diff options
Diffstat (limited to '0085-Revert-all-remove-code-which-is-not-being-considered.patch')
-rw-r--r-- | 0085-Revert-all-remove-code-which-is-not-being-considered.patch | 8976 |
1 files changed, 8976 insertions, 0 deletions
diff --git a/0085-Revert-all-remove-code-which-is-not-being-considered.patch b/0085-Revert-all-remove-code-which-is-not-being-considered.patch new file mode 100644 index 0000000..6addaff --- /dev/null +++ b/0085-Revert-all-remove-code-which-is-not-being-considered.patch @@ -0,0 +1,8976 @@ +From 379b9f7247a4daac9545e3dec79d3c2660111d8d Mon Sep 17 00:00:00 2001 +From: Hari Gowtham <hgowtham@redhat.com> +Date: Mon, 8 Apr 2019 11:32:09 +0530 +Subject: [PATCH 085/124] Revert "all: remove code which is not being + considered in build" + +This reverts most part of commit 8293d21280fd6ddfc9bb54068cf87794fc6be207. +It adds in the changes for tier and CTR with the neccesary changes for building it. + +Label: DOWNSTREAM ONLY + +Change-Id: I8f7978618f2a6a949b09dbcfd25722494cb8f1cd +Signed-off-by: Hari Gowtham <hgowtham@redhat.com> +Reviewed-on: https://code.engineering.redhat.com/gerrit/166245 +Reviewed-by: Nithya Balachandran <nbalacha@redhat.com> +Tested-by: RHGS Build Bot <nigelb@redhat.com> +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> +--- + Makefile.am | 8 +- + configure.ac | 34 + + glusterfs.spec.in | 19 + + libglusterfs/Makefile.am | 4 +- + libglusterfs/src/glusterfs/mem-types.h | 1 + + xlators/cluster/dht/src/Makefile.am | 14 +- + xlators/cluster/dht/src/dht-rebalance.c | 12 + + xlators/cluster/dht/src/tier-common.c | 1199 ++++++++ + xlators/cluster/dht/src/tier-common.h | 55 + + xlators/cluster/dht/src/tier.c | 3105 ++++++++++++++++++++ + xlators/cluster/dht/src/tier.h | 110 + + xlators/features/Makefile.am | 2 +- + xlators/features/changetimerecorder/Makefile.am | 3 + + .../features/changetimerecorder/src/Makefile.am | 26 + + .../changetimerecorder/src/changetimerecorder.c | 2371 +++++++++++++++ + .../changetimerecorder/src/changetimerecorder.h | 21 + + .../features/changetimerecorder/src/ctr-helper.c | 293 ++ + .../features/changetimerecorder/src/ctr-helper.h | 854 ++++++ + .../features/changetimerecorder/src/ctr-messages.h | 61 + + .../changetimerecorder/src/ctr-xlator-ctx.c | 362 +++ + .../changetimerecorder/src/ctr-xlator-ctx.h | 68 + + .../changetimerecorder/src/ctr_mem_types.h | 22 + + 22 files changed, 8637 insertions(+), 7 deletions(-) + create mode 100644 xlators/cluster/dht/src/tier-common.c + create mode 100644 xlators/cluster/dht/src/tier-common.h + create mode 100644 xlators/cluster/dht/src/tier.c + create mode 100644 xlators/cluster/dht/src/tier.h + create mode 100644 xlators/features/changetimerecorder/Makefile.am + create mode 100644 xlators/features/changetimerecorder/src/Makefile.am + create mode 100644 xlators/features/changetimerecorder/src/changetimerecorder.c + create mode 100644 xlators/features/changetimerecorder/src/changetimerecorder.h + create mode 100644 xlators/features/changetimerecorder/src/ctr-helper.c + create mode 100644 xlators/features/changetimerecorder/src/ctr-helper.h + create mode 100644 xlators/features/changetimerecorder/src/ctr-messages.h + create mode 100644 xlators/features/changetimerecorder/src/ctr-xlator-ctx.c + create mode 100644 xlators/features/changetimerecorder/src/ctr-xlator-ctx.h + create mode 100644 xlators/features/changetimerecorder/src/ctr_mem_types.h + +diff --git a/Makefile.am b/Makefile.am +index e0c795f..613382f 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -3,7 +3,7 @@ SOURCES = site.h + EXTRA_DIST = autogen.sh \ + COPYING-GPLV2 COPYING-LGPLV3 COMMITMENT \ + INSTALL README.md AUTHORS THANKS NEWS \ +- glusterfs.spec glusterfs-api.pc.in libgfchangelog.pc.in \ ++ glusterfs.spec glusterfs-api.pc.in libgfchangelog.pc.in libgfdb.pc.in \ + run-tests.sh \ + build-aux/pkg-version \ + contrib/umountd \ +@@ -15,8 +15,12 @@ SUBDIRS = $(ARGP_STANDALONE_DIR) rpc/xdr/gen libglusterfs rpc api xlators \ + + pkgconfigdir = @pkgconfigdir@ + pkgconfig_DATA = glusterfs-api.pc libgfchangelog.pc ++if USE_GFDB ++pkgconfig_DATA += libgfdb.pc ++endif + +-CLEANFILES = glusterfs-api.pc libgfchangelog.pc contrib/umountd/Makefile ++CLEANFILES = glusterfs-api.pc libgfchangelog.pc libgfdb.pc \ ++ contrib/umountd/Makefile + + gitclean: distclean + find . -name Makefile.in -exec rm -f {} \; +diff --git a/configure.ac b/configure.ac +index baa811a..633e850 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -30,6 +30,7 @@ AC_CONFIG_HEADERS([config.h site.h]) + AC_CONFIG_FILES([Makefile + libglusterfs/Makefile + libglusterfs/src/Makefile ++ libglusterfs/src/gfdb/Makefile + geo-replication/src/peer_gsec_create + geo-replication/src/peer_mountbroker + geo-replication/src/peer_mountbroker.py +@@ -121,6 +122,8 @@ AC_CONFIG_FILES([Makefile + xlators/features/changelog/src/Makefile + xlators/features/changelog/lib/Makefile + xlators/features/changelog/lib/src/Makefile ++ xlators/features/changetimerecorder/Makefile ++ xlators/features/changetimerecorder/src/Makefile + xlators/features/locks/Makefile + xlators/features/locks/src/Makefile + xlators/features/quota/Makefile +@@ -237,6 +240,7 @@ AC_CONFIG_FILES([Makefile + contrib/umountd/Makefile + glusterfs-api.pc + libgfchangelog.pc ++ libgfdb.pc + api/Makefile + api/src/Makefile + api/examples/Makefile +@@ -866,6 +870,33 @@ AM_CONDITIONAL([USE_FIREWALLD],test ["x${BUILD_FIREWALLD}" = "xyes"]) + + #endof firewald section + ++# Data tiering requires sqlite ++AC_ARG_ENABLE([tiering], ++ AC_HELP_STRING([--disable-tiering], ++ [Disable data classification/tiering]), ++ [BUILD_GFDB="${enableval}"], [BUILD_GFDB="yes"]) ++ ++case $host_os in ++ darwin*) ++ SQLITE_LIBS="-lsqlite3" ++ AC_CHECK_HEADERS([sqlite3.h], AC_DEFINE(USE_GFDB, 1)) ++ ;; ++ *) ++ if test "x${BUILD_GFDB}" = "xyes"; then ++ PKG_CHECK_MODULES([SQLITE], [sqlite3], ++ AC_DEFINE(USE_GFDB, 1), ++ AC_MSG_ERROR([pass --disable-tiering to build without sqlite])) ++ else ++ AC_DEFINE(USE_GFDB, 0, [no sqlite, gfdb is disabled]) ++ fi ++ ;; ++esac ++ ++AC_SUBST(SQLITE_CFLAGS) ++AC_SUBST(SQLITE_LIBS) ++AM_CONDITIONAL(BUILD_GFDB, test "x${with_server}" = "xyes" -a "x${BUILD_GFDB}" = "xyes") ++AM_CONDITIONAL(USE_GFDB, test "x${with_server}" = "xyes" -a "x${BUILD_GFDB}" = "xyes") ++ + # xml-output + AC_ARG_ENABLE([xml-output], + AC_HELP_STRING([--disable-xml-output], +@@ -1544,6 +1575,8 @@ GFAPI_VERSION="7."${PACKAGE_VERSION} + LIBGFCHANGELOG_VERSION="0.0.1" + AC_SUBST(GFAPI_VERSION) + AC_SUBST(LIBGFCHANGELOG_VERSION) ++LIBGFDB_VERSION="0.0.1" ++AC_SUBST(LIBGFDB_VERSION) + + dnl libtool versioning + LIBGFXDR_LT_VERSION="0:1:0" +@@ -1584,6 +1617,7 @@ echo "XML output : $BUILD_XML_OUTPUT" + echo "Unit Tests : $BUILD_UNITTEST" + echo "Track priv ports : $TRACK_PRIVPORTS" + echo "POSIX ACLs : $BUILD_POSIX_ACLS" ++echo "Data Classification : $BUILD_GFDB" + echo "firewalld-config : $BUILD_FIREWALLD" + echo "Events : $BUILD_EVENTS" + echo "EC dynamic support : $EC_DYNAMIC_SUPPORT" +diff --git a/glusterfs.spec.in b/glusterfs.spec.in +index 2149f86..e0607ba 100644 +--- a/glusterfs.spec.in ++++ b/glusterfs.spec.in +@@ -154,6 +154,7 @@ + %global _without_events --disable-events + %global _without_georeplication --disable-georeplication + %global _with_gnfs %{nil} ++%global _without_tiering --disable-tiering + %global _without_ocf --without-ocf + %endif + +@@ -287,6 +288,9 @@ BuildRequires: libuuid-devel + %if ( 0%{?_with_cmocka:1} ) + BuildRequires: libcmocka-devel >= 1.0.1 + %endif ++%if ( 0%{!?_without_tiering:1} ) ++BuildRequires: sqlite-devel ++%endif + %if ( 0%{!?_without_georeplication:1} ) + BuildRequires: libattr-devel + %endif +@@ -797,6 +801,7 @@ export LDFLAGS + %{?_without_rdma} \ + %{?_without_server} \ + %{?_without_syslog} \ ++ %{?_without_tiering} \ + %{?_with_ipv6default} \ + %{?_without_libtirpc} + +@@ -1232,9 +1237,15 @@ exit 0 + %if ( 0%{?_without_server:1} ) + %exclude %{_libdir}/pkgconfig/libgfchangelog.pc + %exclude %{_libdir}/libgfchangelog.so ++%if ( 0%{!?_without_tiering:1} ) ++%{_libdir}/pkgconfig/libgfdb.pc ++%endif + %else + %{_libdir}/pkgconfig/libgfchangelog.pc + %{_libdir}/libgfchangelog.so ++%if ( 0%{!?_without_tiering:1} ) ++%{_libdir}/pkgconfig/libgfdb.pc ++%endif + %endif + + %files client-xlators +@@ -1330,6 +1341,10 @@ exit 0 + %files libs + %{_libdir}/*.so.* + %exclude %{_libdir}/libgfapi.* ++%if ( 0%{!?_without_tiering:1} ) ++# libgfdb is only needed server-side ++%exclude %{_libdir}/libgfdb.* ++%endif + + %files -n python%{_pythonver}-gluster + # introducing glusterfs module in site packages. +@@ -1417,6 +1432,10 @@ exit 0 + %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/bit-rot.so + %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/bitrot-stub.so + %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/sdfs.so ++%if ( 0%{!?_without_tiering:1} ) ++ %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/changetimerecorder.so ++ %{_libdir}/libgfdb.so.* ++%endif + %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/index.so + %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/locks.so + %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/posix* +diff --git a/libglusterfs/Makefile.am b/libglusterfs/Makefile.am +index d471a3f..7e72f61 100644 +--- a/libglusterfs/Makefile.am ++++ b/libglusterfs/Makefile.am +@@ -1,3 +1,3 @@ +-SUBDIRS = src ++SUBDIRS = src src/gfdb + +-CLEANFILES = ++CLEANFILES = +diff --git a/libglusterfs/src/glusterfs/mem-types.h b/libglusterfs/src/glusterfs/mem-types.h +index 832f68c..92730a9 100644 +--- a/libglusterfs/src/glusterfs/mem-types.h ++++ b/libglusterfs/src/glusterfs/mem-types.h +@@ -138,6 +138,7 @@ enum gf_common_mem_types_ { + gf_common_volfile_t, + gf_common_mt_mgmt_v3_lock_timer_t, /* used only in one location */ + gf_common_mt_server_cmdline_t, /* used only in one location */ ++ gf_mt_gfdb_query_record_t, + gf_common_mt_end + }; + #endif +diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am +index 56f1f2a..5532047 100644 +--- a/xlators/cluster/dht/src/Makefile.am ++++ b/xlators/cluster/dht/src/Makefile.am +@@ -1,4 +1,7 @@ + xlator_LTLIBRARIES = dht.la nufa.la switch.la ++if BUILD_GFDB ++ xlator_LTLIBRARIES += tier.la ++endif + + AM_CFLAGS = -Wall $(GF_CFLAGS) + +@@ -13,6 +16,7 @@ dht_la_SOURCES = $(dht_common_source) dht.c + + nufa_la_SOURCES = $(dht_common_source) nufa.c + switch_la_SOURCES = $(dht_common_source) switch.c ++tier_la_SOURCES = $(dht_common_source) tier.c tier-common.c + + dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la +@@ -23,15 +27,21 @@ nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) + switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la + ++tier_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) $(LIB_DL) ++tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la ++ + noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \ +- dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h ++ dht-lock.h tier-common.h tier.h \ ++ $(top_builddir)/xlators/lib/src/libxlator.h + + AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ ++ -I$(top_srcdir)/libglusterfs/src/gfdb \ + -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ + -I$(top_srcdir)/rpc/rpc-lib/src \ + -I$(top_srcdir)/xlators/lib/src \ + -DDATADIR=\"$(localstatedir)\" \ +- -DLIBDIR=\"$(libdir)\" ++ -DLIBDIR=\"$(libdir)\" \ ++ -DLIBGFDB_VERSION=\"$(LIBGFDB_VERSION)\" + + CLEANFILES = + +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index e0f25b1..efbe8a4 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -8,6 +8,7 @@ + cases as published by the Free Software Foundation. + */ + ++#include "tier.h" + #include "dht-common.h" + #include <glusterfs/xlator.h> + #include <glusterfs/syscall.h> +@@ -2134,6 +2135,17 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to, + } + } + ++ /* store size of previous migrated file */ ++ if (defrag && defrag->tier_conf.is_tier) { ++ if (from != TIER_HASHED_SUBVOL) { ++ defrag->tier_conf.st_last_promoted_size = stbuf.ia_size; ++ } else { ++ /* Don't delete the linkto file on the hashed subvol */ ++ delete_src_linkto = _gf_false; ++ defrag->tier_conf.st_last_demoted_size = stbuf.ia_size; ++ } ++ } ++ + /* The src file is being unlinked after this so we don't need + to clean it up */ + clean_src = _gf_false; +diff --git a/xlators/cluster/dht/src/tier-common.c b/xlators/cluster/dht/src/tier-common.c +new file mode 100644 +index 0000000..b22f477 +--- /dev/null ++++ b/xlators/cluster/dht/src/tier-common.c +@@ -0,0 +1,1199 @@ ++/* ++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#include <glusterfs/glusterfs.h> ++#include <glusterfs/xlator.h> ++#include "libxlator.h" ++#include "dht-common.h" ++#include <glusterfs/defaults.h> ++#include "tier-common.h" ++#include "tier.h" ++ ++int ++dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, inode_t *inode, struct iatt *stbuf, ++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata); ++ ++int ++tier_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, inode_t *inode, struct iatt *stbuf, ++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata) ++{ ++ dht_local_t *local = NULL; ++ loc_t *oldloc = NULL; ++ loc_t *newloc = NULL; ++ ++ local = frame->local; ++ ++ oldloc = &local->loc; ++ newloc = &local->loc2; ++ ++ if (op_ret == -1) { ++ /* No continuation on DHT inode missing errors, as we should ++ * then have a good stbuf that states P2 happened. We would ++ * get inode missing if, the file completed migrated between ++ * the lookup and the link call */ ++ goto out; ++ } ++ ++ if (local->call_cnt != 1) { ++ goto out; ++ } ++ ++ local->call_cnt = 2; ++ ++ /* Do this on the hot tier now */ ++ ++ STACK_WIND(frame, tier_link_cbk, local->cached_subvol, ++ local->cached_subvol->fops->link, oldloc, newloc, xdata); ++ ++ return 0; ++ ++out: ++ DHT_STRIP_PHASE1_FLAGS(stbuf); ++ ++ DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent, ++ postparent, NULL); ++ ++ return 0; ++} ++ ++int ++tier_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, ++ dict_t *xdata) ++{ ++ xlator_t *cached_subvol = NULL; ++ xlator_t *hashed_subvol = NULL; ++ int op_errno = -1; ++ int ret = -1; ++ dht_local_t *local = NULL; ++ dht_conf_t *conf = NULL; ++ ++ VALIDATE_OR_GOTO(frame, err); ++ VALIDATE_OR_GOTO(this, err); ++ VALIDATE_OR_GOTO(oldloc, err); ++ VALIDATE_OR_GOTO(newloc, err); ++ ++ conf = this->private; ++ ++ local = dht_local_init(frame, oldloc, NULL, GF_FOP_LINK); ++ if (!local) { ++ op_errno = ENOMEM; ++ goto err; ++ } ++ local->call_cnt = 1; ++ ++ cached_subvol = local->cached_subvol; ++ ++ if (!cached_subvol) { ++ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", ++ oldloc->path); ++ op_errno = ENOENT; ++ goto err; ++ } ++ ++ hashed_subvol = TIER_HASHED_SUBVOL; ++ ++ ret = loc_copy(&local->loc2, newloc); ++ if (ret == -1) { ++ op_errno = ENOMEM; ++ goto err; ++ } ++ ++ if (hashed_subvol == cached_subvol) { ++ STACK_WIND(frame, dht_link_cbk, cached_subvol, ++ cached_subvol->fops->link, oldloc, newloc, xdata); ++ return 0; ++ } ++ ++ /* Create hardlinks to both the data file on the hot tier ++ and the linkto file on the cold tier */ ++ ++ gf_uuid_copy(local->gfid, oldloc->inode->gfid); ++ ++ STACK_WIND(frame, tier_link_cbk, hashed_subvol, hashed_subvol->fops->link, ++ oldloc, newloc, xdata); ++ ++ return 0; ++err: ++ op_errno = (op_errno == -1) ? errno : op_errno; ++ DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL); ++ return 0; ++} ++ ++int ++tier_create_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int op_ret, int op_errno, ++ struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ dht_local_t *local = NULL; ++ ++ local = frame->local; ++ ++ if (local->params) { ++ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY); ++ } ++ ++ DHT_STACK_UNWIND(create, frame, -1, local->op_errno, NULL, NULL, NULL, NULL, ++ NULL, NULL); ++ ++ return 0; ++} ++ ++int ++tier_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, ++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata) ++{ ++ xlator_t *prev = NULL; ++ int ret = -1; ++ dht_local_t *local = NULL; ++ xlator_t *hashed_subvol = NULL; ++ dht_conf_t *conf = NULL; ++ ++ local = frame->local; ++ conf = this->private; ++ ++ hashed_subvol = TIER_HASHED_SUBVOL; ++ ++ if (!local) { ++ op_ret = -1; ++ op_errno = EINVAL; ++ goto out; ++ } ++ ++ if (op_ret == -1) { ++ if (local->linked == _gf_true && local->xattr_req) { ++ local->op_errno = op_errno; ++ local->op_ret = op_ret; ++ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file( ++ local->xattr_req); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, ++ "Failed to set dictionary value to " ++ "unlink of migrating file"); ++ goto out; ++ } ++ ++ STACK_WIND(frame, tier_create_unlink_stale_linkto_cbk, ++ hashed_subvol, hashed_subvol->fops->unlink, &local->loc, ++ 0, local->xattr_req); ++ return 0; ++ } ++ goto out; ++ } ++ ++ prev = cookie; ++ ++ if (local->loc.parent) { ++ dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0); ++ ++ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1); ++ } ++ ++ ret = dht_layout_preset(this, prev, inode); ++ if (ret != 0) { ++ gf_msg_debug(this->name, 0, "could not set preset layout for subvol %s", ++ prev->name); ++ op_ret = -1; ++ op_errno = EINVAL; ++ goto out; ++ } ++ ++ local->op_errno = op_errno; ++ ++ if (local->linked == _gf_true) { ++ local->stbuf = *stbuf; ++ dht_linkfile_attr_heal(frame, this); ++ } ++out: ++ if (local) { ++ if (local->xattr_req) { ++ dict_del(local->xattr_req, TIER_LINKFILE_GFID); ++ } ++ } ++ ++ DHT_STRIP_PHASE1_FLAGS(stbuf); ++ ++ DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf, ++ preparent, postparent, xdata); ++ ++ return 0; ++} ++ ++int ++tier_create_linkfile_create_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, inode_t *inode, ++ struct iatt *stbuf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ dht_local_t *local = NULL; ++ xlator_t *cached_subvol = NULL; ++ dht_conf_t *conf = NULL; ++ int ret = -1; ++ unsigned char *gfid = NULL; ++ ++ local = frame->local; ++ if (!local) { ++ op_errno = EINVAL; ++ goto err; ++ } ++ ++ if (op_ret == -1) { ++ local->op_errno = op_errno; ++ goto err; ++ } ++ ++ conf = this->private; ++ if (!conf) { ++ local->op_errno = EINVAL; ++ op_errno = EINVAL; ++ goto err; ++ } ++ ++ cached_subvol = TIER_UNHASHED_SUBVOL; ++ ++ if (local->params) { ++ dict_del(local->params, conf->link_xattr_name); ++ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY); ++ } ++ ++ /* ++ * We will delete the linkfile if data file creation fails. ++ * When deleting this stale linkfile, there is a possibility ++ * for a race between this linkfile deletion and a stale ++ * linkfile deletion triggered by another lookup from different ++ * client. ++ * ++ * For eg: ++ * ++ * Client 1 Client 2 ++ * ++ * 1 linkfile created for foo ++ * ++ * 2 data file creation failed ++ * ++ * 3 creating a file with same name ++ * ++ * 4 lookup before creation deleted ++ * the linkfile created by client1 ++ * considering as a stale linkfile. ++ * ++ * 5 New linkfile created for foo ++ * with different gfid. ++ * ++ * 6 Trigger linkfile deletion as ++ * data file creation failed. ++ * ++ * 7 Linkfile deleted which is ++ * created by client2. ++ * ++ * 8 Data file created. ++ * ++ * With this race, we will end up having a file in a non-hashed subvol ++ * without a linkfile in hashed subvol. ++ * ++ * To avoid this, we store the gfid of linkfile created by client, So ++ * If we delete the linkfile , we validate gfid of existing file with ++ * stored value from posix layer. ++ * ++ * Storing this value in local->xattr_req as local->params was also used ++ * to create the data file. During the linkfile deletion we will use ++ * local->xattr_req dictionary. ++ */ ++ if (!local->xattr_req) { ++ local->xattr_req = dict_new(); ++ if (!local->xattr_req) { ++ local->op_errno = ENOMEM; ++ op_errno = ENOMEM; ++ goto err; ++ } ++ } ++ ++ gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char); ++ if (!gfid) { ++ local->op_errno = ENOMEM; ++ op_errno = ENOMEM; ++ goto err; ++ } ++ ++ gf_uuid_copy(gfid, stbuf->ia_gfid); ++ ret = dict_set_dynptr(local->xattr_req, TIER_LINKFILE_GFID, gfid, ++ sizeof(uuid_t)); ++ if (ret) { ++ GF_FREE(gfid); ++ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, ++ "Failed to set dictionary value" ++ " : key = %s", ++ TIER_LINKFILE_GFID); ++ } ++ ++ STACK_WIND_COOKIE(frame, tier_create_cbk, cached_subvol, cached_subvol, ++ cached_subvol->fops->create, &local->loc, local->flags, ++ local->mode, local->umask, local->fd, local->params); ++ ++ return 0; ++err: ++ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, ++ NULL); ++ return 0; ++} ++ ++gf_boolean_t ++tier_is_hot_tier_decommissioned(xlator_t *this) ++{ ++ dht_conf_t *conf = NULL; ++ xlator_t *hot_tier = NULL; ++ int i = 0; ++ ++ conf = this->private; ++ hot_tier = conf->subvolumes[1]; ++ ++ if (conf->decommission_subvols_cnt) { ++ for (i = 0; i < conf->subvolume_cnt; i++) { ++ if (conf->decommissioned_bricks[i] && ++ conf->decommissioned_bricks[i] == hot_tier) ++ return _gf_true; ++ } ++ } ++ ++ return _gf_false; ++} ++ ++int ++tier_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ++ mode_t mode, mode_t umask, fd_t *fd, dict_t *params) ++{ ++ int op_errno = -1; ++ dht_local_t *local = NULL; ++ dht_conf_t *conf = NULL; ++ xlator_t *hot_subvol = NULL; ++ xlator_t *cold_subvol = NULL; ++ ++ VALIDATE_OR_GOTO(frame, err); ++ VALIDATE_OR_GOTO(this, err); ++ VALIDATE_OR_GOTO(loc, err); ++ ++ conf = this->private; ++ ++ dht_get_du_info(frame, this, loc); ++ ++ local = dht_local_init(frame, loc, fd, GF_FOP_CREATE); ++ if (!local) { ++ op_errno = ENOMEM; ++ goto err; ++ } ++ ++ cold_subvol = TIER_HASHED_SUBVOL; ++ hot_subvol = TIER_UNHASHED_SUBVOL; ++ ++ if (conf->subvolumes[0] != cold_subvol) { ++ hot_subvol = conf->subvolumes[0]; ++ } ++ /* ++ * if hot tier full, write to cold. ++ * Also if hot tier is full, create in cold ++ */ ++ if (dht_is_subvol_filled(this, hot_subvol) || ++ tier_is_hot_tier_decommissioned(this)) { ++ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path, ++ cold_subvol->name); ++ ++ STACK_WIND_COOKIE(frame, tier_create_cbk, cold_subvol, cold_subvol, ++ cold_subvol->fops->create, loc, flags, mode, umask, ++ fd, params); ++ } else { ++ local->params = dict_ref(params); ++ local->flags = flags; ++ local->mode = mode; ++ local->umask = umask; ++ local->cached_subvol = hot_subvol; ++ local->hashed_subvol = cold_subvol; ++ ++ gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)", loc->path, ++ hot_subvol->name, cold_subvol->name); ++ ++ dht_linkfile_create(frame, tier_create_linkfile_create_cbk, this, ++ hot_subvol, cold_subvol, loc); ++ ++ goto out; ++ } ++out: ++ return 0; ++ ++err: ++ ++ op_errno = (op_errno == -1) ? errno : op_errno; ++ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL, ++ NULL); ++ ++ return 0; ++} ++ ++int ++tier_unlink_nonhashed_linkfile_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int op_ret, int op_errno, ++ struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ dht_local_t *local = NULL; ++ xlator_t *prev = NULL; ++ ++ local = frame->local; ++ prev = cookie; ++ ++ LOCK(&frame->lock); ++ { ++ if ((op_ret == -1) && (op_errno != ENOENT)) { ++ local->op_errno = op_errno; ++ local->op_ret = op_ret; ++ gf_msg_debug(this->name, op_errno, ++ "Unlink link: subvolume %s" ++ " returned -1", ++ prev->name); ++ goto unlock; ++ } ++ ++ local->op_ret = 0; ++ } ++unlock: ++ UNLOCK(&frame->lock); ++ ++ if (local->op_ret == -1) ++ goto err; ++ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, ++ &local->preparent, &local->postparent, NULL); ++ ++ return 0; ++ ++err: ++ DHT_STACK_UNWIND(unlink, frame, -1, local->op_errno, NULL, NULL, NULL); ++ return 0; ++} ++ ++int ++tier_unlink_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int op_ret, int op_errno, inode_t *inode, ++ struct iatt *preparent, dict_t *xdata, ++ struct iatt *postparent) ++{ ++ dht_local_t *local = NULL; ++ xlator_t *prev = NULL; ++ dht_conf_t *conf = NULL; ++ xlator_t *hot_subvol = NULL; ++ ++ local = frame->local; ++ prev = cookie; ++ conf = this->private; ++ hot_subvol = TIER_UNHASHED_SUBVOL; ++ ++ if (!op_ret) { ++ /* ++ * linkfile present on hot tier. unlinking the linkfile ++ */ ++ STACK_WIND_COOKIE(frame, tier_unlink_nonhashed_linkfile_cbk, hot_subvol, ++ hot_subvol, hot_subvol->fops->unlink, &local->loc, ++ local->flags, NULL); ++ return 0; ++ } ++ ++ LOCK(&frame->lock); ++ { ++ if (op_errno == ENOENT) { ++ local->op_ret = 0; ++ local->op_errno = op_errno; ++ } else { ++ local->op_ret = op_ret; ++ local->op_errno = op_errno; ++ } ++ gf_msg_debug(this->name, op_errno, "Lookup : subvolume %s returned -1", ++ prev->name); ++ } ++ ++ UNLOCK(&frame->lock); ++ ++ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, ++ &local->preparent, &local->postparent, xdata); ++ ++ return 0; ++} ++ ++int ++tier_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int op_ret, int op_errno, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ dht_local_t *local = NULL; ++ xlator_t *prev = NULL; ++ ++ local = frame->local; ++ prev = cookie; ++ ++ LOCK(&frame->lock); ++ { ++ /* Ignore EINVAL for tier to ignore error when the file ++ does not exist on the other tier */ ++ if ((op_ret == -1) && !((op_errno == ENOENT) || (op_errno == EINVAL))) { ++ local->op_errno = op_errno; ++ local->op_ret = op_ret; ++ gf_msg_debug(this->name, op_errno, ++ "Unlink link: subvolume %s" ++ " returned -1", ++ prev->name); ++ goto unlock; ++ } ++ ++ local->op_ret = 0; ++ } ++unlock: ++ UNLOCK(&frame->lock); ++ ++ if (local->op_ret == -1) ++ goto err; ++ ++ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, ++ &local->preparent, &local->postparent, xdata); ++ ++ return 0; ++ ++err: ++ DHT_STACK_UNWIND(unlink, frame, -1, local->op_errno, NULL, NULL, NULL); ++ return 0; ++} ++ ++int32_t ++tier_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, struct iatt *preparent, struct iatt *postparent, ++ dict_t *xdata) ++{ ++ dht_local_t *local = NULL; ++ xlator_t *prev = NULL; ++ struct iatt *stbuf = NULL; ++ dht_conf_t *conf = NULL; ++ int ret = -1; ++ xlator_t *hot_tier = NULL; ++ xlator_t *cold_tier = NULL; ++ ++ local = frame->local; ++ prev = cookie; ++ conf = this->private; ++ ++ cold_tier = TIER_HASHED_SUBVOL; ++ hot_tier = TIER_UNHASHED_SUBVOL; ++ ++ LOCK(&frame->lock); ++ { ++ if (op_ret == -1) { ++ if (op_errno == ENOENT) { ++ local->op_ret = 0; ++ } else { ++ local->op_ret = -1; ++ local->op_errno = op_errno; ++ } ++ gf_msg_debug(this->name, op_errno, ++ "Unlink: subvolume %s returned -1" ++ " with errno = %d", ++ prev->name, op_errno); ++ goto unlock; ++ } ++ ++ local->op_ret = 0; ++ ++ local->postparent = *postparent; ++ local->preparent = *preparent; ++ ++ if (local->loc.parent) { ++ dht_inode_ctx_time_update(local->loc.parent, this, ++ &local->preparent, 0); ++ dht_inode_ctx_time_update(local->loc.parent, this, ++ &local->postparent, 1); ++ } ++ } ++unlock: ++ UNLOCK(&frame->lock); ++ ++ if (local->op_ret) ++ goto out; ++ ++ if (cold_tier != local->cached_subvol) { ++ /* ++ * File is present in hot tier, so there will be ++ * a link file on cold tier, deleting the linkfile ++ * from cold tier ++ */ ++ STACK_WIND_COOKIE(frame, tier_unlink_linkfile_cbk, cold_tier, cold_tier, ++ cold_tier->fops->unlink, &local->loc, local->flags, ++ xdata); ++ return 0; ++ } ++ ++ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf); ++ if (!ret && stbuf && ++ ((IS_DHT_MIGRATION_PHASE2(stbuf)) || IS_DHT_MIGRATION_PHASE1(stbuf))) { ++ /* ++ * File is migrating from cold to hot tier. ++ * Delete the destination linkfile. ++ */ ++ STACK_WIND_COOKIE(frame, tier_unlink_lookup_cbk, hot_tier, hot_tier, ++ hot_tier->fops->lookup, &local->loc, NULL); ++ return 0; ++ } ++ ++out: ++ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno, ++ &local->preparent, &local->postparent, xdata); ++ ++ return 0; ++} ++ ++int ++tier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, ++ dict_t *xdata) ++{ ++ xlator_t *cached_subvol = NULL; ++ xlator_t *hashed_subvol = NULL; ++ dht_conf_t *conf = NULL; ++ int op_errno = -1; ++ dht_local_t *local = NULL; ++ int ret = -1; ++ ++ VALIDATE_OR_GOTO(frame, err); ++ VALIDATE_OR_GOTO(this, err); ++ VALIDATE_OR_GOTO(loc, err); ++ ++ conf = this->private; ++ ++ local = dht_local_init(frame, loc, NULL, GF_FOP_UNLINK); ++ if (!local) { ++ op_errno = ENOMEM; ++ ++ goto err; ++ } ++ ++ hashed_subvol = TIER_HASHED_SUBVOL; ++ ++ cached_subvol = local->cached_subvol; ++ if (!cached_subvol) { ++ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s", ++ loc->path); ++ op_errno = EINVAL; ++ goto err; ++ } ++ ++ local->flags = xflag; ++ if (IA_ISREG(loc->inode->ia_type) && (hashed_subvol == cached_subvol)) { ++ /* ++ * File resides in cold tier. We need to stat ++ * the file to see if it is being promoted. ++ * If yes we need to delete the destination ++ * file as well. ++ * ++ * Currently we are doing this check only for ++ * regular files. ++ */ ++ xdata = xdata ? dict_ref(xdata) : dict_new(); ++ if (xdata) { ++ ret = dict_set_int8(xdata, DHT_IATT_IN_XDATA_KEY, 1); ++ if (ret) { ++ gf_msg_debug(this->name, 0, "Failed to set dictionary key %s", ++ DHT_IATT_IN_XDATA_KEY); ++ } ++ } ++ } ++ ++ /* ++ * File is on hot tier, delete the data file first, then ++ * linkfile from cold. ++ */ ++ STACK_WIND_COOKIE(frame, tier_unlink_cbk, cached_subvol, cached_subvol, ++ cached_subvol->fops->unlink, loc, xflag, xdata); ++ if (xdata) ++ dict_unref(xdata); ++ return 0; ++err: ++ op_errno = (op_errno == -1) ? errno : op_errno; ++ DHT_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL); ++ ++ return 0; ++} ++ ++int ++tier_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) ++{ ++ gf_dirent_t entries; ++ gf_dirent_t *orig_entry = NULL; ++ gf_dirent_t *entry = NULL; ++ int count = 0; ++ ++ INIT_LIST_HEAD(&entries.list); ++ ++ if (op_ret < 0) ++ goto unwind; ++ ++ list_for_each_entry(orig_entry, (&orig_entries->list), list) ++ { ++ entry = gf_dirent_for_name(orig_entry->d_name); ++ if (!entry) { ++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, ++ "Memory allocation failed "); ++ goto unwind; ++ } ++ ++ entry->d_off = orig_entry->d_off; ++ entry->d_ino = orig_entry->d_ino; ++ entry->d_type = orig_entry->d_type; ++ entry->d_len = orig_entry->d_len; ++ ++ list_add_tail(&entry->list, &entries.list); ++ count++; ++ } ++ op_ret = count; ++ ++unwind: ++ if (op_ret < 0) ++ op_ret = 0; ++ ++ DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL); ++ ++ gf_dirent_free(&entries); ++ ++ return 0; ++} ++ ++int ++tier_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata) ++{ ++ dht_local_t *local = NULL; ++ gf_dirent_t entries; ++ gf_dirent_t *orig_entry = NULL; ++ gf_dirent_t *entry = NULL; ++ xlator_t *prev = NULL; ++ xlator_t *next_subvol = NULL; ++ off_t next_offset = 0; ++ int count = 0; ++ dht_conf_t *conf = NULL; ++ int ret = 0; ++ inode_table_t *itable = NULL; ++ inode_t *inode = NULL; ++ ++ INIT_LIST_HEAD(&entries.list); ++ prev = cookie; ++ local = frame->local; ++ itable = local->fd ? local->fd->inode->table : NULL; ++ ++ conf = this->private; ++ GF_VALIDATE_OR_GOTO(this->name, conf, unwind); ++ ++ if (op_ret < 0) ++ goto done; ++ ++ list_for_each_entry(orig_entry, (&orig_entries->list), list) ++ { ++ next_offset = orig_entry->d_off; ++ ++ if (IA_ISINVAL(orig_entry->d_stat.ia_type)) { ++ /*stat failed somewhere- ignore this entry*/ ++ continue; ++ } ++ ++ entry = gf_dirent_for_name(orig_entry->d_name); ++ if (!entry) { ++ goto unwind; ++ } ++ ++ entry->d_off = orig_entry->d_off; ++ entry->d_stat = orig_entry->d_stat; ++ entry->d_ino = orig_entry->d_ino; ++ entry->d_type = orig_entry->d_type; ++ entry->d_len = orig_entry->d_len; ++ ++ if (orig_entry->dict) ++ entry->dict = dict_ref(orig_entry->dict); ++ ++ if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict, ++ conf->link_xattr_name)) { ++ goto entries; ++ ++ } else if (IA_ISDIR(entry->d_stat.ia_type)) { ++ if (orig_entry->inode) { ++ dht_inode_ctx_time_update(orig_entry->inode, this, ++ &entry->d_stat, 1); ++ } ++ } else { ++ if (orig_entry->inode) { ++ ret = dht_layout_preset(this, prev, orig_entry->inode); ++ if (ret) ++ gf_msg(this->name, GF_LOG_WARNING, 0, ++ DHT_MSG_LAYOUT_SET_FAILED, ++ "failed to link the layout " ++ "in inode"); ++ ++ entry->inode = inode_ref(orig_entry->inode); ++ } else if (itable) { ++ /* ++ * orig_entry->inode might be null if any upper ++ * layer xlators below client set to null, to ++ * force a lookup on the inode even if the inode ++ * is present in the inode table. In that case ++ * we just update the ctx to make sure we didn't ++ * missed anything. ++ */ ++ inode = inode_find(itable, orig_entry->d_stat.ia_gfid); ++ if (inode) { ++ ret = dht_layout_preset(this, TIER_HASHED_SUBVOL, inode); ++ if (ret) ++ gf_msg(this->name, GF_LOG_WARNING, 0, ++ DHT_MSG_LAYOUT_SET_FAILED, ++ "failed to link the layout" ++ " in inode"); ++ inode_unref(inode); ++ inode = NULL; ++ } ++ } ++ } ++ ++ entries: ++ list_add_tail(&entry->list, &entries.list); ++ count++; ++ } ++ op_ret = count; ++ ++done: ++ if (count == 0) { ++ /* non-zero next_offset means that ++ EOF is not yet hit on the current subvol ++ */ ++ if (next_offset != 0) { ++ next_subvol = prev; ++ } else { ++ goto unwind; ++ } ++ ++ STACK_WIND_COOKIE(frame, tier_readdirp_cbk, next_subvol, next_subvol, ++ next_subvol->fops->readdirp, local->fd, local->size, ++ next_offset, local->xattr); ++ return 0; ++ } ++ ++unwind: ++ if (op_ret < 0) ++ op_ret = 0; ++ ++ DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL); ++ ++ gf_dirent_free(&entries); ++ ++ return 0; ++} ++ ++int ++tier_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t yoff, int whichop, dict_t *dict) ++{ ++ dht_local_t *local = NULL; ++ int op_errno = -1; ++ xlator_t *hashed_subvol = NULL; ++ int ret = 0; ++ dht_conf_t *conf = NULL; ++ ++ VALIDATE_OR_GOTO(frame, err); ++ VALIDATE_OR_GOTO(this, err); ++ VALIDATE_OR_GOTO(fd, err); ++ VALIDATE_OR_GOTO(this->private, err); ++ ++ conf = this->private; ++ ++ local = dht_local_init(frame, NULL, NULL, whichop); ++ if (!local) { ++ op_errno = ENOMEM; ++ goto err; ++ } ++ ++ local->fd = fd_ref(fd); ++ local->size = size; ++ local->xattr_req = (dict) ? dict_ref(dict) : NULL; ++ ++ hashed_subvol = TIER_HASHED_SUBVOL; ++ ++ /* TODO: do proper readdir */ ++ if (whichop == GF_FOP_READDIRP) { ++ if (dict) ++ local->xattr = dict_ref(dict); ++ else ++ local->xattr = dict_new(); ++ ++ if (local->xattr) { ++ ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256); ++ if (ret) ++ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED, ++ "Failed to set dictionary value" ++ " : key = %s", ++ conf->link_xattr_name); ++ } ++ ++ STACK_WIND_COOKIE(frame, tier_readdirp_cbk, hashed_subvol, ++ hashed_subvol, hashed_subvol->fops->readdirp, fd, ++ size, yoff, local->xattr); ++ ++ } else { ++ STACK_WIND_COOKIE(frame, tier_readdir_cbk, hashed_subvol, hashed_subvol, ++ hashed_subvol->fops->readdir, fd, size, yoff, ++ local->xattr); ++ } ++ ++ return 0; ++ ++err: ++ op_errno = (op_errno == -1) ? errno : op_errno; ++ DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL); ++ ++ return 0; ++} ++ ++int ++tier_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t yoff, dict_t *xdata) ++{ ++ int op = GF_FOP_READDIR; ++ dht_conf_t *conf = NULL; ++ int i = 0; ++ ++ conf = this->private; ++ if (!conf) ++ goto out; ++ ++ for (i = 0; i < conf->subvolume_cnt; i++) { ++ if (!conf->subvolume_status[i]) { ++ op = GF_FOP_READDIRP; ++ break; ++ } ++ } ++ ++ if (conf->use_readdirp) ++ op = GF_FOP_READDIRP; ++ ++out: ++ tier_do_readdir(frame, this, fd, size, yoff, op, 0); ++ return 0; ++} ++ ++int ++tier_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t yoff, dict_t *dict) ++{ ++ tier_do_readdir(frame, this, fd, size, yoff, GF_FOP_READDIRP, dict); ++ return 0; ++} ++ ++int ++tier_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, struct statvfs *statvfs, dict_t *xdata) ++{ ++ gf_boolean_t event = _gf_false; ++ qdstatfs_action_t action = qdstatfs_action_OFF; ++ dht_local_t *local = NULL; ++ int this_call_cnt = 0; ++ int bsize = 0; ++ int frsize = 0; ++ GF_UNUSED int ret = 0; ++ unsigned long new_usage = 0; ++ unsigned long cur_usage = 0; ++ xlator_t *prev = NULL; ++ dht_conf_t *conf = NULL; ++ tier_statvfs_t *tier_stat = NULL; ++ ++ prev = cookie; ++ local = frame->local; ++ GF_ASSERT(local); ++ ++ conf = this->private; ++ ++ if (xdata) ++ ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event); ++ ++ tier_stat = &local->tier_statvfs; ++ ++ LOCK(&frame->lock); ++ { ++ if (op_ret == -1) { ++ local->op_errno = op_errno; ++ goto unlock; ++ } ++ if (!statvfs) { ++ op_errno = EINVAL; ++ local->op_ret = -1; ++ goto unlock; ++ } ++ local->op_ret = 0; ++ ++ if (local->quota_deem_statfs) { ++ if (event == _gf_true) { ++ action = qdstatfs_action_COMPARE; ++ } else { ++ action = qdstatfs_action_NEGLECT; ++ } ++ } else { ++ if (event == _gf_true) { ++ action = qdstatfs_action_REPLACE; ++ local->quota_deem_statfs = _gf_true; ++ } ++ } ++ ++ if (local->quota_deem_statfs) { ++ switch (action) { ++ case qdstatfs_action_NEGLECT: ++ goto unlock; ++ ++ case qdstatfs_action_REPLACE: ++ local->statvfs = *statvfs; ++ goto unlock; ++ ++ case qdstatfs_action_COMPARE: ++ new_usage = statvfs->f_blocks - statvfs->f_bfree; ++ cur_usage = local->statvfs.f_blocks - ++ local->statvfs.f_bfree; ++ ++ /* Take the max of the usage from subvols */ ++ if (new_usage >= cur_usage) ++ local->statvfs = *statvfs; ++ goto unlock; ++ ++ default: ++ break; ++ } ++ } ++ ++ if (local->statvfs.f_bsize != 0) { ++ bsize = max(local->statvfs.f_bsize, statvfs->f_bsize); ++ frsize = max(local->statvfs.f_frsize, statvfs->f_frsize); ++ dht_normalize_stats(&local->statvfs, bsize, frsize); ++ dht_normalize_stats(statvfs, bsize, frsize); ++ } else { ++ local->statvfs.f_bsize = statvfs->f_bsize; ++ local->statvfs.f_frsize = statvfs->f_frsize; ++ } ++ ++ if (prev == TIER_HASHED_SUBVOL) { ++ local->statvfs.f_blocks = statvfs->f_blocks; ++ local->statvfs.f_files = statvfs->f_files; ++ local->statvfs.f_fsid = statvfs->f_fsid; ++ local->statvfs.f_flag = statvfs->f_flag; ++ local->statvfs.f_namemax = statvfs->f_namemax; ++ tier_stat->blocks_used = (statvfs->f_blocks - statvfs->f_bfree); ++ tier_stat->pblocks_used = (statvfs->f_blocks - statvfs->f_bavail); ++ tier_stat->files_used = (statvfs->f_files - statvfs->f_ffree); ++ tier_stat->pfiles_used = (statvfs->f_files - statvfs->f_favail); ++ tier_stat->hashed_fsid = statvfs->f_fsid; ++ } else { ++ tier_stat->unhashed_fsid = statvfs->f_fsid; ++ tier_stat->unhashed_blocks_used = (statvfs->f_blocks - ++ statvfs->f_bfree); ++ tier_stat->unhashed_pblocks_used = (statvfs->f_blocks - ++ statvfs->f_bavail); ++ tier_stat->unhashed_files_used = (statvfs->f_files - ++ statvfs->f_ffree); ++ tier_stat->unhashed_pfiles_used = (statvfs->f_files - ++ statvfs->f_favail); ++ } ++ } ++unlock: ++ UNLOCK(&frame->lock); ++ ++ this_call_cnt = dht_frame_return(frame); ++ if (is_last_call(this_call_cnt)) { ++ if (tier_stat->unhashed_fsid != tier_stat->hashed_fsid) { ++ tier_stat->blocks_used += tier_stat->unhashed_blocks_used; ++ tier_stat->pblocks_used += tier_stat->unhashed_pblocks_used; ++ tier_stat->files_used += tier_stat->unhashed_files_used; ++ tier_stat->pfiles_used += tier_stat->unhashed_pfiles_used; ++ } ++ local->statvfs.f_bfree = local->statvfs.f_blocks - ++ tier_stat->blocks_used; ++ local->statvfs.f_bavail = local->statvfs.f_blocks - ++ tier_stat->pblocks_used; ++ local->statvfs.f_ffree = local->statvfs.f_files - tier_stat->files_used; ++ local->statvfs.f_favail = local->statvfs.f_files - ++ tier_stat->pfiles_used; ++ DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno, ++ &local->statvfs, xdata); ++ } ++ ++ return 0; ++} ++ ++int ++tier_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) ++{ ++ dht_local_t *local = NULL; ++ dht_conf_t *conf = NULL; ++ int op_errno = -1; ++ int i = -1; ++ inode_t *inode = NULL; ++ inode_table_t *itable = NULL; ++ uuid_t root_gfid = { ++ 0, ++ }; ++ loc_t newloc = { ++ 0, ++ }; ++ ++ VALIDATE_OR_GOTO(frame, err); ++ VALIDATE_OR_GOTO(this, err); ++ VALIDATE_OR_GOTO(loc, err); ++ VALIDATE_OR_GOTO(this->private, err); ++ ++ conf = this->private; ++ ++ local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS); ++ if (!local) { ++ op_errno = ENOMEM; ++ goto err; ++ } ++ ++ if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) { ++ itable = loc->inode->table; ++ if (!itable) { ++ op_errno = EINVAL; ++ goto err; ++ } ++ ++ loc = &local->loc2; ++ root_gfid[15] = 1; ++ ++ inode = inode_find(itable, root_gfid); ++ if (!inode) { ++ op_errno = EINVAL; ++ goto err; ++ } ++ ++ dht_build_root_loc(inode, &newloc); ++ loc = &newloc; ++ } ++ ++ local->call_cnt = conf->subvolume_cnt; ++ ++ for (i = 0; i < conf->subvolume_cnt; i++) { ++ STACK_WIND_COOKIE(frame, tier_statfs_cbk, conf->subvolumes[i], ++ conf->subvolumes[i], ++ conf->subvolumes[i]->fops->statfs, loc, xdata); ++ } ++ ++ return 0; ++ ++err: ++ op_errno = (op_errno == -1) ? errno : op_errno; ++ DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL); ++ ++ return 0; ++} +diff --git a/xlators/cluster/dht/src/tier-common.h b/xlators/cluster/dht/src/tier-common.h +new file mode 100644 +index 0000000..b1ebaa8 +--- /dev/null ++++ b/xlators/cluster/dht/src/tier-common.h +@@ -0,0 +1,55 @@ ++/* ++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#ifndef _TIER_COMMON_H_ ++#define _TIER_COMMON_H_ ++/* Function definitions */ ++int ++tier_create_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int op_ret, int op_errno, ++ struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata); ++ ++int ++tier_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, ++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata); ++ ++int ++tier_create_linkfile_create_cbk(call_frame_t *frame, void *cookie, ++ xlator_t *this, int32_t op_ret, ++ int32_t op_errno, inode_t *inode, ++ struct iatt *stbuf, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata); ++ ++int ++tier_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ++ mode_t mode, mode_t umask, fd_t *fd, dict_t *params); ++ ++int32_t ++tier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, ++ dict_t *xdata); ++ ++int32_t ++tier_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t off, dict_t *dict); ++ ++int ++tier_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, ++ off_t yoff, dict_t *xdata); ++ ++int ++tier_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, ++ dict_t *xdata); ++ ++int ++tier_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata); ++ ++#endif +diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c +new file mode 100644 +index 0000000..94b4c63 +--- /dev/null ++++ b/xlators/cluster/dht/src/tier.c +@@ -0,0 +1,3105 @@ ++/* ++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#include <dlfcn.h> ++ ++#include "dht-common.h" ++#include "tier.h" ++#include "tier-common.h" ++#include <glusterfs/syscall.h> ++#include <glusterfs/events.h> ++#include "tier-ctr-interface.h" ++ ++/*Hard coded DB info*/ ++static gfdb_db_type_t dht_tier_db_type = GFDB_SQLITE3; ++/*Hard coded DB info*/ ++ ++/*Mutex for updating the data movement stats*/ ++static pthread_mutex_t dm_stat_mutex = PTHREAD_MUTEX_INITIALIZER; ++ ++/* Stores the path location of promotion query files */ ++static char *promotion_qfile; ++/* Stores the path location of demotion query files */ ++static char *demotion_qfile; ++ ++static void *libhandle; ++static gfdb_methods_t gfdb_methods; ++ ++#define DB_QUERY_RECORD_SIZE 4096 ++ ++/* ++ * Closes all the fds and frees the qfile_array ++ * */ ++static void ++qfile_array_free(tier_qfile_array_t *qfile_array) ++{ ++ ssize_t i = 0; ++ ++ if (qfile_array) { ++ if (qfile_array->fd_array) { ++ for (i = 0; i < qfile_array->array_size; i++) { ++ if (qfile_array->fd_array[i] != -1) { ++ sys_close(qfile_array->fd_array[i]); ++ } ++ } ++ } ++ GF_FREE(qfile_array->fd_array); ++ } ++ GF_FREE(qfile_array); ++} ++ ++/* Create a new query file list with given size */ ++static tier_qfile_array_t * ++qfile_array_new(ssize_t array_size) ++{ ++ int ret = -1; ++ tier_qfile_array_t *qfile_array = NULL; ++ ssize_t i = 0; ++ ++ GF_VALIDATE_OR_GOTO("tier", (array_size > 0), out); ++ ++ qfile_array = GF_CALLOC(1, sizeof(tier_qfile_array_t), ++ gf_tier_mt_qfile_array_t); ++ if (!qfile_array) { ++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to allocate memory for tier_qfile_array_t"); ++ goto out; ++ } ++ ++ qfile_array->fd_array = GF_MALLOC(array_size * sizeof(int), ++ gf_dht_mt_int32_t); ++ if (!qfile_array->fd_array) { ++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to allocate memory for " ++ "tier_qfile_array_t->fd_array"); ++ goto out; ++ } ++ ++ /* Init all the fds to -1 */ ++ for (i = 0; i < array_size; i++) { ++ qfile_array->fd_array[i] = -1; ++ } ++ ++ qfile_array->array_size = array_size; ++ qfile_array->next_index = 0; ++ ++ /* Set exhausted count to list size as the list is empty */ ++ qfile_array->exhausted_count = qfile_array->array_size; ++ ++ ret = 0; ++out: ++ if (ret) { ++ qfile_array_free(qfile_array); ++ qfile_array = NULL; ++ } ++ return qfile_array; ++} ++ ++/* Checks if the query file list is empty or totally exhausted. */ ++static gf_boolean_t ++is_qfile_array_empty(tier_qfile_array_t *qfile_array) ++{ ++ return (qfile_array->exhausted_count == qfile_array->array_size) ++ ? _gf_true ++ : _gf_false; ++} ++ ++/* Shifts the next_fd pointer to the next available fd in the list */ ++static void ++shift_next_index(tier_qfile_array_t *qfile_array) ++{ ++ int qfile_fd = 0; ++ int spin_count = 0; ++ ++ if (is_qfile_array_empty(qfile_array)) { ++ return; ++ } ++ ++ do { ++ /* change next_index in a rotional manner */ ++ (qfile_array->next_index == (qfile_array->array_size - 1)) ++ ? qfile_array->next_index = 0 ++ : qfile_array->next_index++; ++ ++ qfile_fd = (qfile_array->fd_array[qfile_array->next_index]); ++ ++ spin_count++; ++ ++ } while ((qfile_fd == -1) && (spin_count < qfile_array->array_size)); ++} ++ ++/* ++ * This is a non-thread safe function to read query records ++ * from a list of query files in a Round-Robin manner. ++ * As in when the query files get exhuasted they are closed. ++ * Returns: ++ * 0 if all the query records in all the query files of the list are ++ * exhausted. ++ * > 0 if a query record is successfully read. Indicates the size of the query ++ * record read. ++ * < 0 if there was failure ++ * */ ++static int ++read_query_record_list(tier_qfile_array_t *qfile_array, ++ gfdb_query_record_t **query_record) ++{ ++ int ret = -1; ++ int qfile_fd = 0; ++ ++ GF_VALIDATE_OR_GOTO("tier", qfile_array, out); ++ GF_VALIDATE_OR_GOTO("tier", qfile_array->fd_array, out); ++ ++ do { ++ if (is_qfile_array_empty(qfile_array)) { ++ ret = 0; ++ break; ++ } ++ ++ qfile_fd = qfile_array->fd_array[qfile_array->next_index]; ++ ret = gfdb_methods.gfdb_read_query_record(qfile_fd, query_record); ++ if (ret <= 0) { ++ /*The qfile_fd has reached EOF or ++ * there was an error. ++ * 1. Close the exhausted fd ++ * 2. increment the exhausted count ++ * 3. shift next_qfile to next qfile ++ **/ ++ sys_close(qfile_fd); ++ qfile_array->fd_array[qfile_array->next_index] = -1; ++ qfile_array->exhausted_count++; ++ /* shift next_qfile to next qfile */ ++ shift_next_index(qfile_array); ++ continue; ++ } else { ++ /* shift next_qfile to next qfile */ ++ shift_next_index(qfile_array); ++ break; ++ } ++ } while (1); ++out: ++ return ret; ++} ++ ++/* Check and update the watermark every WM_INTERVAL seconds */ ++#define WM_INTERVAL 5 ++#define WM_INTERVAL_EMERG 1 ++ ++static int ++tier_check_same_node(xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag) ++{ ++ int ret = -1; ++ dict_t *dict = NULL; ++ char *uuid_str = NULL; ++ uuid_t node_uuid = { ++ 0, ++ }; ++ ++ GF_VALIDATE_OR_GOTO("tier", this, out); ++ GF_VALIDATE_OR_GOTO(this->name, loc, out); ++ GF_VALIDATE_OR_GOTO(this->name, defrag, out); ++ ++ if (syncop_getxattr(this, loc, &dict, GF_XATTR_NODE_UUID_KEY, NULL, NULL)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Unable to get NODE_UUID_KEY %s %s\n", loc->name, loc->path); ++ goto out; ++ } ++ ++ if (dict_get_str(dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to get node-uuids for %s", loc->path); ++ goto out; ++ } ++ ++ if (gf_uuid_parse(uuid_str, node_uuid)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "uuid_parse failed for %s", loc->path); ++ goto out; ++ } ++ ++ if (gf_uuid_compare(node_uuid, defrag->node_uuid)) { ++ gf_msg_debug(this->name, 0, "%s does not belong to this node", ++ loc->path); ++ ret = 1; ++ goto out; ++ } ++ ++ ret = 0; ++out: ++ if (dict) ++ dict_unref(dict); ++ ++ return ret; ++} ++ ++int ++tier_get_fs_stat(xlator_t *this, loc_t *root_loc) ++{ ++ int ret = 0; ++ gf_defrag_info_t *defrag = NULL; ++ dht_conf_t *conf = NULL; ++ dict_t *xdata = NULL; ++ struct statvfs statfs = { ++ 0, ++ }; ++ gf_tier_conf_t *tier_conf = NULL; ++ ++ conf = this->private; ++ if (!conf) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS, ++ "conf is NULL"); ++ ret = -1; ++ goto exit; ++ } ++ ++ defrag = conf->defrag; ++ if (!defrag) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS, ++ "defrag is NULL"); ++ ret = -1; ++ goto exit; ++ } ++ ++ tier_conf = &defrag->tier_conf; ++ ++ xdata = dict_new(); ++ if (!xdata) { ++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY, ++ "failed to allocate dictionary"); ++ ret = -1; ++ goto exit; ++ } ++ ++ ret = dict_set_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED, ++ "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict"); ++ ret = -1; ++ goto exit; ++ } ++ ++ /* Find how much free space is on the hot subvolume. ++ * Then see if that value */ ++ /* is less than or greater than user defined watermarks. ++ * Stash results in */ ++ /* the tier_conf data structure. */ ++ ++ ret = syncop_statfs(conf->subvolumes[1], root_loc, &statfs, xdata, NULL); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_STATUS, ++ "Unable to obtain statfs."); ++ goto exit; ++ } ++ ++ pthread_mutex_lock(&dm_stat_mutex); ++ ++ tier_conf->block_size = statfs.f_bsize; ++ tier_conf->blocks_total = statfs.f_blocks; ++ tier_conf->blocks_used = statfs.f_blocks - statfs.f_bfree; ++ ++ tier_conf->percent_full = GF_PERCENTAGE(tier_conf->blocks_used, ++ statfs.f_blocks); ++ pthread_mutex_unlock(&dm_stat_mutex); ++ ++exit: ++ if (xdata) ++ dict_unref(xdata); ++ return ret; ++} ++ ++static void ++tier_send_watermark_event(const char *volname, tier_watermark_op_t old_wm, ++ tier_watermark_op_t new_wm) ++{ ++ if (old_wm == TIER_WM_LOW || old_wm == TIER_WM_NONE) { ++ if (new_wm == TIER_WM_MID) { ++ gf_event(EVENT_TIER_WATERMARK_RAISED_TO_MID, "vol=%s", volname); ++ } else if (new_wm == TIER_WM_HI) { ++ gf_event(EVENT_TIER_WATERMARK_HI, "vol=%s", volname); ++ } ++ } else if (old_wm == TIER_WM_MID) { ++ if (new_wm == TIER_WM_LOW) { ++ gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_LOW, "vol=%s", volname); ++ } else if (new_wm == TIER_WM_HI) { ++ gf_event(EVENT_TIER_WATERMARK_HI, "vol=%s", volname); ++ } ++ } else if (old_wm == TIER_WM_HI) { ++ if (new_wm == TIER_WM_MID) { ++ gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_MID, "vol=%s", volname); ++ } else if (new_wm == TIER_WM_LOW) { ++ gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_LOW, "vol=%s", volname); ++ } ++ } ++} ++ ++int ++tier_check_watermark(xlator_t *this) ++{ ++ int ret = -1; ++ gf_defrag_info_t *defrag = NULL; ++ dht_conf_t *conf = NULL; ++ gf_tier_conf_t *tier_conf = NULL; ++ tier_watermark_op_t wm = TIER_WM_NONE; ++ ++ conf = this->private; ++ if (!conf) ++ goto exit; ++ ++ defrag = conf->defrag; ++ if (!defrag) ++ goto exit; ++ ++ tier_conf = &defrag->tier_conf; ++ ++ if (tier_conf->percent_full < tier_conf->watermark_low) { ++ wm = TIER_WM_LOW; ++ ++ } else if (tier_conf->percent_full < tier_conf->watermark_hi) { ++ wm = TIER_WM_MID; ++ ++ } else { ++ wm = TIER_WM_HI; ++ } ++ ++ if (wm != tier_conf->watermark_last) { ++ tier_send_watermark_event(tier_conf->volname, tier_conf->watermark_last, ++ wm); ++ ++ tier_conf->watermark_last = wm; ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Tier watermark now %d", wm); ++ } ++ ++ ret = 0; ++ ++exit: ++ return ret; ++} ++ ++static gf_boolean_t ++is_hot_tier_full(gf_tier_conf_t *tier_conf) ++{ ++ if (tier_conf && (tier_conf->mode == TIER_MODE_WM) && ++ (tier_conf->watermark_last == TIER_WM_HI)) ++ return _gf_true; ++ ++ return _gf_false; ++} ++ ++int ++tier_do_migration(xlator_t *this, int promote) ++{ ++ gf_defrag_info_t *defrag = NULL; ++ dht_conf_t *conf = NULL; ++ long rand = 0; ++ int migrate = 0; ++ gf_tier_conf_t *tier_conf = NULL; ++ ++ conf = this->private; ++ if (!conf) ++ goto exit; ++ ++ defrag = conf->defrag; ++ if (!defrag) ++ goto exit; ++ ++ if (tier_check_watermark(this) != 0) { ++ gf_msg(this->name, GF_LOG_CRITICAL, errno, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to get watermark"); ++ goto exit; ++ } ++ ++ tier_conf = &defrag->tier_conf; ++ ++ switch (tier_conf->watermark_last) { ++ case TIER_WM_LOW: ++ migrate = promote ? 1 : 0; ++ break; ++ case TIER_WM_HI: ++ migrate = promote ? 0 : 1; ++ break; ++ case TIER_WM_MID: ++ /* coverity[DC.WEAK_CRYPTO] */ ++ rand = random() % 100; ++ if (promote) { ++ migrate = (rand > tier_conf->percent_full); ++ } else { ++ migrate = (rand <= tier_conf->percent_full); ++ } ++ break; ++ } ++ ++exit: ++ return migrate; ++} ++ ++int ++tier_migrate(xlator_t *this, int is_promotion, dict_t *migrate_data, loc_t *loc, ++ gf_tier_conf_t *tier_conf) ++{ ++ int ret = -1; ++ ++ pthread_mutex_lock(&tier_conf->pause_mutex); ++ if (is_promotion) ++ tier_conf->promote_in_progress = 1; ++ else ++ tier_conf->demote_in_progress = 1; ++ pthread_mutex_unlock(&tier_conf->pause_mutex); ++ ++ /* Data migration */ ++ ret = syncop_setxattr(this, loc, migrate_data, 0, NULL, NULL); ++ ++ pthread_mutex_lock(&tier_conf->pause_mutex); ++ if (is_promotion) ++ tier_conf->promote_in_progress = 0; ++ else ++ tier_conf->demote_in_progress = 0; ++ pthread_mutex_unlock(&tier_conf->pause_mutex); ++ ++ return ret; ++} ++ ++/* returns _gf_true: if file can be promoted ++ * returns _gf_false: if file cannot be promoted ++ */ ++static gf_boolean_t ++tier_can_promote_file(xlator_t *this, char const *file_name, ++ struct iatt *current, gf_defrag_info_t *defrag) ++{ ++ gf_boolean_t ret = _gf_false; ++ fsblkcnt_t estimated_usage = 0; ++ ++ if (defrag->tier_conf.tier_max_promote_size && ++ (current->ia_size > defrag->tier_conf.tier_max_promote_size)) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "File %s (gfid:%s) with size (%" PRIu64 ++ ") exceeds maxsize " ++ "(%d) for promotion. File will not be promoted.", ++ file_name, uuid_utoa(current->ia_gfid), current->ia_size, ++ defrag->tier_conf.tier_max_promote_size); ++ goto err; ++ } ++ ++ /* bypass further validations for TEST mode */ ++ if (defrag->tier_conf.mode != TIER_MODE_WM) { ++ ret = _gf_true; ++ goto err; ++ } ++ ++ /* convert the file size to blocks as per the block size of the ++ * destination tier ++ * NOTE: add (block_size - 1) to get the correct block size when ++ * there is a remainder after a modulo ++ */ ++ estimated_usage = ((current->ia_size + defrag->tier_conf.block_size - 1) / ++ defrag->tier_conf.block_size) + ++ defrag->tier_conf.blocks_used; ++ ++ /* test if the estimated block usage goes above HI watermark */ ++ if (GF_PERCENTAGE(estimated_usage, defrag->tier_conf.blocks_total) >= ++ defrag->tier_conf.watermark_hi) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Estimated block count consumption on " ++ "hot tier (%" PRIu64 ++ ") exceeds hi watermark (%d%%). " ++ "File will not be promoted.", ++ estimated_usage, defrag->tier_conf.watermark_hi); ++ goto err; ++ } ++ ret = _gf_true; ++err: ++ return ret; ++} ++ ++static int ++tier_set_migrate_data(dict_t *migrate_data) ++{ ++ int failed = 1; ++ ++ failed = dict_set_str(migrate_data, GF_XATTR_FILE_MIGRATE_KEY, "force"); ++ if (failed) { ++ goto bail_out; ++ } ++ ++ /* Flag to suggest the xattr call is from migrator */ ++ failed = dict_set_str(migrate_data, "from.migrator", "yes"); ++ if (failed) { ++ goto bail_out; ++ } ++ ++ /* Flag to suggest its a tiering migration ++ * The reason for this dic key-value is that ++ * promotions and demotions are multithreaded ++ * so the original frame from gf_defrag_start() ++ * is not carried. A new frame will be created when ++ * we do syncop_setxattr(). This does not have the ++ * frame->root->pid of the original frame. So we pass ++ * this dic key-value when we do syncop_setxattr() to do ++ * data migration and set the frame->root->pid to ++ * GF_CLIENT_PID_TIER_DEFRAG in dht_setxattr() just before ++ * calling dht_start_rebalance_task() */ ++ failed = dict_set_str(migrate_data, TIERING_MIGRATION_KEY, "yes"); ++ if (failed) { ++ goto bail_out; ++ } ++ ++ failed = 0; ++ ++bail_out: ++ return failed; ++} ++ ++static char * ++tier_get_parent_path(xlator_t *this, loc_t *p_loc, struct iatt *par_stbuf, ++ int *per_link_status) ++{ ++ int ret = -1; ++ char *parent_path = NULL; ++ dict_t *xdata_request = NULL; ++ dict_t *xdata_response = NULL; ++ ++ xdata_request = dict_new(); ++ if (!xdata_request) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to create xdata_request dict"); ++ goto err; ++ } ++ ret = dict_set_int32(xdata_request, GET_ANCESTRY_PATH_KEY, 42); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to set value to dict : key %s \n", ++ GET_ANCESTRY_PATH_KEY); ++ goto err; ++ } ++ ++ ret = syncop_lookup(this, p_loc, par_stbuf, NULL, xdata_request, ++ &xdata_response); ++ /* When the parent gfid is a stale entry, the lookup ++ * will fail and stop the demotion process. ++ * The parent gfid can be stale when a huge folder is ++ * deleted while the files within it are being migrated ++ */ ++ if (ret == -ESTALE) { ++ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_STALE_LOOKUP, ++ "Stale entry in parent lookup for %s", uuid_utoa(p_loc->gfid)); ++ *per_link_status = 1; ++ goto err; ++ } else if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR, ++ "Error in parent lookup for %s", uuid_utoa(p_loc->gfid)); ++ *per_link_status = -1; ++ goto err; ++ } ++ ret = dict_get_str(xdata_response, GET_ANCESTRY_PATH_KEY, &parent_path); ++ if (ret || !parent_path) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to get parent path for %s", uuid_utoa(p_loc->gfid)); ++ *per_link_status = -1; ++ goto err; ++ } ++ ++err: ++ if (xdata_request) { ++ dict_unref(xdata_request); ++ } ++ ++ if (xdata_response) { ++ dict_unref(xdata_response); ++ xdata_response = NULL; ++ } ++ ++ return parent_path; ++} ++ ++static int ++tier_get_file_name_and_path(xlator_t *this, uuid_t gfid, ++ gfdb_link_info_t *link_info, ++ char const *parent_path, loc_t *loc, ++ int *per_link_status) ++{ ++ int ret = -1; ++ ++ loc->name = gf_strdup(link_info->file_name); ++ if (!loc->name) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Memory " ++ "allocation failed for %s", ++ uuid_utoa(gfid)); ++ *per_link_status = -1; ++ goto err; ++ } ++ ret = gf_asprintf((char **)&(loc->path), "%s/%s", parent_path, loc->name); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to " ++ "construct file path for %s %s\n", ++ parent_path, loc->name); ++ *per_link_status = -1; ++ goto err; ++ } ++ ++ ret = 0; ++ ++err: ++ return ret; ++} ++ ++static int ++tier_lookup_file(xlator_t *this, loc_t *p_loc, loc_t *loc, struct iatt *current, ++ int *per_link_status) ++{ ++ int ret = -1; ++ ++ ret = syncop_lookup(this, loc, current, NULL, NULL, NULL); ++ ++ /* The file may be deleted even when the parent ++ * is available and the lookup will ++ * return a stale entry which would stop the ++ * migration. so if its a stale entry, then skip ++ * the file and keep migrating. ++ */ ++ if (ret == -ESTALE) { ++ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_STALE_LOOKUP, ++ "Stale lookup for %s", uuid_utoa(p_loc->gfid)); ++ *per_link_status = 1; ++ goto err; ++ } else if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to " ++ "lookup file %s\n", ++ loc->name); ++ *per_link_status = -1; ++ goto err; ++ } ++ ret = 0; ++ ++err: ++ return ret; ++} ++ ++static gf_boolean_t ++tier_is_file_already_at_destination(xlator_t *src_subvol, ++ query_cbk_args_t *query_cbk_args, ++ dht_conf_t *conf, int *per_link_status) ++{ ++ gf_boolean_t at_destination = _gf_true; ++ ++ if (src_subvol == NULL) { ++ *per_link_status = 1; ++ goto err; ++ } ++ if (query_cbk_args->is_promotion && src_subvol == conf->subvolumes[1]) { ++ *per_link_status = 1; ++ goto err; ++ } ++ ++ if (!query_cbk_args->is_promotion && src_subvol == conf->subvolumes[0]) { ++ *per_link_status = 1; ++ goto err; ++ } ++ at_destination = _gf_false; ++ ++err: ++ return at_destination; ++} ++ ++static void ++tier_update_migration_counters(query_cbk_args_t *query_cbk_args, ++ gf_defrag_info_t *defrag, ++ uint64_t *total_migrated_bytes, int *total_files) ++{ ++ if (query_cbk_args->is_promotion) { ++ defrag->total_files_promoted++; ++ *total_migrated_bytes += defrag->tier_conf.st_last_promoted_size; ++ pthread_mutex_lock(&dm_stat_mutex); ++ defrag->tier_conf.blocks_used += defrag->tier_conf ++ .st_last_promoted_size; ++ pthread_mutex_unlock(&dm_stat_mutex); ++ } else { ++ defrag->total_files_demoted++; ++ *total_migrated_bytes += defrag->tier_conf.st_last_demoted_size; ++ pthread_mutex_lock(&dm_stat_mutex); ++ defrag->tier_conf.blocks_used -= defrag->tier_conf.st_last_demoted_size; ++ pthread_mutex_unlock(&dm_stat_mutex); ++ } ++ if (defrag->tier_conf.blocks_total) { ++ pthread_mutex_lock(&dm_stat_mutex); ++ defrag->tier_conf.percent_full = GF_PERCENTAGE( ++ defrag->tier_conf.blocks_used, defrag->tier_conf.blocks_total); ++ pthread_mutex_unlock(&dm_stat_mutex); ++ } ++ ++ (*total_files)++; ++} ++ ++static int ++tier_migrate_link(xlator_t *this, dht_conf_t *conf, uuid_t gfid, ++ gfdb_link_info_t *link_info, gf_defrag_info_t *defrag, ++ query_cbk_args_t *query_cbk_args, dict_t *migrate_data, ++ int *per_link_status, int *total_files, ++ uint64_t *total_migrated_bytes) ++{ ++ int ret = -1; ++ struct iatt current = { ++ 0, ++ }; ++ struct iatt par_stbuf = { ++ 0, ++ }; ++ loc_t p_loc = { ++ 0, ++ }; ++ loc_t loc = { ++ 0, ++ }; ++ xlator_t *src_subvol = NULL; ++ inode_t *linked_inode = NULL; ++ char *parent_path = NULL; ++ ++ /* Lookup for parent and get the path of parent */ ++ gf_uuid_copy(p_loc.gfid, link_info->pargfid); ++ p_loc.inode = inode_new(defrag->root_inode->table); ++ if (!p_loc.inode) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to create reference to inode" ++ " for %s", ++ uuid_utoa(p_loc.gfid)); ++ ++ *per_link_status = -1; ++ goto err; ++ } ++ ++ parent_path = tier_get_parent_path(this, &p_loc, &par_stbuf, ++ per_link_status); ++ if (!parent_path) { ++ goto err; ++ } ++ ++ linked_inode = inode_link(p_loc.inode, NULL, NULL, &par_stbuf); ++ inode_unref(p_loc.inode); ++ p_loc.inode = linked_inode; ++ ++ /* Preparing File Inode */ ++ gf_uuid_copy(loc.gfid, gfid); ++ loc.inode = inode_new(defrag->root_inode->table); ++ gf_uuid_copy(loc.pargfid, link_info->pargfid); ++ loc.parent = inode_ref(p_loc.inode); ++ ++ /* Get filename and Construct file path */ ++ if (tier_get_file_name_and_path(this, gfid, link_info, parent_path, &loc, ++ per_link_status) != 0) { ++ goto err; ++ } ++ gf_uuid_copy(loc.parent->gfid, link_info->pargfid); ++ ++ /* lookup file inode */ ++ if (tier_lookup_file(this, &p_loc, &loc, ¤t, per_link_status) != 0) { ++ goto err; ++ } ++ ++ if (query_cbk_args->is_promotion) { ++ if (!tier_can_promote_file(this, link_info->file_name, ¤t, ++ defrag)) { ++ *per_link_status = 1; ++ goto err; ++ } ++ } ++ ++ linked_inode = inode_link(loc.inode, NULL, NULL, ¤t); ++ inode_unref(loc.inode); ++ loc.inode = linked_inode; ++ ++ /* ++ * Do not promote/demote if file already is where it ++ * should be. It means another brick moved the file ++ * so is not an error. So we set per_link_status = 1 ++ * so that we ignore counting this. ++ */ ++ src_subvol = dht_subvol_get_cached(this, loc.inode); ++ ++ if (tier_is_file_already_at_destination(src_subvol, query_cbk_args, conf, ++ per_link_status)) { ++ goto err; ++ } ++ ++ gf_msg_debug(this->name, 0, "Tier %s: src_subvol %s file %s", ++ (query_cbk_args->is_promotion ? "promote" : "demote"), ++ src_subvol->name, loc.path); ++ ++ ret = tier_check_same_node(this, &loc, defrag); ++ if (ret != 0) { ++ if (ret < 0) { ++ *per_link_status = -1; ++ goto err; ++ } ++ ret = 0; ++ /* By setting per_link_status to 1 we are ++ * ignoring this status and will not be counting ++ * this file for migration */ ++ *per_link_status = 1; ++ goto err; ++ } ++ ++ gf_uuid_copy(loc.gfid, loc.inode->gfid); ++ ++ if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Tiering paused. " ++ "Exiting tier_migrate_link"); ++ goto err; ++ } ++ ++ ret = tier_migrate(this, query_cbk_args->is_promotion, migrate_data, &loc, ++ &defrag->tier_conf); ++ ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to " ++ "migrate %s ", ++ loc.path); ++ *per_link_status = -1; ++ goto err; ++ } ++ ++ tier_update_migration_counters(query_cbk_args, defrag, total_migrated_bytes, ++ total_files); ++ ++ ret = 0; ++ ++err: ++ GF_FREE((char *)loc.name); ++ loc.name = NULL; ++ loc_wipe(&loc); ++ loc_wipe(&p_loc); ++ ++ if ((*total_files >= defrag->tier_conf.max_migrate_files) || ++ (*total_migrated_bytes > defrag->tier_conf.max_migrate_bytes)) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Reached cycle migration limit." ++ "migrated bytes %" PRId64 " files %d", ++ *total_migrated_bytes, *total_files); ++ ret = -1; ++ } ++ ++ return ret; ++} ++ ++static int ++tier_migrate_using_query_file(void *_args) ++{ ++ int ret = -1; ++ query_cbk_args_t *query_cbk_args = (query_cbk_args_t *)_args; ++ xlator_t *this = NULL; ++ gf_defrag_info_t *defrag = NULL; ++ gfdb_query_record_t *query_record = NULL; ++ gfdb_link_info_t *link_info = NULL; ++ dict_t *migrate_data = NULL; ++ /* ++ * per_file_status and per_link_status ++ * 0 : success ++ * -1 : failure ++ * 1 : ignore the status and don't count for migration ++ * */ ++ int per_file_status = 0; ++ int per_link_status = 0; ++ int total_status = 0; ++ dht_conf_t *conf = NULL; ++ uint64_t total_migrated_bytes = 0; ++ int total_files = 0; ++ loc_t root_loc = {0}; ++ gfdb_time_t start_time = {0}; ++ gfdb_time_t current_time = {0}; ++ int total_time = 0; ++ int max_time = 0; ++ gf_boolean_t emergency_demote_mode = _gf_false; ++ ++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args, out); ++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out); ++ this = query_cbk_args->this; ++ GF_VALIDATE_OR_GOTO(this->name, query_cbk_args->defrag, out); ++ GF_VALIDATE_OR_GOTO(this->name, query_cbk_args->qfile_array, out); ++ GF_VALIDATE_OR_GOTO(this->name, this->private, out); ++ ++ conf = this->private; ++ ++ defrag = query_cbk_args->defrag; ++ migrate_data = dict_new(); ++ if (!migrate_data) ++ goto out; ++ ++ emergency_demote_mode = (!query_cbk_args->is_promotion && ++ is_hot_tier_full(&defrag->tier_conf)); ++ ++ if (tier_set_migrate_data(migrate_data) != 0) { ++ goto out; ++ } ++ ++ dht_build_root_loc(defrag->root_inode, &root_loc); ++ ++ ret = gettimeofday(&start_time, NULL); ++ if (query_cbk_args->is_promotion) { ++ max_time = defrag->tier_conf.tier_promote_frequency; ++ } else { ++ max_time = defrag->tier_conf.tier_demote_frequency; ++ } ++ ++ /* Per file */ ++ while ((ret = read_query_record_list(query_cbk_args->qfile_array, ++ &query_record)) != 0) { ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to fetch query record " ++ "from query file"); ++ goto out; ++ } ++ ++ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { ++ ret = -1; ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Exiting tier migration as" ++ "defrag status is not started"); ++ goto out; ++ } ++ ++ ret = gettimeofday(¤t_time, NULL); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Could not get current time."); ++ goto out; ++ } ++ ++ total_time = current_time.tv_sec - start_time.tv_sec; ++ if (total_time > max_time) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Max cycle time reached. Exiting migration."); ++ goto out; ++ } ++ ++ per_file_status = 0; ++ per_link_status = 0; ++ ++ if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Tiering paused. " ++ "Exiting tier_migrate_using_query_file"); ++ break; ++ } ++ ++ if (defrag->tier_conf.mode == TIER_MODE_WM) { ++ ret = tier_get_fs_stat(this, &root_loc); ++ if (ret != 0) { ++ gfdb_methods.gfdb_query_record_free(query_record); ++ query_record = NULL; ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS, ++ "tier_get_fs_stat() FAILED ... " ++ "skipping file migrations until next cycle"); ++ break; ++ } ++ ++ if (!tier_do_migration(this, query_cbk_args->is_promotion)) { ++ gfdb_methods.gfdb_query_record_free(query_record); ++ query_record = NULL; ++ ++ /* We have crossed the high watermark. Stop processing ++ * files if this is a promotion cycle so demotion gets ++ * a chance to start if not already running*/ ++ ++ if (query_cbk_args->is_promotion && ++ is_hot_tier_full(&defrag->tier_conf)) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "High watermark crossed during " ++ "promotion. Exiting " ++ "tier_migrate_using_query_file"); ++ break; ++ } ++ continue; ++ } ++ } ++ ++ per_link_status = 0; ++ ++ /* For now we only support single link migration. And we will ++ * ignore other hard links in the link info list of query record ++ * TODO: Multiple hard links migration */ ++ if (!list_empty(&query_record->link_list)) { ++ link_info = list_first_entry(&query_record->link_list, ++ gfdb_link_info_t, list); ++ } ++ if (link_info != NULL) { ++ if (tier_migrate_link(this, conf, query_record->gfid, link_info, ++ defrag, query_cbk_args, migrate_data, ++ &per_link_status, &total_files, ++ &total_migrated_bytes) != 0) { ++ gf_msg( ++ this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "%s failed for %s(gfid:%s)", ++ (query_cbk_args->is_promotion ? "Promotion" : "Demotion"), ++ link_info->file_name, uuid_utoa(query_record->gfid)); ++ } ++ } ++ per_file_status = per_link_status; ++ ++ if (per_file_status < 0) { /* Failure */ ++ pthread_mutex_lock(&dm_stat_mutex); ++ defrag->total_failures++; ++ pthread_mutex_unlock(&dm_stat_mutex); ++ } else if (per_file_status == 0) { /* Success */ ++ pthread_mutex_lock(&dm_stat_mutex); ++ defrag->total_files++; ++ pthread_mutex_unlock(&dm_stat_mutex); ++ } else if (per_file_status == 1) { /* Ignore */ ++ per_file_status = 0; ++ /* Since this attempt was ignored we ++ * decrement the lookup count*/ ++ pthread_mutex_lock(&dm_stat_mutex); ++ defrag->num_files_lookedup--; ++ pthread_mutex_unlock(&dm_stat_mutex); ++ } ++ total_status = total_status + per_file_status; ++ per_link_status = 0; ++ per_file_status = 0; ++ ++ gfdb_methods.gfdb_query_record_free(query_record); ++ query_record = NULL; ++ ++ /* If we are demoting and the entry watermark was HI, then ++ * we are done with emergency demotions if the current ++ * watermark has fallen below hi-watermark level ++ */ ++ if (emergency_demote_mode) { ++ if (tier_check_watermark(this) == 0) { ++ if (!is_hot_tier_full(&defrag->tier_conf)) { ++ break; ++ } ++ } ++ } ++ } ++ ++out: ++ if (migrate_data) ++ dict_unref(migrate_data); ++ ++ gfdb_methods.gfdb_query_record_free(query_record); ++ query_record = NULL; ++ ++ return total_status; ++} ++ ++/* This is the call back function per record/file from data base */ ++static int ++tier_gf_query_callback(gfdb_query_record_t *gfdb_query_record, void *_args) ++{ ++ int ret = -1; ++ query_cbk_args_t *query_cbk_args = _args; ++ ++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args, out); ++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->defrag, out); ++ GF_VALIDATE_OR_GOTO("tier", (query_cbk_args->query_fd > 0), out); ++ ++ ret = gfdb_methods.gfdb_write_query_record(query_cbk_args->query_fd, ++ gfdb_query_record); ++ if (ret) { ++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed writing query record to query file"); ++ goto out; ++ } ++ ++ pthread_mutex_lock(&dm_stat_mutex); ++ query_cbk_args->defrag->num_files_lookedup++; ++ pthread_mutex_unlock(&dm_stat_mutex); ++ ++ ret = 0; ++out: ++ return ret; ++} ++ ++/* Create query file in tier process */ ++static int ++tier_process_self_query(tier_brick_list_t *local_brick, void *args) ++{ ++ int ret = -1; ++ char *db_path = NULL; ++ query_cbk_args_t *query_cbk_args = NULL; ++ xlator_t *this = NULL; ++ gfdb_conn_node_t *conn_node = NULL; ++ dict_t *params_dict = NULL; ++ dict_t *ctr_ipc_dict = NULL; ++ gfdb_brick_info_t *gfdb_brick_info = args; ++ ++ /*Init of all the essentials*/ ++ GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out); ++ query_cbk_args = gfdb_brick_info->_query_cbk_args; ++ ++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out); ++ this = query_cbk_args->this; ++ ++ GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out); ++ ++ GF_VALIDATE_OR_GOTO(this->name, local_brick, out); ++ ++ GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out); ++ ++ GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out); ++ ++ db_path = local_brick->brick_db_path; ++ ++ /*Preparing DB parameters before init_db i.e getting db connection*/ ++ params_dict = dict_new(); ++ if (!params_dict) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "DB Params cannot initialized"); ++ goto out; ++ } ++ SET_DB_PARAM_TO_DICT(this->name, params_dict, ++ (char *)gfdb_methods.get_db_path_key(), db_path, ret, ++ out); ++ ++ /*Get the db connection*/ ++ conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type); ++ if (!conn_node) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "FATAL: Failed initializing db operations"); ++ goto out; ++ } ++ ++ /* Query for eligible files from db */ ++ query_cbk_args->query_fd = open(local_brick->qfile_path, ++ O_WRONLY | O_CREAT | O_APPEND, ++ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); ++ if (query_cbk_args->query_fd < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, errno, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to open query file %s", local_brick->qfile_path); ++ goto out; ++ } ++ if (!gfdb_brick_info->_gfdb_promote) { ++ if (query_cbk_args->defrag->tier_conf.watermark_last == TIER_WM_HI) { ++ /* emergency demotion mode */ ++ ret = gfdb_methods.find_all( ++ conn_node, tier_gf_query_callback, (void *)query_cbk_args, ++ query_cbk_args->defrag->tier_conf.query_limit); ++ } else { ++ if (query_cbk_args->defrag->write_freq_threshold == 0 && ++ query_cbk_args->defrag->read_freq_threshold == 0) { ++ ret = gfdb_methods.find_unchanged_for_time( ++ conn_node, tier_gf_query_callback, (void *)query_cbk_args, ++ gfdb_brick_info->time_stamp); ++ } else { ++ ret = gfdb_methods.find_unchanged_for_time_freq( ++ conn_node, tier_gf_query_callback, (void *)query_cbk_args, ++ gfdb_brick_info->time_stamp, ++ query_cbk_args->defrag->write_freq_threshold, ++ query_cbk_args->defrag->read_freq_threshold, _gf_false); ++ } ++ } ++ } else { ++ if (query_cbk_args->defrag->write_freq_threshold == 0 && ++ query_cbk_args->defrag->read_freq_threshold == 0) { ++ ret = gfdb_methods.find_recently_changed_files( ++ conn_node, tier_gf_query_callback, (void *)query_cbk_args, ++ gfdb_brick_info->time_stamp); ++ } else { ++ ret = gfdb_methods.find_recently_changed_files_freq( ++ conn_node, tier_gf_query_callback, (void *)query_cbk_args, ++ gfdb_brick_info->time_stamp, ++ query_cbk_args->defrag->write_freq_threshold, ++ query_cbk_args->defrag->read_freq_threshold, _gf_false); ++ } ++ } ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "FATAL: query from db failed"); ++ goto out; ++ } ++ ++ /*Clear the heat on the DB entries*/ ++ /*Preparing ctr_ipc_dict*/ ++ ctr_ipc_dict = dict_new(); ++ if (!ctr_ipc_dict) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "ctr_ipc_dict cannot initialized"); ++ goto out; ++ } ++ ++ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, GFDB_IPC_CTR_KEY, ++ GFDB_IPC_CTR_CLEAR_OPS, ret, out); ++ ++ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict, ++ NULL); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed clearing the heat " ++ "on db %s error %d", ++ local_brick->brick_db_path, ret); ++ goto out; ++ } ++ ++ ret = 0; ++out: ++ if (params_dict) { ++ dict_unref(params_dict); ++ params_dict = NULL; ++ } ++ ++ if (ctr_ipc_dict) { ++ dict_unref(ctr_ipc_dict); ++ ctr_ipc_dict = NULL; ++ } ++ ++ if (query_cbk_args && query_cbk_args->query_fd >= 0) { ++ sys_close(query_cbk_args->query_fd); ++ query_cbk_args->query_fd = -1; ++ } ++ gfdb_methods.fini_db(conn_node); ++ ++ return ret; ++} ++ ++/*Ask CTR to create the query file*/ ++static int ++tier_process_ctr_query(tier_brick_list_t *local_brick, void *args) ++{ ++ int ret = -1; ++ query_cbk_args_t *query_cbk_args = NULL; ++ xlator_t *this = NULL; ++ dict_t *ctr_ipc_in_dict = NULL; ++ dict_t *ctr_ipc_out_dict = NULL; ++ gfdb_brick_info_t *gfdb_brick_info = args; ++ gfdb_ipc_ctr_params_t *ipc_ctr_params = NULL; ++ int count = 0; ++ ++ /*Init of all the essentials*/ ++ GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out); ++ query_cbk_args = gfdb_brick_info->_query_cbk_args; ++ ++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out); ++ this = query_cbk_args->this; ++ ++ GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out); ++ ++ GF_VALIDATE_OR_GOTO(this->name, local_brick, out); ++ ++ GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out); ++ ++ GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out); ++ ++ /*Preparing ctr_ipc_in_dict*/ ++ ctr_ipc_in_dict = dict_new(); ++ if (!ctr_ipc_in_dict) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "ctr_ipc_in_dict cannot initialized"); ++ goto out; ++ } ++ ++ ipc_ctr_params = GF_CALLOC(1, sizeof(gfdb_ipc_ctr_params_t), ++ gf_tier_mt_ipc_ctr_params_t); ++ if (!ipc_ctr_params) { ++ goto out; ++ } ++ ++ /* set all the query params*/ ++ ipc_ctr_params->is_promote = gfdb_brick_info->_gfdb_promote; ++ ++ ipc_ctr_params->write_freq_threshold = query_cbk_args->defrag ++ ->write_freq_threshold; ++ ++ ipc_ctr_params->read_freq_threshold = query_cbk_args->defrag ++ ->read_freq_threshold; ++ ++ ipc_ctr_params->query_limit = query_cbk_args->defrag->tier_conf.query_limit; ++ ++ ipc_ctr_params->emergency_demote = (!gfdb_brick_info->_gfdb_promote && ++ query_cbk_args->defrag->tier_conf ++ .watermark_last == TIER_WM_HI); ++ ++ memcpy(&ipc_ctr_params->time_stamp, gfdb_brick_info->time_stamp, ++ sizeof(gfdb_time_t)); ++ ++ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, GFDB_IPC_CTR_KEY, ++ GFDB_IPC_CTR_QUERY_OPS, ret, out); ++ ++ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, ++ GFDB_IPC_CTR_GET_QFILE_PATH, local_brick->qfile_path, ++ ret, out); ++ ++ ret = dict_set_bin(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS, ++ ipc_ctr_params, sizeof(*ipc_ctr_params)); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED, ++ "Failed setting %s to params dictionary", ++ GFDB_IPC_CTR_GET_QUERY_PARAMS); ++ GF_FREE(ipc_ctr_params); ++ goto out; ++ } ++ ipc_ctr_params = NULL; ++ ++ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_in_dict, ++ &ctr_ipc_out_dict); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_IPC_TIER_ERROR, ++ "Failed query on %s ret %d", local_brick->brick_db_path, ret); ++ goto out; ++ } ++ ++ ret = dict_get_int32(ctr_ipc_out_dict, GFDB_IPC_CTR_RET_QUERY_COUNT, ++ &count); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed getting count " ++ "of records on %s", ++ local_brick->brick_db_path); ++ goto out; ++ } ++ ++ if (count < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed query on %s", local_brick->brick_db_path); ++ ret = -1; ++ goto out; ++ } ++ ++ pthread_mutex_lock(&dm_stat_mutex); ++ query_cbk_args->defrag->num_files_lookedup = count; ++ pthread_mutex_unlock(&dm_stat_mutex); ++ ++ ret = 0; ++out: ++ ++ if (ctr_ipc_in_dict) { ++ dict_unref(ctr_ipc_in_dict); ++ ctr_ipc_in_dict = NULL; ++ } ++ ++ if (ctr_ipc_out_dict) { ++ dict_unref(ctr_ipc_out_dict); ++ ctr_ipc_out_dict = NULL; ++ } ++ ++ GF_FREE(ipc_ctr_params); ++ ++ return ret; ++} ++ ++/* This is the call back function for each brick from hot/cold bricklist ++ * It picks up each bricks db and queries for eligible files for migration. ++ * The list of eligible files are populated in appropriate query files*/ ++static int ++tier_process_brick(tier_brick_list_t *local_brick, void *args) ++{ ++ int ret = -1; ++ dict_t *ctr_ipc_in_dict = NULL; ++ dict_t *ctr_ipc_out_dict = NULL; ++ char *strval = NULL; ++ ++ GF_VALIDATE_OR_GOTO("tier", local_brick, out); ++ ++ GF_VALIDATE_OR_GOTO("tier", local_brick->xlator, out); ++ ++ if (dht_tier_db_type == GFDB_SQLITE3) { ++ /*Preparing ctr_ipc_in_dict*/ ++ ctr_ipc_in_dict = dict_new(); ++ if (!ctr_ipc_in_dict) { ++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "ctr_ipc_in_dict cannot initialized"); ++ goto out; ++ } ++ ++ ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_KEY, ++ GFDB_IPC_CTR_GET_DB_PARAM_OPS); ++ if (ret) { ++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED, ++ "Failed to set %s " ++ "to params dictionary", ++ GFDB_IPC_CTR_KEY); ++ goto out; ++ } ++ ++ ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_DB_PARAM_OPS, ""); ++ if (ret) { ++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED, ++ "Failed to set %s " ++ "to params dictionary", ++ GFDB_IPC_CTR_GET_DB_PARAM_OPS); ++ goto out; ++ } ++ ++ ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_DB_KEY, ++ "journal_mode"); ++ if (ret) { ++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED, ++ "Failed to set %s " ++ "to params dictionary", ++ GFDB_IPC_CTR_GET_DB_KEY); ++ goto out; ++ } ++ ++ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ++ ctr_ipc_in_dict, &ctr_ipc_out_dict); ++ if (ret || ctr_ipc_out_dict == NULL) { ++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to get " ++ "journal_mode of sql db %s", ++ local_brick->brick_db_path); ++ goto out; ++ } ++ ++ ret = dict_get_str(ctr_ipc_out_dict, "journal_mode", &strval); ++ if (ret) { ++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_GET_PARAM_FAILED, ++ "Failed to get %s " ++ "from params dictionary" ++ "journal_mode", ++ strval); ++ goto out; ++ } ++ ++ if (strval && (strncmp(strval, "wal", SLEN("wal")) == 0)) { ++ ret = tier_process_self_query(local_brick, args); ++ if (ret) { ++ goto out; ++ } ++ } else { ++ ret = tier_process_ctr_query(local_brick, args); ++ if (ret) { ++ goto out; ++ } ++ } ++ ret = 0; ++ ++ } else { ++ ret = tier_process_self_query(local_brick, args); ++ if (ret) { ++ goto out; ++ } ++ } ++ ++ ret = 0; ++out: ++ if (ctr_ipc_in_dict) ++ dict_unref(ctr_ipc_in_dict); ++ ++ if (ctr_ipc_out_dict) ++ dict_unref(ctr_ipc_out_dict); ++ ++ return ret; ++} ++ ++static int ++tier_build_migration_qfile(migration_args_t *args, ++ query_cbk_args_t *query_cbk_args, ++ gf_boolean_t is_promotion) ++{ ++ gfdb_time_t current_time; ++ gfdb_brick_info_t gfdb_brick_info; ++ gfdb_time_t time_in_past; ++ int ret = -1; ++ tier_brick_list_t *local_brick = NULL; ++ int i = 0; ++ time_in_past.tv_sec = args->freq_time; ++ time_in_past.tv_usec = 0; ++ ++ ret = gettimeofday(¤t_time, NULL); ++ if (ret == -1) { ++ gf_msg(args->this->name, GF_LOG_ERROR, errno, ++ DHT_MSG_SYS_CALL_GET_TIME_FAILED, "Failed to get current time"); ++ goto out; ++ } ++ time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec; ++ ++ /* The migration daemon may run a varying numberof usec after the */ ++ /* sleep call triggers. A file may be registered in CTR some number */ ++ /* of usec X after the daemon started and missed in the subsequent */ ++ /* cycle if the daemon starts Y usec after the period in seconds */ ++ /* where Y>X. Normalize away this problem by always setting usec */ ++ /* to 0. */ ++ time_in_past.tv_usec = 0; ++ ++ gfdb_brick_info.time_stamp = &time_in_past; ++ gfdb_brick_info._gfdb_promote = is_promotion; ++ gfdb_brick_info._query_cbk_args = query_cbk_args; ++ ++ list_for_each_entry(local_brick, args->brick_list, list) ++ { ++ /* Construct query file path for this brick ++ * i.e ++ * /var/run/gluster/xlator_name/ ++ * {promote/demote}-brickname-indexinbricklist ++ * So that no two query files will have same path even ++ * bricks have the same name ++ * */ ++ snprintf(local_brick->qfile_path, PATH_MAX, "%s-%s-%d", ++ GET_QFILE_PATH(gfdb_brick_info._gfdb_promote), ++ local_brick->brick_name, i); ++ ++ /* Delete any old query files for this brick */ ++ sys_unlink(local_brick->qfile_path); ++ ++ ret = tier_process_brick(local_brick, &gfdb_brick_info); ++ if (ret) { ++ gf_msg(args->this->name, GF_LOG_ERROR, 0, ++ DHT_MSG_BRICK_QUERY_FAILED, "Brick %s query failed\n", ++ local_brick->brick_db_path); ++ } ++ i++; ++ } ++ ret = 0; ++out: ++ return ret; ++} ++ ++static int ++tier_migrate_files_using_qfile(migration_args_t *comp, ++ query_cbk_args_t *query_cbk_args) ++{ ++ int ret = -1; ++ tier_brick_list_t *local_brick = NULL; ++ tier_brick_list_t *temp = NULL; ++ gfdb_time_t current_time = { ++ 0, ++ }; ++ ssize_t qfile_array_size = 0; ++ int count = 0; ++ int temp_fd = 0; ++ gf_tier_conf_t *tier_conf = NULL; ++ ++ tier_conf = &(query_cbk_args->defrag->tier_conf); ++ ++ /* Time for error query files */ ++ gettimeofday(¤t_time, NULL); ++ ++ /* Build the qfile list */ ++ list_for_each_entry_safe(local_brick, temp, comp->brick_list, list) ++ { ++ qfile_array_size++; ++ } ++ query_cbk_args->qfile_array = qfile_array_new(qfile_array_size); ++ if (!query_cbk_args->qfile_array) { ++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to create new " ++ "qfile_array"); ++ goto out; ++ } ++ ++ /*Open all qfiles*/ ++ count = 0; ++ query_cbk_args->qfile_array->exhausted_count = 0; ++ list_for_each_entry_safe(local_brick, temp, comp->brick_list, list) ++ { ++ temp_fd = query_cbk_args->qfile_array->fd_array[count]; ++ temp_fd = open(local_brick->qfile_path, O_RDONLY, ++ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); ++ if (temp_fd < 0) { ++ gf_msg("tier", GF_LOG_ERROR, errno, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to open " ++ "%s to the query file", ++ local_brick->qfile_path); ++ query_cbk_args->qfile_array->exhausted_count++; ++ } ++ query_cbk_args->qfile_array->fd_array[count] = temp_fd; ++ count++; ++ } ++ ++ /* Moving the query file index to the next, so that we won't the same ++ * query file every cycle as the first one */ ++ query_cbk_args->qfile_array ++ ->next_index = (query_cbk_args->is_promotion) ++ ? tier_conf->last_promote_qfile_index ++ : tier_conf->last_demote_qfile_index; ++ shift_next_index(query_cbk_args->qfile_array); ++ if (query_cbk_args->is_promotion) { ++ tier_conf->last_promote_qfile_index = query_cbk_args->qfile_array ++ ->next_index; ++ } else { ++ tier_conf->last_demote_qfile_index = query_cbk_args->qfile_array ++ ->next_index; ++ } ++ ++ /* Migrate files using query file list */ ++ ret = tier_migrate_using_query_file((void *)query_cbk_args); ++out: ++ qfile_array_free(query_cbk_args->qfile_array); ++ ++ /* If there is an error rename all the query files to .err files ++ * with a timestamp for better debugging */ ++ if (ret) { ++ struct tm tm = { ++ 0, ++ }; ++ char time_str[128] = { ++ 0, ++ }; ++ char query_file_path_err[PATH_MAX] = { ++ 0, ++ }; ++ int32_t len = 0; ++ ++ /* Time format for error query files */ ++ gmtime_r(¤t_time.tv_sec, &tm); ++ strftime(time_str, sizeof(time_str), "%F-%T", &tm); ++ ++ list_for_each_entry_safe(local_brick, temp, comp->brick_list, list) ++ { ++ /* rename error qfile*/ ++ len = snprintf(query_file_path_err, sizeof(query_file_path_err), ++ "%s-%s.err", local_brick->qfile_path, time_str); ++ if ((len >= 0) && (len < sizeof(query_file_path_err))) { ++ if (sys_rename(local_brick->qfile_path, query_file_path_err) == ++ -1) ++ gf_msg_debug("tier", 0, ++ "rename " ++ "failed"); ++ } ++ } ++ } ++ ++ query_cbk_args->qfile_array = NULL; ++ ++ return ret; ++} ++ ++int ++tier_demote(migration_args_t *demotion_args) ++{ ++ query_cbk_args_t query_cbk_args; ++ int ret = -1; ++ ++ GF_VALIDATE_OR_GOTO("tier", demotion_args, out); ++ GF_VALIDATE_OR_GOTO("tier", demotion_args->this, out); ++ GF_VALIDATE_OR_GOTO(demotion_args->this->name, demotion_args->brick_list, ++ out); ++ GF_VALIDATE_OR_GOTO(demotion_args->this->name, demotion_args->defrag, out); ++ ++ THIS = demotion_args->this; ++ ++ query_cbk_args.this = demotion_args->this; ++ query_cbk_args.defrag = demotion_args->defrag; ++ query_cbk_args.is_promotion = 0; ++ ++ /*Build the query file using bricklist*/ ++ ret = tier_build_migration_qfile(demotion_args, &query_cbk_args, _gf_false); ++ if (ret) ++ goto out; ++ ++ /* Migrate files using the query file */ ++ ret = tier_migrate_files_using_qfile(demotion_args, &query_cbk_args); ++ if (ret) ++ goto out; ++ ++out: ++ demotion_args->return_value = ret; ++ return ret; ++} ++ ++int ++tier_promote(migration_args_t *promotion_args) ++{ ++ int ret = -1; ++ query_cbk_args_t query_cbk_args; ++ ++ GF_VALIDATE_OR_GOTO("tier", promotion_args->this, out); ++ GF_VALIDATE_OR_GOTO(promotion_args->this->name, promotion_args->brick_list, ++ out); ++ GF_VALIDATE_OR_GOTO(promotion_args->this->name, promotion_args->defrag, ++ out); ++ ++ THIS = promotion_args->this; ++ ++ query_cbk_args.this = promotion_args->this; ++ query_cbk_args.defrag = promotion_args->defrag; ++ query_cbk_args.is_promotion = 1; ++ ++ /*Build the query file using bricklist*/ ++ ret = tier_build_migration_qfile(promotion_args, &query_cbk_args, _gf_true); ++ if (ret) ++ goto out; ++ ++ /* Migrate files using the query file */ ++ ret = tier_migrate_files_using_qfile(promotion_args, &query_cbk_args); ++ if (ret) ++ goto out; ++ ++out: ++ promotion_args->return_value = ret; ++ return ret; ++} ++ ++/* ++ * Command the CTR on a brick to compact the local database using an IPC ++ */ ++static int ++tier_process_self_compact(tier_brick_list_t *local_brick, void *args) ++{ ++ int ret = -1; ++ char *db_path = NULL; ++ query_cbk_args_t *query_cbk_args = NULL; ++ xlator_t *this = NULL; ++ gfdb_conn_node_t *conn_node = NULL; ++ dict_t *params_dict = NULL; ++ dict_t *ctr_ipc_dict = NULL; ++ gfdb_brick_info_t *gfdb_brick_info = args; ++ ++ /*Init of all the essentials*/ ++ GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out); ++ query_cbk_args = gfdb_brick_info->_query_cbk_args; ++ ++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out); ++ this = query_cbk_args->this; ++ ++ GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out); ++ ++ GF_VALIDATE_OR_GOTO(this->name, local_brick, out); ++ ++ GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out); ++ ++ GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out); ++ ++ db_path = local_brick->brick_db_path; ++ ++ /*Preparing DB parameters before init_db i.e getting db connection*/ ++ params_dict = dict_new(); ++ if (!params_dict) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "DB Params cannot initialized"); ++ goto out; ++ } ++ SET_DB_PARAM_TO_DICT(this->name, params_dict, ++ (char *)gfdb_methods.get_db_path_key(), db_path, ret, ++ out); ++ ++ /*Get the db connection*/ ++ conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type); ++ if (!conn_node) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "FATAL: Failed initializing db operations"); ++ goto out; ++ } ++ ++ ret = 0; ++ ++ /*Preparing ctr_ipc_dict*/ ++ ctr_ipc_dict = dict_new(); ++ if (!ctr_ipc_dict) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "ctr_ipc_dict cannot initialized"); ++ goto out; ++ } ++ ++ ret = dict_set_int32(ctr_ipc_dict, "compact_active", ++ query_cbk_args->defrag->tier_conf.compact_active); ++ ++ if (ret) { ++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED, ++ "Failed to set %s " ++ "to params dictionary", ++ "compact_active"); ++ goto out; ++ } ++ ++ ret = dict_set_int32( ++ ctr_ipc_dict, "compact_mode_switched", ++ query_cbk_args->defrag->tier_conf.compact_mode_switched); ++ ++ if (ret) { ++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED, ++ "Failed to set %s " ++ "to params dictionary", ++ "compact_mode_switched"); ++ goto out; ++ } ++ ++ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, GFDB_IPC_CTR_KEY, ++ GFDB_IPC_CTR_SET_COMPACT_PRAGMA, ret, out); ++ ++ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Starting Compaction IPC"); ++ ++ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict, ++ NULL); ++ ++ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Ending Compaction IPC"); ++ ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed compaction " ++ "on db %s error %d", ++ local_brick->brick_db_path, ret); ++ goto out; ++ } ++ ++ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, ++ "SUCCESS: %s Compaction", local_brick->brick_name); ++ ++ ret = 0; ++out: ++ if (params_dict) { ++ dict_unref(params_dict); ++ params_dict = NULL; ++ } ++ ++ if (ctr_ipc_dict) { ++ dict_unref(ctr_ipc_dict); ++ ctr_ipc_dict = NULL; ++ } ++ ++ gfdb_methods.fini_db(conn_node); ++ ++ return ret; ++} ++ ++/* ++ * This is the call back function for each brick from hot/cold bricklist. ++ * It determines the database type on each brick and calls the corresponding ++ * function to prepare the compaction IPC. ++ */ ++static int ++tier_compact_db_brick(tier_brick_list_t *local_brick, void *args) ++{ ++ int ret = -1; ++ ++ GF_VALIDATE_OR_GOTO("tier", local_brick, out); ++ ++ GF_VALIDATE_OR_GOTO("tier", local_brick->xlator, out); ++ ++ ret = tier_process_self_compact(local_brick, args); ++ if (ret) { ++ gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Brick %s did not compact", local_brick->brick_name); ++ goto out; ++ } ++ ++ ret = 0; ++ ++out: ++ ++ return ret; ++} ++ ++static int ++tier_send_compact(migration_args_t *args, query_cbk_args_t *query_cbk_args) ++{ ++ gfdb_time_t current_time; ++ gfdb_brick_info_t gfdb_brick_info; ++ gfdb_time_t time_in_past; ++ int ret = -1; ++ tier_brick_list_t *local_brick = NULL; ++ ++ time_in_past.tv_sec = args->freq_time; ++ time_in_past.tv_usec = 0; ++ ++ ret = gettimeofday(¤t_time, NULL); ++ if (ret == -1) { ++ gf_msg(args->this->name, GF_LOG_ERROR, errno, ++ DHT_MSG_SYS_CALL_GET_TIME_FAILED, "Failed to get current time"); ++ goto out; ++ } ++ time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec; ++ ++ /* The migration daemon may run a varying numberof usec after the sleep ++ call triggers. A file may be registered in CTR some number of usec X ++ after the daemon started and missed in the subsequent cycle if the ++ daemon starts Y usec after the period in seconds where Y>X. Normalize ++ away this problem by always setting usec to 0. */ ++ time_in_past.tv_usec = 0; ++ ++ gfdb_brick_info.time_stamp = &time_in_past; ++ ++ /* This is meant to say we are always compacting at this point */ ++ /* We simply borrow the promotion flag to do this */ ++ gfdb_brick_info._gfdb_promote = 1; ++ ++ gfdb_brick_info._query_cbk_args = query_cbk_args; ++ ++ list_for_each_entry(local_brick, args->brick_list, list) ++ { ++ gf_msg(args->this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Start compaction for %s", local_brick->brick_name); ++ ++ ret = tier_compact_db_brick(local_brick, &gfdb_brick_info); ++ if (ret) { ++ gf_msg(args->this->name, GF_LOG_ERROR, 0, ++ DHT_MSG_BRICK_QUERY_FAILED, "Brick %s compaction failed\n", ++ local_brick->brick_db_path); ++ } ++ ++ gf_msg(args->this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, ++ "End compaction for %s", local_brick->brick_name); ++ } ++ ret = 0; ++out: ++ return ret; ++} ++ ++static int ++tier_compact(void *args) ++{ ++ int ret = -1; ++ query_cbk_args_t query_cbk_args; ++ migration_args_t *compaction_args = args; ++ ++ GF_VALIDATE_OR_GOTO("tier", compaction_args->this, out); ++ GF_VALIDATE_OR_GOTO(compaction_args->this->name, ++ compaction_args->brick_list, out); ++ GF_VALIDATE_OR_GOTO(compaction_args->this->name, compaction_args->defrag, ++ out); ++ ++ THIS = compaction_args->this; ++ ++ query_cbk_args.this = compaction_args->this; ++ query_cbk_args.defrag = compaction_args->defrag; ++ query_cbk_args.is_compaction = 1; ++ ++ /* Send the compaction pragma out to all the bricks on the bricklist. */ ++ /* tier_get_bricklist ensures all bricks on the list are local to */ ++ /* this node. */ ++ ret = tier_send_compact(compaction_args, &query_cbk_args); ++ if (ret) ++ goto out; ++ ++ ret = 0; ++out: ++ compaction_args->return_value = ret; ++ return ret; ++} ++ ++static int ++tier_get_bricklist(xlator_t *xl, struct list_head *local_bricklist_head) ++{ ++ xlator_list_t *child = NULL; ++ char *rv = NULL; ++ char *rh = NULL; ++ char *brickname = NULL; ++ char db_name[PATH_MAX] = ""; ++ int ret = 0; ++ tier_brick_list_t *local_brick = NULL; ++ int32_t len = 0; ++ ++ GF_VALIDATE_OR_GOTO("tier", xl, out); ++ GF_VALIDATE_OR_GOTO("tier", local_bricklist_head, out); ++ ++ /* ++ * This function obtains remote subvolumes and filters out only ++ * those running on the same node as the tier daemon. ++ */ ++ if (strcmp(xl->type, "protocol/client") == 0) { ++ ret = dict_get_str(xl->options, "remote-host", &rh); ++ if (ret < 0) ++ goto out; ++ ++ if (gf_is_local_addr(rh)) { ++ local_brick = GF_CALLOC(1, sizeof(tier_brick_list_t), ++ gf_tier_mt_bricklist_t); ++ if (!local_brick) { ++ goto out; ++ } ++ ++ ret = dict_get_str(xl->options, "remote-subvolume", &rv); ++ if (ret < 0) ++ goto out; ++ ++ brickname = strrchr(rv, '/') + 1; ++ snprintf(db_name, sizeof(db_name), "%s.db", brickname); ++ ++ local_brick->brick_db_path = GF_MALLOC(PATH_MAX, gf_common_mt_char); ++ if (!local_brick->brick_db_path) { ++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Failed to allocate memory for" ++ " bricklist."); ++ ret = -1; ++ goto out; ++ } ++ ++ len = snprintf(local_brick->brick_db_path, PATH_MAX, "%s/%s/%s", rv, ++ GF_HIDDEN_PATH, db_name); ++ if ((len < 0) || (len >= PATH_MAX)) { ++ gf_msg("tier", GF_LOG_ERROR, EINVAL, DHT_MSG_LOG_TIER_STATUS, ++ "DB path too long"); ++ ret = -1; ++ goto out; ++ } ++ ++ local_brick->xlator = xl; ++ ++ snprintf(local_brick->brick_name, NAME_MAX, "%s", brickname); ++ ++ list_add_tail(&(local_brick->list), local_bricklist_head); ++ ++ ret = 0; ++ goto out; ++ } ++ } ++ ++ for (child = xl->children; child; child = child->next) { ++ ret = tier_get_bricklist(child->xlator, local_bricklist_head); ++ if (ret) { ++ goto out; ++ } ++ } ++ ++ ret = 0; ++out: ++ ++ if (ret) { ++ if (local_brick) { ++ GF_FREE(local_brick->brick_db_path); ++ } ++ GF_FREE(local_brick); ++ } ++ ++ return ret; ++} ++ ++int ++tier_get_freq_demote(gf_tier_conf_t *tier_conf) ++{ ++ if ((tier_conf->mode == TIER_MODE_WM) && ++ (tier_conf->watermark_last == TIER_WM_HI)) ++ return DEFAULT_DEMOTE_DEGRADED; ++ else ++ return tier_conf->tier_demote_frequency; ++} ++ ++int ++tier_get_freq_promote(gf_tier_conf_t *tier_conf) ++{ ++ return tier_conf->tier_promote_frequency; ++} ++ ++int ++tier_get_freq_compact_hot(gf_tier_conf_t *tier_conf) ++{ ++ return tier_conf->tier_compact_hot_frequency; ++} ++ ++int ++tier_get_freq_compact_cold(gf_tier_conf_t *tier_conf) ++{ ++ return tier_conf->tier_compact_cold_frequency; ++} ++ ++static int ++tier_check_demote(gfdb_time_t current_time, int freq) ++{ ++ return ((current_time.tv_sec % freq) == 0) ? _gf_true : _gf_false; ++} ++ ++static gf_boolean_t ++tier_check_promote(gf_tier_conf_t *tier_conf, gfdb_time_t current_time, ++ int freq) ++{ ++ if ((tier_conf->mode == TIER_MODE_WM) && ++ (tier_conf->watermark_last == TIER_WM_HI)) ++ return _gf_false; ++ ++ else ++ return ((current_time.tv_sec % freq) == 0) ? _gf_true : _gf_false; ++} ++ ++static gf_boolean_t ++tier_check_compact(gf_tier_conf_t *tier_conf, gfdb_time_t current_time, ++ int freq_compact) ++{ ++ if (!(tier_conf->compact_active || tier_conf->compact_mode_switched)) ++ return _gf_false; ++ ++ return ((current_time.tv_sec % freq_compact) == 0) ? _gf_true : _gf_false; ++} ++ ++void ++clear_bricklist(struct list_head *brick_list) ++{ ++ tier_brick_list_t *local_brick = NULL; ++ tier_brick_list_t *temp = NULL; ++ ++ if (list_empty(brick_list)) { ++ return; ++ } ++ ++ list_for_each_entry_safe(local_brick, temp, brick_list, list) ++ { ++ list_del(&local_brick->list); ++ GF_FREE(local_brick->brick_db_path); ++ GF_FREE(local_brick); ++ } ++} ++ ++static void ++set_brick_list_qpath(struct list_head *brick_list, gf_boolean_t is_cold) ++{ ++ tier_brick_list_t *local_brick = NULL; ++ int i = 0; ++ ++ GF_VALIDATE_OR_GOTO("tier", brick_list, out); ++ ++ list_for_each_entry(local_brick, brick_list, list) ++ { ++ /* Construct query file path for this brick ++ * i.e ++ * /var/run/gluster/xlator_name/ ++ * {promote/demote}-brickname-indexinbricklist ++ * So that no two query files will have same path even ++ * bricks have the same name ++ * */ ++ snprintf(local_brick->qfile_path, PATH_MAX, "%s-%s-%d", ++ GET_QFILE_PATH(is_cold), local_brick->brick_name, i); ++ i++; ++ } ++out: ++ return; ++} ++ ++static int ++tier_prepare_compact(migration_args_t *args, gfdb_time_t current_time) ++{ ++ xlator_t *this = NULL; ++ dht_conf_t *conf = NULL; ++ gf_defrag_info_t *defrag = NULL; ++ gf_tier_conf_t *tier_conf = NULL; ++ gf_boolean_t is_hot_tier = args->is_hot_tier; ++ int freq = 0; ++ int ret = -1; ++ const char *tier_type = is_hot_tier ? "hot" : "cold"; ++ ++ this = args->this; ++ ++ conf = this->private; ++ ++ defrag = conf->defrag; ++ ++ tier_conf = &defrag->tier_conf; ++ ++ freq = is_hot_tier ? tier_get_freq_compact_hot(tier_conf) ++ : tier_get_freq_compact_cold(tier_conf); ++ ++ defrag->tier_conf.compact_mode_switched = ++ is_hot_tier ? defrag->tier_conf.compact_mode_switched_hot ++ : defrag->tier_conf.compact_mode_switched_cold; ++ ++ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Compact mode %i", defrag->tier_conf.compact_mode_switched); ++ ++ if (tier_check_compact(tier_conf, current_time, freq)) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Start compaction on %s tier", tier_type); ++ ++ args->freq_time = freq; ++ ret = tier_compact(args); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Compaction failed on " ++ "%s tier", ++ tier_type); ++ goto out; ++ } ++ ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "End compaction on %s tier", tier_type); ++ ++ if (is_hot_tier) { ++ defrag->tier_conf.compact_mode_switched_hot = _gf_false; ++ } else { ++ defrag->tier_conf.compact_mode_switched_cold = _gf_false; ++ } ++ } ++ ++out: ++ return ret; ++} ++ ++static int ++tier_get_wm_interval(tier_mode_t mode, tier_watermark_op_t wm) ++{ ++ if (mode == TIER_MODE_WM && wm == TIER_WM_HI) ++ return WM_INTERVAL_EMERG; ++ ++ return WM_INTERVAL; ++} ++ ++/* ++ * Main tiering loop. This is called from the promotion and the ++ * demotion threads spawned in tier_start(). ++ * ++ * Every second, wake from sleep to perform tasks. ++ * 1. Check trigger to migrate data. ++ * 2. Check for state changes (pause, unpause, stop). ++ */ ++static void * ++tier_run(void *in_args) ++{ ++ dht_conf_t *conf = NULL; ++ gfdb_time_t current_time = {0}; ++ int freq = 0; ++ int ret = 0; ++ xlator_t *any = NULL; ++ xlator_t *xlator = NULL; ++ gf_tier_conf_t *tier_conf = NULL; ++ loc_t root_loc = {0}; ++ int check_watermark = 0; ++ gf_defrag_info_t *defrag = NULL; ++ xlator_t *this = NULL; ++ migration_args_t *args = in_args; ++ GF_VALIDATE_OR_GOTO("tier", args, out); ++ GF_VALIDATE_OR_GOTO("tier", args->brick_list, out); ++ ++ this = args->this; ++ GF_VALIDATE_OR_GOTO("tier", this, out); ++ ++ conf = this->private; ++ GF_VALIDATE_OR_GOTO("tier", conf, out); ++ ++ defrag = conf->defrag; ++ GF_VALIDATE_OR_GOTO("tier", defrag, out); ++ ++ if (list_empty(args->brick_list)) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Brick list for tier is empty. Exiting."); ++ goto out; ++ } ++ ++ defrag->defrag_status = GF_DEFRAG_STATUS_STARTED; ++ tier_conf = &defrag->tier_conf; ++ ++ dht_build_root_loc(defrag->root_inode, &root_loc); ++ ++ while (1) { ++ /* ++ * Check if a graph switch occurred. If so, stop migration ++ * thread. It will need to be restarted manually. ++ */ ++ any = THIS->ctx->active->first; ++ xlator = xlator_search_by_name(any, this->name); ++ ++ if (xlator != this) { ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Detected graph switch. Exiting migration " ++ "daemon."); ++ goto out; ++ } ++ ++ gf_defrag_check_pause_tier(tier_conf); ++ ++ sleep(1); ++ ++ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) { ++ ret = 1; ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "defrag->defrag_status != " ++ "GF_DEFRAG_STATUS_STARTED"); ++ goto out; ++ } ++ ++ if (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER || ++ defrag->cmd == GF_DEFRAG_CMD_DETACH_START) { ++ ret = 0; ++ defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE; ++ gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_LOG_TIER_ERROR, ++ "defrag->defrag_cmd == " ++ "GF_DEFRAG_CMD_START_DETACH_TIER"); ++ goto out; ++ } ++ ++ if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING) ++ continue; ++ ++ /* To have proper synchronization amongst all ++ * brick holding nodes, so that promotion and demotions ++ * start atomically w.r.t promotion/demotion frequency ++ * period, all nodes should have their system time ++ * in-sync with each other either manually set or ++ * using a NTP server*/ ++ ret = gettimeofday(¤t_time, NULL); ++ if (ret == -1) { ++ gf_msg(this->name, GF_LOG_ERROR, errno, ++ DHT_MSG_SYS_CALL_GET_TIME_FAILED, ++ "Failed to get current time"); ++ goto out; ++ } ++ ++ check_watermark++; ++ ++ /* emergency demotion requires frequent watermark monitoring */ ++ if (check_watermark >= ++ tier_get_wm_interval(tier_conf->mode, tier_conf->watermark_last)) { ++ check_watermark = 0; ++ if (tier_conf->mode == TIER_MODE_WM) { ++ ret = tier_get_fs_stat(this, &root_loc); ++ if (ret != 0) { ++ continue; ++ } ++ ret = tier_check_watermark(this); ++ if (ret != 0) { ++ gf_msg(this->name, GF_LOG_CRITICAL, errno, ++ DHT_MSG_LOG_TIER_ERROR, "Failed to get watermark"); ++ continue; ++ } ++ } ++ } ++ ++ if (args->is_promotion) { ++ freq = tier_get_freq_promote(tier_conf); ++ ++ if (tier_check_promote(tier_conf, current_time, freq)) { ++ args->freq_time = freq; ++ ret = tier_promote(args); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Promotion failed"); ++ } ++ } ++ } else if (args->is_compaction) { ++ tier_prepare_compact(args, current_time); ++ } else { ++ freq = tier_get_freq_demote(tier_conf); ++ ++ if (tier_check_demote(current_time, freq)) { ++ args->freq_time = freq; ++ ret = tier_demote(args); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Demotion failed"); ++ } ++ } ++ } ++ ++ /* Check the statfs immediately after the processing threads ++ return */ ++ check_watermark = WM_INTERVAL; ++ } ++ ++ ret = 0; ++out: ++ ++ args->return_value = ret; ++ ++ return NULL; ++} ++ ++int ++tier_start(xlator_t *this, gf_defrag_info_t *defrag) ++{ ++ pthread_t promote_thread; ++ pthread_t demote_thread; ++ pthread_t hot_compact_thread; ++ pthread_t cold_compact_thread; ++ int ret = -1; ++ struct list_head bricklist_hot = {0}; ++ struct list_head bricklist_cold = {0}; ++ migration_args_t promotion_args = {0}; ++ migration_args_t demotion_args = {0}; ++ migration_args_t hot_compaction_args = {0}; ++ migration_args_t cold_compaction_args = {0}; ++ dht_conf_t *conf = NULL; ++ ++ INIT_LIST_HEAD((&bricklist_hot)); ++ INIT_LIST_HEAD((&bricklist_cold)); ++ ++ conf = this->private; ++ ++ tier_get_bricklist(conf->subvolumes[1], &bricklist_hot); ++ set_brick_list_qpath(&bricklist_hot, _gf_false); ++ ++ demotion_args.this = this; ++ demotion_args.brick_list = &bricklist_hot; ++ demotion_args.defrag = defrag; ++ demotion_args.is_promotion = _gf_false; ++ demotion_args.is_compaction = _gf_false; ++ ++ ret = gf_thread_create(&demote_thread, NULL, &tier_run, &demotion_args, ++ "tierdem"); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to start demotion thread."); ++ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; ++ goto cleanup; ++ } ++ ++ tier_get_bricklist(conf->subvolumes[0], &bricklist_cold); ++ set_brick_list_qpath(&bricklist_cold, _gf_true); ++ ++ promotion_args.this = this; ++ promotion_args.brick_list = &bricklist_cold; ++ promotion_args.defrag = defrag; ++ promotion_args.is_promotion = _gf_true; ++ ++ ret = gf_thread_create(&promote_thread, NULL, &tier_run, &promotion_args, ++ "tierpro"); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to start promotion thread."); ++ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; ++ goto waitforspawned; ++ } ++ ++ hot_compaction_args.this = this; ++ hot_compaction_args.brick_list = &bricklist_hot; ++ hot_compaction_args.defrag = defrag; ++ hot_compaction_args.is_promotion = _gf_false; ++ hot_compaction_args.is_compaction = _gf_true; ++ hot_compaction_args.is_hot_tier = _gf_true; ++ ++ ret = gf_thread_create(&hot_compact_thread, NULL, &tier_run, ++ &hot_compaction_args, "tierhcom"); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to start compaction thread."); ++ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; ++ goto waitforspawnedpromote; ++ } ++ ++ cold_compaction_args.this = this; ++ cold_compaction_args.brick_list = &bricklist_cold; ++ cold_compaction_args.defrag = defrag; ++ cold_compaction_args.is_promotion = _gf_false; ++ cold_compaction_args.is_compaction = _gf_true; ++ cold_compaction_args.is_hot_tier = _gf_false; ++ ++ ret = gf_thread_create(&cold_compact_thread, NULL, &tier_run, ++ &cold_compaction_args, "tierccom"); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Failed to start compaction thread."); ++ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED; ++ goto waitforspawnedhotcompact; ++ } ++ pthread_join(cold_compact_thread, NULL); ++ ++waitforspawnedhotcompact: ++ pthread_join(hot_compact_thread, NULL); ++ ++waitforspawnedpromote: ++ pthread_join(promote_thread, NULL); ++ ++waitforspawned: ++ pthread_join(demote_thread, NULL); ++ ++cleanup: ++ clear_bricklist(&bricklist_cold); ++ clear_bricklist(&bricklist_hot); ++ return ret; ++} ++ ++int32_t ++tier_migration_needed(xlator_t *this) ++{ ++ gf_defrag_info_t *defrag = NULL; ++ dht_conf_t *conf = NULL; ++ int ret = 0; ++ ++ conf = this->private; ++ ++ GF_VALIDATE_OR_GOTO(this->name, conf, out); ++ GF_VALIDATE_OR_GOTO(this->name, conf->defrag, out); ++ ++ defrag = conf->defrag; ++ ++ if ((defrag->cmd == GF_DEFRAG_CMD_START_TIER) || ++ (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER)) ++ ret = 1; ++out: ++ return ret; ++} ++ ++int32_t ++tier_migration_get_dst(xlator_t *this, dht_local_t *local) ++{ ++ dht_conf_t *conf = NULL; ++ int32_t ret = -1; ++ gf_defrag_info_t *defrag = NULL; ++ ++ GF_VALIDATE_OR_GOTO("tier", this, out); ++ GF_VALIDATE_OR_GOTO(this->name, this->private, out); ++ ++ conf = this->private; ++ ++ defrag = conf->defrag; ++ ++ if (defrag && defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER) { ++ local->rebalance.target_node = conf->subvolumes[0]; ++ ++ } else if (conf->subvolumes[0] == local->cached_subvol) ++ local->rebalance.target_node = conf->subvolumes[1]; ++ else ++ local->rebalance.target_node = conf->subvolumes[0]; ++ ++ if (local->rebalance.target_node) ++ ret = 0; ++ ++out: ++ return ret; ++} ++ ++xlator_t * ++tier_search(xlator_t *this, dht_layout_t *layout, const char *name) ++{ ++ xlator_t *subvol = NULL; ++ dht_conf_t *conf = NULL; ++ ++ GF_VALIDATE_OR_GOTO("tier", this, out); ++ GF_VALIDATE_OR_GOTO(this->name, this->private, out); ++ ++ conf = this->private; ++ ++ subvol = TIER_HASHED_SUBVOL; ++ ++out: ++ return subvol; ++} ++ ++static int ++tier_load_externals(xlator_t *this) ++{ ++ int ret = -1; ++ char *libpathfull = (LIBDIR "/libgfdb.so.0"); ++ get_gfdb_methods_t get_gfdb_methods; ++ ++ GF_VALIDATE_OR_GOTO("this", this, out); ++ ++ libhandle = dlopen(libpathfull, RTLD_NOW); ++ if (!libhandle) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Error loading libgfdb.so %s\n", dlerror()); ++ ret = -1; ++ goto out; ++ } ++ ++ get_gfdb_methods = dlsym(libhandle, "get_gfdb_methods"); ++ if (!get_gfdb_methods) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Error loading get_gfdb_methods()"); ++ ret = -1; ++ goto out; ++ } ++ ++ get_gfdb_methods(&gfdb_methods); ++ ++ ret = 0; ++ ++out: ++ if (ret && libhandle) ++ dlclose(libhandle); ++ ++ return ret; ++} ++ ++static tier_mode_t ++tier_validate_mode(char *mode) ++{ ++ int ret = -1; ++ ++ if (strcmp(mode, "test") == 0) { ++ ret = TIER_MODE_TEST; ++ } else { ++ ret = TIER_MODE_WM; ++ } ++ ++ return ret; ++} ++ ++static gf_boolean_t ++tier_validate_compact_mode(char *mode) ++{ ++ gf_boolean_t ret = _gf_false; ++ ++ gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "tier_validate_compact_mode: mode = %s", mode); ++ ++ if (!strcmp(mode, "on")) { ++ ret = _gf_true; ++ } else { ++ ret = _gf_false; ++ } ++ ++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS, ++ "tier_validate_compact_mode: ret = %i", ret); ++ ++ return ret; ++} ++ ++int ++tier_init_methods(xlator_t *this) ++{ ++ int ret = -1; ++ dht_conf_t *conf = NULL; ++ dht_methods_t *methods = NULL; ++ ++ GF_VALIDATE_OR_GOTO("tier", this, err); ++ ++ conf = this->private; ++ ++ methods = &(conf->methods); ++ ++ methods->migration_get_dst_subvol = tier_migration_get_dst; ++ methods->migration_other = tier_start; ++ methods->migration_needed = tier_migration_needed; ++ methods->layout_search = tier_search; ++ ++ ret = 0; ++err: ++ return ret; ++} ++ ++static void ++tier_save_vol_name(xlator_t *this) ++{ ++ dht_conf_t *conf = NULL; ++ gf_defrag_info_t *defrag = NULL; ++ char *suffix = NULL; ++ int name_len = 0; ++ ++ conf = this->private; ++ defrag = conf->defrag; ++ ++ suffix = strstr(this->name, "-tier-dht"); ++ ++ if (suffix) ++ name_len = suffix - this->name; ++ else ++ name_len = strlen(this->name); ++ ++ if (name_len > GD_VOLUME_NAME_MAX) ++ name_len = GD_VOLUME_NAME_MAX; ++ ++ strncpy(defrag->tier_conf.volname, this->name, name_len); ++ defrag->tier_conf.volname[name_len] = 0; ++} ++ ++int ++tier_init(xlator_t *this) ++{ ++ int ret = -1; ++ int freq = 0; ++ int maxsize = 0; ++ dht_conf_t *conf = NULL; ++ gf_defrag_info_t *defrag = NULL; ++ char *voldir = NULL; ++ char *mode = NULL; ++ char *paused = NULL; ++ tier_mode_t tier_mode = DEFAULT_TIER_MODE; ++ gf_boolean_t compact_mode = _gf_false; ++ ++ ret = dht_init(this); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "tier_init failed"); ++ goto out; ++ } ++ ++ conf = this->private; ++ ++ ret = tier_init_methods(this); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "tier_init_methods failed"); ++ goto out; ++ } ++ ++ if (conf->subvolume_cnt != 2) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Invalid number of subvolumes %d", conf->subvolume_cnt); ++ goto out; ++ } ++ ++ /* if instatiated from client side initialization is complete. */ ++ if (!conf->defrag) { ++ ret = 0; ++ goto out; ++ } ++ ++ /* if instatiated from server side, load db libraries */ ++ ret = tier_load_externals(this); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "Could not load externals. Aborting"); ++ goto out; ++ } ++ ++ defrag = conf->defrag; ++ ++ defrag->tier_conf.last_demote_qfile_index = 0; ++ defrag->tier_conf.last_promote_qfile_index = 0; ++ ++ defrag->tier_conf.is_tier = 1; ++ defrag->this = this; ++ ++ ret = dict_get_int32(this->options, "tier-max-promote-file-size", &maxsize); ++ if (ret) { ++ maxsize = 0; ++ } ++ ++ defrag->tier_conf.tier_max_promote_size = maxsize; ++ ++ ret = dict_get_int32(this->options, "tier-promote-frequency", &freq); ++ if (ret) { ++ freq = DEFAULT_PROMOTE_FREQ_SEC; ++ } ++ ++ defrag->tier_conf.tier_promote_frequency = freq; ++ ++ ret = dict_get_int32(this->options, "tier-demote-frequency", &freq); ++ if (ret) { ++ freq = DEFAULT_DEMOTE_FREQ_SEC; ++ } ++ ++ defrag->tier_conf.tier_demote_frequency = freq; ++ ++ ret = dict_get_int32(this->options, "tier-hot-compact-frequency", &freq); ++ if (ret) { ++ freq = DEFAULT_HOT_COMPACT_FREQ_SEC; ++ } ++ ++ defrag->tier_conf.tier_compact_hot_frequency = freq; ++ ++ ret = dict_get_int32(this->options, "tier-cold-compact-frequency", &freq); ++ if (ret) { ++ freq = DEFAULT_COLD_COMPACT_FREQ_SEC; ++ } ++ ++ defrag->tier_conf.tier_compact_cold_frequency = freq; ++ ++ ret = dict_get_int32(this->options, "watermark-hi", &freq); ++ if (ret) { ++ freq = DEFAULT_WM_HI; ++ } ++ ++ defrag->tier_conf.watermark_hi = freq; ++ ++ ret = dict_get_int32(this->options, "watermark-low", &freq); ++ if (ret) { ++ freq = DEFAULT_WM_LOW; ++ } ++ ++ defrag->tier_conf.watermark_low = freq; ++ ++ ret = dict_get_int32(this->options, "write-freq-threshold", &freq); ++ if (ret) { ++ freq = DEFAULT_WRITE_FREQ_SEC; ++ } ++ ++ defrag->write_freq_threshold = freq; ++ ++ ret = dict_get_int32(this->options, "read-freq-threshold", &freq); ++ if (ret) { ++ freq = DEFAULT_READ_FREQ_SEC; ++ } ++ ++ defrag->read_freq_threshold = freq; ++ ++ ret = dict_get_int32(this->options, "tier-max-mb", &freq); ++ if (ret) { ++ freq = DEFAULT_TIER_MAX_MIGRATE_MB; ++ } ++ ++ defrag->tier_conf.max_migrate_bytes = (uint64_t)freq * 1024 * 1024; ++ ++ ret = dict_get_int32(this->options, "tier-max-files", &freq); ++ if (ret) { ++ freq = DEFAULT_TIER_MAX_MIGRATE_FILES; ++ } ++ ++ defrag->tier_conf.max_migrate_files = freq; ++ ++ ret = dict_get_int32(this->options, "tier-query-limit", ++ &(defrag->tier_conf.query_limit)); ++ if (ret) { ++ defrag->tier_conf.query_limit = DEFAULT_TIER_QUERY_LIMIT; ++ } ++ ++ ret = dict_get_str(this->options, "tier-compact", &mode); ++ ++ if (ret) { ++ defrag->tier_conf.compact_active = DEFAULT_COMP_MODE; ++ } else { ++ compact_mode = tier_validate_compact_mode(mode); ++ /* If compaction is now active, we need to inform the bricks on ++ the hot and cold tier of this. See dht-common.h for more. */ ++ defrag->tier_conf.compact_active = compact_mode; ++ if (compact_mode) { ++ defrag->tier_conf.compact_mode_switched_hot = _gf_true; ++ defrag->tier_conf.compact_mode_switched_cold = _gf_true; ++ } ++ } ++ ++ ret = dict_get_str(this->options, "tier-mode", &mode); ++ if (ret) { ++ defrag->tier_conf.mode = DEFAULT_TIER_MODE; ++ } else { ++ tier_mode = tier_validate_mode(mode); ++ defrag->tier_conf.mode = tier_mode; ++ } ++ ++ pthread_mutex_init(&defrag->tier_conf.pause_mutex, 0); ++ ++ gf_defrag_set_pause_state(&defrag->tier_conf, TIER_RUNNING); ++ ++ ret = dict_get_str(this->options, "tier-pause", &paused); ++ ++ if (paused && strcmp(paused, "on") == 0) ++ gf_defrag_set_pause_state(&defrag->tier_conf, TIER_REQUEST_PAUSE); ++ ++ ret = gf_asprintf(&voldir, "%s/%s", DEFAULT_VAR_RUN_DIRECTORY, this->name); ++ if (ret < 0) ++ goto out; ++ ++ ret = mkdir_p(voldir, 0777, _gf_true); ++ if (ret == -1 && errno != EEXIST) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "tier_init failed"); ++ ++ GF_FREE(voldir); ++ goto out; ++ } ++ ++ GF_FREE(voldir); ++ ++ ret = gf_asprintf(&promotion_qfile, "%s/%s/promote", ++ DEFAULT_VAR_RUN_DIRECTORY, this->name); ++ if (ret < 0) ++ goto out; ++ ++ ret = gf_asprintf(&demotion_qfile, "%s/%s/demote", ++ DEFAULT_VAR_RUN_DIRECTORY, this->name); ++ if (ret < 0) { ++ GF_FREE(promotion_qfile); ++ goto out; ++ } ++ ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "Promote/demote frequency %d/%d " ++ "Write/Read freq thresholds %d/%d", ++ defrag->tier_conf.tier_promote_frequency, ++ defrag->tier_conf.tier_demote_frequency, ++ defrag->write_freq_threshold, defrag->read_freq_threshold); ++ ++ tier_save_vol_name(this); ++ ++ ret = 0; ++ ++out: ++ ++ return ret; ++} ++ ++int ++tier_cli_pause_done(int op_ret, call_frame_t *sync_frame, void *data) ++{ ++ gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED, ++ "Migrate file paused with op_ret %d", op_ret); ++ ++ return op_ret; ++} ++ ++int ++tier_cli_pause(void *data) ++{ ++ gf_defrag_info_t *defrag = NULL; ++ xlator_t *this = NULL; ++ dht_conf_t *conf = NULL; ++ int ret = -1; ++ ++ this = data; ++ ++ conf = this->private; ++ GF_VALIDATE_OR_GOTO(this->name, conf, exit); ++ ++ defrag = conf->defrag; ++ GF_VALIDATE_OR_GOTO(this->name, defrag, exit); ++ ++ gf_defrag_pause_tier(this, defrag); ++ ++ ret = 0; ++exit: ++ return ret; ++} ++ ++int ++tier_reconfigure(xlator_t *this, dict_t *options) ++{ ++ dht_conf_t *conf = NULL; ++ gf_defrag_info_t *defrag = NULL; ++ char *mode = NULL; ++ int migrate_mb = 0; ++ gf_boolean_t req_pause = _gf_false; ++ int ret = 0; ++ call_frame_t *frame = NULL; ++ gf_boolean_t last_compact_setting = _gf_false; ++ ++ conf = this->private; ++ ++ if (conf->defrag) { ++ defrag = conf->defrag; ++ GF_OPTION_RECONF("tier-max-promote-file-size", ++ defrag->tier_conf.tier_max_promote_size, options, ++ int32, out); ++ ++ GF_OPTION_RECONF("tier-promote-frequency", ++ defrag->tier_conf.tier_promote_frequency, options, ++ int32, out); ++ ++ GF_OPTION_RECONF("tier-demote-frequency", ++ defrag->tier_conf.tier_demote_frequency, options, ++ int32, out); ++ ++ GF_OPTION_RECONF("write-freq-threshold", defrag->write_freq_threshold, ++ options, int32, out); ++ ++ GF_OPTION_RECONF("read-freq-threshold", defrag->read_freq_threshold, ++ options, int32, out); ++ ++ GF_OPTION_RECONF("watermark-hi", defrag->tier_conf.watermark_hi, ++ options, int32, out); ++ ++ GF_OPTION_RECONF("watermark-low", defrag->tier_conf.watermark_low, ++ options, int32, out); ++ ++ last_compact_setting = defrag->tier_conf.compact_active; ++ ++ GF_OPTION_RECONF("tier-compact", defrag->tier_conf.compact_active, ++ options, bool, out); ++ ++ if (last_compact_setting != defrag->tier_conf.compact_active) { ++ defrag->tier_conf.compact_mode_switched_hot = _gf_true; ++ defrag->tier_conf.compact_mode_switched_cold = _gf_true; ++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS, ++ "compact mode switched"); ++ } ++ ++ GF_OPTION_RECONF("tier-hot-compact-frequency", ++ defrag->tier_conf.tier_compact_hot_frequency, options, ++ int32, out); ++ ++ GF_OPTION_RECONF("tier-cold-compact-frequency", ++ defrag->tier_conf.tier_compact_cold_frequency, options, ++ int32, out); ++ ++ GF_OPTION_RECONF("tier-mode", mode, options, str, out); ++ defrag->tier_conf.mode = tier_validate_mode(mode); ++ ++ GF_OPTION_RECONF("tier-max-mb", migrate_mb, options, int32, out); ++ defrag->tier_conf.max_migrate_bytes = (uint64_t)migrate_mb * 1024 * ++ 1024; ++ ++ GF_OPTION_RECONF("tier-max-files", defrag->tier_conf.max_migrate_files, ++ options, int32, out); ++ ++ GF_OPTION_RECONF("tier-query-limit", defrag->tier_conf.query_limit, ++ options, int32, out); ++ ++ GF_OPTION_RECONF("tier-pause", req_pause, options, bool, out); ++ ++ if (req_pause == _gf_true) { ++ frame = create_frame(this, this->ctx->pool); ++ if (!frame) ++ goto out; ++ ++ frame->root->pid = GF_CLIENT_PID_DEFRAG; ++ ++ ret = synctask_new(this->ctx->env, tier_cli_pause, ++ tier_cli_pause_done, frame, this); ++ ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "pause tier failed on reconfigure"); ++ } ++ } else { ++ ret = gf_defrag_resume_tier(this, defrag); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR, ++ "resume tier failed on reconfigure"); ++ } ++ } ++ } ++ ++out: ++ return dht_reconfigure(this, options); ++} ++ ++void ++tier_fini(xlator_t *this) ++{ ++ if (libhandle) ++ dlclose(libhandle); ++ ++ GF_FREE(demotion_qfile); ++ GF_FREE(promotion_qfile); ++ ++ dht_fini(this); ++} ++ ++struct xlator_fops fops = { ++ ++ .lookup = dht_lookup, ++ .create = tier_create, ++ .mknod = dht_mknod, ++ ++ .open = dht_open, ++ .statfs = tier_statfs, ++ .opendir = dht_opendir, ++ .readdir = tier_readdir, ++ .readdirp = tier_readdirp, ++ .fsyncdir = dht_fsyncdir, ++ .symlink = dht_symlink, ++ .unlink = tier_unlink, ++ .link = tier_link, ++ .mkdir = dht_mkdir, ++ .rmdir = dht_rmdir, ++ .rename = dht_rename, ++ .entrylk = dht_entrylk, ++ .fentrylk = dht_fentrylk, ++ ++ /* Inode read operations */ ++ .stat = dht_stat, ++ .fstat = dht_fstat, ++ .access = dht_access, ++ .readlink = dht_readlink, ++ .getxattr = dht_getxattr, ++ .fgetxattr = dht_fgetxattr, ++ .readv = dht_readv, ++ .flush = dht_flush, ++ .fsync = dht_fsync, ++ .inodelk = dht_inodelk, ++ .finodelk = dht_finodelk, ++ .lk = dht_lk, ++ ++ /* Inode write operations */ ++ .fremovexattr = dht_fremovexattr, ++ .removexattr = dht_removexattr, ++ .setxattr = dht_setxattr, ++ .fsetxattr = dht_fsetxattr, ++ .truncate = dht_truncate, ++ .ftruncate = dht_ftruncate, ++ .writev = dht_writev, ++ .xattrop = dht_xattrop, ++ .fxattrop = dht_fxattrop, ++ .setattr = dht_setattr, ++ .fsetattr = dht_fsetattr, ++ .fallocate = dht_fallocate, ++ .discard = dht_discard, ++ .zerofill = dht_zerofill, ++}; ++ ++struct xlator_cbks cbks = {.release = dht_release, .forget = dht_forget}; ++ ++extern int32_t ++mem_acct_init(xlator_t *this); ++ ++extern struct volume_options dht_options[]; ++ ++xlator_api_t xlator_api = { ++ .init = tier_init, ++ .fini = tier_fini, ++ .notify = dht_notify, ++ .reconfigure = tier_reconfigure, ++ .mem_acct_init = mem_acct_init, ++ .op_version = {GD_OP_VERSION_3_7_0}, /* Present from the initial version */ ++ .fops = &fops, ++ .cbks = &cbks, ++ .options = dht_options, ++ .identifier = "tier", ++ .category = GF_MAINTAINED, ++}; ++ +diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h +new file mode 100644 +index 0000000..a20b1db +--- /dev/null ++++ b/xlators/cluster/dht/src/tier.h +@@ -0,0 +1,110 @@ ++/* ++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#ifndef _TIER_H_ ++#define _TIER_H_ ++ ++/******************************************************************************/ ++/* This is from dht-rebalancer.c as we don't have dht-rebalancer.h */ ++#include "dht-common.h" ++#include <glusterfs/xlator.h> ++#include <signal.h> ++#include <fnmatch.h> ++#include <signal.h> ++ ++/* ++ * Size of timer wheel. We would not promote or demote less ++ * frequently than this number. ++ */ ++#define TIMER_SECS 3600 ++ ++#include "gfdb_data_store.h" ++#include <ctype.h> ++#include <sys/stat.h> ++ ++#define PROMOTION_QFILE "promotequeryfile" ++#define DEMOTION_QFILE "demotequeryfile" ++ ++#define TIER_HASHED_SUBVOL conf->subvolumes[0] ++#define TIER_UNHASHED_SUBVOL conf->subvolumes[1] ++ ++#define GET_QFILE_PATH(is_promotion) \ ++ (is_promotion) ? promotion_qfile : demotion_qfile ++ ++typedef struct tier_qfile_array { ++ int *fd_array; ++ ssize_t array_size; ++ ssize_t next_index; ++ /* Indicate the number of exhuasted FDs*/ ++ ssize_t exhausted_count; ++} tier_qfile_array_t; ++ ++typedef struct _query_cbk_args { ++ xlator_t *this; ++ gf_defrag_info_t *defrag; ++ /* This is write */ ++ int query_fd; ++ int is_promotion; ++ int is_compaction; ++ /* This is for read */ ++ tier_qfile_array_t *qfile_array; ++} query_cbk_args_t; ++ ++int ++gf_run_tier(xlator_t *this, gf_defrag_info_t *defrag); ++ ++typedef struct gfdb_brick_info { ++ gfdb_time_t *time_stamp; ++ gf_boolean_t _gfdb_promote; ++ query_cbk_args_t *_query_cbk_args; ++} gfdb_brick_info_t; ++ ++typedef struct brick_list { ++ xlator_t *xlator; ++ char *brick_db_path; ++ char brick_name[NAME_MAX]; ++ char qfile_path[PATH_MAX]; ++ struct list_head list; ++} tier_brick_list_t; ++ ++typedef struct _dm_thread_args { ++ xlator_t *this; ++ gf_defrag_info_t *defrag; ++ struct list_head *brick_list; ++ int freq_time; ++ int return_value; ++ int is_promotion; ++ int is_compaction; ++ gf_boolean_t is_hot_tier; ++} migration_args_t; ++ ++typedef enum tier_watermark_op_ { ++ TIER_WM_NONE = 0, ++ TIER_WM_LOW, ++ TIER_WM_HI, ++ TIER_WM_MID ++} tier_watermark_op_t; ++ ++#define DEFAULT_PROMOTE_FREQ_SEC 120 ++#define DEFAULT_DEMOTE_FREQ_SEC 120 ++#define DEFAULT_HOT_COMPACT_FREQ_SEC 604800 ++#define DEFAULT_COLD_COMPACT_FREQ_SEC 604800 ++#define DEFAULT_DEMOTE_DEGRADED 1 ++#define DEFAULT_WRITE_FREQ_SEC 0 ++#define DEFAULT_READ_FREQ_SEC 0 ++#define DEFAULT_WM_LOW 75 ++#define DEFAULT_WM_HI 90 ++#define DEFAULT_TIER_MODE TIER_MODE_TEST ++#define DEFAULT_COMP_MODE _gf_true ++#define DEFAULT_TIER_MAX_MIGRATE_MB 1000 ++#define DEFAULT_TIER_MAX_MIGRATE_FILES 5000 ++#define DEFAULT_TIER_QUERY_LIMIT 100 ++ ++#endif +diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am +index 194634b..545c02b 100644 +--- a/xlators/features/Makefile.am ++++ b/xlators/features/Makefile.am +@@ -5,6 +5,6 @@ endif + SUBDIRS = locks quota read-only quiesce marker index barrier arbiter upcall \ + compress changelog gfid-access snapview-client snapview-server trash \ + shard bit-rot leases selinux sdfs namespace $(CLOUDSYNC_DIR) thin-arbiter \ +- utime ++ utime changetimerecorder + + CLEANFILES = +diff --git a/xlators/features/changetimerecorder/Makefile.am b/xlators/features/changetimerecorder/Makefile.am +new file mode 100644 +index 0000000..a985f42 +--- /dev/null ++++ b/xlators/features/changetimerecorder/Makefile.am +@@ -0,0 +1,3 @@ ++SUBDIRS = src ++ ++CLEANFILES = +diff --git a/xlators/features/changetimerecorder/src/Makefile.am b/xlators/features/changetimerecorder/src/Makefile.am +new file mode 100644 +index 0000000..620017e +--- /dev/null ++++ b/xlators/features/changetimerecorder/src/Makefile.am +@@ -0,0 +1,26 @@ ++xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features ++ ++# changetimerecorder can only get build when libgfdb is enabled ++if BUILD_GFDB ++ xlator_LTLIBRARIES = changetimerecorder.la ++endif ++ ++changetimerecorder_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) ++ ++changetimerecorder_la_SOURCES = changetimerecorder.c \ ++ ctr-helper.c ctr-xlator-ctx.c ++ ++changetimerecorder_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\ ++ $(top_builddir)/libglusterfs/src/gfdb/libgfdb.la ++ ++noinst_HEADERS = ctr-messages.h changetimerecorder.h ctr_mem_types.h \ ++ ctr-helper.h ctr-xlator-ctx.h ++ ++AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \ ++ -I$(top_srcdir)/libglusterfs/src/gfdb \ ++ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \ ++ -DDATADIR=\"$(localstatedir)\" ++ ++AM_CFLAGS = -Wall $(GF_CFLAGS) $(SQLITE_CFLAGS) ++ ++CLEANFILES = +diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.c b/xlators/features/changetimerecorder/src/changetimerecorder.c +new file mode 100644 +index 0000000..f2aa4a9 +--- /dev/null ++++ b/xlators/features/changetimerecorder/src/changetimerecorder.c +@@ -0,0 +1,2371 @@ ++/* ++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++#include <ctype.h> ++#include <sys/uio.h> ++ ++#include "gfdb_sqlite3.h" ++#include "ctr-helper.h" ++#include "ctr-messages.h" ++#include <glusterfs/syscall.h> ++ ++#include "changetimerecorder.h" ++#include "tier-ctr-interface.h" ++ ++/*******************************inode forget***********************************/ ++int ++ctr_forget(xlator_t *this, inode_t *inode) ++{ ++ fini_ctr_xlator_ctx(this, inode); ++ return 0; ++} ++ ++/************************** Look up heal **************************************/ ++/* ++Problem: The CTR xlator records file meta (heat/hardlinks) ++into the data. This works fine for files which are created ++after ctr xlator is switched ON. But for files which were ++created before CTR xlator is ON, CTR xlator is not able to ++record either of the meta i.e heat or hardlinks. Thus making ++those files immune to promotions/demotions. ++ ++Solution: The solution that is implemented in this patch is ++do ctr-db heal of all those pre-existent files, using named lookup. ++For this purpose we use the inode-xlator context variable option ++in gluster. ++The inode-xlator context variable for ctr xlator will have the ++following, ++ a. A Lock for the context variable ++ b. A hardlink list: This list represents the successful looked ++ up hardlinks. ++These are the scenarios when the hardlink list is updated: ++1) Named-Lookup: Whenever a named lookup happens on a file, in the ++ wind path we copy all required hardlink and inode information to ++ ctr_db_record structure, which resides in the frame->local variable. ++ We don't update the database in wind. During the unwind, we read the ++ information from the ctr_db_record and , ++ Check if the inode context variable is created, if not we create it. ++ Check if the hard link is there in the hardlink list. ++ If its not there we add it to the list and send a update to the ++ database using libgfdb. ++ Please note: The database transaction can fail(and we ignore) as there ++ already might be a record in the db. This update to the db is to heal ++ if its not there. ++ If its there in the list we ignore it. ++2) Inode Forget: Whenever an inode forget hits we clear the hardlink list in ++ the inode context variable and delete the inode context variable. ++ Please note: An inode forget may happen for two reason, ++ a. when the inode is delete. ++ b. the in-memory inode is evicted from the inode table due to cache limits. ++3) create: whenever a create happens we create the inode context variable and ++ add the hardlink. The database updation is done as usual by ctr. ++4) link: whenever a hardlink is created for the inode, we create the inode ++ context variable, if not present, and add the hardlink to the list. ++5) unlink: whenever a unlink happens we delete the hardlink from the list. ++6) mknod: same as create. ++7) rename: whenever a rename happens we update the hardlink in list. if the ++ hardlink was not present for updation, we add the hardlink to the list. ++ ++What is pending: ++1) This solution will only work for named lookups. ++2) We don't track afr-self-heal/dht-rebalancer traffic for healing. ++ ++*/ ++ ++/* This function does not write anything to the db, ++ * just created the local variable ++ * for the frame and sets values for the ctr_db_record */ ++static int ++ctr_lookup_wind(call_frame_t *frame, xlator_t *this, ++ gf_ctr_inode_context_t *ctr_inode_cx) ++{ ++ int ret = -1; ++ gf_ctr_private_t *_priv = NULL; ++ gf_ctr_local_t *ctr_local = NULL; ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(frame->root); ++ GF_ASSERT(this); ++ IS_CTR_INODE_CX_SANE(ctr_inode_cx); ++ ++ _priv = this->private; ++ GF_ASSERT(_priv); ++ ++ if (_priv->ctr_record_wind && ctr_inode_cx->ia_type != IA_IFDIR) { ++ frame->local = init_ctr_local_t(this); ++ if (!frame->local) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND, ++ "WIND: Error while creating ctr local"); ++ goto out; ++ }; ++ ctr_local = frame->local; ++ /*Definitely no internal fops will reach here*/ ++ ctr_local->is_internal_fop = _gf_false; ++ /*Don't record counters*/ ++ CTR_DB_REC(ctr_local).do_record_counters = _gf_false; ++ /*Don't record time at all*/ ++ CTR_DB_REC(ctr_local).do_record_times = _gf_false; ++ ++ /* Copy gfid into db record*/ ++ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, *(ctr_inode_cx->gfid)); ++ ++ /* Set fop_path and fop_type, required by libgfdb to make ++ * decision while inserting the record */ ++ CTR_DB_REC(ctr_local).gfdb_fop_path = ctr_inode_cx->fop_path; ++ CTR_DB_REC(ctr_local).gfdb_fop_type = ctr_inode_cx->fop_type; ++ ++ /* Copy hard link info*/ ++ gf_uuid_copy(CTR_DB_REC(ctr_local).pargfid, ++ *((NEW_LINK_CX(ctr_inode_cx))->pargfid)); ++ if (snprintf(CTR_DB_REC(ctr_local).file_name, ++ sizeof(CTR_DB_REC(ctr_local).file_name), "%s", ++ NEW_LINK_CX(ctr_inode_cx)->basename) >= ++ sizeof(CTR_DB_REC(ctr_local).file_name)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND, ++ "WIND: Error copying filename of ctr local"); ++ goto out; ++ } ++ /* Since we are in lookup we can ignore errors while ++ * Inserting in the DB, because there may be many ++ * to write to the DB attempts for healing. ++ * We don't want to log all failed attempts and ++ * bloat the log*/ ++ ctr_local->gfdb_db_record.ignore_errors = _gf_true; ++ } ++ ++ ret = 0; ++ ++out: ++ ++ if (ret) { ++ free_ctr_local(ctr_local); ++ frame->local = NULL; ++ } ++ ++ return ret; ++} ++ ++/* This function inserts the ctr_db_record populated by ctr_lookup_wind ++ * in to the db. It also destroys the frame->local created by ctr_lookup_wind */ ++static int ++ctr_lookup_unwind(call_frame_t *frame, xlator_t *this) ++{ ++ int ret = -1; ++ gf_ctr_private_t *_priv = NULL; ++ gf_ctr_local_t *ctr_local = NULL; ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(this); ++ ++ _priv = this->private; ++ GF_ASSERT(_priv); ++ ++ GF_ASSERT(_priv->_db_conn); ++ ++ ctr_local = frame->local; ++ ++ if (ctr_local && (ctr_local->ia_inode_type != IA_IFDIR)) { ++ ret = insert_record(_priv->_db_conn, &ctr_local->gfdb_db_record); ++ if (ret == -1) { ++ gf_msg(this->name, ++ _gfdb_log_level(GF_LOG_ERROR, ++ ctr_local->gfdb_db_record.ignore_errors), ++ 0, CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND, ++ "UNWIND: Error filling ctr local"); ++ goto out; ++ } ++ } ++ ret = 0; ++out: ++ free_ctr_local(ctr_local); ++ frame->local = NULL; ++ return ret; ++} ++ ++/****************************************************************************** ++ * ++ * FOPS HANDLING BELOW ++ * ++ * ***************************************************************************/ ++ ++/****************************LOOKUP********************************************/ ++ ++int32_t ++ctr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, inode_t *inode, ++ struct iatt *buf, dict_t *dict, struct iatt *postparent) ++{ ++ int ret = -1; ++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL; ++ gf_ctr_local_t *ctr_local = NULL; ++ ctr_heal_ret_val_t ret_val = CTR_CTX_ERROR; ++ gf_boolean_t _is_heal_needed = _gf_false; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ ++ /* if the lookup failed lookup don't do anything*/ ++ if (op_ret == -1) { ++ gf_msg_trace(this->name, 0, "lookup failed with %s", ++ strerror(op_errno)); ++ goto out; ++ } ++ ++ /* Ignore directory lookups */ ++ if (inode->ia_type == IA_IFDIR) { ++ goto out; ++ } ++ ++ /* if frame local was not set by the ctr_lookup() ++ * so don't so anything*/ ++ if (!frame->local) { ++ goto out; ++ } ++ ++ /* if the lookup is for dht link donot record*/ ++ if (dht_is_linkfile(buf, dict)) { ++ gf_msg_trace(this->name, 0, ++ "Ignoring Lookup " ++ "for dht link file"); ++ goto out; ++ } ++ ++ ctr_local = frame->local; ++ /*Assign the proper inode type*/ ++ ctr_local->ia_inode_type = inode->ia_type; ++ ++ /* Copy gfid directly from inode */ ++ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, inode->gfid); ++ ++ /* Checking if gfid and parent gfid is valid */ ++ if (gf_uuid_is_null(CTR_DB_REC(ctr_local).gfid) || ++ gf_uuid_is_null(CTR_DB_REC(ctr_local).pargfid)) { ++ gf_msg_trace(this->name, 0, "Invalid GFID"); ++ goto out; ++ } ++ ++ /* if its a first entry ++ * then mark the ctr_record for create ++ * A create will attempt a file and a hard link created in the db*/ ++ ctr_xlator_ctx = get_ctr_xlator_ctx(this, inode); ++ if (!ctr_xlator_ctx) { ++ /* This marks inode heal */ ++ CTR_DB_REC(ctr_local).gfdb_fop_type = GFDB_FOP_CREATE_WRITE; ++ _is_heal_needed = _gf_true; ++ } ++ ++ /* Copy the correct gfid from resolved inode */ ++ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, inode->gfid); ++ ++ /* Add hard link to the list */ ++ ret_val = add_hard_link_ctx(frame, this, inode); ++ if (ret_val == CTR_CTX_ERROR) { ++ gf_msg_trace(this->name, 0, "Failed adding hardlink to list"); ++ goto out; ++ } ++ /* If inode needs healing then heal the hardlink also */ ++ else if (ret_val & CTR_TRY_INODE_HEAL) { ++ /* This marks inode heal */ ++ CTR_DB_REC(ctr_local).gfdb_fop_type = GFDB_FOP_CREATE_WRITE; ++ _is_heal_needed = _gf_true; ++ } ++ /* If hardlink needs healing */ ++ else if (ret_val & CTR_TRY_HARDLINK_HEAL) { ++ _is_heal_needed = _gf_true; ++ } ++ ++ /* If lookup heal needed */ ++ if (!_is_heal_needed) ++ goto out; ++ ++ /* FINALLY HEAL : Inserts the ctr_db_record populated by ctr_lookup_wind ++ * in to the db. It also destroys the frame->local ++ * created by ctr_lookup_wind */ ++ ret = ctr_lookup_unwind(frame, this); ++ if (ret) { ++ gf_msg_trace(this->name, 0, "Failed healing/inserting link"); ++ } ++ ++out: ++ free_ctr_local((gf_ctr_local_t *)frame->local); ++ frame->local = NULL; ++ ++ STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, dict, ++ postparent); ++ ++ return 0; ++} ++ ++int32_t ++ctr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) ++{ ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ gf_ctr_link_context_t ctr_link_cx; ++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx; ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(frame->root); ++ ++ /* Don't handle nameless lookups*/ ++ if (!loc->parent || !loc->name) ++ goto out; ++ ++ /*fill ctr link context*/ ++ FILL_CTR_LINK_CX(_link_cx, loc->parent->gfid, loc->name, out); ++ ++ /* Fill ctr inode context*/ ++ /* IA_IFREG : We assume its a file in the wind ++ * but in the unwind we are sure what the inode is a file ++ * or directory ++ * gfid: we are just filling loc->gfid which is not correct. ++ * In unwind we fill the correct gfid for successful lookup*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, IA_IFREG, loc->gfid, _link_cx, NULL, ++ GFDB_FOP_DENTRY_WRITE, GFDB_FOP_WIND); ++ ++ /* Create the frame->local and populate ctr_db_record ++ * No writing to the db yet */ ++ ret = ctr_lookup_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_LINK_WIND_FAILED, ++ "Failed to insert link wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_lookup_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, xdata); ++ return 0; ++} ++ ++/****************************WRITEV********************************************/ ++int32_t ++ctr_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_WRITEV_UNWIND_FAILED, ++ "Failed to insert writev unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf, ++ xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector, ++ int32_t count, off_t off, uint32_t flags, struct iobref *iobref, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL, ++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_WRITEV_WIND_FAILED, ++ "Failed to insert writev wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_writev_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags, ++ iobref, xdata); ++ ++ return 0; ++} ++ ++/******************************setattr*****************************************/ ++ ++int32_t ++ctr_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *preop_stbuf, ++ struct iatt *postop_stbuf, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_SETATTR_UNWIND_FAILED, ++ "Failed to insert setattr unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, preop_stbuf, ++ postop_stbuf, xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, ++ int32_t valid, dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid, ++ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED, ++ "Failed to insert setattr wind"); ++ } ++out: ++ ++ STACK_WIND(frame, ctr_setattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata); ++ ++ return 0; ++} ++ ++/*************************** fsetattr ***************************************/ ++int32_t ++ctr_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *preop_stbuf, ++ struct iatt *postop_stbuf, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_SETATTR_UNWIND_FAILED, ++ "Failed to insert fsetattr unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(fsetattr, frame, op_ret, op_errno, preop_stbuf, ++ postop_stbuf, xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf, ++ int32_t valid, dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL, ++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED, ++ "Failed to insert fsetattr wind"); ++ } ++out: ++ STACK_WIND(frame, ctr_fsetattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata); ++ ++ return 0; ++} ++/****************************fremovexattr************************************/ ++ ++int32_t ++ctr_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_FREMOVEXATTR_UNWIND_FAILED, ++ "Failed to insert fremovexattr unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, ++ const char *name, dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL, ++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_FREMOVEXATTR_WIND_FAILED, ++ "Failed to insert fremovexattr wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_fremovexattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata); ++ return 0; ++} ++ ++/****************************removexattr*************************************/ ++ ++int32_t ++ctr_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_REMOVEXATTR_UNWIND_FAILED, ++ "Failed to insert removexattr unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc, ++ const char *name, dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid, ++ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_REMOVEXATTR_WIND_FAILED, ++ "Failed to insert removexattr wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_removexattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata); ++ return 0; ++} ++ ++/****************************truncate****************************************/ ++ ++int32_t ++ctr_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_TRUNCATE_UNWIND_FAILED, ++ "Failed to insert truncate unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf, ++ xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid, ++ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_TRUNCATE_WIND_FAILED, ++ "Failed to insert truncate wind"); ++ } ++out: ++ STACK_WIND(frame, ctr_truncate_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata); ++ return 0; ++} ++ ++/****************************ftruncate***************************************/ ++ ++int32_t ++ctr_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf, ++ struct iatt *postbuf, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_FTRUNCATE_UNWIND_FAILED, ++ "Failed to insert ftruncate unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf, ++ xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL, ++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_FTRUNCATE_WIND_FAILED, ++ "Failed to insert ftruncate wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_ftruncate_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata); ++ return 0; ++} ++ ++/****************************rename******************************************/ ++int32_t ++ctr_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *buf, ++ struct iatt *preoldparent, struct iatt *postoldparent, ++ struct iatt *prenewparent, struct iatt *postnewparent, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ uint32_t remaining_links = -1; ++ gf_ctr_local_t *ctr_local = NULL; ++ gfdb_fop_type_t fop_type = GFDB_FOP_INVALID_OP; ++ gfdb_fop_path_t fop_path = GFDB_FOP_INVALID; ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(this); ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE, ++ GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RENAME_UNWIND_FAILED, ++ "Failed to insert rename unwind"); ++ goto out; ++ } ++ ++ if (!xdata) ++ goto out; ++ /* ++ * ++ * Extracting GF_RESPONSE_LINK_COUNT_XDATA from POSIX Xlator ++ * This is only set when we are overwriting hardlinks. ++ * ++ * */ ++ ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, ++ &remaining_links); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_GET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED, ++ "Failed to getting GF_RESPONSE_LINK_COUNT_XDATA"); ++ remaining_links = -1; ++ goto out; ++ } ++ ++ ctr_local = frame->local; ++ if (!ctr_local) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_NULL_LOCAL, ++ "ctr_local is NULL."); ++ goto out; ++ } ++ ++ /* This is not the only link */ ++ if (remaining_links > 1) { ++ fop_type = GFDB_FOP_DENTRY_WRITE; ++ fop_path = GFDB_FOP_UNDEL; ++ } ++ /* Last link that was deleted */ ++ else if (remaining_links == 1) { ++ fop_type = GFDB_FOP_DENTRY_WRITE; ++ fop_path = GFDB_FOP_UNDEL_ALL; ++ } else { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RENAME_UNWIND_FAILED, ++ "Invalid link count from posix"); ++ goto out; ++ } ++ ++ ret = ctr_delete_hard_link_from_db( ++ this, CTR_DB_REC(ctr_local).old_gfid, CTR_DB_REC(ctr_local).pargfid, ++ CTR_DB_REC(ctr_local).file_name, fop_type, fop_path); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_UNLINK_UNWIND_FAILED, ++ "Failed to delete records of %s", ++ CTR_DB_REC(ctr_local).old_file_name); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent, ++ postoldparent, prenewparent, postnewparent, xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ gf_ctr_link_context_t new_link_cx, old_link_cx; ++ gf_ctr_link_context_t *_nlink_cx = &new_link_cx; ++ gf_ctr_link_context_t *_olink_cx = &old_link_cx; ++ int is_dict_created = 0; ++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ ++ /*Fill old link context*/ ++ FILL_CTR_LINK_CX(_olink_cx, oldloc->pargfid, oldloc->name, out); ++ ++ /*Fill new link context*/ ++ FILL_CTR_LINK_CX(_nlink_cx, newloc->pargfid, newloc->name, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, oldloc->inode->ia_type, ++ oldloc->inode->gfid, _nlink_cx, _olink_cx, ++ GFDB_FOP_DENTRY_WRITE, GFDB_FOP_WIND); ++ ++ /* If the rename is a overwrite of hardlink ++ * rename ("file1", "file2") ++ * file1 is hardlink for gfid say 00000000-0000-0000-0000-00000000000A ++ * file2 is hardlink for gfid say 00000000-0000-0000-0000-00000000000B ++ * so we are saving file2 gfid in old_gfid so that we delete entries ++ * from the db during rename callback if the fop is successful ++ * */ ++ if (newloc->inode) { ++ /* This is the GFID from where the newloc hardlink will be ++ * unlinked */ ++ _inode_cx->old_gfid = &newloc->inode->gfid; ++ } ++ ++ /* Is a metatdata fop */ ++ _inode_cx->is_metadata_fop = _gf_true; ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RENAME_WIND_FAILED, ++ "Failed to insert rename wind"); ++ } else { ++ /* We are doing updation of hard link in inode context in wind ++ * As we don't get the "inode" in the call back for rename */ ++ ret = update_hard_link_ctx(frame, this, oldloc->inode); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_UPDATE_HARDLINK_FAILED, ++ "Failed " ++ "updating hard link in ctr inode context"); ++ goto out; ++ } ++ ++ /* If the newloc has an inode. i.e acquiring hardlink of an ++ * exisitng file i.e overwritting a file. ++ * */ ++ if (newloc->inode) { ++ /* Getting the ctr inode context variable for ++ * inode whose hardlink will be acquired during ++ * the rename ++ * */ ++ ctr_xlator_ctx = get_ctr_xlator_ctx(this, newloc->inode); ++ if (!ctr_xlator_ctx) { ++ /* Since there is no ctr inode context ++ * so nothing more to do */ ++ ret = 0; ++ goto out; ++ } ++ ++ /* Deleting hardlink from context variable */ ++ ret = ctr_delete_hard_link(this, ctr_xlator_ctx, newloc->pargfid, ++ newloc->name); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_DELETE_HARDLINK_FAILED, ++ "Failed to delete hard link"); ++ goto out; ++ } ++ ++ /* Requesting for number of hardlinks on the newloc ++ * inode from POSIX. ++ * */ ++ is_dict_created = set_posix_link_request(this, &xdata); ++ if (is_dict_created == -1) { ++ ret = -1; ++ goto out; ++ } ++ } ++ } ++ ++out: ++ STACK_WIND(frame, ctr_rename_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); ++ ++ if (is_dict_created == 1) { ++ dict_unref(xdata); ++ } ++ ++ return 0; ++} ++ ++/****************************unlink******************************************/ ++int32_t ++ctr_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, struct iatt *preparent, ++ struct iatt *postparent, dict_t *xdata) ++{ ++ int ret = -1; ++ uint32_t remaining_links = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ if (!xdata) ++ goto out; ++ ++ /* ++ * ++ * Extracting GF_RESPONSE_LINK_COUNT_XDATA from POSIX Xlator ++ * ++ * */ ++ ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA, ++ &remaining_links); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_GET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED, ++ "Failed to getting GF_RESPONSE_LINK_COUNT_XDATA"); ++ remaining_links = -1; ++ } ++ ++ /*This is not the only link*/ ++ if (remaining_links != 1) { ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE, ++ GFDB_FOP_UNDEL); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_UNLINK_UNWIND_FAILED, ++ "Failed to insert unlink unwind"); ++ } ++ } ++ /*Last link that was deleted*/ ++ else if (remaining_links == 1) { ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE, ++ GFDB_FOP_UNDEL_ALL); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_UNLINK_UNWIND_FAILED, ++ "Failed to insert unlink unwind"); ++ } ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent, ++ xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ gf_ctr_link_context_t ctr_link_cx; ++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx; ++ gf_boolean_t is_xdata_created = _gf_false; ++ struct iatt dummy_stat = {0}; ++ ++ GF_ASSERT(frame); ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ ++ /*Fill link context*/ ++ FILL_CTR_LINK_CX(_link_cx, loc->pargfid, loc->name, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid, ++ _link_cx, NULL, GFDB_FOP_DENTRY_WRITE, ++ GFDB_FOP_WDEL); ++ ++ /*Internal FOP*/ ++ _inode_cx->is_internal_fop = is_internal_fop(frame, xdata); ++ ++ /* Is a metadata FOP */ ++ _inode_cx->is_metadata_fop = _gf_true; ++ ++ /* If its a internal FOP and dht link file donot record*/ ++ if (_inode_cx->is_internal_fop && dht_is_linkfile(&dummy_stat, xdata)) { ++ goto out; ++ } ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_UNLINK_UNWIND_FAILED, ++ "Failed to insert unlink wind"); ++ } else { ++ /* We are doing delete of hard link in inode context in wind ++ * As we don't get the "inode" in the call back for rename */ ++ ret = delete_hard_link_ctx(frame, this, loc->inode); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_DELETE_HARDLINK_FAILED, ++ "Failed " ++ "deleting hard link from ctr inode context"); ++ } ++ } ++ ++ /* ++ * ++ * Sending GF_REQUEST_LINK_COUNT_XDATA ++ * to POSIX Xlator to send link count in unwind path ++ * ++ * */ ++ /*create xdata if NULL*/ ++ if (!xdata) { ++ xdata = dict_new(); ++ is_xdata_created = (xdata) ? _gf_true : _gf_false; ++ } ++ if (!xdata) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_XDATA_NULL, ++ "xdata is NULL :Cannot send " ++ "GF_REQUEST_LINK_COUNT_XDATA to posix"); ++ goto out; ++ } ++ ++ ret = dict_set_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, 1); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_SET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED, ++ "Failed setting GF_REQUEST_LINK_COUNT_XDATA"); ++ if (is_xdata_created) { ++ dict_unref(xdata); ++ } ++ goto out; ++ } ++ ++out: ++ STACK_WIND(frame, ctr_unlink_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); ++ ++ if (is_xdata_created) ++ dict_unref(xdata); ++ ++ return 0; ++} ++ ++/****************************fsync******************************************/ ++int32_t ++ctr_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, ++ int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_UNWIND_FAILED, ++ "Failed to insert fsync unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL, ++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_WIND_FAILED, ++ "Failed to insert fsync wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_fsync_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsync, fd, flags, xdata); ++ return 0; ++} ++ ++/****************************setxattr****************************************/ ++ ++int ++ctr_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_UNWIND_FAILED, ++ "Failed to insert setxattr unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata); ++ ++ return 0; ++} ++ ++int ++ctr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr, ++ int flags, dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid, ++ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED, ++ "Failed to insert setxattr wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_setxattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->setxattr, loc, xattr, flags, xdata); ++ return 0; ++} ++/**************************** fsetxattr *************************************/ ++int32_t ++ctr_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this, ++ int32_t op_ret, int32_t op_errno, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_UNWIND_FAILED, ++ "Failed to insert fsetxattr unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata); ++ ++ return 0; ++} ++ ++int32_t ++ctr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict, ++ int32_t flags, dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL, ++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED, ++ "Failed to insert fsetxattr wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_fsetxattr_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata); ++ return 0; ++} ++/****************************mknod*******************************************/ ++ ++int32_t ++ctr_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, ++ int32_t op_errno, inode_t *inode, struct iatt *buf, ++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata) ++{ ++ int ret = -1; ++ ctr_heal_ret_val_t ret_val = CTR_CTX_ERROR; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ /* Add hard link to the list */ ++ ret_val = add_hard_link_ctx(frame, this, inode); ++ if (ret_val == CTR_CTX_ERROR) { ++ gf_msg_trace(this->name, 0, "Failed adding hard link"); ++ } ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_CREATE_WRITE, ++ GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_MKNOD_UNWIND_FAILED, ++ "Failed to insert mknod unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent, ++ postparent, xdata); ++ ++ return 0; ++} ++ ++int ++ctr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode, ++ dev_t rdev, mode_t umask, dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ gf_ctr_link_context_t ctr_link_cx; ++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx; ++ uuid_t gfid = { ++ 0, ++ }; ++ uuid_t *ptr_gfid = &gfid; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(frame->root); ++ ++ /*get gfid from xdata dict*/ ++ ret = dict_get_gfuuid(xdata, "gfid-req", &gfid); ++ if (ret) { ++ gf_msg_debug(this->name, 0, "failed to get gfid from dict"); ++ goto out; ++ } ++ ++ /*fill ctr link context*/ ++ FILL_CTR_LINK_CX(_link_cx, loc->pargfid, loc->name, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, *ptr_gfid, _link_cx, ++ NULL, GFDB_FOP_CREATE_WRITE, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_MKNOD_WIND_FAILED, ++ "Failed to insert mknod wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_mknod_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata); ++ return 0; ++} ++ ++/****************************create******************************************/ ++int ++ctr_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf, ++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = add_hard_link_ctx(frame, this, inode); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_ADD_HARDLINK_FAILED, ++ "Failed adding hard link"); ++ } ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_CREATE_WRITE, ++ GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_UNWIND_FAILED, ++ "Failed to insert create unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, stbuf, ++ preparent, postparent, xdata); ++ ++ return 0; ++} ++ ++int ++ctr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags, ++ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ gf_ctr_link_context_t ctr_link_cx; ++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx; ++ uuid_t gfid = { ++ 0, ++ }; ++ uuid_t *ptr_gfid = &gfid; ++ struct iatt dummy_stat = {0}; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(frame->root); ++ ++ /*Get GFID from Xdata dict*/ ++ ret = dict_get_gfuuid(xdata, "gfid-req", &gfid); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_GET_GFID_FROM_DICT_FAILED, ++ "failed to get gfid from dict"); ++ goto out; ++ } ++ ++ /*fill ctr link context*/ ++ FILL_CTR_LINK_CX(_link_cx, loc->pargfid, loc->name, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, *ptr_gfid, _link_cx, ++ NULL, GFDB_FOP_CREATE_WRITE, GFDB_FOP_WIND); ++ ++ /*Internal FOP*/ ++ _inode_cx->is_internal_fop = is_internal_fop(frame, xdata); ++ ++ /* If its a internal FOP and dht link file donot record*/ ++ if (_inode_cx->is_internal_fop && dht_is_linkfile(&dummy_stat, xdata)) { ++ goto out; ++ } ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, &ctr_inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_WIND_FAILED, ++ "Failed to insert create wind"); ++ } ++out: ++ STACK_WIND(frame, ctr_create_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd, ++ xdata); ++ return 0; ++} ++ ++/****************************link********************************************/ ++ ++int ++ctr_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, inode_t *inode, struct iatt *stbuf, ++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ /* Add hard link to the list */ ++ ret = add_hard_link_ctx(frame, this, inode); ++ if (ret) { ++ gf_msg_trace(this->name, 0, "Failed adding hard link"); ++ } ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE, ++ GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_UNWIND_FAILED, ++ "Failed to insert create unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, inode, stbuf, preparent, ++ postparent, xdata); ++ return 0; ++} ++ ++int ++ctr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, ++ dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ gf_ctr_link_context_t ctr_link_cx; ++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx; ++ struct iatt dummy_stat = {0}; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(frame->root); ++ ++ /*fill ctr link context*/ ++ FILL_CTR_LINK_CX(_link_cx, newloc->pargfid, newloc->name, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, oldloc->inode->ia_type, ++ oldloc->inode->gfid, _link_cx, NULL, ++ GFDB_FOP_DENTRY_WRITE, GFDB_FOP_WIND); ++ ++ /*Internal FOP*/ ++ _inode_cx->is_internal_fop = is_internal_fop(frame, xdata); ++ ++ /* Is a metadata fop */ ++ _inode_cx->is_metadata_fop = _gf_true; ++ ++ /* If its a internal FOP and dht link file donot record*/ ++ if (_inode_cx->is_internal_fop && dht_is_linkfile(&dummy_stat, xdata)) { ++ goto out; ++ } ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_LINK_WIND_FAILED, ++ "Failed to insert link wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_link_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); ++ return 0; ++} ++ ++/******************************readv*****************************************/ ++int ++ctr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, ++ int op_errno, struct iovec *vector, int count, struct iatt *stbuf, ++ struct iobref *iobref, dict_t *xdata) ++{ ++ int ret = -1; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out); ++ ++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_READ, GFDB_FOP_UNWIND); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_UNWIND_FAILED, ++ "Failed to insert create unwind"); ++ } ++ ++out: ++ ctr_free_frame_local(frame); ++ ++ STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf, ++ iobref, xdata); ++ return 0; ++} ++ ++int ++ctr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off, ++ uint32_t flags, dict_t *xdata) ++{ ++ int ret = -1; ++ gf_ctr_inode_context_t ctr_inode_cx; ++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx; ++ ++ CTR_IS_DISABLED_THEN_GOTO(this, out); ++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out); ++ ++ /*Fill ctr inode context*/ ++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL, ++ NULL, GFDB_FOP_INODE_READ, GFDB_FOP_WIND); ++ ++ /*record into the database*/ ++ ret = ctr_insert_wind(frame, this, _inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_READV_WIND_FAILED, ++ "Failed to insert readv wind"); ++ } ++ ++out: ++ STACK_WIND(frame, ctr_readv_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->readv, fd, size, off, flags, xdata); ++ return 0; ++} ++ ++/*******************************ctr_ipc****************************************/ ++ ++/*This is the call back function per record/file from data base*/ ++static int ++ctr_db_query_callback(gfdb_query_record_t *gfdb_query_record, void *args) ++{ ++ int ret = -1; ++ ctr_query_cbk_args_t *query_cbk_args = args; ++ ++ GF_VALIDATE_OR_GOTO("ctr", query_cbk_args, out); ++ ++ ret = gfdb_write_query_record(query_cbk_args->query_fd, gfdb_query_record); ++ if (ret) { ++ gf_msg("ctr", GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR, ++ "Failed to write to query file"); ++ goto out; ++ } ++ ++ query_cbk_args->count++; ++ ++ ret = 0; ++out: ++ return ret; ++} ++ ++/* This function does all the db queries related to tiering and ++ * generates/populates new/existing query file ++ * inputs: ++ * xlator_t *this : CTR Translator ++ * void *conn_node : Database connection ++ * char *query_file: the query file that needs to be updated ++ * gfdb_ipc_ctr_params_t *ipc_ctr_params: the query parameters ++ * Return: ++ * On success 0 ++ * On failure -1 ++ * */ ++int ++ctr_db_query(xlator_t *this, void *conn_node, char *query_file, ++ gfdb_ipc_ctr_params_t *ipc_ctr_params) ++{ ++ int ret = -1; ++ ctr_query_cbk_args_t query_cbk_args = {0}; ++ ++ GF_VALIDATE_OR_GOTO("ctr", this, out); ++ GF_VALIDATE_OR_GOTO(this->name, conn_node, out); ++ GF_VALIDATE_OR_GOTO(this->name, query_file, out); ++ GF_VALIDATE_OR_GOTO(this->name, ipc_ctr_params, out); ++ ++ /*Query for eligible files from db*/ ++ query_cbk_args.query_fd = open(query_file, O_WRONLY | O_CREAT | O_APPEND, ++ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); ++ if (query_cbk_args.query_fd < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, errno, CTR_MSG_FATAL_ERROR, ++ "Failed to open query file %s", query_file); ++ goto out; ++ } ++ if (!ipc_ctr_params->is_promote) { ++ if (ipc_ctr_params->emergency_demote) { ++ /* emergency demotion mode */ ++ ret = find_all(conn_node, ctr_db_query_callback, ++ (void *)&query_cbk_args, ++ ipc_ctr_params->query_limit); ++ } else { ++ if (ipc_ctr_params->write_freq_threshold == 0 && ++ ipc_ctr_params->read_freq_threshold == 0) { ++ ret = find_unchanged_for_time(conn_node, ctr_db_query_callback, ++ (void *)&query_cbk_args, ++ &ipc_ctr_params->time_stamp); ++ } else { ++ ret = find_unchanged_for_time_freq( ++ conn_node, ctr_db_query_callback, (void *)&query_cbk_args, ++ &ipc_ctr_params->time_stamp, ++ ipc_ctr_params->write_freq_threshold, ++ ipc_ctr_params->read_freq_threshold, _gf_false); ++ } ++ } ++ } else { ++ if (ipc_ctr_params->write_freq_threshold == 0 && ++ ipc_ctr_params->read_freq_threshold == 0) { ++ ret = find_recently_changed_files(conn_node, ctr_db_query_callback, ++ (void *)&query_cbk_args, ++ &ipc_ctr_params->time_stamp); ++ } else { ++ ret = find_recently_changed_files_freq( ++ conn_node, ctr_db_query_callback, (void *)&query_cbk_args, ++ &ipc_ctr_params->time_stamp, ++ ipc_ctr_params->write_freq_threshold, ++ ipc_ctr_params->read_freq_threshold, _gf_false); ++ } ++ } ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR, ++ "FATAL: query from db failed"); ++ goto out; ++ } ++ ++ ret = clear_files_heat(conn_node); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR, ++ "FATAL: Failed to clear db entries"); ++ goto out; ++ } ++ ++ ret = 0; ++out: ++ ++ if (!ret) ++ ret = query_cbk_args.count; ++ ++ if (query_cbk_args.query_fd >= 0) { ++ sys_close(query_cbk_args.query_fd); ++ query_cbk_args.query_fd = -1; ++ } ++ ++ return ret; ++} ++ ++void * ++ctr_compact_thread(void *args) ++{ ++ int ret = -1; ++ void *db_conn = NULL; ++ ++ xlator_t *this = NULL; ++ gf_ctr_private_t *priv = NULL; ++ gf_boolean_t compact_active = _gf_false; ++ gf_boolean_t compact_mode_switched = _gf_false; ++ ++ this = (xlator_t *)args; ++ ++ GF_VALIDATE_OR_GOTO("ctr", this, out); ++ ++ priv = this->private; ++ ++ db_conn = priv->_db_conn; ++ compact_active = priv->compact_active; ++ compact_mode_switched = priv->compact_mode_switched; ++ ++ gf_msg("ctr-compact", GF_LOG_INFO, 0, CTR_MSG_SET, "Starting compaction"); ++ ++ ret = compact_db(db_conn, compact_active, compact_mode_switched); ++ ++ if (ret) { ++ gf_msg("ctr-compact", GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed to perform the compaction"); ++ } ++ ++ ret = pthread_mutex_lock(&priv->compact_lock); ++ ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed to acquire lock"); ++ goto out; ++ } ++ ++ /* We are done compaction on this brick. Set all flags to false */ ++ priv->compact_active = _gf_false; ++ priv->compact_mode_switched = _gf_false; ++ ++ ret = pthread_mutex_unlock(&priv->compact_lock); ++ ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed to release lock"); ++ goto out; ++ } ++ ++out: ++ return NULL; ++} ++ ++int ++ctr_ipc_helper(xlator_t *this, dict_t *in_dict, dict_t *out_dict) ++{ ++ int ret = -1; ++ char *ctr_ipc_ops = NULL; ++ gf_ctr_private_t *priv = NULL; ++ char *db_version = NULL; ++ char *db_param_key = NULL; ++ char *db_param = NULL; ++ char *query_file = NULL; ++ gfdb_ipc_ctr_params_t *ipc_ctr_params = NULL; ++ int result = 0; ++ pthread_t compact_thread; ++ ++ GF_VALIDATE_OR_GOTO("ctr", this, out); ++ GF_VALIDATE_OR_GOTO(this->name, this->private, out); ++ priv = this->private; ++ GF_VALIDATE_OR_GOTO(this->name, priv->_db_conn, out); ++ GF_VALIDATE_OR_GOTO(this->name, in_dict, out); ++ GF_VALIDATE_OR_GOTO(this->name, out_dict, out); ++ ++ GET_DB_PARAM_FROM_DICT(this->name, in_dict, GFDB_IPC_CTR_KEY, ctr_ipc_ops, ++ out); ++ ++ /*if its a db clear operation */ ++ if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_CLEAR_OPS, ++ SLEN(GFDB_IPC_CTR_CLEAR_OPS)) == 0) { ++ ret = clear_files_heat(priv->_db_conn); ++ if (ret) ++ goto out; ++ ++ } /* if its a query operation, in which case its query + clear db*/ ++ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_QUERY_OPS, ++ SLEN(GFDB_IPC_CTR_QUERY_OPS)) == 0) { ++ ret = dict_get_str(in_dict, GFDB_IPC_CTR_GET_QFILE_PATH, &query_file); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed extracting query file path"); ++ goto out; ++ } ++ ++ ret = dict_get_bin(in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS, ++ (void *)&ipc_ctr_params); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed extracting query parameters"); ++ goto out; ++ } ++ ++ ret = ctr_db_query(this, priv->_db_conn, query_file, ipc_ctr_params); ++ ++ ret = dict_set_int32(out_dict, GFDB_IPC_CTR_RET_QUERY_COUNT, ret); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed setting query reply"); ++ goto out; ++ } ++ ++ } /* if its a query for db version */ ++ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_GET_DB_VERSION_OPS, ++ SLEN(GFDB_IPC_CTR_GET_DB_VERSION_OPS)) == 0) { ++ ret = get_db_version(priv->_db_conn, &db_version); ++ if (ret == -1 || !db_version) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed extracting db version "); ++ goto out; ++ } ++ ++ SET_DB_PARAM_TO_DICT(this->name, out_dict, GFDB_IPC_CTR_RET_DB_VERSION, ++ db_version, ret, error); ++ ++ } /* if its a query for a db setting */ ++ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_GET_DB_PARAM_OPS, ++ SLEN(GFDB_IPC_CTR_GET_DB_PARAM_OPS)) == 0) { ++ ret = dict_get_str(in_dict, GFDB_IPC_CTR_GET_DB_KEY, &db_param_key); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed extracting db param key"); ++ goto out; ++ } ++ ++ ret = get_db_params(priv->_db_conn, db_param_key, &db_param); ++ if (ret == -1 || !db_param) { ++ goto out; ++ } ++ ++ SET_DB_PARAM_TO_DICT(this->name, out_dict, db_param_key, db_param, ret, ++ error); ++ } /* if its an attempt to compact the database */ ++ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_SET_COMPACT_PRAGMA, ++ SLEN(GFDB_IPC_CTR_SET_COMPACT_PRAGMA)) == 0) { ++ ret = pthread_mutex_lock(&priv->compact_lock); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed to acquire lock for compaction"); ++ goto out; ++ } ++ ++ if ((priv->compact_active || priv->compact_mode_switched)) { ++ /* Compaction in progress. LEAVE */ ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Compaction already in progress."); ++ pthread_mutex_unlock(&priv->compact_lock); ++ goto out; ++ } ++ /* At this point, we should be the only one on the brick */ ++ /* compacting */ ++ ++ /* Grab the arguments from the dictionary */ ++ ret = dict_get_int32(in_dict, "compact_active", &result); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed to get compaction type"); ++ goto out; ++ } ++ ++ if (result) { ++ priv->compact_active = _gf_true; ++ } ++ ++ ret = dict_get_int32(in_dict, "compact_mode_switched", &result); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed to see if compaction switched"); ++ goto out; ++ } ++ ++ if (result) { ++ priv->compact_mode_switched = _gf_true; ++ gf_msg("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET, ++ "Pre-thread: Compact mode switch is true"); ++ } else { ++ gf_msg("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET, ++ "Pre-thread: Compact mode switch is false"); ++ } ++ ++ ret = pthread_mutex_unlock(&priv->compact_lock); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed to release lock for compaction"); ++ goto out; ++ } ++ ++ ret = gf_thread_create(&compact_thread, NULL, ctr_compact_thread, ++ (void *)this, "ctrcomp"); ++ ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed to spawn compaction thread"); ++ goto out; ++ } ++ ++ goto out; ++ } /* default case */ ++ else { ++ goto out; ++ } ++ ++ ret = 0; ++ goto out; ++error: ++ GF_FREE(db_param_key); ++ GF_FREE(db_param); ++ GF_FREE(db_version); ++out: ++ return ret; ++} ++ ++/* IPC Call from tier migrator to clear the heat on the DB */ ++int32_t ++ctr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *in_dict) ++{ ++ int ret = -1; ++ gf_ctr_private_t *priv = NULL; ++ dict_t *out_dict = NULL; ++ ++ GF_ASSERT(this); ++ priv = this->private; ++ GF_ASSERT(priv); ++ GF_ASSERT(priv->_db_conn); ++ GF_VALIDATE_OR_GOTO(this->name, in_dict, wind); ++ ++ if (op != GF_IPC_TARGET_CTR) ++ goto wind; ++ ++ out_dict = dict_new(); ++ if (!out_dict) { ++ goto out; ++ } ++ ++ ret = ctr_ipc_helper(this, in_dict, out_dict); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET, ++ "Failed in ctr_ipc_helper"); ++ } ++out: ++ ++ STACK_UNWIND_STRICT(ipc, frame, ret, 0, out_dict); ++ ++ if (out_dict) ++ dict_unref(out_dict); ++ ++ return 0; ++ ++wind: ++ STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->ipc, op, in_dict); ++ ++ return 0; ++} ++ ++/* Call to initialize db for ctr xlator while ctr is enabled */ ++int32_t ++initialize_ctr_resource(xlator_t *this, gf_ctr_private_t *priv) ++{ ++ int ret_db = -1; ++ dict_t *params_dict = NULL; ++ ++ if (!priv) ++ goto error; ++ ++ /* For compaction */ ++ priv->compact_active = _gf_false; ++ priv->compact_mode_switched = _gf_false; ++ ret_db = pthread_mutex_init(&priv->compact_lock, NULL); ++ ++ if (ret_db) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR, ++ "FATAL: Failed initializing compaction mutex"); ++ goto error; ++ } ++ ++ params_dict = dict_new(); ++ if (!params_dict) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INIT_DB_PARAMS_FAILED, ++ "DB Params cannot initialized!"); ++ goto error; ++ } ++ ++ /*Extract db params options*/ ++ ret_db = extract_db_params(this, params_dict, priv->gfdb_db_type); ++ if (ret_db) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_EXTRACT_DB_PARAM_OPTIONS_FAILED, ++ "Failed extracting db params options"); ++ goto error; ++ } ++ ++ /*Create a memory pool for ctr xlator*/ ++ this->local_pool = mem_pool_new(gf_ctr_local_t, 64); ++ if (!this->local_pool) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_CREATE_LOCAL_MEMORY_POOL_FAILED, ++ "failed to create local memory pool"); ++ goto error; ++ } ++ ++ /*Initialize Database Connection*/ ++ priv->_db_conn = init_db(params_dict, priv->gfdb_db_type); ++ if (!priv->_db_conn) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR, ++ "FATAL: Failed initializing data base"); ++ goto error; ++ } ++ ++ ret_db = 0; ++ goto out; ++ ++error: ++ if (this) ++ mem_pool_destroy(this->local_pool); ++ ++ if (priv) { ++ GF_FREE(priv->ctr_db_path); ++ } ++ GF_FREE(priv); ++ ret_db = -1; ++out: ++ if (params_dict) ++ dict_unref(params_dict); ++ ++ return ret_db; ++} ++ ++/******************************************************************************/ ++int ++reconfigure(xlator_t *this, dict_t *options) ++{ ++ char *temp_str = NULL; ++ int ret = 0; ++ gf_ctr_private_t *priv = NULL; ++ ++ priv = this->private; ++ ++ if (dict_get_str(options, "changetimerecorder.frequency", &temp_str)) { ++ gf_msg(this->name, GF_LOG_TRACE, 0, CTR_MSG_SET, "set"); ++ } ++ ++ GF_OPTION_RECONF("ctr-enabled", priv->enabled, options, bool, out); ++ if (!priv->enabled) { ++ gf_msg(GFDB_DATA_STORE, GF_LOG_INFO, 0, CTR_MSG_XLATOR_DISABLED, ++ "CTR Xlator is not enabled so skip ctr reconfigure"); ++ goto out; ++ } ++ ++ /* If ctr is enabled after skip init for ctr xlator then call ++ initialize_ctr_resource during reconfigure phase to allocate resources ++ for xlator ++ */ ++ if (priv->enabled && !priv->_db_conn) { ++ ret = initialize_ctr_resource(this, priv); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR, ++ "FATAL: Failed ctr initialize resource"); ++ goto out; ++ } ++ } ++ ++ GF_OPTION_RECONF("record-counters", priv->ctr_record_counter, options, bool, ++ out); ++ ++ GF_OPTION_RECONF("ctr-record-metadata-heat", priv->ctr_record_metadata_heat, ++ options, bool, out); ++ ++ GF_OPTION_RECONF("ctr_link_consistency", priv->ctr_link_consistency, ++ options, bool, out); ++ ++ GF_OPTION_RECONF("ctr_lookupheal_inode_timeout", ++ priv->ctr_lookupheal_inode_timeout, options, uint64, out); ++ ++ GF_OPTION_RECONF("ctr_lookupheal_link_timeout", ++ priv->ctr_lookupheal_link_timeout, options, uint64, out); ++ ++ GF_OPTION_RECONF("record-exit", priv->ctr_record_unwind, options, bool, ++ out); ++ ++ GF_OPTION_RECONF("record-entry", priv->ctr_record_wind, options, bool, out); ++ ++ /* If database is sqlite */ ++ if (priv->gfdb_db_type == GFDB_SQLITE3) { ++ /* AUTOCHECKPOINT */ ++ if (dict_get_str(options, GFDB_SQL_PARAM_WAL_AUTOCHECK, &temp_str) == ++ 0) { ++ ret = set_db_params(priv->_db_conn, "wal_autocheckpoint", temp_str); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED, ++ "Failed to set %s", GFDB_SQL_PARAM_WAL_AUTOCHECK); ++ } ++ } ++ ++ /* CACHE_SIZE */ ++ if (dict_get_str(options, GFDB_SQL_PARAM_CACHE_SIZE, &temp_str) == 0) { ++ ret = set_db_params(priv->_db_conn, "cache_size", temp_str); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED, ++ "Failed to set %s", GFDB_SQL_PARAM_CACHE_SIZE); ++ } ++ } ++ } ++ ++ ret = 0; ++ ++out: ++ ++ return ret; ++} ++ ++/****************************init********************************************/ ++ ++int32_t ++init(xlator_t *this) ++{ ++ gf_ctr_private_t *priv = NULL; ++ int ret_db = -1; ++ ++ if (!this) { ++ gf_msg("ctr", GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR, ++ "FATAL: ctr this is not initialized"); ++ return -1; ++ } ++ ++ if (!this->children || this->children->next) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR, ++ "FATAL: ctr should have exactly one child"); ++ return -1; ++ } ++ ++ if (!this->parents) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_DANGLING_VOLUME, ++ "dangling volume. check volfile "); ++ } ++ ++ priv = GF_CALLOC(1, sizeof(*priv), gf_ctr_mt_private_t); ++ if (!priv) { ++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, CTR_MSG_CALLOC_FAILED, ++ "Calloc did not work!!!"); ++ return -1; ++ } ++ ++ /*Default values for the translator*/ ++ priv->ctr_record_wind = _gf_true; ++ priv->ctr_record_unwind = _gf_false; ++ priv->ctr_hot_brick = _gf_false; ++ priv->gfdb_db_type = GFDB_SQLITE3; ++ priv->gfdb_sync_type = GFDB_DB_SYNC; ++ priv->_db_conn = NULL; ++ priv->ctr_lookupheal_link_timeout = CTR_DEFAULT_HARDLINK_EXP_PERIOD; ++ priv->ctr_lookupheal_inode_timeout = CTR_DEFAULT_INODE_EXP_PERIOD; ++ ++ /*Extract ctr xlator options*/ ++ ret_db = extract_ctr_options(this, priv); ++ if (ret_db) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_EXTRACT_CTR_XLATOR_OPTIONS_FAILED, ++ "Failed extracting ctr xlator options"); ++ GF_FREE(priv); ++ return -1; ++ } ++ ++ if (!priv->enabled) { ++ gf_msg(GFDB_DATA_STORE, GF_LOG_INFO, 0, CTR_MSG_XLATOR_DISABLED, ++ "CTR Xlator is not enabled so skip ctr init"); ++ goto out; ++ } ++ ++ ret_db = initialize_ctr_resource(this, priv); ++ if (ret_db) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR, ++ "FATAL: Failed ctr initialize resource"); ++ return -1; ++ } ++ ++out: ++ this->private = (void *)priv; ++ return 0; ++} ++ ++int ++notify(xlator_t *this, int event, void *data, ...) ++{ ++ gf_ctr_private_t *priv = NULL; ++ int ret = 0; ++ ++ priv = this->private; ++ ++ if (!priv) ++ goto out; ++ ++ ret = default_notify(this, event, data); ++ ++out: ++ return ret; ++} ++ ++int32_t ++mem_acct_init(xlator_t *this) ++{ ++ int ret = -1; ++ ++ GF_VALIDATE_OR_GOTO("ctr", this, out); ++ ++ ret = xlator_mem_acct_init(this, gf_ctr_mt_end + 1); ++ ++ if (ret != 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_MEM_ACC_INIT_FAILED, ++ "Memory accounting init" ++ "failed"); ++ return ret; ++ } ++out: ++ return ret; ++} ++ ++void ++fini(xlator_t *this) ++{ ++ gf_ctr_private_t *priv = NULL; ++ ++ priv = this->private; ++ ++ if (priv && priv->enabled) { ++ if (fini_db(priv->_db_conn)) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_CLOSE_DB_CONN_FAILED, ++ "Failed closing " ++ "db connection"); ++ } ++ ++ if (priv->_db_conn) ++ priv->_db_conn = NULL; ++ ++ GF_FREE(priv->ctr_db_path); ++ if (pthread_mutex_destroy(&priv->compact_lock)) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_CLOSE_DB_CONN_FAILED, ++ "Failed to " ++ "destroy the compaction mutex"); ++ } ++ } ++ GF_FREE(priv); ++ mem_pool_destroy(this->local_pool); ++ this->local_pool = NULL; ++ ++ return; ++} ++ ++struct xlator_fops fops = { ++ /*lookup*/ ++ .lookup = ctr_lookup, ++ /*write fops */ ++ .mknod = ctr_mknod, ++ .create = ctr_create, ++ .truncate = ctr_truncate, ++ .ftruncate = ctr_ftruncate, ++ .setxattr = ctr_setxattr, ++ .fsetxattr = ctr_fsetxattr, ++ .removexattr = ctr_removexattr, ++ .fremovexattr = ctr_fremovexattr, ++ .unlink = ctr_unlink, ++ .link = ctr_link, ++ .rename = ctr_rename, ++ .writev = ctr_writev, ++ .setattr = ctr_setattr, ++ .fsetattr = ctr_fsetattr, ++ /*read fops*/ ++ .readv = ctr_readv, ++ /* IPC call*/ ++ .ipc = ctr_ipc}; ++ ++struct xlator_cbks cbks = {.forget = ctr_forget}; ++ ++struct volume_options options[] = { ++ {.key = ++ { ++ "ctr-enabled", ++ }, ++ .type = GF_OPTION_TYPE_BOOL, ++ .value = {"on", "off"}, ++ .default_value = "off", ++ .description = "Enables the CTR", ++ .flags = OPT_FLAG_SETTABLE}, ++ {.key = {"record-entry"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .value = {"on", "off"}, ++ .default_value = "on"}, ++ {.key = {"record-exit"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .value = {"on", "off"}, ++ .default_value = "off"}, ++ {.key = {"record-counters"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .value = {"on", "off"}, ++ .default_value = "off", ++ .op_version = {GD_OP_VERSION_3_7_0}, ++ .flags = OPT_FLAG_SETTABLE, ++ .tags = {}}, ++ {.key = {"ctr-record-metadata-heat"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .value = {"on", "off"}, ++ .default_value = "off", ++ .flags = OPT_FLAG_SETTABLE, ++ .op_version = {GD_OP_VERSION_3_7_0}, ++ .tags = {}}, ++ {.key = {"ctr_link_consistency"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .value = {"on", "off"}, ++ .default_value = "off", ++ .flags = OPT_FLAG_SETTABLE, ++ .op_version = {GD_OP_VERSION_3_7_0}, ++ .tags = {}}, ++ {.key = {"ctr_lookupheal_link_timeout"}, ++ .type = GF_OPTION_TYPE_INT, ++ .default_value = "300", ++ .flags = OPT_FLAG_SETTABLE, ++ .op_version = {GD_OP_VERSION_3_7_2}, ++ .tags = {}}, ++ {.key = {"ctr_lookupheal_inode_timeout"}, ++ .type = GF_OPTION_TYPE_INT, ++ .default_value = "300", ++ .flags = OPT_FLAG_SETTABLE, ++ .op_version = {GD_OP_VERSION_3_7_2}, ++ .tags = {}}, ++ {.key = {"hot-brick"}, ++ .type = GF_OPTION_TYPE_BOOL, ++ .value = {"on", "off"}, ++ .default_value = "off"}, ++ {.key = {"db-type"}, ++ .type = GF_OPTION_TYPE_STR, ++ .value = {"hashfile", "rocksdb", "changelog", "sqlite3", "hyperdex"}, ++ .default_value = "sqlite3", ++ .op_version = {GD_OP_VERSION_3_7_0}, ++ .flags = OPT_FLAG_SETTABLE, ++ .tags = {}}, ++ {.key = {"db-sync"}, ++ .type = GF_OPTION_TYPE_STR, ++ .value = {"sync", "async"}, ++ .default_value = "sync"}, ++ {.key = {"db-path"}, .type = GF_OPTION_TYPE_PATH}, ++ {.key = {"db-name"}, .type = GF_OPTION_TYPE_STR}, ++ {.key = {GFDB_SQL_PARAM_SYNC}, ++ .type = GF_OPTION_TYPE_STR, ++ .value = {"off", "normal", "full"}, ++ .default_value = "normal"}, ++ {.key = {GFDB_SQL_PARAM_JOURNAL_MODE}, ++ .type = GF_OPTION_TYPE_STR, ++ .value = {"delete", "truncate", "persist", "memory", "wal", "off"}, ++ .default_value = "wal", ++ .flags = OPT_FLAG_SETTABLE, ++ .op_version = {GD_OP_VERSION_3_7_0}, ++ .tags = {}}, ++ {.key = {GFDB_SQL_PARAM_AUTO_VACUUM}, ++ .type = GF_OPTION_TYPE_STR, ++ .value = {"off", "full", "incr"}, ++ .default_value = "off", ++ .flags = OPT_FLAG_SETTABLE, ++ .op_version = {GD_OP_VERSION_3_7_0}, ++ .tags = {}}, ++ {.key = {GFDB_SQL_PARAM_WAL_AUTOCHECK}, ++ .type = GF_OPTION_TYPE_INT, ++ .default_value = "25000", ++ .flags = OPT_FLAG_SETTABLE, ++ .op_version = {GD_OP_VERSION_3_7_0}, ++ .tags = {}}, ++ {.key = {GFDB_SQL_PARAM_CACHE_SIZE}, ++ .type = GF_OPTION_TYPE_INT, ++ .default_value = "12500", ++ .flags = OPT_FLAG_SETTABLE, ++ .op_version = {GD_OP_VERSION_3_7_0}, ++ .tags = {}}, ++ {.key = {GFDB_SQL_PARAM_PAGE_SIZE}, ++ .type = GF_OPTION_TYPE_INT, ++ .default_value = "4096", ++ .flags = OPT_FLAG_SETTABLE, ++ .op_version = {GD_OP_VERSION_3_7_0}, ++ .tags = {}}, ++ {.key = {NULL}}, ++}; ++ ++xlator_api_t xlator_api = { ++ .init = init, ++ .fini = fini, ++ .notify = notify, ++ .reconfigure = reconfigure, ++ .mem_acct_init = mem_acct_init, ++ .op_version = {GD_OP_VERSION_3_7_0}, /* Present from the initial version */ ++ .fops = &fops, ++ .cbks = &cbks, ++ .identifier = "changetimerecorder", ++ .category = GF_MAINTAINED, ++ .options = options, ++}; +diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.h b/xlators/features/changetimerecorder/src/changetimerecorder.h +new file mode 100644 +index 0000000..0150a1c +--- /dev/null ++++ b/xlators/features/changetimerecorder/src/changetimerecorder.h +@@ -0,0 +1,21 @@ ++/* ++ Copyright (c) 2006-2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#ifndef __CTR_H ++#define __CTR_H ++ ++#include <glusterfs/glusterfs.h> ++#include <glusterfs/xlator.h> ++#include <glusterfs/logging.h> ++#include <glusterfs/common-utils.h> ++#include "ctr_mem_types.h" ++#include "ctr-helper.h" ++ ++#endif /* __CTR_H */ +diff --git a/xlators/features/changetimerecorder/src/ctr-helper.c b/xlators/features/changetimerecorder/src/ctr-helper.c +new file mode 100644 +index 0000000..e1e6573 +--- /dev/null ++++ b/xlators/features/changetimerecorder/src/ctr-helper.c +@@ -0,0 +1,293 @@ ++/* ++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#include "gfdb_sqlite3.h" ++#include "ctr-helper.h" ++#include "ctr-messages.h" ++ ++/******************************************************************************* ++ * ++ * Fill unwind into db record ++ * ++ ******************************************************************************/ ++int ++fill_db_record_for_unwind(xlator_t *this, gf_ctr_local_t *ctr_local, ++ gfdb_fop_type_t fop_type, gfdb_fop_path_t fop_path) ++{ ++ int ret = -1; ++ gfdb_time_t *ctr_uwtime = NULL; ++ gf_ctr_private_t *_priv = NULL; ++ ++ GF_ASSERT(this); ++ _priv = this->private; ++ GF_ASSERT(_priv); ++ ++ GF_ASSERT(ctr_local); ++ ++ /*If not unwind path error*/ ++ if (!isunwindpath(fop_path)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_WRONG_FOP_PATH, ++ "Wrong fop_path. Should be unwind"); ++ goto out; ++ } ++ ++ ctr_uwtime = &CTR_DB_REC(ctr_local).gfdb_unwind_change_time; ++ CTR_DB_REC(ctr_local).gfdb_fop_path = fop_path; ++ CTR_DB_REC(ctr_local).gfdb_fop_type = fop_type; ++ ++ ret = gettimeofday(ctr_uwtime, NULL); ++ if (ret == -1) { ++ gf_msg(this->name, GF_LOG_ERROR, errno, ++ CTR_MSG_FILL_UNWIND_TIME_REC_ERROR, ++ "Error " ++ "filling unwind time record %s", ++ strerror(errno)); ++ goto out; ++ } ++ ++ /* Special case i.e if its a tier rebalance ++ * + cold tier brick ++ * + its a create/mknod FOP ++ * we record unwind time as zero */ ++ if (ctr_local->client_pid == GF_CLIENT_PID_TIER_DEFRAG && ++ (!_priv->ctr_hot_brick) && isdentrycreatefop(fop_type)) { ++ memset(ctr_uwtime, 0, sizeof(*ctr_uwtime)); ++ } ++ ret = 0; ++out: ++ return ret; ++} ++ ++/******************************************************************************* ++ * ++ * Fill wind into db record ++ * ++ ******************************************************************************/ ++int ++fill_db_record_for_wind(xlator_t *this, gf_ctr_local_t *ctr_local, ++ gf_ctr_inode_context_t *ctr_inode_cx) ++{ ++ int ret = -1; ++ gfdb_time_t *ctr_wtime = NULL; ++ gf_ctr_private_t *_priv = NULL; ++ ++ GF_ASSERT(this); ++ _priv = this->private; ++ GF_ASSERT(_priv); ++ GF_ASSERT(ctr_local); ++ IS_CTR_INODE_CX_SANE(ctr_inode_cx); ++ ++ /*if not wind path error!*/ ++ if (!iswindpath(ctr_inode_cx->fop_path)) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_WRONG_FOP_PATH, ++ "Wrong fop_path. Should be wind"); ++ goto out; ++ } ++ ++ ctr_wtime = &CTR_DB_REC(ctr_local).gfdb_wind_change_time; ++ CTR_DB_REC(ctr_local).gfdb_fop_path = ctr_inode_cx->fop_path; ++ CTR_DB_REC(ctr_local).gfdb_fop_type = ctr_inode_cx->fop_type; ++ CTR_DB_REC(ctr_local).link_consistency = _priv->ctr_link_consistency; ++ ++ ret = gettimeofday(ctr_wtime, NULL); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, errno, ++ CTR_MSG_FILL_UNWIND_TIME_REC_ERROR, ++ "Error filling wind time record %s", strerror(errno)); ++ goto out; ++ } ++ ++ /* Special case i.e if its a tier rebalance ++ * + cold tier brick ++ * + its a create/mknod FOP ++ * we record wind time as zero */ ++ if (ctr_local->client_pid == GF_CLIENT_PID_TIER_DEFRAG && ++ (!_priv->ctr_hot_brick) && isdentrycreatefop(ctr_inode_cx->fop_type)) { ++ memset(ctr_wtime, 0, sizeof(*ctr_wtime)); ++ } ++ ++ /* Copy gfid into db record */ ++ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, *(ctr_inode_cx->gfid)); ++ ++ /* Copy older gfid if any */ ++ if (ctr_inode_cx->old_gfid && ++ (!gf_uuid_is_null(*(ctr_inode_cx->old_gfid)))) { ++ gf_uuid_copy(CTR_DB_REC(ctr_local).old_gfid, *(ctr_inode_cx->old_gfid)); ++ } ++ ++ /*Hard Links*/ ++ if (isdentryfop(ctr_inode_cx->fop_type)) { ++ /*new link fop*/ ++ if (NEW_LINK_CX(ctr_inode_cx)) { ++ gf_uuid_copy(CTR_DB_REC(ctr_local).pargfid, ++ *((NEW_LINK_CX(ctr_inode_cx))->pargfid)); ++ strcpy(CTR_DB_REC(ctr_local).file_name, ++ NEW_LINK_CX(ctr_inode_cx)->basename); ++ } ++ /*rename fop*/ ++ if (OLD_LINK_CX(ctr_inode_cx)) { ++ gf_uuid_copy(CTR_DB_REC(ctr_local).old_pargfid, ++ *((OLD_LINK_CX(ctr_inode_cx))->pargfid)); ++ strcpy(CTR_DB_REC(ctr_local).old_file_name, ++ OLD_LINK_CX(ctr_inode_cx)->basename); ++ } ++ } ++ ++ ret = 0; ++out: ++ /*On error roll back and clean the record*/ ++ if (ret == -1) { ++ CLEAR_CTR_DB_RECORD(ctr_local); ++ } ++ return ret; ++} ++ ++/****************************************************************************** ++ * ++ * CTR xlator init related functions ++ * ++ * ++ * ****************************************************************************/ ++static int ++extract_sql_params(xlator_t *this, dict_t *params_dict) ++{ ++ int ret = -1; ++ char *db_path = NULL; ++ char *db_name = NULL; ++ char *db_full_path = NULL; ++ ++ GF_ASSERT(this); ++ GF_ASSERT(params_dict); ++ ++ /*Extract the path of the db*/ ++ db_path = NULL; ++ GET_DB_PARAM_FROM_DICT_DEFAULT(this->name, this->options, "db-path", ++ db_path, "/var/run/gluster/"); ++ ++ /*Extract the name of the db*/ ++ db_name = NULL; ++ GET_DB_PARAM_FROM_DICT_DEFAULT(this->name, this->options, "db-name", ++ db_name, "gf_ctr_db.db"); ++ ++ /*Construct full path of the db*/ ++ ret = gf_asprintf(&db_full_path, "%s/%s", db_path, db_name); ++ if (ret < 0) { ++ gf_msg(GFDB_DATA_STORE, GF_LOG_ERROR, 0, ++ CTR_MSG_CONSTRUCT_DB_PATH_FAILED, ++ "Construction of full db path failed!"); ++ goto out; ++ } ++ ++ /*Setting the SQL DB Path*/ ++ SET_DB_PARAM_TO_DICT(this->name, params_dict, GFDB_SQL_PARAM_DBPATH, ++ db_full_path, ret, out); ++ ++ /*Extract rest of the sql params*/ ++ ret = gfdb_set_sql_params(this->name, this->options, params_dict); ++ if (ret) { ++ gf_msg(GFDB_DATA_STORE, GF_LOG_ERROR, 0, ++ CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED, ++ "Failed setting values to sql param dict!"); ++ } ++ ++ ret = 0; ++ ++out: ++ if (ret) ++ GF_FREE(db_full_path); ++ return ret; ++} ++ ++int ++extract_db_params(xlator_t *this, dict_t *params_dict, gfdb_db_type_t db_type) ++{ ++ int ret = -1; ++ ++ GF_ASSERT(this); ++ GF_ASSERT(params_dict); ++ ++ switch (db_type) { ++ case GFDB_SQLITE3: ++ ret = extract_sql_params(this, params_dict); ++ if (ret) ++ goto out; ++ break; ++ case GFDB_ROCKS_DB: ++ case GFDB_HYPERDEX: ++ case GFDB_HASH_FILE_STORE: ++ case GFDB_INVALID_DB: ++ case GFDB_DB_END: ++ goto out; ++ } ++ ret = 0; ++out: ++ return ret; ++} ++ ++int ++extract_ctr_options(xlator_t *this, gf_ctr_private_t *_priv) ++{ ++ int ret = -1; ++ char *_val_str = NULL; ++ ++ GF_ASSERT(this); ++ GF_ASSERT(_priv); ++ ++ /*Checking if the CTR Translator is enabled. By default its disabled*/ ++ _priv->enabled = _gf_false; ++ GF_OPTION_INIT("ctr-enabled", _priv->enabled, bool, out); ++ if (!_priv->enabled) { ++ gf_msg(GFDB_DATA_STORE, GF_LOG_INFO, 0, CTR_MSG_XLATOR_DISABLED, ++ "CTR Xlator is disabled."); ++ ret = 0; ++ goto out; ++ } ++ ++ /*Extract db type*/ ++ GF_OPTION_INIT("db-type", _val_str, str, out); ++ _priv->gfdb_db_type = gf_string2gfdbdbtype(_val_str); ++ ++ /*Extract flag for record on wind*/ ++ GF_OPTION_INIT("record-entry", _priv->ctr_record_wind, bool, out); ++ ++ /*Extract flag for record on unwind*/ ++ GF_OPTION_INIT("record-exit", _priv->ctr_record_unwind, bool, out); ++ ++ /*Extract flag for record on counters*/ ++ GF_OPTION_INIT("record-counters", _priv->ctr_record_counter, bool, out); ++ ++ /* Extract flag for record metadata heat */ ++ GF_OPTION_INIT("ctr-record-metadata-heat", _priv->ctr_record_metadata_heat, ++ bool, out); ++ ++ /*Extract flag for link consistency*/ ++ GF_OPTION_INIT("ctr_link_consistency", _priv->ctr_link_consistency, bool, ++ out); ++ ++ /*Extract ctr_lookupheal_inode_timeout */ ++ GF_OPTION_INIT("ctr_lookupheal_inode_timeout", ++ _priv->ctr_lookupheal_inode_timeout, uint64, out); ++ ++ /*Extract ctr_lookupheal_link_timeout*/ ++ GF_OPTION_INIT("ctr_lookupheal_link_timeout", ++ _priv->ctr_lookupheal_link_timeout, uint64, out); ++ ++ /*Extract flag for hot tier brick*/ ++ GF_OPTION_INIT("hot-brick", _priv->ctr_hot_brick, bool, out); ++ ++ /*Extract flag for sync mode*/ ++ GF_OPTION_INIT("db-sync", _val_str, str, out); ++ _priv->gfdb_sync_type = gf_string2gfdbdbsync(_val_str); ++ ++ ret = 0; ++ ++out: ++ return ret; ++} +diff --git a/xlators/features/changetimerecorder/src/ctr-helper.h b/xlators/features/changetimerecorder/src/ctr-helper.h +new file mode 100644 +index 0000000..517fbb0 +--- /dev/null ++++ b/xlators/features/changetimerecorder/src/ctr-helper.h +@@ -0,0 +1,854 @@ ++/* ++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#ifndef __CTR_HELPER_H ++#define __CTR_HELPER_H ++ ++#include <glusterfs/xlator.h> ++#include "ctr_mem_types.h" ++#include <glusterfs/iatt.h> ++#include <glusterfs/glusterfs.h> ++#include <glusterfs/xlator.h> ++#include <glusterfs/defaults.h> ++#include <glusterfs/logging.h> ++#include <glusterfs/common-utils.h> ++#include <time.h> ++#include <sys/time.h> ++#include <pthread.h> ++ ++#include "gfdb_data_store.h" ++#include "ctr-xlator-ctx.h" ++#include "ctr-messages.h" ++ ++#define CTR_DEFAULT_HARDLINK_EXP_PERIOD 300 /* Five mins */ ++#define CTR_DEFAULT_INODE_EXP_PERIOD 300 /* Five mins */ ++ ++typedef struct ctr_query_cbk_args { ++ int query_fd; ++ int count; ++} ctr_query_cbk_args_t; ++ ++/*CTR Xlator Private structure*/ ++typedef struct gf_ctr_private { ++ gf_boolean_t enabled; ++ char *ctr_db_path; ++ gf_boolean_t ctr_hot_brick; ++ gf_boolean_t ctr_record_wind; ++ gf_boolean_t ctr_record_unwind; ++ gf_boolean_t ctr_record_counter; ++ gf_boolean_t ctr_record_metadata_heat; ++ gf_boolean_t ctr_link_consistency; ++ gfdb_db_type_t gfdb_db_type; ++ gfdb_sync_type_t gfdb_sync_type; ++ gfdb_conn_node_t *_db_conn; ++ uint64_t ctr_lookupheal_link_timeout; ++ uint64_t ctr_lookupheal_inode_timeout; ++ gf_boolean_t compact_active; ++ gf_boolean_t compact_mode_switched; ++ pthread_mutex_t compact_lock; ++} gf_ctr_private_t; ++ ++/* ++ * gf_ctr_local_t is the ctr xlator local data structure that is stored in ++ * the call_frame of each FOP. ++ * ++ * gfdb_db_record: The gf_ctr_local contains a gfdb_db_record object, which is ++ * used by the insert_record() api from the libgfdb. The gfdb_db_record object ++ * will contain all the inode and hardlink(only for dentry fops: create, ++ * mknod,link, unlink, rename).The ctr_local is keep alive till the unwind ++ * call and will be release during the unwind. The same gfdb_db_record will ++ * used for the unwind insert_record() api, to record unwind in the database. ++ * ++ * ia_inode_type in gf_ctr_local will tell the type of the inode. This is ++ * important for during the unwind path. As we will not have the inode during ++ * the unwind path. We would have include this in the gfdb_db_record itself ++ * but currently we record only file inode information. ++ * ++ * is_internal_fop in gf_ctr_local will tell us if this is a internal fop and ++ * take special/no action. We don't record change/access times or increement ++ * heat counter for internal fops from rebalancer. ++ * */ ++typedef struct gf_ctr_local { ++ gfdb_db_record_t gfdb_db_record; ++ ia_type_t ia_inode_type; ++ gf_boolean_t is_internal_fop; ++ gf_special_pid_t client_pid; ++} gf_ctr_local_t; ++/* ++ * Easy access of gfdb_db_record of ctr_local ++ * */ ++#define CTR_DB_REC(ctr_local) (ctr_local->gfdb_db_record) ++ ++/*Clear db record*/ ++#define CLEAR_CTR_DB_RECORD(ctr_local) \ ++ do { \ ++ ctr_local->gfdb_db_record.gfdb_fop_path = GFDB_FOP_INVALID; \ ++ memset(&(ctr_local->gfdb_db_record.gfdb_wind_change_time), 0, \ ++ sizeof(gfdb_time_t)); \ ++ memset(&(ctr_local->gfdb_db_record.gfdb_unwind_change_time), 0, \ ++ sizeof(gfdb_time_t)); \ ++ gf_uuid_clear(ctr_local->gfdb_db_record.gfid); \ ++ gf_uuid_clear(ctr_local->gfdb_db_record.pargfid); \ ++ memset(ctr_local->gfdb_db_record.file_name, 0, GF_NAME_MAX + 1); \ ++ memset(ctr_local->gfdb_db_record.old_file_name, 0, GF_NAME_MAX + 1); \ ++ ctr_local->gfdb_db_record.gfdb_fop_type = GFDB_FOP_INVALID_OP; \ ++ ctr_local->ia_inode_type = IA_INVAL; \ ++ } while (0) ++ ++static gf_ctr_local_t * ++init_ctr_local_t(xlator_t *this) ++{ ++ gf_ctr_local_t *ctr_local = NULL; ++ ++ GF_ASSERT(this); ++ ++ ctr_local = mem_get0(this->local_pool); ++ if (!ctr_local) { ++ gf_msg(GFDB_DATA_STORE, GF_LOG_ERROR, 0, ++ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND, ++ "Error while creating ctr local"); ++ goto out; ++ } ++ ++ CLEAR_CTR_DB_RECORD(ctr_local); ++out: ++ return ctr_local; ++} ++ ++static void ++free_ctr_local(gf_ctr_local_t *ctr_local) ++{ ++ if (ctr_local) ++ mem_put(ctr_local); ++} ++ ++/****************************************************************************** ++ * ++ * ++ * Context Carrier Structures ++ * ++ * ++ * ****************************************************************************/ ++ ++/* ++ * Context Carrier structures are used to carry relevant information about ++ * inodes and links from the fops calls to the ctr_insert_wind. ++ * These structure just have pointers to the original data and donot ++ * do a deep copy of any data. This info is deep copied to ++ * ctr_local->gfdb_db_record and passed to insert_record() api of libgfdb. This ++ * info remains persistent for the unwind in ctr_local->gfdb_db_record ++ * and once used will be destroyed. ++ * ++ * gf_ctr_link_context_t : Context structure for hard links ++ * gf_ctr_inode_context_t : Context structure for inodes ++ * ++ * */ ++ ++/*Context Carrier Structure for hard links*/ ++typedef struct gf_ctr_link_context { ++ uuid_t *pargfid; ++ const char *basename; ++} gf_ctr_link_context_t; ++ ++/*Context Carrier Structure for inodes*/ ++typedef struct gf_ctr_inode_context { ++ ia_type_t ia_type; ++ uuid_t *gfid; ++ uuid_t *old_gfid; ++ gf_ctr_link_context_t *new_link_cx; ++ gf_ctr_link_context_t *old_link_cx; ++ gfdb_fop_type_t fop_type; ++ gfdb_fop_path_t fop_path; ++ gf_boolean_t is_internal_fop; ++ /* Indicating metadata fops */ ++ gf_boolean_t is_metadata_fop; ++} gf_ctr_inode_context_t; ++ ++/*******************Util Macros for Context Carrier Structures*****************/ ++ ++/*Checks if ctr_link_cx is sane!*/ ++#define IS_CTR_LINK_CX_SANE(ctr_link_cx) \ ++ do { \ ++ if (ctr_link_cx) { \ ++ if (ctr_link_cx->pargfid) \ ++ GF_ASSERT(*(ctr_link_cx->pargfid)); \ ++ GF_ASSERT(ctr_link_cx->basename); \ ++ }; \ ++ } while (0) ++ ++/*Clear and fill the ctr_link_context with values*/ ++#define FILL_CTR_LINK_CX(ctr_link_cx, _pargfid, _basename, label) \ ++ do { \ ++ GF_VALIDATE_OR_GOTO("ctr", ctr_link_cx, label); \ ++ GF_VALIDATE_OR_GOTO("ctr", _pargfid, label); \ ++ GF_VALIDATE_OR_GOTO("ctr", _basename, label); \ ++ memset(ctr_link_cx, 0, sizeof(*ctr_link_cx)); \ ++ ctr_link_cx->pargfid = &_pargfid; \ ++ ctr_link_cx->basename = _basename; \ ++ } while (0) ++ ++#define NEW_LINK_CX(ctr_inode_cx) ctr_inode_cx->new_link_cx ++ ++#define OLD_LINK_CX(ctr_inode_cx) ctr_inode_cx->old_link_cx ++ ++/*Checks if ctr_inode_cx is sane!*/ ++#define IS_CTR_INODE_CX_SANE(ctr_inode_cx) \ ++ do { \ ++ GF_ASSERT(ctr_inode_cx); \ ++ GF_ASSERT(ctr_inode_cx->gfid); \ ++ GF_ASSERT(*(ctr_inode_cx->gfid)); \ ++ GF_ASSERT(ctr_inode_cx->fop_type != GFDB_FOP_INVALID_OP); \ ++ GF_ASSERT(ctr_inode_cx->fop_path != GFDB_FOP_INVALID); \ ++ IS_CTR_LINK_CX_SANE(NEW_LINK_CX(ctr_inode_cx)); \ ++ IS_CTR_LINK_CX_SANE(OLD_LINK_CX(ctr_inode_cx)); \ ++ } while (0) ++ ++/*Clear and fill the ctr_inode_context with values*/ ++#define FILL_CTR_INODE_CONTEXT(ctr_inode_cx, _ia_type, _gfid, _new_link_cx, \ ++ _old_link_cx, _fop_type, _fop_path) \ ++ do { \ ++ GF_ASSERT(ctr_inode_cx); \ ++ GF_ASSERT(_gfid); \ ++ GF_ASSERT(_fop_type != GFDB_FOP_INVALID_OP); \ ++ GF_ASSERT(_fop_path != GFDB_FOP_INVALID); \ ++ memset(ctr_inode_cx, 0, sizeof(*ctr_inode_cx)); \ ++ ctr_inode_cx->ia_type = _ia_type; \ ++ ctr_inode_cx->gfid = &_gfid; \ ++ IS_CTR_LINK_CX_SANE(NEW_LINK_CX(ctr_inode_cx)); \ ++ if (_new_link_cx) \ ++ NEW_LINK_CX(ctr_inode_cx) = _new_link_cx; \ ++ IS_CTR_LINK_CX_SANE(OLD_LINK_CX(ctr_inode_cx)); \ ++ if (_old_link_cx) \ ++ OLD_LINK_CX(ctr_inode_cx) = _old_link_cx; \ ++ ctr_inode_cx->fop_type = _fop_type; \ ++ ctr_inode_cx->fop_path = _fop_path; \ ++ } while (0) ++ ++/****************************************************************************** ++ * ++ * Util functions or macros used by ++ * insert wind and insert unwind ++ * ++ * ****************************************************************************/ ++/* Free ctr frame local */ ++static inline void ++ctr_free_frame_local(call_frame_t *frame) ++{ ++ if (frame) { ++ free_ctr_local((gf_ctr_local_t *)frame->local); ++ frame->local = NULL; ++ } ++} ++ ++/* Setting GF_REQUEST_LINK_COUNT_XDATA in dict ++ * that has to be sent to POSIX Xlator to send ++ * link count in unwind path. ++ * return 0 for success with not creation of dict ++ * return 1 for success with creation of dict ++ * return -1 for failure. ++ * */ ++static inline int ++set_posix_link_request(xlator_t *this, dict_t **xdata) ++{ ++ int ret = -1; ++ gf_boolean_t is_created = _gf_false; ++ ++ GF_VALIDATE_OR_GOTO("ctr", this, out); ++ GF_VALIDATE_OR_GOTO(this->name, xdata, out); ++ ++ /*create xdata if NULL*/ ++ if (!*xdata) { ++ *xdata = dict_new(); ++ is_created = _gf_true; ++ ret = 1; ++ } else { ++ ret = 0; ++ } ++ ++ if (!*xdata) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_XDATA_NULL, ++ "xdata is NULL :Cannot send " ++ "GF_REQUEST_LINK_COUNT_XDATA to posix"); ++ ret = -1; ++ goto out; ++ } ++ ++ ret = dict_set_int32(*xdata, GF_REQUEST_LINK_COUNT_XDATA, 1); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_SET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED, ++ "Failed setting GF_REQUEST_LINK_COUNT_XDATA"); ++ ret = -1; ++ goto out; ++ } ++ ret = 0; ++out: ++ if (ret == -1) { ++ if (*xdata && is_created) { ++ dict_unref(*xdata); ++ } ++ } ++ return ret; ++} ++ ++/* ++ * If a bitrot fop ++ * */ ++#define BITROT_FOP(frame) \ ++ (frame->root->pid == GF_CLIENT_PID_BITD || \ ++ frame->root->pid == GF_CLIENT_PID_SCRUB) ++ ++/* ++ * If a rebalancer fop ++ * */ ++#define REBALANCE_FOP(frame) (frame->root->pid == GF_CLIENT_PID_DEFRAG) ++ ++/* ++ * If its a tiering rebalancer fop ++ * */ ++#define TIER_REBALANCE_FOP(frame) \ ++ (frame->root->pid == GF_CLIENT_PID_TIER_DEFRAG) ++ ++/* ++ * If its a AFR SELF HEAL ++ * */ ++#define AFR_SELF_HEAL_FOP(frame) (frame->root->pid == GF_CLIENT_PID_SELF_HEALD) ++ ++/* ++ * if a rebalancer fop goto ++ * */ ++#define CTR_IF_REBALANCE_FOP_THEN_GOTO(frame, label) \ ++ do { \ ++ if (REBALANCE_FOP(frame)) \ ++ goto label; \ ++ } while (0) ++ ++/* ++ * Internal fop ++ * ++ * */ ++static inline gf_boolean_t ++is_internal_fop(call_frame_t *frame, dict_t *xdata) ++{ ++ gf_boolean_t ret = _gf_false; ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(frame->root); ++ ++ if (AFR_SELF_HEAL_FOP(frame)) { ++ ret = _gf_true; ++ } ++ if (BITROT_FOP(frame)) { ++ ret = _gf_true; ++ } ++ if (REBALANCE_FOP(frame) || TIER_REBALANCE_FOP(frame)) { ++ ret = _gf_true; ++ if (xdata && dict_get(xdata, CTR_ATTACH_TIER_LOOKUP)) { ++ ret = _gf_false; ++ } ++ } ++ if (xdata && dict_get(xdata, GLUSTERFS_INTERNAL_FOP_KEY)) { ++ ret = _gf_true; ++ } ++ ++ return ret; ++} ++ ++#define CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, dict, label) \ ++ do { \ ++ if (is_internal_fop(frame, dict)) \ ++ goto label; \ ++ } while (0) ++ ++/* if fop has failed exit */ ++#define CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, label) \ ++ do { \ ++ if (op_ret == -1) { \ ++ gf_msg_trace(this->name, 0, "Failed fop with %s", \ ++ strerror(op_errno)); \ ++ goto label; \ ++ }; \ ++ } while (0) ++ ++/* ++ * IS CTR Xlator is disabled then goto to label ++ * */ ++#define CTR_IS_DISABLED_THEN_GOTO(this, label) \ ++ do { \ ++ gf_ctr_private_t *_priv = NULL; \ ++ GF_ASSERT(this); \ ++ GF_ASSERT(this->private); \ ++ _priv = this->private; \ ++ if (!_priv->_db_conn) \ ++ goto label; \ ++ } while (0) ++ ++/* ++ * IS CTR record metadata heat is disabled then goto to label ++ * */ ++#define CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, label) \ ++ do { \ ++ gf_ctr_private_t *_priv = NULL; \ ++ GF_ASSERT(this); \ ++ GF_ASSERT(this->private); \ ++ _priv = this->private; \ ++ if (!_priv->ctr_record_metadata_heat) \ ++ goto label; \ ++ } while (0) ++ ++int ++fill_db_record_for_unwind(xlator_t *this, gf_ctr_local_t *ctr_local, ++ gfdb_fop_type_t fop_type, gfdb_fop_path_t fop_path); ++ ++int ++fill_db_record_for_wind(xlator_t *this, gf_ctr_local_t *ctr_local, ++ gf_ctr_inode_context_t *ctr_inode_cx); ++ ++/******************************************************************************* ++ * CTR INSERT WIND ++ * ***************************************************************************** ++ * Function used to insert/update record into the database during a wind fop ++ * This function creates ctr_local structure into the frame of the fop ++ * call. ++ * ****************************************************************************/ ++ ++static inline int ++ctr_insert_wind(call_frame_t *frame, xlator_t *this, ++ gf_ctr_inode_context_t *ctr_inode_cx) ++{ ++ int ret = -1; ++ gf_ctr_private_t *_priv = NULL; ++ gf_ctr_local_t *ctr_local = NULL; ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(frame->root); ++ GF_ASSERT(this); ++ IS_CTR_INODE_CX_SANE(ctr_inode_cx); ++ ++ _priv = this->private; ++ GF_ASSERT(_priv); ++ ++ GF_ASSERT(_priv->_db_conn); ++ ++ /*If record_wind option of CTR is on record wind for ++ * regular files only*/ ++ if (_priv->ctr_record_wind && ctr_inode_cx->ia_type != IA_IFDIR) { ++ frame->local = init_ctr_local_t(this); ++ if (!frame->local) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND, ++ "WIND: Error while creating ctr local"); ++ goto out; ++ }; ++ ctr_local = frame->local; ++ ctr_local->client_pid = frame->root->pid; ++ ctr_local->is_internal_fop = ctr_inode_cx->is_internal_fop; ++ ++ /* Decide whether to record counters or not */ ++ CTR_DB_REC(ctr_local).do_record_counters = _gf_false; ++ /* If record counter is enabled */ ++ if (_priv->ctr_record_counter) { ++ /* If not a internal fop */ ++ if (!(ctr_local->is_internal_fop)) { ++ /* If its a metadata fop AND ++ * record metadata heat ++ * OR ++ * its NOT a metadata fop */ ++ if ((ctr_inode_cx->is_metadata_fop && ++ _priv->ctr_record_metadata_heat) || ++ (!ctr_inode_cx->is_metadata_fop)) { ++ CTR_DB_REC(ctr_local).do_record_counters = _gf_true; ++ } ++ } ++ } ++ ++ /* Decide whether to record times or not ++ * For non internal FOPS record times as usual*/ ++ CTR_DB_REC(ctr_local).do_record_times = _gf_false; ++ if (!ctr_local->is_internal_fop) { ++ /* If its a metadata fop AND ++ * record metadata heat ++ * OR ++ * its NOT a metadata fop */ ++ if ((ctr_inode_cx->is_metadata_fop && ++ _priv->ctr_record_metadata_heat) || ++ (!ctr_inode_cx->is_metadata_fop)) { ++ CTR_DB_REC(ctr_local).do_record_times = ++ (_priv->ctr_record_wind || _priv->ctr_record_unwind); ++ } ++ } ++ /* when its a internal FOPS*/ ++ else { ++ /* Record times only for create ++ * i.e when the inode is created */ ++ CTR_DB_REC(ctr_local).do_record_times = (isdentrycreatefop( ++ ctr_inode_cx->fop_type)) ++ ? _gf_true ++ : _gf_false; ++ } ++ ++ /*Fill the db record for insertion*/ ++ ret = fill_db_record_for_wind(this, ctr_local, ctr_inode_cx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_FILL_CTR_LOCAL_ERROR_WIND, ++ "WIND: Error filling ctr local"); ++ goto out; ++ } ++ ++ /*Insert the db record*/ ++ ret = insert_record(_priv->_db_conn, &ctr_local->gfdb_db_record); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_INSERT_RECORD_WIND_FAILED, ++ "WIND: Inserting of record failed!"); ++ goto out; ++ } ++ } ++ ret = 0; ++out: ++ ++ if (ret) { ++ free_ctr_local(ctr_local); ++ frame->local = NULL; ++ } ++ ++ return ret; ++} ++ ++/******************************************************************************* ++ * CTR INSERT UNWIND ++ * ***************************************************************************** ++ * Function used to insert/update record into the database during a unwind fop ++ * This function destroys ctr_local structure into the frame of the fop ++ * call at the end. ++ * ****************************************************************************/ ++static inline int ++ctr_insert_unwind(call_frame_t *frame, xlator_t *this, gfdb_fop_type_t fop_type, ++ gfdb_fop_path_t fop_path) ++{ ++ int ret = -1; ++ gf_ctr_private_t *_priv = NULL; ++ gf_ctr_local_t *ctr_local = NULL; ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(this); ++ ++ _priv = this->private; ++ GF_ASSERT(_priv); ++ ++ GF_ASSERT(_priv->_db_conn); ++ ++ ctr_local = frame->local; ++ ++ if (ctr_local && (_priv->ctr_record_unwind || isdentryfop(fop_type)) && ++ (ctr_local->ia_inode_type != IA_IFDIR)) { ++ CTR_DB_REC(ctr_local).do_record_uwind_time = _priv->ctr_record_unwind; ++ ++ ret = fill_db_record_for_unwind(this, ctr_local, fop_type, fop_path); ++ if (ret == -1) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND, ++ "UNWIND: Error filling ctr local"); ++ goto out; ++ } ++ ++ ret = insert_record(_priv->_db_conn, &ctr_local->gfdb_db_record); ++ if (ret == -1) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND, ++ "UNWIND: Error filling ctr local"); ++ goto out; ++ } ++ } ++ ret = 0; ++out: ++ return ret; ++} ++ ++/****************************************************************************** ++ * Delete file/flink record/s from db ++ * ****************************************************************************/ ++static inline int ++ctr_delete_hard_link_from_db(xlator_t *this, uuid_t gfid, uuid_t pargfid, ++ char *basename, gfdb_fop_type_t fop_type, ++ gfdb_fop_path_t fop_path) ++{ ++ int ret = -1; ++ gfdb_db_record_t gfdb_db_record; ++ gf_ctr_private_t *_priv = NULL; ++ ++ _priv = this->private; ++ GF_VALIDATE_OR_GOTO(this->name, _priv, out); ++ GF_VALIDATE_OR_GOTO(this->name, (!gf_uuid_is_null(gfid)), out); ++ GF_VALIDATE_OR_GOTO(this->name, (!gf_uuid_is_null(pargfid)), out); ++ GF_VALIDATE_OR_GOTO(this->name, (fop_type == GFDB_FOP_DENTRY_WRITE), out); ++ GF_VALIDATE_OR_GOTO( ++ this->name, (fop_path == GFDB_FOP_UNDEL || GFDB_FOP_UNDEL_ALL), out); ++ ++ /* Set gfdb_db_record to 0 */ ++ memset(&gfdb_db_record, 0, sizeof(gfdb_db_record)); ++ ++ /* Copy basename */ ++ if (snprintf(gfdb_db_record.file_name, GF_NAME_MAX, "%s", basename) >= ++ GF_NAME_MAX) ++ goto out; ++ ++ /* Copy gfid into db record */ ++ gf_uuid_copy(gfdb_db_record.gfid, gfid); ++ ++ /* Copy pargid into db record */ ++ gf_uuid_copy(gfdb_db_record.pargfid, pargfid); ++ ++ gfdb_db_record.gfdb_fop_path = fop_path; ++ gfdb_db_record.gfdb_fop_type = fop_type; ++ ++ /*send delete request to db*/ ++ ret = insert_record(_priv->_db_conn, &gfdb_db_record); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RECORD_WIND_FAILED, ++ "Failed to delete record. %s", basename); ++ goto out; ++ } ++ ++ ret = 0; ++out: ++ return ret; ++} ++ ++/******************************* Hard link function ***************************/ ++ ++static inline gf_boolean_t ++__is_inode_expired(ctr_xlator_ctx_t *ctr_xlator_ctx, gf_ctr_private_t *_priv, ++ gfdb_time_t *current_time) ++{ ++ gf_boolean_t ret = _gf_false; ++ uint64_t time_diff = 0; ++ ++ GF_ASSERT(ctr_xlator_ctx); ++ GF_ASSERT(_priv); ++ GF_ASSERT(current_time); ++ ++ time_diff = current_time->tv_sec - ctr_xlator_ctx->inode_heal_period; ++ ++ ret = (time_diff >= _priv->ctr_lookupheal_inode_timeout) ? _gf_true ++ : _gf_false; ++ return ret; ++} ++ ++static inline gf_boolean_t ++__is_hardlink_expired(ctr_hard_link_t *ctr_hard_link, gf_ctr_private_t *_priv, ++ gfdb_time_t *current_time) ++{ ++ gf_boolean_t ret = _gf_false; ++ uint64_t time_diff = 0; ++ ++ GF_ASSERT(ctr_hard_link); ++ GF_ASSERT(_priv); ++ GF_ASSERT(current_time); ++ ++ time_diff = current_time->tv_sec - ctr_hard_link->hardlink_heal_period; ++ ++ ret = ret || (time_diff >= _priv->ctr_lookupheal_link_timeout) ? _gf_true ++ : _gf_false; ++ ++ return ret; ++} ++ ++/* Return values of heal*/ ++typedef enum ctr_heal_ret_val { ++ CTR_CTX_ERROR = -1, ++ /* No healing required */ ++ CTR_TRY_NO_HEAL = 0, ++ /* Try healing hard link */ ++ CTR_TRY_HARDLINK_HEAL = 1, ++ /* Try healing inode */ ++ CTR_TRY_INODE_HEAL = 2, ++} ctr_heal_ret_val_t; ++ ++/** ++ * @brief Function to add hard link to the inode context variable. ++ * The inode context maintainences a in-memory list. This is used ++ * smart healing of database. ++ * @param frame of the FOP ++ * @param this is the Xlator instant ++ * @param inode ++ * @return Return ctr_heal_ret_val_t ++ */ ++ ++static inline ctr_heal_ret_val_t ++add_hard_link_ctx(call_frame_t *frame, xlator_t *this, inode_t *inode) ++{ ++ ctr_heal_ret_val_t ret_val = CTR_TRY_NO_HEAL; ++ int ret = -1; ++ gf_ctr_local_t *ctr_local = NULL; ++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL; ++ ctr_hard_link_t *ctr_hard_link = NULL; ++ gf_ctr_private_t *_priv = NULL; ++ gfdb_time_t current_time = {0}; ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(this); ++ GF_ASSERT(inode); ++ GF_ASSERT(this->private); ++ ++ _priv = this->private; ++ ++ ctr_local = frame->local; ++ if (!ctr_local) { ++ goto out; ++ } ++ ++ ctr_xlator_ctx = init_ctr_xlator_ctx(this, inode); ++ if (!ctr_xlator_ctx) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_ACCESS_CTR_INODE_CONTEXT_FAILED, ++ "Failed accessing ctr inode context"); ++ goto out; ++ } ++ ++ LOCK(&ctr_xlator_ctx->lock); ++ ++ /* Check if the hard link already exists ++ * in the ctr inode context*/ ++ ctr_hard_link = ctr_search_hard_link_ctx(this, ctr_xlator_ctx, ++ CTR_DB_REC(ctr_local).pargfid, ++ CTR_DB_REC(ctr_local).file_name); ++ /* if there then ignore */ ++ if (ctr_hard_link) { ++ ret = gettimeofday(¤t_time, NULL); ++ if (ret == -1) { ++ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time"); ++ ret_val = CTR_CTX_ERROR; ++ goto unlock; ++ } ++ ++ if (__is_hardlink_expired(ctr_hard_link, _priv, ¤t_time)) { ++ ctr_hard_link->hardlink_heal_period = current_time.tv_sec; ++ ret_val = ret_val | CTR_TRY_HARDLINK_HEAL; ++ } ++ ++ if (__is_inode_expired(ctr_xlator_ctx, _priv, ¤t_time)) { ++ ctr_xlator_ctx->inode_heal_period = current_time.tv_sec; ++ ret_val = ret_val | CTR_TRY_INODE_HEAL; ++ } ++ ++ goto unlock; ++ } ++ ++ /* Add the hard link to the list*/ ++ ret = ctr_add_hard_link(this, ctr_xlator_ctx, CTR_DB_REC(ctr_local).pargfid, ++ CTR_DB_REC(ctr_local).file_name); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_ADD_HARDLINK_TO_CTR_INODE_CONTEXT_FAILED, ++ "Failed to add hardlink to the ctr inode context"); ++ ret_val = CTR_CTX_ERROR; ++ goto unlock; ++ } ++ ++ ret_val = CTR_TRY_NO_HEAL; ++unlock: ++ UNLOCK(&ctr_xlator_ctx->lock); ++out: ++ return ret_val; ++} ++ ++static inline int ++delete_hard_link_ctx(call_frame_t *frame, xlator_t *this, inode_t *inode) ++{ ++ int ret = -1; ++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL; ++ gf_ctr_local_t *ctr_local = NULL; ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(this); ++ GF_ASSERT(inode); ++ ++ ctr_local = frame->local; ++ if (!ctr_local) { ++ goto out; ++ } ++ ++ ctr_xlator_ctx = get_ctr_xlator_ctx(this, inode); ++ if (!ctr_xlator_ctx) { ++ /* Since there is no ctr inode context so nothing more to do */ ++ ret = 0; ++ goto out; ++ } ++ ++ ret = ctr_delete_hard_link(this, ctr_xlator_ctx, ++ CTR_DB_REC(ctr_local).pargfid, ++ CTR_DB_REC(ctr_local).file_name); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_DELETE_HARDLINK_FAILED, ++ "Failed to delete hard link"); ++ goto out; ++ } ++ ++ ret = 0; ++ ++out: ++ return ret; ++} ++ ++static inline int ++update_hard_link_ctx(call_frame_t *frame, xlator_t *this, inode_t *inode) ++{ ++ int ret = -1; ++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL; ++ gf_ctr_local_t *ctr_local = NULL; ++ ++ GF_ASSERT(frame); ++ GF_ASSERT(this); ++ GF_ASSERT(inode); ++ ++ ctr_local = frame->local; ++ if (!ctr_local) { ++ goto out; ++ } ++ ++ ctr_xlator_ctx = init_ctr_xlator_ctx(this, inode); ++ if (!ctr_xlator_ctx) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_ACCESS_CTR_INODE_CONTEXT_FAILED, ++ "Failed accessing ctr inode context"); ++ goto out; ++ } ++ ++ ret = ctr_update_hard_link( ++ this, ctr_xlator_ctx, CTR_DB_REC(ctr_local).pargfid, ++ CTR_DB_REC(ctr_local).file_name, CTR_DB_REC(ctr_local).old_pargfid, ++ CTR_DB_REC(ctr_local).old_file_name); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_DELETE_HARDLINK_FAILED, ++ "Failed to delete hard link"); ++ goto out; ++ } ++ ++ ret = 0; ++ ++out: ++ return ret; ++} ++ ++/****************************************************************************** ++ * ++ * CTR xlator init related functions ++ * ++ * ++ * ****************************************************************************/ ++int ++extract_db_params(xlator_t *this, dict_t *params_dict, gfdb_db_type_t db_type); ++ ++int ++extract_ctr_options(xlator_t *this, gf_ctr_private_t *_priv); ++ ++#endif +diff --git a/xlators/features/changetimerecorder/src/ctr-messages.h b/xlators/features/changetimerecorder/src/ctr-messages.h +new file mode 100644 +index 0000000..23adf0a +--- /dev/null ++++ b/xlators/features/changetimerecorder/src/ctr-messages.h +@@ -0,0 +1,61 @@ ++/* ++ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++ */ ++ ++#ifndef _CTR_MESSAGES_H_ ++#define _CTR_MESSAGES_H_ ++ ++#include <glusterfs/glfs-message-id.h> ++ ++/* To add new message IDs, append new identifiers at the end of the list. ++ * ++ * Never remove a message ID. If it's not used anymore, you can rename it or ++ * leave it as it is, but not delete it. This is to prevent reutilization of ++ * IDs by other messages. ++ * ++ * The component name must match one of the entries defined in ++ * glfs-message-id.h. ++ */ ++ ++GLFS_MSGID( ++ CTR, CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND, ++ CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND, CTR_MSG_FILL_CTR_LOCAL_ERROR_WIND, ++ CTR_MSG_INSERT_LINK_WIND_FAILED, CTR_MSG_INSERT_WRITEV_WIND_FAILED, ++ CTR_MSG_INSERT_WRITEV_UNWIND_FAILED, CTR_MSG_INSERT_SETATTR_WIND_FAILED, ++ CTR_MSG_INSERT_SETATTR_UNWIND_FAILED, ++ CTR_MSG_INSERT_FREMOVEXATTR_UNWIND_FAILED, ++ CTR_MSG_INSERT_FREMOVEXATTR_WIND_FAILED, ++ CTR_MSG_INSERT_REMOVEXATTR_WIND_FAILED, ++ CTR_MSG_INSERT_REMOVEXATTR_UNWIND_FAILED, ++ CTR_MSG_INSERT_TRUNCATE_WIND_FAILED, CTR_MSG_INSERT_TRUNCATE_UNWIND_FAILED, ++ CTR_MSG_INSERT_FTRUNCATE_UNWIND_FAILED, ++ CTR_MSG_INSERT_FTRUNCATE_WIND_FAILED, CTR_MSG_INSERT_RENAME_WIND_FAILED, ++ CTR_MSG_INSERT_RENAME_UNWIND_FAILED, ++ CTR_MSG_ACCESS_CTR_INODE_CONTEXT_FAILED, CTR_MSG_ADD_HARDLINK_FAILED, ++ CTR_MSG_DELETE_HARDLINK_FAILED, CTR_MSG_UPDATE_HARDLINK_FAILED, ++ CTR_MSG_GET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED, ++ CTR_MSG_SET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED, ++ CTR_MSG_INSERT_UNLINK_UNWIND_FAILED, CTR_MSG_INSERT_UNLINK_WIND_FAILED, ++ CTR_MSG_XDATA_NULL, CTR_MSG_INSERT_FSYNC_WIND_FAILED, ++ CTR_MSG_INSERT_FSYNC_UNWIND_FAILED, CTR_MSG_INSERT_MKNOD_UNWIND_FAILED, ++ CTR_MSG_INSERT_MKNOD_WIND_FAILED, CTR_MSG_INSERT_CREATE_WIND_FAILED, ++ CTR_MSG_INSERT_CREATE_UNWIND_FAILED, CTR_MSG_INSERT_RECORD_WIND_FAILED, ++ CTR_MSG_INSERT_READV_WIND_FAILED, CTR_MSG_GET_GFID_FROM_DICT_FAILED, ++ CTR_MSG_SET, CTR_MSG_FATAL_ERROR, CTR_MSG_DANGLING_VOLUME, ++ CTR_MSG_CALLOC_FAILED, CTR_MSG_EXTRACT_CTR_XLATOR_OPTIONS_FAILED, ++ CTR_MSG_INIT_DB_PARAMS_FAILED, CTR_MSG_CREATE_LOCAL_MEMORY_POOL_FAILED, ++ CTR_MSG_MEM_ACC_INIT_FAILED, CTR_MSG_CLOSE_DB_CONN_FAILED, ++ CTR_MSG_FILL_UNWIND_TIME_REC_ERROR, CTR_MSG_WRONG_FOP_PATH, ++ CTR_MSG_CONSTRUCT_DB_PATH_FAILED, CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED, ++ CTR_MSG_XLATOR_DISABLED, CTR_MSG_HARDLINK_MISSING_IN_LIST, ++ CTR_MSG_ADD_HARDLINK_TO_LIST_FAILED, CTR_MSG_INIT_LOCK_FAILED, ++ CTR_MSG_COPY_FAILED, CTR_MSG_EXTRACT_DB_PARAM_OPTIONS_FAILED, ++ CTR_MSG_ADD_HARDLINK_TO_CTR_INODE_CONTEXT_FAILED, CTR_MSG_NULL_LOCAL); ++ ++#endif /* !_CTR_MESSAGES_H_ */ +diff --git a/xlators/features/changetimerecorder/src/ctr-xlator-ctx.c b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.c +new file mode 100644 +index 0000000..b6b66d5 +--- /dev/null ++++ b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.c +@@ -0,0 +1,362 @@ ++/* ++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#include "ctr-xlator-ctx.h" ++#include "ctr-messages.h" ++#include <time.h> ++#include <sys/time.h> ++ ++#define IS_THE_ONLY_HARDLINK(ctr_hard_link) \ ++ (ctr_hard_link->list.next == ctr_hard_link->list.prev) ++ ++static void ++fini_ctr_hard_link(ctr_hard_link_t **ctr_hard_link) ++{ ++ GF_ASSERT(ctr_hard_link); ++ ++ if (*ctr_hard_link) ++ return; ++ GF_FREE((*ctr_hard_link)->base_name); ++ GF_FREE(*ctr_hard_link); ++ *ctr_hard_link = NULL; ++} ++ ++/* Please lock the ctr_xlator_ctx before using this function */ ++ctr_hard_link_t * ++ctr_search_hard_link_ctx(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx, ++ uuid_t pgfid, const char *base_name) ++{ ++ ctr_hard_link_t *_hard_link = NULL; ++ ctr_hard_link_t *searched_hardlink = NULL; ++ ++ GF_ASSERT(this); ++ GF_ASSERT(ctr_xlator_ctx); ++ ++ if (pgfid == NULL || base_name == NULL) ++ goto out; ++ ++ /*linear search*/ ++ list_for_each_entry(_hard_link, &ctr_xlator_ctx->hardlink_list, list) ++ { ++ if (gf_uuid_compare(_hard_link->pgfid, pgfid) == 0 && ++ _hard_link->base_name && ++ strcmp(_hard_link->base_name, base_name) == 0) { ++ searched_hardlink = _hard_link; ++ break; ++ } ++ } ++ ++out: ++ return searched_hardlink; ++} ++ ++/* Please lock the ctr_xlator_ctx before using this function */ ++int ++ctr_add_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx, ++ uuid_t pgfid, const char *base_name) ++{ ++ int ret = -1; ++ ctr_hard_link_t *ctr_hard_link = NULL; ++ struct timeval current_time = {0}; ++ ++ GF_ASSERT(this); ++ GF_ASSERT(ctr_xlator_ctx); ++ ++ if (pgfid == NULL || base_name == NULL) ++ goto out; ++ ++ ctr_hard_link = GF_CALLOC(1, sizeof(*ctr_hard_link), gf_ctr_mt_hard_link_t); ++ if (!ctr_hard_link) { ++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, CTR_MSG_CALLOC_FAILED, ++ "Failed allocating " ++ "ctr_hard_link"); ++ goto out; ++ } ++ ++ /*Initialize the ctr_hard_link object and ++ * Assign the values : parent GFID and basename*/ ++ INIT_LIST_HEAD(&ctr_hard_link->list); ++ gf_uuid_copy(ctr_hard_link->pgfid, pgfid); ++ ret = gf_asprintf(&ctr_hard_link->base_name, "%s", base_name); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_COPY_FAILED, ++ "Failed copying basename" ++ "to ctr_hard_link"); ++ goto error; ++ } ++ ++ ret = gettimeofday(¤t_time, NULL); ++ if (ret == -1) { ++ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time"); ++ goto error; ++ } ++ ++ /*Add the hard link to the list*/ ++ list_add_tail(&ctr_hard_link->list, &ctr_xlator_ctx->hardlink_list); ++ ++ ctr_hard_link->hardlink_heal_period = current_time.tv_sec; ++ ++ /*aal izz well!*/ ++ ret = 0; ++ goto out; ++error: ++ GF_FREE(ctr_hard_link); ++out: ++ return ret; ++} ++ ++static void ++__delete_hard_link_from_list(ctr_hard_link_t **ctr_hard_link) ++{ ++ GF_ASSERT(ctr_hard_link); ++ GF_ASSERT(*ctr_hard_link); ++ ++ /*Remove hard link from list*/ ++ list_del(&(*ctr_hard_link)->list); ++ fini_ctr_hard_link(ctr_hard_link); ++} ++ ++int ++ctr_delete_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx, ++ uuid_t pgfid, const char *base_name) ++{ ++ int ret = -1; ++ ctr_hard_link_t *ctr_hard_link = NULL; ++ ++ GF_ASSERT(this); ++ GF_ASSERT(ctr_xlator_ctx); ++ ++ LOCK(&ctr_xlator_ctx->lock); ++ ++ /*Check if the hard link is present */ ++ ctr_hard_link = ctr_search_hard_link_ctx(this, ctr_xlator_ctx, pgfid, ++ base_name); ++ if (!ctr_hard_link) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_HARDLINK_MISSING_IN_LIST, ++ "Hard link doesn't exist in the list"); ++ goto out; ++ } ++ ++ __delete_hard_link_from_list(&ctr_hard_link); ++ ctr_hard_link = NULL; ++ ++ ret = 0; ++out: ++ UNLOCK(&ctr_xlator_ctx->lock); ++ ++ return ret; ++} ++ ++int ++ctr_update_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx, ++ uuid_t pgfid, const char *base_name, uuid_t old_pgfid, ++ const char *old_base_name) ++{ ++ int ret = -1; ++ ctr_hard_link_t *ctr_hard_link = NULL; ++ struct timeval current_time = {0}; ++ ++ GF_ASSERT(this); ++ GF_ASSERT(ctr_xlator_ctx); ++ ++ LOCK(&ctr_xlator_ctx->lock); ++ ++ /*Check if the hard link is present */ ++ ctr_hard_link = ctr_search_hard_link_ctx(this, ctr_xlator_ctx, old_pgfid, ++ old_base_name); ++ if (!ctr_hard_link) { ++ gf_msg_trace(this->name, 0, ++ "Hard link doesn't exist" ++ " in the list"); ++ /* Since the hard link is not present in the list ++ * we add it to the list */ ++ ret = ctr_add_hard_link(this, ctr_xlator_ctx, pgfid, base_name); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, ++ CTR_MSG_ADD_HARDLINK_TO_LIST_FAILED, ++ "Failed adding hard link to the list"); ++ goto out; ++ } ++ ret = 0; ++ goto out; ++ } ++ ++ /* update the hard link */ ++ gf_uuid_copy(ctr_hard_link->pgfid, pgfid); ++ GF_FREE(ctr_hard_link->base_name); ++ ret = gf_asprintf(&ctr_hard_link->base_name, "%s", base_name); ++ if (ret < 0) { ++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_COPY_FAILED, ++ "Failed copying basename" ++ "to ctr_hard_link"); ++ /* delete the corrupted entry */ ++ __delete_hard_link_from_list(&ctr_hard_link); ++ ctr_hard_link = NULL; ++ goto out; ++ } ++ ++ ret = gettimeofday(¤t_time, NULL); ++ if (ret == -1) { ++ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time"); ++ ctr_hard_link->hardlink_heal_period = 0; ++ } else { ++ ctr_hard_link->hardlink_heal_period = current_time.tv_sec; ++ } ++ ++ ret = 0; ++ ++out: ++ UNLOCK(&ctr_xlator_ctx->lock); ++ ++ return ret; ++} ++ ++/* Delete all hardlinks */ ++static int ++ctr_delete_all_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx) ++{ ++ int ret = -1; ++ ctr_hard_link_t *ctr_hard_link = NULL; ++ ctr_hard_link_t *tmp = NULL; ++ ++ GF_ASSERT(ctr_xlator_ctx); ++ ++ LOCK(&ctr_xlator_ctx->lock); ++ ++ list_for_each_entry_safe(ctr_hard_link, tmp, &ctr_xlator_ctx->hardlink_list, ++ list) ++ { ++ /*Remove hard link from list*/ ++ __delete_hard_link_from_list(&ctr_hard_link); ++ ctr_hard_link = NULL; ++ } ++ ++ UNLOCK(&ctr_xlator_ctx->lock); ++ ++ ret = 0; ++ ++ return ret; ++} ++ ++/* Please lock the inode before using this function */ ++static ctr_xlator_ctx_t * ++__get_ctr_xlator_ctx(xlator_t *this, inode_t *inode) ++{ ++ int ret = 0; ++ uint64_t _addr = 0; ++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL; ++ ++ GF_ASSERT(this); ++ GF_ASSERT(inode); ++ ++ ret = __inode_ctx_get(inode, this, &_addr); ++ if (ret < 0) ++ _addr = 0; ++ if (_addr != 0) { ++ ctr_xlator_ctx = (ctr_xlator_ctx_t *)(long)_addr; ++ } ++ ++ return ctr_xlator_ctx; ++} ++ ++ctr_xlator_ctx_t * ++init_ctr_xlator_ctx(xlator_t *this, inode_t *inode) ++{ ++ int ret = -1; ++ uint64_t _addr = 0; ++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL; ++ struct timeval current_time = {0}; ++ ++ GF_ASSERT(this); ++ GF_ASSERT(inode); ++ ++ LOCK(&inode->lock); ++ { ++ ctr_xlator_ctx = __get_ctr_xlator_ctx(this, inode); ++ if (ctr_xlator_ctx) { ++ ret = 0; ++ goto out; ++ } ++ ctr_xlator_ctx = GF_CALLOC(1, sizeof(*ctr_xlator_ctx), ++ gf_ctr_mt_xlator_ctx); ++ if (!ctr_xlator_ctx) ++ goto out; ++ ++ ret = LOCK_INIT(&ctr_xlator_ctx->lock); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_ERROR, ret, CTR_MSG_INIT_LOCK_FAILED, ++ "Failed init lock %s", strerror(ret)); ++ goto out; ++ } ++ _addr = (uint64_t)(uintptr_t)ctr_xlator_ctx; ++ ++ ret = __inode_ctx_set(inode, this, &_addr); ++ if (ret) { ++ goto out; ++ } ++ ++ INIT_LIST_HEAD(&ctr_xlator_ctx->hardlink_list); ++ ++ ret = gettimeofday(¤t_time, NULL); ++ if (ret == -1) { ++ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time"); ++ goto out; ++ } ++ ++ ctr_xlator_ctx->inode_heal_period = current_time.tv_sec; ++ } ++ ret = 0; ++out: ++ if (ret) { ++ GF_FREE(ctr_xlator_ctx); ++ ctr_xlator_ctx = NULL; ++ } ++ ++ UNLOCK(&inode->lock); ++ ++ return ctr_xlator_ctx; ++} ++ ++void ++fini_ctr_xlator_ctx(xlator_t *this, inode_t *inode) ++{ ++ int ret = 0; ++ uint64_t _addr = 0; ++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL; ++ ++ inode_ctx_del(inode, this, &_addr); ++ if (!_addr) ++ return; ++ ++ ctr_xlator_ctx = (ctr_xlator_ctx_t *)(long)_addr; ++ ++ ret = ctr_delete_all_hard_link(this, ctr_xlator_ctx); ++ if (ret) { ++ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_DELETE_HARDLINK_FAILED, ++ "Failed deleting all " ++ "hard links from inode context"); ++ } ++ ++ LOCK_DESTROY(&ctr_xlator_ctx->lock); ++ ++ GF_FREE(ctr_xlator_ctx); ++} ++ ++ctr_xlator_ctx_t * ++get_ctr_xlator_ctx(xlator_t *this, inode_t *inode) ++{ ++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL; ++ ++ LOCK(&inode->lock); ++ ctr_xlator_ctx = __get_ctr_xlator_ctx(this, inode); ++ UNLOCK(&inode->lock); ++ ++ return ctr_xlator_ctx; ++} +diff --git a/xlators/features/changetimerecorder/src/ctr-xlator-ctx.h b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.h +new file mode 100644 +index 0000000..4e3bf7e +--- /dev/null ++++ b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.h +@@ -0,0 +1,68 @@ ++/* ++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#ifndef __CTR_XLATOR_CTX_H ++#define __CTR_XLATOR_CTX_H ++ ++#include <glusterfs/xlator.h> ++#include "ctr_mem_types.h" ++#include <glusterfs/iatt.h> ++#include <glusterfs/glusterfs.h> ++#include <glusterfs/xlator.h> ++#include <glusterfs/logging.h> ++#include <glusterfs/locking.h> ++#include <glusterfs/common-utils.h> ++#include <time.h> ++#include <sys/time.h> ++ ++typedef struct ctr_hard_link { ++ uuid_t pgfid; ++ char *base_name; ++ /* Hardlink expiry : Defines the expiry period after which a ++ * database heal is attempted. */ ++ uint64_t hardlink_heal_period; ++ struct list_head list; ++} ctr_hard_link_t; ++ ++typedef struct ctr_xlator_ctx { ++ /* This represents the looked up hardlinks ++ * NOTE: This doesn't represent all physical hardlinks of the inode*/ ++ struct list_head hardlink_list; ++ uint64_t inode_heal_period; ++ gf_lock_t lock; ++} ctr_xlator_ctx_t; ++ ++ctr_hard_link_t * ++ctr_search_hard_link_ctx(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx, ++ uuid_t pgfid, const char *base_name); ++ ++int ++ctr_add_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx, ++ uuid_t pgfid, const char *base_name); ++ ++int ++ctr_delete_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx, ++ uuid_t pgfid, const char *base_name); ++ ++int ++ctr_update_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx, ++ uuid_t pgfid, const char *base_name, uuid_t old_pgfid, ++ const char *old_base_name); ++ ++ctr_xlator_ctx_t * ++get_ctr_xlator_ctx(xlator_t *this, inode_t *inode); ++ ++ctr_xlator_ctx_t * ++init_ctr_xlator_ctx(xlator_t *this, inode_t *inode); ++ ++void ++fini_ctr_xlator_ctx(xlator_t *this, inode_t *inode); ++ ++#endif +diff --git a/xlators/features/changetimerecorder/src/ctr_mem_types.h b/xlators/features/changetimerecorder/src/ctr_mem_types.h +new file mode 100644 +index 0000000..7b8f531 +--- /dev/null ++++ b/xlators/features/changetimerecorder/src/ctr_mem_types.h +@@ -0,0 +1,22 @@ ++/* ++ Copyright (c) 2008-2015 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#ifndef __CTR_MEM_TYPES_H__ ++#define __CTR_MEM_TYPES_H__ ++ ++#include "gfdb_mem-types.h" ++ ++enum gf_ctr_mem_types_ { ++ gf_ctr_mt_private_t = gfdb_mt_end + 1, ++ gf_ctr_mt_xlator_ctx, ++ gf_ctr_mt_hard_link_t, ++ gf_ctr_mt_end ++}; ++#endif +-- +1.8.3.1 + |