summaryrefslogtreecommitdiff
path: root/0085-Revert-all-remove-code-which-is-not-being-considered.patch
diff options
context:
space:
mode:
Diffstat (limited to '0085-Revert-all-remove-code-which-is-not-being-considered.patch')
-rw-r--r--0085-Revert-all-remove-code-which-is-not-being-considered.patch8976
1 files changed, 8976 insertions, 0 deletions
diff --git a/0085-Revert-all-remove-code-which-is-not-being-considered.patch b/0085-Revert-all-remove-code-which-is-not-being-considered.patch
new file mode 100644
index 0000000..6addaff
--- /dev/null
+++ b/0085-Revert-all-remove-code-which-is-not-being-considered.patch
@@ -0,0 +1,8976 @@
+From 379b9f7247a4daac9545e3dec79d3c2660111d8d Mon Sep 17 00:00:00 2001
+From: Hari Gowtham <hgowtham@redhat.com>
+Date: Mon, 8 Apr 2019 11:32:09 +0530
+Subject: [PATCH 085/124] Revert "all: remove code which is not being
+ considered in build"
+
+This reverts most part of commit 8293d21280fd6ddfc9bb54068cf87794fc6be207.
+It adds in the changes for tier and CTR with the neccesary changes for building it.
+
+Label: DOWNSTREAM ONLY
+
+Change-Id: I8f7978618f2a6a949b09dbcfd25722494cb8f1cd
+Signed-off-by: Hari Gowtham <hgowtham@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/166245
+Reviewed-by: Nithya Balachandran <nbalacha@redhat.com>
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
+---
+ Makefile.am | 8 +-
+ configure.ac | 34 +
+ glusterfs.spec.in | 19 +
+ libglusterfs/Makefile.am | 4 +-
+ libglusterfs/src/glusterfs/mem-types.h | 1 +
+ xlators/cluster/dht/src/Makefile.am | 14 +-
+ xlators/cluster/dht/src/dht-rebalance.c | 12 +
+ xlators/cluster/dht/src/tier-common.c | 1199 ++++++++
+ xlators/cluster/dht/src/tier-common.h | 55 +
+ xlators/cluster/dht/src/tier.c | 3105 ++++++++++++++++++++
+ xlators/cluster/dht/src/tier.h | 110 +
+ xlators/features/Makefile.am | 2 +-
+ xlators/features/changetimerecorder/Makefile.am | 3 +
+ .../features/changetimerecorder/src/Makefile.am | 26 +
+ .../changetimerecorder/src/changetimerecorder.c | 2371 +++++++++++++++
+ .../changetimerecorder/src/changetimerecorder.h | 21 +
+ .../features/changetimerecorder/src/ctr-helper.c | 293 ++
+ .../features/changetimerecorder/src/ctr-helper.h | 854 ++++++
+ .../features/changetimerecorder/src/ctr-messages.h | 61 +
+ .../changetimerecorder/src/ctr-xlator-ctx.c | 362 +++
+ .../changetimerecorder/src/ctr-xlator-ctx.h | 68 +
+ .../changetimerecorder/src/ctr_mem_types.h | 22 +
+ 22 files changed, 8637 insertions(+), 7 deletions(-)
+ create mode 100644 xlators/cluster/dht/src/tier-common.c
+ create mode 100644 xlators/cluster/dht/src/tier-common.h
+ create mode 100644 xlators/cluster/dht/src/tier.c
+ create mode 100644 xlators/cluster/dht/src/tier.h
+ create mode 100644 xlators/features/changetimerecorder/Makefile.am
+ create mode 100644 xlators/features/changetimerecorder/src/Makefile.am
+ create mode 100644 xlators/features/changetimerecorder/src/changetimerecorder.c
+ create mode 100644 xlators/features/changetimerecorder/src/changetimerecorder.h
+ create mode 100644 xlators/features/changetimerecorder/src/ctr-helper.c
+ create mode 100644 xlators/features/changetimerecorder/src/ctr-helper.h
+ create mode 100644 xlators/features/changetimerecorder/src/ctr-messages.h
+ create mode 100644 xlators/features/changetimerecorder/src/ctr-xlator-ctx.c
+ create mode 100644 xlators/features/changetimerecorder/src/ctr-xlator-ctx.h
+ create mode 100644 xlators/features/changetimerecorder/src/ctr_mem_types.h
+
+diff --git a/Makefile.am b/Makefile.am
+index e0c795f..613382f 100644
+--- a/Makefile.am
++++ b/Makefile.am
+@@ -3,7 +3,7 @@ SOURCES = site.h
+ EXTRA_DIST = autogen.sh \
+ COPYING-GPLV2 COPYING-LGPLV3 COMMITMENT \
+ INSTALL README.md AUTHORS THANKS NEWS \
+- glusterfs.spec glusterfs-api.pc.in libgfchangelog.pc.in \
++ glusterfs.spec glusterfs-api.pc.in libgfchangelog.pc.in libgfdb.pc.in \
+ run-tests.sh \
+ build-aux/pkg-version \
+ contrib/umountd \
+@@ -15,8 +15,12 @@ SUBDIRS = $(ARGP_STANDALONE_DIR) rpc/xdr/gen libglusterfs rpc api xlators \
+
+ pkgconfigdir = @pkgconfigdir@
+ pkgconfig_DATA = glusterfs-api.pc libgfchangelog.pc
++if USE_GFDB
++pkgconfig_DATA += libgfdb.pc
++endif
+
+-CLEANFILES = glusterfs-api.pc libgfchangelog.pc contrib/umountd/Makefile
++CLEANFILES = glusterfs-api.pc libgfchangelog.pc libgfdb.pc \
++ contrib/umountd/Makefile
+
+ gitclean: distclean
+ find . -name Makefile.in -exec rm -f {} \;
+diff --git a/configure.ac b/configure.ac
+index baa811a..633e850 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -30,6 +30,7 @@ AC_CONFIG_HEADERS([config.h site.h])
+ AC_CONFIG_FILES([Makefile
+ libglusterfs/Makefile
+ libglusterfs/src/Makefile
++ libglusterfs/src/gfdb/Makefile
+ geo-replication/src/peer_gsec_create
+ geo-replication/src/peer_mountbroker
+ geo-replication/src/peer_mountbroker.py
+@@ -121,6 +122,8 @@ AC_CONFIG_FILES([Makefile
+ xlators/features/changelog/src/Makefile
+ xlators/features/changelog/lib/Makefile
+ xlators/features/changelog/lib/src/Makefile
++ xlators/features/changetimerecorder/Makefile
++ xlators/features/changetimerecorder/src/Makefile
+ xlators/features/locks/Makefile
+ xlators/features/locks/src/Makefile
+ xlators/features/quota/Makefile
+@@ -237,6 +240,7 @@ AC_CONFIG_FILES([Makefile
+ contrib/umountd/Makefile
+ glusterfs-api.pc
+ libgfchangelog.pc
++ libgfdb.pc
+ api/Makefile
+ api/src/Makefile
+ api/examples/Makefile
+@@ -866,6 +870,33 @@ AM_CONDITIONAL([USE_FIREWALLD],test ["x${BUILD_FIREWALLD}" = "xyes"])
+
+ #endof firewald section
+
++# Data tiering requires sqlite
++AC_ARG_ENABLE([tiering],
++ AC_HELP_STRING([--disable-tiering],
++ [Disable data classification/tiering]),
++ [BUILD_GFDB="${enableval}"], [BUILD_GFDB="yes"])
++
++case $host_os in
++ darwin*)
++ SQLITE_LIBS="-lsqlite3"
++ AC_CHECK_HEADERS([sqlite3.h], AC_DEFINE(USE_GFDB, 1))
++ ;;
++ *)
++ if test "x${BUILD_GFDB}" = "xyes"; then
++ PKG_CHECK_MODULES([SQLITE], [sqlite3],
++ AC_DEFINE(USE_GFDB, 1),
++ AC_MSG_ERROR([pass --disable-tiering to build without sqlite]))
++ else
++ AC_DEFINE(USE_GFDB, 0, [no sqlite, gfdb is disabled])
++ fi
++ ;;
++esac
++
++AC_SUBST(SQLITE_CFLAGS)
++AC_SUBST(SQLITE_LIBS)
++AM_CONDITIONAL(BUILD_GFDB, test "x${with_server}" = "xyes" -a "x${BUILD_GFDB}" = "xyes")
++AM_CONDITIONAL(USE_GFDB, test "x${with_server}" = "xyes" -a "x${BUILD_GFDB}" = "xyes")
++
+ # xml-output
+ AC_ARG_ENABLE([xml-output],
+ AC_HELP_STRING([--disable-xml-output],
+@@ -1544,6 +1575,8 @@ GFAPI_VERSION="7."${PACKAGE_VERSION}
+ LIBGFCHANGELOG_VERSION="0.0.1"
+ AC_SUBST(GFAPI_VERSION)
+ AC_SUBST(LIBGFCHANGELOG_VERSION)
++LIBGFDB_VERSION="0.0.1"
++AC_SUBST(LIBGFDB_VERSION)
+
+ dnl libtool versioning
+ LIBGFXDR_LT_VERSION="0:1:0"
+@@ -1584,6 +1617,7 @@ echo "XML output : $BUILD_XML_OUTPUT"
+ echo "Unit Tests : $BUILD_UNITTEST"
+ echo "Track priv ports : $TRACK_PRIVPORTS"
+ echo "POSIX ACLs : $BUILD_POSIX_ACLS"
++echo "Data Classification : $BUILD_GFDB"
+ echo "firewalld-config : $BUILD_FIREWALLD"
+ echo "Events : $BUILD_EVENTS"
+ echo "EC dynamic support : $EC_DYNAMIC_SUPPORT"
+diff --git a/glusterfs.spec.in b/glusterfs.spec.in
+index 2149f86..e0607ba 100644
+--- a/glusterfs.spec.in
++++ b/glusterfs.spec.in
+@@ -154,6 +154,7 @@
+ %global _without_events --disable-events
+ %global _without_georeplication --disable-georeplication
+ %global _with_gnfs %{nil}
++%global _without_tiering --disable-tiering
+ %global _without_ocf --without-ocf
+ %endif
+
+@@ -287,6 +288,9 @@ BuildRequires: libuuid-devel
+ %if ( 0%{?_with_cmocka:1} )
+ BuildRequires: libcmocka-devel >= 1.0.1
+ %endif
++%if ( 0%{!?_without_tiering:1} )
++BuildRequires: sqlite-devel
++%endif
+ %if ( 0%{!?_without_georeplication:1} )
+ BuildRequires: libattr-devel
+ %endif
+@@ -797,6 +801,7 @@ export LDFLAGS
+ %{?_without_rdma} \
+ %{?_without_server} \
+ %{?_without_syslog} \
++ %{?_without_tiering} \
+ %{?_with_ipv6default} \
+ %{?_without_libtirpc}
+
+@@ -1232,9 +1237,15 @@ exit 0
+ %if ( 0%{?_without_server:1} )
+ %exclude %{_libdir}/pkgconfig/libgfchangelog.pc
+ %exclude %{_libdir}/libgfchangelog.so
++%if ( 0%{!?_without_tiering:1} )
++%{_libdir}/pkgconfig/libgfdb.pc
++%endif
+ %else
+ %{_libdir}/pkgconfig/libgfchangelog.pc
+ %{_libdir}/libgfchangelog.so
++%if ( 0%{!?_without_tiering:1} )
++%{_libdir}/pkgconfig/libgfdb.pc
++%endif
+ %endif
+
+ %files client-xlators
+@@ -1330,6 +1341,10 @@ exit 0
+ %files libs
+ %{_libdir}/*.so.*
+ %exclude %{_libdir}/libgfapi.*
++%if ( 0%{!?_without_tiering:1} )
++# libgfdb is only needed server-side
++%exclude %{_libdir}/libgfdb.*
++%endif
+
+ %files -n python%{_pythonver}-gluster
+ # introducing glusterfs module in site packages.
+@@ -1417,6 +1432,10 @@ exit 0
+ %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/bit-rot.so
+ %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/bitrot-stub.so
+ %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/sdfs.so
++%if ( 0%{!?_without_tiering:1} )
++ %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/changetimerecorder.so
++ %{_libdir}/libgfdb.so.*
++%endif
+ %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/index.so
+ %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/locks.so
+ %{_libdir}/glusterfs/%{version}%{?prereltag}/xlator/features/posix*
+diff --git a/libglusterfs/Makefile.am b/libglusterfs/Makefile.am
+index d471a3f..7e72f61 100644
+--- a/libglusterfs/Makefile.am
++++ b/libglusterfs/Makefile.am
+@@ -1,3 +1,3 @@
+-SUBDIRS = src
++SUBDIRS = src src/gfdb
+
+-CLEANFILES =
++CLEANFILES =
+diff --git a/libglusterfs/src/glusterfs/mem-types.h b/libglusterfs/src/glusterfs/mem-types.h
+index 832f68c..92730a9 100644
+--- a/libglusterfs/src/glusterfs/mem-types.h
++++ b/libglusterfs/src/glusterfs/mem-types.h
+@@ -138,6 +138,7 @@ enum gf_common_mem_types_ {
+ gf_common_volfile_t,
+ gf_common_mt_mgmt_v3_lock_timer_t, /* used only in one location */
+ gf_common_mt_server_cmdline_t, /* used only in one location */
++ gf_mt_gfdb_query_record_t,
+ gf_common_mt_end
+ };
+ #endif
+diff --git a/xlators/cluster/dht/src/Makefile.am b/xlators/cluster/dht/src/Makefile.am
+index 56f1f2a..5532047 100644
+--- a/xlators/cluster/dht/src/Makefile.am
++++ b/xlators/cluster/dht/src/Makefile.am
+@@ -1,4 +1,7 @@
+ xlator_LTLIBRARIES = dht.la nufa.la switch.la
++if BUILD_GFDB
++ xlator_LTLIBRARIES += tier.la
++endif
+
+ AM_CFLAGS = -Wall $(GF_CFLAGS)
+
+@@ -13,6 +16,7 @@ dht_la_SOURCES = $(dht_common_source) dht.c
+
+ nufa_la_SOURCES = $(dht_common_source) nufa.c
+ switch_la_SOURCES = $(dht_common_source) switch.c
++tier_la_SOURCES = $(dht_common_source) tier.c tier-common.c
+
+ dht_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+ dht_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+@@ -23,15 +27,21 @@ nufa_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+ switch_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
+ switch_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
+
++tier_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS) $(LIB_DL)
++tier_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la
++
+ noinst_HEADERS = dht-common.h dht-mem-types.h dht-messages.h \
+- dht-lock.h $(top_builddir)/xlators/lib/src/libxlator.h
++ dht-lock.h tier-common.h tier.h \
++ $(top_builddir)/xlators/lib/src/libxlator.h
+
+ AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
++ -I$(top_srcdir)/libglusterfs/src/gfdb \
+ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
+ -I$(top_srcdir)/rpc/rpc-lib/src \
+ -I$(top_srcdir)/xlators/lib/src \
+ -DDATADIR=\"$(localstatedir)\" \
+- -DLIBDIR=\"$(libdir)\"
++ -DLIBDIR=\"$(libdir)\" \
++ -DLIBGFDB_VERSION=\"$(LIBGFDB_VERSION)\"
+
+ CLEANFILES =
+
+diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
+index e0f25b1..efbe8a4 100644
+--- a/xlators/cluster/dht/src/dht-rebalance.c
++++ b/xlators/cluster/dht/src/dht-rebalance.c
+@@ -8,6 +8,7 @@
+ cases as published by the Free Software Foundation.
+ */
+
++#include "tier.h"
+ #include "dht-common.h"
+ #include <glusterfs/xlator.h>
+ #include <glusterfs/syscall.h>
+@@ -2134,6 +2135,17 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
+ }
+ }
+
++ /* store size of previous migrated file */
++ if (defrag && defrag->tier_conf.is_tier) {
++ if (from != TIER_HASHED_SUBVOL) {
++ defrag->tier_conf.st_last_promoted_size = stbuf.ia_size;
++ } else {
++ /* Don't delete the linkto file on the hashed subvol */
++ delete_src_linkto = _gf_false;
++ defrag->tier_conf.st_last_demoted_size = stbuf.ia_size;
++ }
++ }
++
+ /* The src file is being unlinked after this so we don't need
+ to clean it up */
+ clean_src = _gf_false;
+diff --git a/xlators/cluster/dht/src/tier-common.c b/xlators/cluster/dht/src/tier-common.c
+new file mode 100644
+index 0000000..b22f477
+--- /dev/null
++++ b/xlators/cluster/dht/src/tier-common.c
+@@ -0,0 +1,1199 @@
++/*
++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#include <glusterfs/glusterfs.h>
++#include <glusterfs/xlator.h>
++#include "libxlator.h"
++#include "dht-common.h"
++#include <glusterfs/defaults.h>
++#include "tier-common.h"
++#include "tier.h"
++
++int
++dht_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, inode_t *inode, struct iatt *stbuf,
++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
++
++int
++tier_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, inode_t *inode, struct iatt *stbuf,
++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
++{
++ dht_local_t *local = NULL;
++ loc_t *oldloc = NULL;
++ loc_t *newloc = NULL;
++
++ local = frame->local;
++
++ oldloc = &local->loc;
++ newloc = &local->loc2;
++
++ if (op_ret == -1) {
++ /* No continuation on DHT inode missing errors, as we should
++ * then have a good stbuf that states P2 happened. We would
++ * get inode missing if, the file completed migrated between
++ * the lookup and the link call */
++ goto out;
++ }
++
++ if (local->call_cnt != 1) {
++ goto out;
++ }
++
++ local->call_cnt = 2;
++
++ /* Do this on the hot tier now */
++
++ STACK_WIND(frame, tier_link_cbk, local->cached_subvol,
++ local->cached_subvol->fops->link, oldloc, newloc, xdata);
++
++ return 0;
++
++out:
++ DHT_STRIP_PHASE1_FLAGS(stbuf);
++
++ DHT_STACK_UNWIND(link, frame, op_ret, op_errno, inode, stbuf, preparent,
++ postparent, NULL);
++
++ return 0;
++}
++
++int
++tier_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
++ dict_t *xdata)
++{
++ xlator_t *cached_subvol = NULL;
++ xlator_t *hashed_subvol = NULL;
++ int op_errno = -1;
++ int ret = -1;
++ dht_local_t *local = NULL;
++ dht_conf_t *conf = NULL;
++
++ VALIDATE_OR_GOTO(frame, err);
++ VALIDATE_OR_GOTO(this, err);
++ VALIDATE_OR_GOTO(oldloc, err);
++ VALIDATE_OR_GOTO(newloc, err);
++
++ conf = this->private;
++
++ local = dht_local_init(frame, oldloc, NULL, GF_FOP_LINK);
++ if (!local) {
++ op_errno = ENOMEM;
++ goto err;
++ }
++ local->call_cnt = 1;
++
++ cached_subvol = local->cached_subvol;
++
++ if (!cached_subvol) {
++ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
++ oldloc->path);
++ op_errno = ENOENT;
++ goto err;
++ }
++
++ hashed_subvol = TIER_HASHED_SUBVOL;
++
++ ret = loc_copy(&local->loc2, newloc);
++ if (ret == -1) {
++ op_errno = ENOMEM;
++ goto err;
++ }
++
++ if (hashed_subvol == cached_subvol) {
++ STACK_WIND(frame, dht_link_cbk, cached_subvol,
++ cached_subvol->fops->link, oldloc, newloc, xdata);
++ return 0;
++ }
++
++ /* Create hardlinks to both the data file on the hot tier
++ and the linkto file on the cold tier */
++
++ gf_uuid_copy(local->gfid, oldloc->inode->gfid);
++
++ STACK_WIND(frame, tier_link_cbk, hashed_subvol, hashed_subvol->fops->link,
++ oldloc, newloc, xdata);
++
++ return 0;
++err:
++ op_errno = (op_errno == -1) ? errno : op_errno;
++ DHT_STACK_UNWIND(link, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL);
++ return 0;
++}
++
++int
++tier_create_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
++ xlator_t *this, int op_ret, int op_errno,
++ struct iatt *preparent,
++ struct iatt *postparent, dict_t *xdata)
++{
++ dht_local_t *local = NULL;
++
++ local = frame->local;
++
++ if (local->params) {
++ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
++ }
++
++ DHT_STACK_UNWIND(create, frame, -1, local->op_errno, NULL, NULL, NULL, NULL,
++ NULL, NULL);
++
++ return 0;
++}
++
++int
++tier_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
++{
++ xlator_t *prev = NULL;
++ int ret = -1;
++ dht_local_t *local = NULL;
++ xlator_t *hashed_subvol = NULL;
++ dht_conf_t *conf = NULL;
++
++ local = frame->local;
++ conf = this->private;
++
++ hashed_subvol = TIER_HASHED_SUBVOL;
++
++ if (!local) {
++ op_ret = -1;
++ op_errno = EINVAL;
++ goto out;
++ }
++
++ if (op_ret == -1) {
++ if (local->linked == _gf_true && local->xattr_req) {
++ local->op_errno = op_errno;
++ local->op_ret = op_ret;
++ ret = dht_fill_dict_to_avoid_unlink_of_migrating_file(
++ local->xattr_req);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
++ "Failed to set dictionary value to "
++ "unlink of migrating file");
++ goto out;
++ }
++
++ STACK_WIND(frame, tier_create_unlink_stale_linkto_cbk,
++ hashed_subvol, hashed_subvol->fops->unlink, &local->loc,
++ 0, local->xattr_req);
++ return 0;
++ }
++ goto out;
++ }
++
++ prev = cookie;
++
++ if (local->loc.parent) {
++ dht_inode_ctx_time_update(local->loc.parent, this, preparent, 0);
++
++ dht_inode_ctx_time_update(local->loc.parent, this, postparent, 1);
++ }
++
++ ret = dht_layout_preset(this, prev, inode);
++ if (ret != 0) {
++ gf_msg_debug(this->name, 0, "could not set preset layout for subvol %s",
++ prev->name);
++ op_ret = -1;
++ op_errno = EINVAL;
++ goto out;
++ }
++
++ local->op_errno = op_errno;
++
++ if (local->linked == _gf_true) {
++ local->stbuf = *stbuf;
++ dht_linkfile_attr_heal(frame, this);
++ }
++out:
++ if (local) {
++ if (local->xattr_req) {
++ dict_del(local->xattr_req, TIER_LINKFILE_GFID);
++ }
++ }
++
++ DHT_STRIP_PHASE1_FLAGS(stbuf);
++
++ DHT_STACK_UNWIND(create, frame, op_ret, op_errno, fd, inode, stbuf,
++ preparent, postparent, xdata);
++
++ return 0;
++}
++
++int
++tier_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
++ xlator_t *this, int32_t op_ret,
++ int32_t op_errno, inode_t *inode,
++ struct iatt *stbuf, struct iatt *preparent,
++ struct iatt *postparent, dict_t *xdata)
++{
++ dht_local_t *local = NULL;
++ xlator_t *cached_subvol = NULL;
++ dht_conf_t *conf = NULL;
++ int ret = -1;
++ unsigned char *gfid = NULL;
++
++ local = frame->local;
++ if (!local) {
++ op_errno = EINVAL;
++ goto err;
++ }
++
++ if (op_ret == -1) {
++ local->op_errno = op_errno;
++ goto err;
++ }
++
++ conf = this->private;
++ if (!conf) {
++ local->op_errno = EINVAL;
++ op_errno = EINVAL;
++ goto err;
++ }
++
++ cached_subvol = TIER_UNHASHED_SUBVOL;
++
++ if (local->params) {
++ dict_del(local->params, conf->link_xattr_name);
++ dict_del(local->params, GLUSTERFS_INTERNAL_FOP_KEY);
++ }
++
++ /*
++ * We will delete the linkfile if data file creation fails.
++ * When deleting this stale linkfile, there is a possibility
++ * for a race between this linkfile deletion and a stale
++ * linkfile deletion triggered by another lookup from different
++ * client.
++ *
++ * For eg:
++ *
++ * Client 1 Client 2
++ *
++ * 1 linkfile created for foo
++ *
++ * 2 data file creation failed
++ *
++ * 3 creating a file with same name
++ *
++ * 4 lookup before creation deleted
++ * the linkfile created by client1
++ * considering as a stale linkfile.
++ *
++ * 5 New linkfile created for foo
++ * with different gfid.
++ *
++ * 6 Trigger linkfile deletion as
++ * data file creation failed.
++ *
++ * 7 Linkfile deleted which is
++ * created by client2.
++ *
++ * 8 Data file created.
++ *
++ * With this race, we will end up having a file in a non-hashed subvol
++ * without a linkfile in hashed subvol.
++ *
++ * To avoid this, we store the gfid of linkfile created by client, So
++ * If we delete the linkfile , we validate gfid of existing file with
++ * stored value from posix layer.
++ *
++ * Storing this value in local->xattr_req as local->params was also used
++ * to create the data file. During the linkfile deletion we will use
++ * local->xattr_req dictionary.
++ */
++ if (!local->xattr_req) {
++ local->xattr_req = dict_new();
++ if (!local->xattr_req) {
++ local->op_errno = ENOMEM;
++ op_errno = ENOMEM;
++ goto err;
++ }
++ }
++
++ gfid = GF_MALLOC(sizeof(uuid_t), gf_common_mt_char);
++ if (!gfid) {
++ local->op_errno = ENOMEM;
++ op_errno = ENOMEM;
++ goto err;
++ }
++
++ gf_uuid_copy(gfid, stbuf->ia_gfid);
++ ret = dict_set_dynptr(local->xattr_req, TIER_LINKFILE_GFID, gfid,
++ sizeof(uuid_t));
++ if (ret) {
++ GF_FREE(gfid);
++ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
++ "Failed to set dictionary value"
++ " : key = %s",
++ TIER_LINKFILE_GFID);
++ }
++
++ STACK_WIND_COOKIE(frame, tier_create_cbk, cached_subvol, cached_subvol,
++ cached_subvol->fops->create, &local->loc, local->flags,
++ local->mode, local->umask, local->fd, local->params);
++
++ return 0;
++err:
++ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
++ NULL);
++ return 0;
++}
++
++gf_boolean_t
++tier_is_hot_tier_decommissioned(xlator_t *this)
++{
++ dht_conf_t *conf = NULL;
++ xlator_t *hot_tier = NULL;
++ int i = 0;
++
++ conf = this->private;
++ hot_tier = conf->subvolumes[1];
++
++ if (conf->decommission_subvols_cnt) {
++ for (i = 0; i < conf->subvolume_cnt; i++) {
++ if (conf->decommissioned_bricks[i] &&
++ conf->decommissioned_bricks[i] == hot_tier)
++ return _gf_true;
++ }
++ }
++
++ return _gf_false;
++}
++
++int
++tier_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
++ mode_t mode, mode_t umask, fd_t *fd, dict_t *params)
++{
++ int op_errno = -1;
++ dht_local_t *local = NULL;
++ dht_conf_t *conf = NULL;
++ xlator_t *hot_subvol = NULL;
++ xlator_t *cold_subvol = NULL;
++
++ VALIDATE_OR_GOTO(frame, err);
++ VALIDATE_OR_GOTO(this, err);
++ VALIDATE_OR_GOTO(loc, err);
++
++ conf = this->private;
++
++ dht_get_du_info(frame, this, loc);
++
++ local = dht_local_init(frame, loc, fd, GF_FOP_CREATE);
++ if (!local) {
++ op_errno = ENOMEM;
++ goto err;
++ }
++
++ cold_subvol = TIER_HASHED_SUBVOL;
++ hot_subvol = TIER_UNHASHED_SUBVOL;
++
++ if (conf->subvolumes[0] != cold_subvol) {
++ hot_subvol = conf->subvolumes[0];
++ }
++ /*
++ * if hot tier full, write to cold.
++ * Also if hot tier is full, create in cold
++ */
++ if (dht_is_subvol_filled(this, hot_subvol) ||
++ tier_is_hot_tier_decommissioned(this)) {
++ gf_msg_debug(this->name, 0, "creating %s on %s", loc->path,
++ cold_subvol->name);
++
++ STACK_WIND_COOKIE(frame, tier_create_cbk, cold_subvol, cold_subvol,
++ cold_subvol->fops->create, loc, flags, mode, umask,
++ fd, params);
++ } else {
++ local->params = dict_ref(params);
++ local->flags = flags;
++ local->mode = mode;
++ local->umask = umask;
++ local->cached_subvol = hot_subvol;
++ local->hashed_subvol = cold_subvol;
++
++ gf_msg_debug(this->name, 0, "creating %s on %s (link at %s)", loc->path,
++ hot_subvol->name, cold_subvol->name);
++
++ dht_linkfile_create(frame, tier_create_linkfile_create_cbk, this,
++ hot_subvol, cold_subvol, loc);
++
++ goto out;
++ }
++out:
++ return 0;
++
++err:
++
++ op_errno = (op_errno == -1) ? errno : op_errno;
++ DHT_STACK_UNWIND(create, frame, -1, op_errno, NULL, NULL, NULL, NULL, NULL,
++ NULL);
++
++ return 0;
++}
++
++int
++tier_unlink_nonhashed_linkfile_cbk(call_frame_t *frame, void *cookie,
++ xlator_t *this, int op_ret, int op_errno,
++ struct iatt *preparent,
++ struct iatt *postparent, dict_t *xdata)
++{
++ dht_local_t *local = NULL;
++ xlator_t *prev = NULL;
++
++ local = frame->local;
++ prev = cookie;
++
++ LOCK(&frame->lock);
++ {
++ if ((op_ret == -1) && (op_errno != ENOENT)) {
++ local->op_errno = op_errno;
++ local->op_ret = op_ret;
++ gf_msg_debug(this->name, op_errno,
++ "Unlink link: subvolume %s"
++ " returned -1",
++ prev->name);
++ goto unlock;
++ }
++
++ local->op_ret = 0;
++ }
++unlock:
++ UNLOCK(&frame->lock);
++
++ if (local->op_ret == -1)
++ goto err;
++ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
++ &local->preparent, &local->postparent, NULL);
++
++ return 0;
++
++err:
++ DHT_STACK_UNWIND(unlink, frame, -1, local->op_errno, NULL, NULL, NULL);
++ return 0;
++}
++
++int
++tier_unlink_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int op_ret, int op_errno, inode_t *inode,
++ struct iatt *preparent, dict_t *xdata,
++ struct iatt *postparent)
++{
++ dht_local_t *local = NULL;
++ xlator_t *prev = NULL;
++ dht_conf_t *conf = NULL;
++ xlator_t *hot_subvol = NULL;
++
++ local = frame->local;
++ prev = cookie;
++ conf = this->private;
++ hot_subvol = TIER_UNHASHED_SUBVOL;
++
++ if (!op_ret) {
++ /*
++ * linkfile present on hot tier. unlinking the linkfile
++ */
++ STACK_WIND_COOKIE(frame, tier_unlink_nonhashed_linkfile_cbk, hot_subvol,
++ hot_subvol, hot_subvol->fops->unlink, &local->loc,
++ local->flags, NULL);
++ return 0;
++ }
++
++ LOCK(&frame->lock);
++ {
++ if (op_errno == ENOENT) {
++ local->op_ret = 0;
++ local->op_errno = op_errno;
++ } else {
++ local->op_ret = op_ret;
++ local->op_errno = op_errno;
++ }
++ gf_msg_debug(this->name, op_errno, "Lookup : subvolume %s returned -1",
++ prev->name);
++ }
++
++ UNLOCK(&frame->lock);
++
++ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
++ &local->preparent, &local->postparent, xdata);
++
++ return 0;
++}
++
++int
++tier_unlink_linkfile_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int op_ret, int op_errno, struct iatt *preparent,
++ struct iatt *postparent, dict_t *xdata)
++{
++ dht_local_t *local = NULL;
++ xlator_t *prev = NULL;
++
++ local = frame->local;
++ prev = cookie;
++
++ LOCK(&frame->lock);
++ {
++ /* Ignore EINVAL for tier to ignore error when the file
++ does not exist on the other tier */
++ if ((op_ret == -1) && !((op_errno == ENOENT) || (op_errno == EINVAL))) {
++ local->op_errno = op_errno;
++ local->op_ret = op_ret;
++ gf_msg_debug(this->name, op_errno,
++ "Unlink link: subvolume %s"
++ " returned -1",
++ prev->name);
++ goto unlock;
++ }
++
++ local->op_ret = 0;
++ }
++unlock:
++ UNLOCK(&frame->lock);
++
++ if (local->op_ret == -1)
++ goto err;
++
++ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
++ &local->preparent, &local->postparent, xdata);
++
++ return 0;
++
++err:
++ DHT_STACK_UNWIND(unlink, frame, -1, local->op_errno, NULL, NULL, NULL);
++ return 0;
++}
++
++int32_t
++tier_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, struct iatt *preparent, struct iatt *postparent,
++ dict_t *xdata)
++{
++ dht_local_t *local = NULL;
++ xlator_t *prev = NULL;
++ struct iatt *stbuf = NULL;
++ dht_conf_t *conf = NULL;
++ int ret = -1;
++ xlator_t *hot_tier = NULL;
++ xlator_t *cold_tier = NULL;
++
++ local = frame->local;
++ prev = cookie;
++ conf = this->private;
++
++ cold_tier = TIER_HASHED_SUBVOL;
++ hot_tier = TIER_UNHASHED_SUBVOL;
++
++ LOCK(&frame->lock);
++ {
++ if (op_ret == -1) {
++ if (op_errno == ENOENT) {
++ local->op_ret = 0;
++ } else {
++ local->op_ret = -1;
++ local->op_errno = op_errno;
++ }
++ gf_msg_debug(this->name, op_errno,
++ "Unlink: subvolume %s returned -1"
++ " with errno = %d",
++ prev->name, op_errno);
++ goto unlock;
++ }
++
++ local->op_ret = 0;
++
++ local->postparent = *postparent;
++ local->preparent = *preparent;
++
++ if (local->loc.parent) {
++ dht_inode_ctx_time_update(local->loc.parent, this,
++ &local->preparent, 0);
++ dht_inode_ctx_time_update(local->loc.parent, this,
++ &local->postparent, 1);
++ }
++ }
++unlock:
++ UNLOCK(&frame->lock);
++
++ if (local->op_ret)
++ goto out;
++
++ if (cold_tier != local->cached_subvol) {
++ /*
++ * File is present in hot tier, so there will be
++ * a link file on cold tier, deleting the linkfile
++ * from cold tier
++ */
++ STACK_WIND_COOKIE(frame, tier_unlink_linkfile_cbk, cold_tier, cold_tier,
++ cold_tier->fops->unlink, &local->loc, local->flags,
++ xdata);
++ return 0;
++ }
++
++ ret = dict_get_bin(xdata, DHT_IATT_IN_XDATA_KEY, (void **)&stbuf);
++ if (!ret && stbuf &&
++ ((IS_DHT_MIGRATION_PHASE2(stbuf)) || IS_DHT_MIGRATION_PHASE1(stbuf))) {
++ /*
++ * File is migrating from cold to hot tier.
++ * Delete the destination linkfile.
++ */
++ STACK_WIND_COOKIE(frame, tier_unlink_lookup_cbk, hot_tier, hot_tier,
++ hot_tier->fops->lookup, &local->loc, NULL);
++ return 0;
++ }
++
++out:
++ DHT_STACK_UNWIND(unlink, frame, local->op_ret, local->op_errno,
++ &local->preparent, &local->postparent, xdata);
++
++ return 0;
++}
++
++int
++tier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
++ dict_t *xdata)
++{
++ xlator_t *cached_subvol = NULL;
++ xlator_t *hashed_subvol = NULL;
++ dht_conf_t *conf = NULL;
++ int op_errno = -1;
++ dht_local_t *local = NULL;
++ int ret = -1;
++
++ VALIDATE_OR_GOTO(frame, err);
++ VALIDATE_OR_GOTO(this, err);
++ VALIDATE_OR_GOTO(loc, err);
++
++ conf = this->private;
++
++ local = dht_local_init(frame, loc, NULL, GF_FOP_UNLINK);
++ if (!local) {
++ op_errno = ENOMEM;
++
++ goto err;
++ }
++
++ hashed_subvol = TIER_HASHED_SUBVOL;
++
++ cached_subvol = local->cached_subvol;
++ if (!cached_subvol) {
++ gf_msg_debug(this->name, 0, "no cached subvolume for path=%s",
++ loc->path);
++ op_errno = EINVAL;
++ goto err;
++ }
++
++ local->flags = xflag;
++ if (IA_ISREG(loc->inode->ia_type) && (hashed_subvol == cached_subvol)) {
++ /*
++ * File resides in cold tier. We need to stat
++ * the file to see if it is being promoted.
++ * If yes we need to delete the destination
++ * file as well.
++ *
++ * Currently we are doing this check only for
++ * regular files.
++ */
++ xdata = xdata ? dict_ref(xdata) : dict_new();
++ if (xdata) {
++ ret = dict_set_int8(xdata, DHT_IATT_IN_XDATA_KEY, 1);
++ if (ret) {
++ gf_msg_debug(this->name, 0, "Failed to set dictionary key %s",
++ DHT_IATT_IN_XDATA_KEY);
++ }
++ }
++ }
++
++ /*
++ * File is on hot tier, delete the data file first, then
++ * linkfile from cold.
++ */
++ STACK_WIND_COOKIE(frame, tier_unlink_cbk, cached_subvol, cached_subvol,
++ cached_subvol->fops->unlink, loc, xflag, xdata);
++ if (xdata)
++ dict_unref(xdata);
++ return 0;
++err:
++ op_errno = (op_errno == -1) ? errno : op_errno;
++ DHT_STACK_UNWIND(unlink, frame, -1, op_errno, NULL, NULL, NULL);
++
++ return 0;
++}
++
++int
++tier_readdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
++{
++ gf_dirent_t entries;
++ gf_dirent_t *orig_entry = NULL;
++ gf_dirent_t *entry = NULL;
++ int count = 0;
++
++ INIT_LIST_HEAD(&entries.list);
++
++ if (op_ret < 0)
++ goto unwind;
++
++ list_for_each_entry(orig_entry, (&orig_entries->list), list)
++ {
++ entry = gf_dirent_for_name(orig_entry->d_name);
++ if (!entry) {
++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
++ "Memory allocation failed ");
++ goto unwind;
++ }
++
++ entry->d_off = orig_entry->d_off;
++ entry->d_ino = orig_entry->d_ino;
++ entry->d_type = orig_entry->d_type;
++ entry->d_len = orig_entry->d_len;
++
++ list_add_tail(&entry->list, &entries.list);
++ count++;
++ }
++ op_ret = count;
++
++unwind:
++ if (op_ret < 0)
++ op_ret = 0;
++
++ DHT_STACK_UNWIND(readdir, frame, op_ret, op_errno, &entries, NULL);
++
++ gf_dirent_free(&entries);
++
++ return 0;
++}
++
++int
++tier_readdirp_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, gf_dirent_t *orig_entries, dict_t *xdata)
++{
++ dht_local_t *local = NULL;
++ gf_dirent_t entries;
++ gf_dirent_t *orig_entry = NULL;
++ gf_dirent_t *entry = NULL;
++ xlator_t *prev = NULL;
++ xlator_t *next_subvol = NULL;
++ off_t next_offset = 0;
++ int count = 0;
++ dht_conf_t *conf = NULL;
++ int ret = 0;
++ inode_table_t *itable = NULL;
++ inode_t *inode = NULL;
++
++ INIT_LIST_HEAD(&entries.list);
++ prev = cookie;
++ local = frame->local;
++ itable = local->fd ? local->fd->inode->table : NULL;
++
++ conf = this->private;
++ GF_VALIDATE_OR_GOTO(this->name, conf, unwind);
++
++ if (op_ret < 0)
++ goto done;
++
++ list_for_each_entry(orig_entry, (&orig_entries->list), list)
++ {
++ next_offset = orig_entry->d_off;
++
++ if (IA_ISINVAL(orig_entry->d_stat.ia_type)) {
++ /*stat failed somewhere- ignore this entry*/
++ continue;
++ }
++
++ entry = gf_dirent_for_name(orig_entry->d_name);
++ if (!entry) {
++ goto unwind;
++ }
++
++ entry->d_off = orig_entry->d_off;
++ entry->d_stat = orig_entry->d_stat;
++ entry->d_ino = orig_entry->d_ino;
++ entry->d_type = orig_entry->d_type;
++ entry->d_len = orig_entry->d_len;
++
++ if (orig_entry->dict)
++ entry->dict = dict_ref(orig_entry->dict);
++
++ if (check_is_linkfile(NULL, (&orig_entry->d_stat), orig_entry->dict,
++ conf->link_xattr_name)) {
++ goto entries;
++
++ } else if (IA_ISDIR(entry->d_stat.ia_type)) {
++ if (orig_entry->inode) {
++ dht_inode_ctx_time_update(orig_entry->inode, this,
++ &entry->d_stat, 1);
++ }
++ } else {
++ if (orig_entry->inode) {
++ ret = dht_layout_preset(this, prev, orig_entry->inode);
++ if (ret)
++ gf_msg(this->name, GF_LOG_WARNING, 0,
++ DHT_MSG_LAYOUT_SET_FAILED,
++ "failed to link the layout "
++ "in inode");
++
++ entry->inode = inode_ref(orig_entry->inode);
++ } else if (itable) {
++ /*
++ * orig_entry->inode might be null if any upper
++ * layer xlators below client set to null, to
++ * force a lookup on the inode even if the inode
++ * is present in the inode table. In that case
++ * we just update the ctx to make sure we didn't
++ * missed anything.
++ */
++ inode = inode_find(itable, orig_entry->d_stat.ia_gfid);
++ if (inode) {
++ ret = dht_layout_preset(this, TIER_HASHED_SUBVOL, inode);
++ if (ret)
++ gf_msg(this->name, GF_LOG_WARNING, 0,
++ DHT_MSG_LAYOUT_SET_FAILED,
++ "failed to link the layout"
++ " in inode");
++ inode_unref(inode);
++ inode = NULL;
++ }
++ }
++ }
++
++ entries:
++ list_add_tail(&entry->list, &entries.list);
++ count++;
++ }
++ op_ret = count;
++
++done:
++ if (count == 0) {
++ /* non-zero next_offset means that
++ EOF is not yet hit on the current subvol
++ */
++ if (next_offset != 0) {
++ next_subvol = prev;
++ } else {
++ goto unwind;
++ }
++
++ STACK_WIND_COOKIE(frame, tier_readdirp_cbk, next_subvol, next_subvol,
++ next_subvol->fops->readdirp, local->fd, local->size,
++ next_offset, local->xattr);
++ return 0;
++ }
++
++unwind:
++ if (op_ret < 0)
++ op_ret = 0;
++
++ DHT_STACK_UNWIND(readdirp, frame, op_ret, op_errno, &entries, NULL);
++
++ gf_dirent_free(&entries);
++
++ return 0;
++}
++
++int
++tier_do_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
++ off_t yoff, int whichop, dict_t *dict)
++{
++ dht_local_t *local = NULL;
++ int op_errno = -1;
++ xlator_t *hashed_subvol = NULL;
++ int ret = 0;
++ dht_conf_t *conf = NULL;
++
++ VALIDATE_OR_GOTO(frame, err);
++ VALIDATE_OR_GOTO(this, err);
++ VALIDATE_OR_GOTO(fd, err);
++ VALIDATE_OR_GOTO(this->private, err);
++
++ conf = this->private;
++
++ local = dht_local_init(frame, NULL, NULL, whichop);
++ if (!local) {
++ op_errno = ENOMEM;
++ goto err;
++ }
++
++ local->fd = fd_ref(fd);
++ local->size = size;
++ local->xattr_req = (dict) ? dict_ref(dict) : NULL;
++
++ hashed_subvol = TIER_HASHED_SUBVOL;
++
++ /* TODO: do proper readdir */
++ if (whichop == GF_FOP_READDIRP) {
++ if (dict)
++ local->xattr = dict_ref(dict);
++ else
++ local->xattr = dict_new();
++
++ if (local->xattr) {
++ ret = dict_set_uint32(local->xattr, conf->link_xattr_name, 256);
++ if (ret)
++ gf_msg(this->name, GF_LOG_WARNING, 0, DHT_MSG_DICT_SET_FAILED,
++ "Failed to set dictionary value"
++ " : key = %s",
++ conf->link_xattr_name);
++ }
++
++ STACK_WIND_COOKIE(frame, tier_readdirp_cbk, hashed_subvol,
++ hashed_subvol, hashed_subvol->fops->readdirp, fd,
++ size, yoff, local->xattr);
++
++ } else {
++ STACK_WIND_COOKIE(frame, tier_readdir_cbk, hashed_subvol, hashed_subvol,
++ hashed_subvol->fops->readdir, fd, size, yoff,
++ local->xattr);
++ }
++
++ return 0;
++
++err:
++ op_errno = (op_errno == -1) ? errno : op_errno;
++ DHT_STACK_UNWIND(readdir, frame, -1, op_errno, NULL, NULL);
++
++ return 0;
++}
++
++int
++tier_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
++ off_t yoff, dict_t *xdata)
++{
++ int op = GF_FOP_READDIR;
++ dht_conf_t *conf = NULL;
++ int i = 0;
++
++ conf = this->private;
++ if (!conf)
++ goto out;
++
++ for (i = 0; i < conf->subvolume_cnt; i++) {
++ if (!conf->subvolume_status[i]) {
++ op = GF_FOP_READDIRP;
++ break;
++ }
++ }
++
++ if (conf->use_readdirp)
++ op = GF_FOP_READDIRP;
++
++out:
++ tier_do_readdir(frame, this, fd, size, yoff, op, 0);
++ return 0;
++}
++
++int
++tier_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
++ off_t yoff, dict_t *dict)
++{
++ tier_do_readdir(frame, this, fd, size, yoff, GF_FOP_READDIRP, dict);
++ return 0;
++}
++
++int
++tier_statfs_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, struct statvfs *statvfs, dict_t *xdata)
++{
++ gf_boolean_t event = _gf_false;
++ qdstatfs_action_t action = qdstatfs_action_OFF;
++ dht_local_t *local = NULL;
++ int this_call_cnt = 0;
++ int bsize = 0;
++ int frsize = 0;
++ GF_UNUSED int ret = 0;
++ unsigned long new_usage = 0;
++ unsigned long cur_usage = 0;
++ xlator_t *prev = NULL;
++ dht_conf_t *conf = NULL;
++ tier_statvfs_t *tier_stat = NULL;
++
++ prev = cookie;
++ local = frame->local;
++ GF_ASSERT(local);
++
++ conf = this->private;
++
++ if (xdata)
++ ret = dict_get_int8(xdata, "quota-deem-statfs", (int8_t *)&event);
++
++ tier_stat = &local->tier_statvfs;
++
++ LOCK(&frame->lock);
++ {
++ if (op_ret == -1) {
++ local->op_errno = op_errno;
++ goto unlock;
++ }
++ if (!statvfs) {
++ op_errno = EINVAL;
++ local->op_ret = -1;
++ goto unlock;
++ }
++ local->op_ret = 0;
++
++ if (local->quota_deem_statfs) {
++ if (event == _gf_true) {
++ action = qdstatfs_action_COMPARE;
++ } else {
++ action = qdstatfs_action_NEGLECT;
++ }
++ } else {
++ if (event == _gf_true) {
++ action = qdstatfs_action_REPLACE;
++ local->quota_deem_statfs = _gf_true;
++ }
++ }
++
++ if (local->quota_deem_statfs) {
++ switch (action) {
++ case qdstatfs_action_NEGLECT:
++ goto unlock;
++
++ case qdstatfs_action_REPLACE:
++ local->statvfs = *statvfs;
++ goto unlock;
++
++ case qdstatfs_action_COMPARE:
++ new_usage = statvfs->f_blocks - statvfs->f_bfree;
++ cur_usage = local->statvfs.f_blocks -
++ local->statvfs.f_bfree;
++
++ /* Take the max of the usage from subvols */
++ if (new_usage >= cur_usage)
++ local->statvfs = *statvfs;
++ goto unlock;
++
++ default:
++ break;
++ }
++ }
++
++ if (local->statvfs.f_bsize != 0) {
++ bsize = max(local->statvfs.f_bsize, statvfs->f_bsize);
++ frsize = max(local->statvfs.f_frsize, statvfs->f_frsize);
++ dht_normalize_stats(&local->statvfs, bsize, frsize);
++ dht_normalize_stats(statvfs, bsize, frsize);
++ } else {
++ local->statvfs.f_bsize = statvfs->f_bsize;
++ local->statvfs.f_frsize = statvfs->f_frsize;
++ }
++
++ if (prev == TIER_HASHED_SUBVOL) {
++ local->statvfs.f_blocks = statvfs->f_blocks;
++ local->statvfs.f_files = statvfs->f_files;
++ local->statvfs.f_fsid = statvfs->f_fsid;
++ local->statvfs.f_flag = statvfs->f_flag;
++ local->statvfs.f_namemax = statvfs->f_namemax;
++ tier_stat->blocks_used = (statvfs->f_blocks - statvfs->f_bfree);
++ tier_stat->pblocks_used = (statvfs->f_blocks - statvfs->f_bavail);
++ tier_stat->files_used = (statvfs->f_files - statvfs->f_ffree);
++ tier_stat->pfiles_used = (statvfs->f_files - statvfs->f_favail);
++ tier_stat->hashed_fsid = statvfs->f_fsid;
++ } else {
++ tier_stat->unhashed_fsid = statvfs->f_fsid;
++ tier_stat->unhashed_blocks_used = (statvfs->f_blocks -
++ statvfs->f_bfree);
++ tier_stat->unhashed_pblocks_used = (statvfs->f_blocks -
++ statvfs->f_bavail);
++ tier_stat->unhashed_files_used = (statvfs->f_files -
++ statvfs->f_ffree);
++ tier_stat->unhashed_pfiles_used = (statvfs->f_files -
++ statvfs->f_favail);
++ }
++ }
++unlock:
++ UNLOCK(&frame->lock);
++
++ this_call_cnt = dht_frame_return(frame);
++ if (is_last_call(this_call_cnt)) {
++ if (tier_stat->unhashed_fsid != tier_stat->hashed_fsid) {
++ tier_stat->blocks_used += tier_stat->unhashed_blocks_used;
++ tier_stat->pblocks_used += tier_stat->unhashed_pblocks_used;
++ tier_stat->files_used += tier_stat->unhashed_files_used;
++ tier_stat->pfiles_used += tier_stat->unhashed_pfiles_used;
++ }
++ local->statvfs.f_bfree = local->statvfs.f_blocks -
++ tier_stat->blocks_used;
++ local->statvfs.f_bavail = local->statvfs.f_blocks -
++ tier_stat->pblocks_used;
++ local->statvfs.f_ffree = local->statvfs.f_files - tier_stat->files_used;
++ local->statvfs.f_favail = local->statvfs.f_files -
++ tier_stat->pfiles_used;
++ DHT_STACK_UNWIND(statfs, frame, local->op_ret, local->op_errno,
++ &local->statvfs, xdata);
++ }
++
++ return 0;
++}
++
++int
++tier_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
++{
++ dht_local_t *local = NULL;
++ dht_conf_t *conf = NULL;
++ int op_errno = -1;
++ int i = -1;
++ inode_t *inode = NULL;
++ inode_table_t *itable = NULL;
++ uuid_t root_gfid = {
++ 0,
++ };
++ loc_t newloc = {
++ 0,
++ };
++
++ VALIDATE_OR_GOTO(frame, err);
++ VALIDATE_OR_GOTO(this, err);
++ VALIDATE_OR_GOTO(loc, err);
++ VALIDATE_OR_GOTO(this->private, err);
++
++ conf = this->private;
++
++ local = dht_local_init(frame, NULL, NULL, GF_FOP_STATFS);
++ if (!local) {
++ op_errno = ENOMEM;
++ goto err;
++ }
++
++ if (loc->inode && !IA_ISDIR(loc->inode->ia_type)) {
++ itable = loc->inode->table;
++ if (!itable) {
++ op_errno = EINVAL;
++ goto err;
++ }
++
++ loc = &local->loc2;
++ root_gfid[15] = 1;
++
++ inode = inode_find(itable, root_gfid);
++ if (!inode) {
++ op_errno = EINVAL;
++ goto err;
++ }
++
++ dht_build_root_loc(inode, &newloc);
++ loc = &newloc;
++ }
++
++ local->call_cnt = conf->subvolume_cnt;
++
++ for (i = 0; i < conf->subvolume_cnt; i++) {
++ STACK_WIND_COOKIE(frame, tier_statfs_cbk, conf->subvolumes[i],
++ conf->subvolumes[i],
++ conf->subvolumes[i]->fops->statfs, loc, xdata);
++ }
++
++ return 0;
++
++err:
++ op_errno = (op_errno == -1) ? errno : op_errno;
++ DHT_STACK_UNWIND(statfs, frame, -1, op_errno, NULL, NULL);
++
++ return 0;
++}
+diff --git a/xlators/cluster/dht/src/tier-common.h b/xlators/cluster/dht/src/tier-common.h
+new file mode 100644
+index 0000000..b1ebaa8
+--- /dev/null
++++ b/xlators/cluster/dht/src/tier-common.h
+@@ -0,0 +1,55 @@
++/*
++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#ifndef _TIER_COMMON_H_
++#define _TIER_COMMON_H_
++/* Function definitions */
++int
++tier_create_unlink_stale_linkto_cbk(call_frame_t *frame, void *cookie,
++ xlator_t *this, int op_ret, int op_errno,
++ struct iatt *preparent,
++ struct iatt *postparent, dict_t *xdata);
++
++int
++tier_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata);
++
++int
++tier_create_linkfile_create_cbk(call_frame_t *frame, void *cookie,
++ xlator_t *this, int32_t op_ret,
++ int32_t op_errno, inode_t *inode,
++ struct iatt *stbuf, struct iatt *preparent,
++ struct iatt *postparent, dict_t *xdata);
++
++int
++tier_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
++ mode_t mode, mode_t umask, fd_t *fd, dict_t *params);
++
++int32_t
++tier_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
++ dict_t *xdata);
++
++int32_t
++tier_readdirp(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
++ off_t off, dict_t *dict);
++
++int
++tier_readdir(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size,
++ off_t yoff, dict_t *xdata);
++
++int
++tier_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
++ dict_t *xdata);
++
++int
++tier_statfs(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata);
++
++#endif
+diff --git a/xlators/cluster/dht/src/tier.c b/xlators/cluster/dht/src/tier.c
+new file mode 100644
+index 0000000..94b4c63
+--- /dev/null
++++ b/xlators/cluster/dht/src/tier.c
+@@ -0,0 +1,3105 @@
++/*
++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#include <dlfcn.h>
++
++#include "dht-common.h"
++#include "tier.h"
++#include "tier-common.h"
++#include <glusterfs/syscall.h>
++#include <glusterfs/events.h>
++#include "tier-ctr-interface.h"
++
++/*Hard coded DB info*/
++static gfdb_db_type_t dht_tier_db_type = GFDB_SQLITE3;
++/*Hard coded DB info*/
++
++/*Mutex for updating the data movement stats*/
++static pthread_mutex_t dm_stat_mutex = PTHREAD_MUTEX_INITIALIZER;
++
++/* Stores the path location of promotion query files */
++static char *promotion_qfile;
++/* Stores the path location of demotion query files */
++static char *demotion_qfile;
++
++static void *libhandle;
++static gfdb_methods_t gfdb_methods;
++
++#define DB_QUERY_RECORD_SIZE 4096
++
++/*
++ * Closes all the fds and frees the qfile_array
++ * */
++static void
++qfile_array_free(tier_qfile_array_t *qfile_array)
++{
++ ssize_t i = 0;
++
++ if (qfile_array) {
++ if (qfile_array->fd_array) {
++ for (i = 0; i < qfile_array->array_size; i++) {
++ if (qfile_array->fd_array[i] != -1) {
++ sys_close(qfile_array->fd_array[i]);
++ }
++ }
++ }
++ GF_FREE(qfile_array->fd_array);
++ }
++ GF_FREE(qfile_array);
++}
++
++/* Create a new query file list with given size */
++static tier_qfile_array_t *
++qfile_array_new(ssize_t array_size)
++{
++ int ret = -1;
++ tier_qfile_array_t *qfile_array = NULL;
++ ssize_t i = 0;
++
++ GF_VALIDATE_OR_GOTO("tier", (array_size > 0), out);
++
++ qfile_array = GF_CALLOC(1, sizeof(tier_qfile_array_t),
++ gf_tier_mt_qfile_array_t);
++ if (!qfile_array) {
++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to allocate memory for tier_qfile_array_t");
++ goto out;
++ }
++
++ qfile_array->fd_array = GF_MALLOC(array_size * sizeof(int),
++ gf_dht_mt_int32_t);
++ if (!qfile_array->fd_array) {
++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to allocate memory for "
++ "tier_qfile_array_t->fd_array");
++ goto out;
++ }
++
++ /* Init all the fds to -1 */
++ for (i = 0; i < array_size; i++) {
++ qfile_array->fd_array[i] = -1;
++ }
++
++ qfile_array->array_size = array_size;
++ qfile_array->next_index = 0;
++
++ /* Set exhausted count to list size as the list is empty */
++ qfile_array->exhausted_count = qfile_array->array_size;
++
++ ret = 0;
++out:
++ if (ret) {
++ qfile_array_free(qfile_array);
++ qfile_array = NULL;
++ }
++ return qfile_array;
++}
++
++/* Checks if the query file list is empty or totally exhausted. */
++static gf_boolean_t
++is_qfile_array_empty(tier_qfile_array_t *qfile_array)
++{
++ return (qfile_array->exhausted_count == qfile_array->array_size)
++ ? _gf_true
++ : _gf_false;
++}
++
++/* Shifts the next_fd pointer to the next available fd in the list */
++static void
++shift_next_index(tier_qfile_array_t *qfile_array)
++{
++ int qfile_fd = 0;
++ int spin_count = 0;
++
++ if (is_qfile_array_empty(qfile_array)) {
++ return;
++ }
++
++ do {
++ /* change next_index in a rotional manner */
++ (qfile_array->next_index == (qfile_array->array_size - 1))
++ ? qfile_array->next_index = 0
++ : qfile_array->next_index++;
++
++ qfile_fd = (qfile_array->fd_array[qfile_array->next_index]);
++
++ spin_count++;
++
++ } while ((qfile_fd == -1) && (spin_count < qfile_array->array_size));
++}
++
++/*
++ * This is a non-thread safe function to read query records
++ * from a list of query files in a Round-Robin manner.
++ * As in when the query files get exhuasted they are closed.
++ * Returns:
++ * 0 if all the query records in all the query files of the list are
++ * exhausted.
++ * > 0 if a query record is successfully read. Indicates the size of the query
++ * record read.
++ * < 0 if there was failure
++ * */
++static int
++read_query_record_list(tier_qfile_array_t *qfile_array,
++ gfdb_query_record_t **query_record)
++{
++ int ret = -1;
++ int qfile_fd = 0;
++
++ GF_VALIDATE_OR_GOTO("tier", qfile_array, out);
++ GF_VALIDATE_OR_GOTO("tier", qfile_array->fd_array, out);
++
++ do {
++ if (is_qfile_array_empty(qfile_array)) {
++ ret = 0;
++ break;
++ }
++
++ qfile_fd = qfile_array->fd_array[qfile_array->next_index];
++ ret = gfdb_methods.gfdb_read_query_record(qfile_fd, query_record);
++ if (ret <= 0) {
++ /*The qfile_fd has reached EOF or
++ * there was an error.
++ * 1. Close the exhausted fd
++ * 2. increment the exhausted count
++ * 3. shift next_qfile to next qfile
++ **/
++ sys_close(qfile_fd);
++ qfile_array->fd_array[qfile_array->next_index] = -1;
++ qfile_array->exhausted_count++;
++ /* shift next_qfile to next qfile */
++ shift_next_index(qfile_array);
++ continue;
++ } else {
++ /* shift next_qfile to next qfile */
++ shift_next_index(qfile_array);
++ break;
++ }
++ } while (1);
++out:
++ return ret;
++}
++
++/* Check and update the watermark every WM_INTERVAL seconds */
++#define WM_INTERVAL 5
++#define WM_INTERVAL_EMERG 1
++
++static int
++tier_check_same_node(xlator_t *this, loc_t *loc, gf_defrag_info_t *defrag)
++{
++ int ret = -1;
++ dict_t *dict = NULL;
++ char *uuid_str = NULL;
++ uuid_t node_uuid = {
++ 0,
++ };
++
++ GF_VALIDATE_OR_GOTO("tier", this, out);
++ GF_VALIDATE_OR_GOTO(this->name, loc, out);
++ GF_VALIDATE_OR_GOTO(this->name, defrag, out);
++
++ if (syncop_getxattr(this, loc, &dict, GF_XATTR_NODE_UUID_KEY, NULL, NULL)) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Unable to get NODE_UUID_KEY %s %s\n", loc->name, loc->path);
++ goto out;
++ }
++
++ if (dict_get_str(dict, GF_XATTR_NODE_UUID_KEY, &uuid_str) < 0) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to get node-uuids for %s", loc->path);
++ goto out;
++ }
++
++ if (gf_uuid_parse(uuid_str, node_uuid)) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "uuid_parse failed for %s", loc->path);
++ goto out;
++ }
++
++ if (gf_uuid_compare(node_uuid, defrag->node_uuid)) {
++ gf_msg_debug(this->name, 0, "%s does not belong to this node",
++ loc->path);
++ ret = 1;
++ goto out;
++ }
++
++ ret = 0;
++out:
++ if (dict)
++ dict_unref(dict);
++
++ return ret;
++}
++
++int
++tier_get_fs_stat(xlator_t *this, loc_t *root_loc)
++{
++ int ret = 0;
++ gf_defrag_info_t *defrag = NULL;
++ dht_conf_t *conf = NULL;
++ dict_t *xdata = NULL;
++ struct statvfs statfs = {
++ 0,
++ };
++ gf_tier_conf_t *tier_conf = NULL;
++
++ conf = this->private;
++ if (!conf) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
++ "conf is NULL");
++ ret = -1;
++ goto exit;
++ }
++
++ defrag = conf->defrag;
++ if (!defrag) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
++ "defrag is NULL");
++ ret = -1;
++ goto exit;
++ }
++
++ tier_conf = &defrag->tier_conf;
++
++ xdata = dict_new();
++ if (!xdata) {
++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, DHT_MSG_NO_MEMORY,
++ "failed to allocate dictionary");
++ ret = -1;
++ goto exit;
++ }
++
++ ret = dict_set_int8(xdata, GF_INTERNAL_IGNORE_DEEM_STATFS, 1);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_DICT_SET_FAILED,
++ "Failed to set " GF_INTERNAL_IGNORE_DEEM_STATFS " in dict");
++ ret = -1;
++ goto exit;
++ }
++
++ /* Find how much free space is on the hot subvolume.
++ * Then see if that value */
++ /* is less than or greater than user defined watermarks.
++ * Stash results in */
++ /* the tier_conf data structure. */
++
++ ret = syncop_statfs(conf->subvolumes[1], root_loc, &statfs, xdata, NULL);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_STATUS,
++ "Unable to obtain statfs.");
++ goto exit;
++ }
++
++ pthread_mutex_lock(&dm_stat_mutex);
++
++ tier_conf->block_size = statfs.f_bsize;
++ tier_conf->blocks_total = statfs.f_blocks;
++ tier_conf->blocks_used = statfs.f_blocks - statfs.f_bfree;
++
++ tier_conf->percent_full = GF_PERCENTAGE(tier_conf->blocks_used,
++ statfs.f_blocks);
++ pthread_mutex_unlock(&dm_stat_mutex);
++
++exit:
++ if (xdata)
++ dict_unref(xdata);
++ return ret;
++}
++
++static void
++tier_send_watermark_event(const char *volname, tier_watermark_op_t old_wm,
++ tier_watermark_op_t new_wm)
++{
++ if (old_wm == TIER_WM_LOW || old_wm == TIER_WM_NONE) {
++ if (new_wm == TIER_WM_MID) {
++ gf_event(EVENT_TIER_WATERMARK_RAISED_TO_MID, "vol=%s", volname);
++ } else if (new_wm == TIER_WM_HI) {
++ gf_event(EVENT_TIER_WATERMARK_HI, "vol=%s", volname);
++ }
++ } else if (old_wm == TIER_WM_MID) {
++ if (new_wm == TIER_WM_LOW) {
++ gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_LOW, "vol=%s", volname);
++ } else if (new_wm == TIER_WM_HI) {
++ gf_event(EVENT_TIER_WATERMARK_HI, "vol=%s", volname);
++ }
++ } else if (old_wm == TIER_WM_HI) {
++ if (new_wm == TIER_WM_MID) {
++ gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_MID, "vol=%s", volname);
++ } else if (new_wm == TIER_WM_LOW) {
++ gf_event(EVENT_TIER_WATERMARK_DROPPED_TO_LOW, "vol=%s", volname);
++ }
++ }
++}
++
++int
++tier_check_watermark(xlator_t *this)
++{
++ int ret = -1;
++ gf_defrag_info_t *defrag = NULL;
++ dht_conf_t *conf = NULL;
++ gf_tier_conf_t *tier_conf = NULL;
++ tier_watermark_op_t wm = TIER_WM_NONE;
++
++ conf = this->private;
++ if (!conf)
++ goto exit;
++
++ defrag = conf->defrag;
++ if (!defrag)
++ goto exit;
++
++ tier_conf = &defrag->tier_conf;
++
++ if (tier_conf->percent_full < tier_conf->watermark_low) {
++ wm = TIER_WM_LOW;
++
++ } else if (tier_conf->percent_full < tier_conf->watermark_hi) {
++ wm = TIER_WM_MID;
++
++ } else {
++ wm = TIER_WM_HI;
++ }
++
++ if (wm != tier_conf->watermark_last) {
++ tier_send_watermark_event(tier_conf->volname, tier_conf->watermark_last,
++ wm);
++
++ tier_conf->watermark_last = wm;
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Tier watermark now %d", wm);
++ }
++
++ ret = 0;
++
++exit:
++ return ret;
++}
++
++static gf_boolean_t
++is_hot_tier_full(gf_tier_conf_t *tier_conf)
++{
++ if (tier_conf && (tier_conf->mode == TIER_MODE_WM) &&
++ (tier_conf->watermark_last == TIER_WM_HI))
++ return _gf_true;
++
++ return _gf_false;
++}
++
++int
++tier_do_migration(xlator_t *this, int promote)
++{
++ gf_defrag_info_t *defrag = NULL;
++ dht_conf_t *conf = NULL;
++ long rand = 0;
++ int migrate = 0;
++ gf_tier_conf_t *tier_conf = NULL;
++
++ conf = this->private;
++ if (!conf)
++ goto exit;
++
++ defrag = conf->defrag;
++ if (!defrag)
++ goto exit;
++
++ if (tier_check_watermark(this) != 0) {
++ gf_msg(this->name, GF_LOG_CRITICAL, errno, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to get watermark");
++ goto exit;
++ }
++
++ tier_conf = &defrag->tier_conf;
++
++ switch (tier_conf->watermark_last) {
++ case TIER_WM_LOW:
++ migrate = promote ? 1 : 0;
++ break;
++ case TIER_WM_HI:
++ migrate = promote ? 0 : 1;
++ break;
++ case TIER_WM_MID:
++ /* coverity[DC.WEAK_CRYPTO] */
++ rand = random() % 100;
++ if (promote) {
++ migrate = (rand > tier_conf->percent_full);
++ } else {
++ migrate = (rand <= tier_conf->percent_full);
++ }
++ break;
++ }
++
++exit:
++ return migrate;
++}
++
++int
++tier_migrate(xlator_t *this, int is_promotion, dict_t *migrate_data, loc_t *loc,
++ gf_tier_conf_t *tier_conf)
++{
++ int ret = -1;
++
++ pthread_mutex_lock(&tier_conf->pause_mutex);
++ if (is_promotion)
++ tier_conf->promote_in_progress = 1;
++ else
++ tier_conf->demote_in_progress = 1;
++ pthread_mutex_unlock(&tier_conf->pause_mutex);
++
++ /* Data migration */
++ ret = syncop_setxattr(this, loc, migrate_data, 0, NULL, NULL);
++
++ pthread_mutex_lock(&tier_conf->pause_mutex);
++ if (is_promotion)
++ tier_conf->promote_in_progress = 0;
++ else
++ tier_conf->demote_in_progress = 0;
++ pthread_mutex_unlock(&tier_conf->pause_mutex);
++
++ return ret;
++}
++
++/* returns _gf_true: if file can be promoted
++ * returns _gf_false: if file cannot be promoted
++ */
++static gf_boolean_t
++tier_can_promote_file(xlator_t *this, char const *file_name,
++ struct iatt *current, gf_defrag_info_t *defrag)
++{
++ gf_boolean_t ret = _gf_false;
++ fsblkcnt_t estimated_usage = 0;
++
++ if (defrag->tier_conf.tier_max_promote_size &&
++ (current->ia_size > defrag->tier_conf.tier_max_promote_size)) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "File %s (gfid:%s) with size (%" PRIu64
++ ") exceeds maxsize "
++ "(%d) for promotion. File will not be promoted.",
++ file_name, uuid_utoa(current->ia_gfid), current->ia_size,
++ defrag->tier_conf.tier_max_promote_size);
++ goto err;
++ }
++
++ /* bypass further validations for TEST mode */
++ if (defrag->tier_conf.mode != TIER_MODE_WM) {
++ ret = _gf_true;
++ goto err;
++ }
++
++ /* convert the file size to blocks as per the block size of the
++ * destination tier
++ * NOTE: add (block_size - 1) to get the correct block size when
++ * there is a remainder after a modulo
++ */
++ estimated_usage = ((current->ia_size + defrag->tier_conf.block_size - 1) /
++ defrag->tier_conf.block_size) +
++ defrag->tier_conf.blocks_used;
++
++ /* test if the estimated block usage goes above HI watermark */
++ if (GF_PERCENTAGE(estimated_usage, defrag->tier_conf.blocks_total) >=
++ defrag->tier_conf.watermark_hi) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Estimated block count consumption on "
++ "hot tier (%" PRIu64
++ ") exceeds hi watermark (%d%%). "
++ "File will not be promoted.",
++ estimated_usage, defrag->tier_conf.watermark_hi);
++ goto err;
++ }
++ ret = _gf_true;
++err:
++ return ret;
++}
++
++static int
++tier_set_migrate_data(dict_t *migrate_data)
++{
++ int failed = 1;
++
++ failed = dict_set_str(migrate_data, GF_XATTR_FILE_MIGRATE_KEY, "force");
++ if (failed) {
++ goto bail_out;
++ }
++
++ /* Flag to suggest the xattr call is from migrator */
++ failed = dict_set_str(migrate_data, "from.migrator", "yes");
++ if (failed) {
++ goto bail_out;
++ }
++
++ /* Flag to suggest its a tiering migration
++ * The reason for this dic key-value is that
++ * promotions and demotions are multithreaded
++ * so the original frame from gf_defrag_start()
++ * is not carried. A new frame will be created when
++ * we do syncop_setxattr(). This does not have the
++ * frame->root->pid of the original frame. So we pass
++ * this dic key-value when we do syncop_setxattr() to do
++ * data migration and set the frame->root->pid to
++ * GF_CLIENT_PID_TIER_DEFRAG in dht_setxattr() just before
++ * calling dht_start_rebalance_task() */
++ failed = dict_set_str(migrate_data, TIERING_MIGRATION_KEY, "yes");
++ if (failed) {
++ goto bail_out;
++ }
++
++ failed = 0;
++
++bail_out:
++ return failed;
++}
++
++static char *
++tier_get_parent_path(xlator_t *this, loc_t *p_loc, struct iatt *par_stbuf,
++ int *per_link_status)
++{
++ int ret = -1;
++ char *parent_path = NULL;
++ dict_t *xdata_request = NULL;
++ dict_t *xdata_response = NULL;
++
++ xdata_request = dict_new();
++ if (!xdata_request) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to create xdata_request dict");
++ goto err;
++ }
++ ret = dict_set_int32(xdata_request, GET_ANCESTRY_PATH_KEY, 42);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to set value to dict : key %s \n",
++ GET_ANCESTRY_PATH_KEY);
++ goto err;
++ }
++
++ ret = syncop_lookup(this, p_loc, par_stbuf, NULL, xdata_request,
++ &xdata_response);
++ /* When the parent gfid is a stale entry, the lookup
++ * will fail and stop the demotion process.
++ * The parent gfid can be stale when a huge folder is
++ * deleted while the files within it are being migrated
++ */
++ if (ret == -ESTALE) {
++ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_STALE_LOOKUP,
++ "Stale entry in parent lookup for %s", uuid_utoa(p_loc->gfid));
++ *per_link_status = 1;
++ goto err;
++ } else if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR,
++ "Error in parent lookup for %s", uuid_utoa(p_loc->gfid));
++ *per_link_status = -1;
++ goto err;
++ }
++ ret = dict_get_str(xdata_response, GET_ANCESTRY_PATH_KEY, &parent_path);
++ if (ret || !parent_path) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to get parent path for %s", uuid_utoa(p_loc->gfid));
++ *per_link_status = -1;
++ goto err;
++ }
++
++err:
++ if (xdata_request) {
++ dict_unref(xdata_request);
++ }
++
++ if (xdata_response) {
++ dict_unref(xdata_response);
++ xdata_response = NULL;
++ }
++
++ return parent_path;
++}
++
++static int
++tier_get_file_name_and_path(xlator_t *this, uuid_t gfid,
++ gfdb_link_info_t *link_info,
++ char const *parent_path, loc_t *loc,
++ int *per_link_status)
++{
++ int ret = -1;
++
++ loc->name = gf_strdup(link_info->file_name);
++ if (!loc->name) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Memory "
++ "allocation failed for %s",
++ uuid_utoa(gfid));
++ *per_link_status = -1;
++ goto err;
++ }
++ ret = gf_asprintf((char **)&(loc->path), "%s/%s", parent_path, loc->name);
++ if (ret < 0) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to "
++ "construct file path for %s %s\n",
++ parent_path, loc->name);
++ *per_link_status = -1;
++ goto err;
++ }
++
++ ret = 0;
++
++err:
++ return ret;
++}
++
++static int
++tier_lookup_file(xlator_t *this, loc_t *p_loc, loc_t *loc, struct iatt *current,
++ int *per_link_status)
++{
++ int ret = -1;
++
++ ret = syncop_lookup(this, loc, current, NULL, NULL, NULL);
++
++ /* The file may be deleted even when the parent
++ * is available and the lookup will
++ * return a stale entry which would stop the
++ * migration. so if its a stale entry, then skip
++ * the file and keep migrating.
++ */
++ if (ret == -ESTALE) {
++ gf_msg(this->name, GF_LOG_WARNING, -ret, DHT_MSG_STALE_LOOKUP,
++ "Stale lookup for %s", uuid_utoa(p_loc->gfid));
++ *per_link_status = 1;
++ goto err;
++ } else if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to "
++ "lookup file %s\n",
++ loc->name);
++ *per_link_status = -1;
++ goto err;
++ }
++ ret = 0;
++
++err:
++ return ret;
++}
++
++static gf_boolean_t
++tier_is_file_already_at_destination(xlator_t *src_subvol,
++ query_cbk_args_t *query_cbk_args,
++ dht_conf_t *conf, int *per_link_status)
++{
++ gf_boolean_t at_destination = _gf_true;
++
++ if (src_subvol == NULL) {
++ *per_link_status = 1;
++ goto err;
++ }
++ if (query_cbk_args->is_promotion && src_subvol == conf->subvolumes[1]) {
++ *per_link_status = 1;
++ goto err;
++ }
++
++ if (!query_cbk_args->is_promotion && src_subvol == conf->subvolumes[0]) {
++ *per_link_status = 1;
++ goto err;
++ }
++ at_destination = _gf_false;
++
++err:
++ return at_destination;
++}
++
++static void
++tier_update_migration_counters(query_cbk_args_t *query_cbk_args,
++ gf_defrag_info_t *defrag,
++ uint64_t *total_migrated_bytes, int *total_files)
++{
++ if (query_cbk_args->is_promotion) {
++ defrag->total_files_promoted++;
++ *total_migrated_bytes += defrag->tier_conf.st_last_promoted_size;
++ pthread_mutex_lock(&dm_stat_mutex);
++ defrag->tier_conf.blocks_used += defrag->tier_conf
++ .st_last_promoted_size;
++ pthread_mutex_unlock(&dm_stat_mutex);
++ } else {
++ defrag->total_files_demoted++;
++ *total_migrated_bytes += defrag->tier_conf.st_last_demoted_size;
++ pthread_mutex_lock(&dm_stat_mutex);
++ defrag->tier_conf.blocks_used -= defrag->tier_conf.st_last_demoted_size;
++ pthread_mutex_unlock(&dm_stat_mutex);
++ }
++ if (defrag->tier_conf.blocks_total) {
++ pthread_mutex_lock(&dm_stat_mutex);
++ defrag->tier_conf.percent_full = GF_PERCENTAGE(
++ defrag->tier_conf.blocks_used, defrag->tier_conf.blocks_total);
++ pthread_mutex_unlock(&dm_stat_mutex);
++ }
++
++ (*total_files)++;
++}
++
++static int
++tier_migrate_link(xlator_t *this, dht_conf_t *conf, uuid_t gfid,
++ gfdb_link_info_t *link_info, gf_defrag_info_t *defrag,
++ query_cbk_args_t *query_cbk_args, dict_t *migrate_data,
++ int *per_link_status, int *total_files,
++ uint64_t *total_migrated_bytes)
++{
++ int ret = -1;
++ struct iatt current = {
++ 0,
++ };
++ struct iatt par_stbuf = {
++ 0,
++ };
++ loc_t p_loc = {
++ 0,
++ };
++ loc_t loc = {
++ 0,
++ };
++ xlator_t *src_subvol = NULL;
++ inode_t *linked_inode = NULL;
++ char *parent_path = NULL;
++
++ /* Lookup for parent and get the path of parent */
++ gf_uuid_copy(p_loc.gfid, link_info->pargfid);
++ p_loc.inode = inode_new(defrag->root_inode->table);
++ if (!p_loc.inode) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to create reference to inode"
++ " for %s",
++ uuid_utoa(p_loc.gfid));
++
++ *per_link_status = -1;
++ goto err;
++ }
++
++ parent_path = tier_get_parent_path(this, &p_loc, &par_stbuf,
++ per_link_status);
++ if (!parent_path) {
++ goto err;
++ }
++
++ linked_inode = inode_link(p_loc.inode, NULL, NULL, &par_stbuf);
++ inode_unref(p_loc.inode);
++ p_loc.inode = linked_inode;
++
++ /* Preparing File Inode */
++ gf_uuid_copy(loc.gfid, gfid);
++ loc.inode = inode_new(defrag->root_inode->table);
++ gf_uuid_copy(loc.pargfid, link_info->pargfid);
++ loc.parent = inode_ref(p_loc.inode);
++
++ /* Get filename and Construct file path */
++ if (tier_get_file_name_and_path(this, gfid, link_info, parent_path, &loc,
++ per_link_status) != 0) {
++ goto err;
++ }
++ gf_uuid_copy(loc.parent->gfid, link_info->pargfid);
++
++ /* lookup file inode */
++ if (tier_lookup_file(this, &p_loc, &loc, &current, per_link_status) != 0) {
++ goto err;
++ }
++
++ if (query_cbk_args->is_promotion) {
++ if (!tier_can_promote_file(this, link_info->file_name, &current,
++ defrag)) {
++ *per_link_status = 1;
++ goto err;
++ }
++ }
++
++ linked_inode = inode_link(loc.inode, NULL, NULL, &current);
++ inode_unref(loc.inode);
++ loc.inode = linked_inode;
++
++ /*
++ * Do not promote/demote if file already is where it
++ * should be. It means another brick moved the file
++ * so is not an error. So we set per_link_status = 1
++ * so that we ignore counting this.
++ */
++ src_subvol = dht_subvol_get_cached(this, loc.inode);
++
++ if (tier_is_file_already_at_destination(src_subvol, query_cbk_args, conf,
++ per_link_status)) {
++ goto err;
++ }
++
++ gf_msg_debug(this->name, 0, "Tier %s: src_subvol %s file %s",
++ (query_cbk_args->is_promotion ? "promote" : "demote"),
++ src_subvol->name, loc.path);
++
++ ret = tier_check_same_node(this, &loc, defrag);
++ if (ret != 0) {
++ if (ret < 0) {
++ *per_link_status = -1;
++ goto err;
++ }
++ ret = 0;
++ /* By setting per_link_status to 1 we are
++ * ignoring this status and will not be counting
++ * this file for migration */
++ *per_link_status = 1;
++ goto err;
++ }
++
++ gf_uuid_copy(loc.gfid, loc.inode->gfid);
++
++ if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Tiering paused. "
++ "Exiting tier_migrate_link");
++ goto err;
++ }
++
++ ret = tier_migrate(this, query_cbk_args->is_promotion, migrate_data, &loc,
++ &defrag->tier_conf);
++
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, -ret, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to "
++ "migrate %s ",
++ loc.path);
++ *per_link_status = -1;
++ goto err;
++ }
++
++ tier_update_migration_counters(query_cbk_args, defrag, total_migrated_bytes,
++ total_files);
++
++ ret = 0;
++
++err:
++ GF_FREE((char *)loc.name);
++ loc.name = NULL;
++ loc_wipe(&loc);
++ loc_wipe(&p_loc);
++
++ if ((*total_files >= defrag->tier_conf.max_migrate_files) ||
++ (*total_migrated_bytes > defrag->tier_conf.max_migrate_bytes)) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Reached cycle migration limit."
++ "migrated bytes %" PRId64 " files %d",
++ *total_migrated_bytes, *total_files);
++ ret = -1;
++ }
++
++ return ret;
++}
++
++static int
++tier_migrate_using_query_file(void *_args)
++{
++ int ret = -1;
++ query_cbk_args_t *query_cbk_args = (query_cbk_args_t *)_args;
++ xlator_t *this = NULL;
++ gf_defrag_info_t *defrag = NULL;
++ gfdb_query_record_t *query_record = NULL;
++ gfdb_link_info_t *link_info = NULL;
++ dict_t *migrate_data = NULL;
++ /*
++ * per_file_status and per_link_status
++ * 0 : success
++ * -1 : failure
++ * 1 : ignore the status and don't count for migration
++ * */
++ int per_file_status = 0;
++ int per_link_status = 0;
++ int total_status = 0;
++ dht_conf_t *conf = NULL;
++ uint64_t total_migrated_bytes = 0;
++ int total_files = 0;
++ loc_t root_loc = {0};
++ gfdb_time_t start_time = {0};
++ gfdb_time_t current_time = {0};
++ int total_time = 0;
++ int max_time = 0;
++ gf_boolean_t emergency_demote_mode = _gf_false;
++
++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args, out);
++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
++ this = query_cbk_args->this;
++ GF_VALIDATE_OR_GOTO(this->name, query_cbk_args->defrag, out);
++ GF_VALIDATE_OR_GOTO(this->name, query_cbk_args->qfile_array, out);
++ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
++
++ conf = this->private;
++
++ defrag = query_cbk_args->defrag;
++ migrate_data = dict_new();
++ if (!migrate_data)
++ goto out;
++
++ emergency_demote_mode = (!query_cbk_args->is_promotion &&
++ is_hot_tier_full(&defrag->tier_conf));
++
++ if (tier_set_migrate_data(migrate_data) != 0) {
++ goto out;
++ }
++
++ dht_build_root_loc(defrag->root_inode, &root_loc);
++
++ ret = gettimeofday(&start_time, NULL);
++ if (query_cbk_args->is_promotion) {
++ max_time = defrag->tier_conf.tier_promote_frequency;
++ } else {
++ max_time = defrag->tier_conf.tier_demote_frequency;
++ }
++
++ /* Per file */
++ while ((ret = read_query_record_list(query_cbk_args->qfile_array,
++ &query_record)) != 0) {
++ if (ret < 0) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to fetch query record "
++ "from query file");
++ goto out;
++ }
++
++ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
++ ret = -1;
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Exiting tier migration as"
++ "defrag status is not started");
++ goto out;
++ }
++
++ ret = gettimeofday(&current_time, NULL);
++ if (ret < 0) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Could not get current time.");
++ goto out;
++ }
++
++ total_time = current_time.tv_sec - start_time.tv_sec;
++ if (total_time > max_time) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Max cycle time reached. Exiting migration.");
++ goto out;
++ }
++
++ per_file_status = 0;
++ per_link_status = 0;
++
++ if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Tiering paused. "
++ "Exiting tier_migrate_using_query_file");
++ break;
++ }
++
++ if (defrag->tier_conf.mode == TIER_MODE_WM) {
++ ret = tier_get_fs_stat(this, &root_loc);
++ if (ret != 0) {
++ gfdb_methods.gfdb_query_record_free(query_record);
++ query_record = NULL;
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
++ "tier_get_fs_stat() FAILED ... "
++ "skipping file migrations until next cycle");
++ break;
++ }
++
++ if (!tier_do_migration(this, query_cbk_args->is_promotion)) {
++ gfdb_methods.gfdb_query_record_free(query_record);
++ query_record = NULL;
++
++ /* We have crossed the high watermark. Stop processing
++ * files if this is a promotion cycle so demotion gets
++ * a chance to start if not already running*/
++
++ if (query_cbk_args->is_promotion &&
++ is_hot_tier_full(&defrag->tier_conf)) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "High watermark crossed during "
++ "promotion. Exiting "
++ "tier_migrate_using_query_file");
++ break;
++ }
++ continue;
++ }
++ }
++
++ per_link_status = 0;
++
++ /* For now we only support single link migration. And we will
++ * ignore other hard links in the link info list of query record
++ * TODO: Multiple hard links migration */
++ if (!list_empty(&query_record->link_list)) {
++ link_info = list_first_entry(&query_record->link_list,
++ gfdb_link_info_t, list);
++ }
++ if (link_info != NULL) {
++ if (tier_migrate_link(this, conf, query_record->gfid, link_info,
++ defrag, query_cbk_args, migrate_data,
++ &per_link_status, &total_files,
++ &total_migrated_bytes) != 0) {
++ gf_msg(
++ this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "%s failed for %s(gfid:%s)",
++ (query_cbk_args->is_promotion ? "Promotion" : "Demotion"),
++ link_info->file_name, uuid_utoa(query_record->gfid));
++ }
++ }
++ per_file_status = per_link_status;
++
++ if (per_file_status < 0) { /* Failure */
++ pthread_mutex_lock(&dm_stat_mutex);
++ defrag->total_failures++;
++ pthread_mutex_unlock(&dm_stat_mutex);
++ } else if (per_file_status == 0) { /* Success */
++ pthread_mutex_lock(&dm_stat_mutex);
++ defrag->total_files++;
++ pthread_mutex_unlock(&dm_stat_mutex);
++ } else if (per_file_status == 1) { /* Ignore */
++ per_file_status = 0;
++ /* Since this attempt was ignored we
++ * decrement the lookup count*/
++ pthread_mutex_lock(&dm_stat_mutex);
++ defrag->num_files_lookedup--;
++ pthread_mutex_unlock(&dm_stat_mutex);
++ }
++ total_status = total_status + per_file_status;
++ per_link_status = 0;
++ per_file_status = 0;
++
++ gfdb_methods.gfdb_query_record_free(query_record);
++ query_record = NULL;
++
++ /* If we are demoting and the entry watermark was HI, then
++ * we are done with emergency demotions if the current
++ * watermark has fallen below hi-watermark level
++ */
++ if (emergency_demote_mode) {
++ if (tier_check_watermark(this) == 0) {
++ if (!is_hot_tier_full(&defrag->tier_conf)) {
++ break;
++ }
++ }
++ }
++ }
++
++out:
++ if (migrate_data)
++ dict_unref(migrate_data);
++
++ gfdb_methods.gfdb_query_record_free(query_record);
++ query_record = NULL;
++
++ return total_status;
++}
++
++/* This is the call back function per record/file from data base */
++static int
++tier_gf_query_callback(gfdb_query_record_t *gfdb_query_record, void *_args)
++{
++ int ret = -1;
++ query_cbk_args_t *query_cbk_args = _args;
++
++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args, out);
++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->defrag, out);
++ GF_VALIDATE_OR_GOTO("tier", (query_cbk_args->query_fd > 0), out);
++
++ ret = gfdb_methods.gfdb_write_query_record(query_cbk_args->query_fd,
++ gfdb_query_record);
++ if (ret) {
++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed writing query record to query file");
++ goto out;
++ }
++
++ pthread_mutex_lock(&dm_stat_mutex);
++ query_cbk_args->defrag->num_files_lookedup++;
++ pthread_mutex_unlock(&dm_stat_mutex);
++
++ ret = 0;
++out:
++ return ret;
++}
++
++/* Create query file in tier process */
++static int
++tier_process_self_query(tier_brick_list_t *local_brick, void *args)
++{
++ int ret = -1;
++ char *db_path = NULL;
++ query_cbk_args_t *query_cbk_args = NULL;
++ xlator_t *this = NULL;
++ gfdb_conn_node_t *conn_node = NULL;
++ dict_t *params_dict = NULL;
++ dict_t *ctr_ipc_dict = NULL;
++ gfdb_brick_info_t *gfdb_brick_info = args;
++
++ /*Init of all the essentials*/
++ GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out);
++ query_cbk_args = gfdb_brick_info->_query_cbk_args;
++
++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
++ this = query_cbk_args->this;
++
++ GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out);
++
++ GF_VALIDATE_OR_GOTO(this->name, local_brick, out);
++
++ GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out);
++
++ GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out);
++
++ db_path = local_brick->brick_db_path;
++
++ /*Preparing DB parameters before init_db i.e getting db connection*/
++ params_dict = dict_new();
++ if (!params_dict) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "DB Params cannot initialized");
++ goto out;
++ }
++ SET_DB_PARAM_TO_DICT(this->name, params_dict,
++ (char *)gfdb_methods.get_db_path_key(), db_path, ret,
++ out);
++
++ /*Get the db connection*/
++ conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type);
++ if (!conn_node) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "FATAL: Failed initializing db operations");
++ goto out;
++ }
++
++ /* Query for eligible files from db */
++ query_cbk_args->query_fd = open(local_brick->qfile_path,
++ O_WRONLY | O_CREAT | O_APPEND,
++ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
++ if (query_cbk_args->query_fd < 0) {
++ gf_msg(this->name, GF_LOG_ERROR, errno, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to open query file %s", local_brick->qfile_path);
++ goto out;
++ }
++ if (!gfdb_brick_info->_gfdb_promote) {
++ if (query_cbk_args->defrag->tier_conf.watermark_last == TIER_WM_HI) {
++ /* emergency demotion mode */
++ ret = gfdb_methods.find_all(
++ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
++ query_cbk_args->defrag->tier_conf.query_limit);
++ } else {
++ if (query_cbk_args->defrag->write_freq_threshold == 0 &&
++ query_cbk_args->defrag->read_freq_threshold == 0) {
++ ret = gfdb_methods.find_unchanged_for_time(
++ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
++ gfdb_brick_info->time_stamp);
++ } else {
++ ret = gfdb_methods.find_unchanged_for_time_freq(
++ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
++ gfdb_brick_info->time_stamp,
++ query_cbk_args->defrag->write_freq_threshold,
++ query_cbk_args->defrag->read_freq_threshold, _gf_false);
++ }
++ }
++ } else {
++ if (query_cbk_args->defrag->write_freq_threshold == 0 &&
++ query_cbk_args->defrag->read_freq_threshold == 0) {
++ ret = gfdb_methods.find_recently_changed_files(
++ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
++ gfdb_brick_info->time_stamp);
++ } else {
++ ret = gfdb_methods.find_recently_changed_files_freq(
++ conn_node, tier_gf_query_callback, (void *)query_cbk_args,
++ gfdb_brick_info->time_stamp,
++ query_cbk_args->defrag->write_freq_threshold,
++ query_cbk_args->defrag->read_freq_threshold, _gf_false);
++ }
++ }
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "FATAL: query from db failed");
++ goto out;
++ }
++
++ /*Clear the heat on the DB entries*/
++ /*Preparing ctr_ipc_dict*/
++ ctr_ipc_dict = dict_new();
++ if (!ctr_ipc_dict) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "ctr_ipc_dict cannot initialized");
++ goto out;
++ }
++
++ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, GFDB_IPC_CTR_KEY,
++ GFDB_IPC_CTR_CLEAR_OPS, ret, out);
++
++ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict,
++ NULL);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed clearing the heat "
++ "on db %s error %d",
++ local_brick->brick_db_path, ret);
++ goto out;
++ }
++
++ ret = 0;
++out:
++ if (params_dict) {
++ dict_unref(params_dict);
++ params_dict = NULL;
++ }
++
++ if (ctr_ipc_dict) {
++ dict_unref(ctr_ipc_dict);
++ ctr_ipc_dict = NULL;
++ }
++
++ if (query_cbk_args && query_cbk_args->query_fd >= 0) {
++ sys_close(query_cbk_args->query_fd);
++ query_cbk_args->query_fd = -1;
++ }
++ gfdb_methods.fini_db(conn_node);
++
++ return ret;
++}
++
++/*Ask CTR to create the query file*/
++static int
++tier_process_ctr_query(tier_brick_list_t *local_brick, void *args)
++{
++ int ret = -1;
++ query_cbk_args_t *query_cbk_args = NULL;
++ xlator_t *this = NULL;
++ dict_t *ctr_ipc_in_dict = NULL;
++ dict_t *ctr_ipc_out_dict = NULL;
++ gfdb_brick_info_t *gfdb_brick_info = args;
++ gfdb_ipc_ctr_params_t *ipc_ctr_params = NULL;
++ int count = 0;
++
++ /*Init of all the essentials*/
++ GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out);
++ query_cbk_args = gfdb_brick_info->_query_cbk_args;
++
++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
++ this = query_cbk_args->this;
++
++ GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out);
++
++ GF_VALIDATE_OR_GOTO(this->name, local_brick, out);
++
++ GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out);
++
++ GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out);
++
++ /*Preparing ctr_ipc_in_dict*/
++ ctr_ipc_in_dict = dict_new();
++ if (!ctr_ipc_in_dict) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "ctr_ipc_in_dict cannot initialized");
++ goto out;
++ }
++
++ ipc_ctr_params = GF_CALLOC(1, sizeof(gfdb_ipc_ctr_params_t),
++ gf_tier_mt_ipc_ctr_params_t);
++ if (!ipc_ctr_params) {
++ goto out;
++ }
++
++ /* set all the query params*/
++ ipc_ctr_params->is_promote = gfdb_brick_info->_gfdb_promote;
++
++ ipc_ctr_params->write_freq_threshold = query_cbk_args->defrag
++ ->write_freq_threshold;
++
++ ipc_ctr_params->read_freq_threshold = query_cbk_args->defrag
++ ->read_freq_threshold;
++
++ ipc_ctr_params->query_limit = query_cbk_args->defrag->tier_conf.query_limit;
++
++ ipc_ctr_params->emergency_demote = (!gfdb_brick_info->_gfdb_promote &&
++ query_cbk_args->defrag->tier_conf
++ .watermark_last == TIER_WM_HI);
++
++ memcpy(&ipc_ctr_params->time_stamp, gfdb_brick_info->time_stamp,
++ sizeof(gfdb_time_t));
++
++ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict, GFDB_IPC_CTR_KEY,
++ GFDB_IPC_CTR_QUERY_OPS, ret, out);
++
++ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_in_dict,
++ GFDB_IPC_CTR_GET_QFILE_PATH, local_brick->qfile_path,
++ ret, out);
++
++ ret = dict_set_bin(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS,
++ ipc_ctr_params, sizeof(*ipc_ctr_params));
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
++ "Failed setting %s to params dictionary",
++ GFDB_IPC_CTR_GET_QUERY_PARAMS);
++ GF_FREE(ipc_ctr_params);
++ goto out;
++ }
++ ipc_ctr_params = NULL;
++
++ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_in_dict,
++ &ctr_ipc_out_dict);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_IPC_TIER_ERROR,
++ "Failed query on %s ret %d", local_brick->brick_db_path, ret);
++ goto out;
++ }
++
++ ret = dict_get_int32(ctr_ipc_out_dict, GFDB_IPC_CTR_RET_QUERY_COUNT,
++ &count);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed getting count "
++ "of records on %s",
++ local_brick->brick_db_path);
++ goto out;
++ }
++
++ if (count < 0) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed query on %s", local_brick->brick_db_path);
++ ret = -1;
++ goto out;
++ }
++
++ pthread_mutex_lock(&dm_stat_mutex);
++ query_cbk_args->defrag->num_files_lookedup = count;
++ pthread_mutex_unlock(&dm_stat_mutex);
++
++ ret = 0;
++out:
++
++ if (ctr_ipc_in_dict) {
++ dict_unref(ctr_ipc_in_dict);
++ ctr_ipc_in_dict = NULL;
++ }
++
++ if (ctr_ipc_out_dict) {
++ dict_unref(ctr_ipc_out_dict);
++ ctr_ipc_out_dict = NULL;
++ }
++
++ GF_FREE(ipc_ctr_params);
++
++ return ret;
++}
++
++/* This is the call back function for each brick from hot/cold bricklist
++ * It picks up each bricks db and queries for eligible files for migration.
++ * The list of eligible files are populated in appropriate query files*/
++static int
++tier_process_brick(tier_brick_list_t *local_brick, void *args)
++{
++ int ret = -1;
++ dict_t *ctr_ipc_in_dict = NULL;
++ dict_t *ctr_ipc_out_dict = NULL;
++ char *strval = NULL;
++
++ GF_VALIDATE_OR_GOTO("tier", local_brick, out);
++
++ GF_VALIDATE_OR_GOTO("tier", local_brick->xlator, out);
++
++ if (dht_tier_db_type == GFDB_SQLITE3) {
++ /*Preparing ctr_ipc_in_dict*/
++ ctr_ipc_in_dict = dict_new();
++ if (!ctr_ipc_in_dict) {
++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "ctr_ipc_in_dict cannot initialized");
++ goto out;
++ }
++
++ ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_KEY,
++ GFDB_IPC_CTR_GET_DB_PARAM_OPS);
++ if (ret) {
++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
++ "Failed to set %s "
++ "to params dictionary",
++ GFDB_IPC_CTR_KEY);
++ goto out;
++ }
++
++ ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_DB_PARAM_OPS, "");
++ if (ret) {
++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
++ "Failed to set %s "
++ "to params dictionary",
++ GFDB_IPC_CTR_GET_DB_PARAM_OPS);
++ goto out;
++ }
++
++ ret = dict_set_str(ctr_ipc_in_dict, GFDB_IPC_CTR_GET_DB_KEY,
++ "journal_mode");
++ if (ret) {
++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
++ "Failed to set %s "
++ "to params dictionary",
++ GFDB_IPC_CTR_GET_DB_KEY);
++ goto out;
++ }
++
++ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR,
++ ctr_ipc_in_dict, &ctr_ipc_out_dict);
++ if (ret || ctr_ipc_out_dict == NULL) {
++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to get "
++ "journal_mode of sql db %s",
++ local_brick->brick_db_path);
++ goto out;
++ }
++
++ ret = dict_get_str(ctr_ipc_out_dict, "journal_mode", &strval);
++ if (ret) {
++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_GET_PARAM_FAILED,
++ "Failed to get %s "
++ "from params dictionary"
++ "journal_mode",
++ strval);
++ goto out;
++ }
++
++ if (strval && (strncmp(strval, "wal", SLEN("wal")) == 0)) {
++ ret = tier_process_self_query(local_brick, args);
++ if (ret) {
++ goto out;
++ }
++ } else {
++ ret = tier_process_ctr_query(local_brick, args);
++ if (ret) {
++ goto out;
++ }
++ }
++ ret = 0;
++
++ } else {
++ ret = tier_process_self_query(local_brick, args);
++ if (ret) {
++ goto out;
++ }
++ }
++
++ ret = 0;
++out:
++ if (ctr_ipc_in_dict)
++ dict_unref(ctr_ipc_in_dict);
++
++ if (ctr_ipc_out_dict)
++ dict_unref(ctr_ipc_out_dict);
++
++ return ret;
++}
++
++static int
++tier_build_migration_qfile(migration_args_t *args,
++ query_cbk_args_t *query_cbk_args,
++ gf_boolean_t is_promotion)
++{
++ gfdb_time_t current_time;
++ gfdb_brick_info_t gfdb_brick_info;
++ gfdb_time_t time_in_past;
++ int ret = -1;
++ tier_brick_list_t *local_brick = NULL;
++ int i = 0;
++ time_in_past.tv_sec = args->freq_time;
++ time_in_past.tv_usec = 0;
++
++ ret = gettimeofday(&current_time, NULL);
++ if (ret == -1) {
++ gf_msg(args->this->name, GF_LOG_ERROR, errno,
++ DHT_MSG_SYS_CALL_GET_TIME_FAILED, "Failed to get current time");
++ goto out;
++ }
++ time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec;
++
++ /* The migration daemon may run a varying numberof usec after the */
++ /* sleep call triggers. A file may be registered in CTR some number */
++ /* of usec X after the daemon started and missed in the subsequent */
++ /* cycle if the daemon starts Y usec after the period in seconds */
++ /* where Y>X. Normalize away this problem by always setting usec */
++ /* to 0. */
++ time_in_past.tv_usec = 0;
++
++ gfdb_brick_info.time_stamp = &time_in_past;
++ gfdb_brick_info._gfdb_promote = is_promotion;
++ gfdb_brick_info._query_cbk_args = query_cbk_args;
++
++ list_for_each_entry(local_brick, args->brick_list, list)
++ {
++ /* Construct query file path for this brick
++ * i.e
++ * /var/run/gluster/xlator_name/
++ * {promote/demote}-brickname-indexinbricklist
++ * So that no two query files will have same path even
++ * bricks have the same name
++ * */
++ snprintf(local_brick->qfile_path, PATH_MAX, "%s-%s-%d",
++ GET_QFILE_PATH(gfdb_brick_info._gfdb_promote),
++ local_brick->brick_name, i);
++
++ /* Delete any old query files for this brick */
++ sys_unlink(local_brick->qfile_path);
++
++ ret = tier_process_brick(local_brick, &gfdb_brick_info);
++ if (ret) {
++ gf_msg(args->this->name, GF_LOG_ERROR, 0,
++ DHT_MSG_BRICK_QUERY_FAILED, "Brick %s query failed\n",
++ local_brick->brick_db_path);
++ }
++ i++;
++ }
++ ret = 0;
++out:
++ return ret;
++}
++
++static int
++tier_migrate_files_using_qfile(migration_args_t *comp,
++ query_cbk_args_t *query_cbk_args)
++{
++ int ret = -1;
++ tier_brick_list_t *local_brick = NULL;
++ tier_brick_list_t *temp = NULL;
++ gfdb_time_t current_time = {
++ 0,
++ };
++ ssize_t qfile_array_size = 0;
++ int count = 0;
++ int temp_fd = 0;
++ gf_tier_conf_t *tier_conf = NULL;
++
++ tier_conf = &(query_cbk_args->defrag->tier_conf);
++
++ /* Time for error query files */
++ gettimeofday(&current_time, NULL);
++
++ /* Build the qfile list */
++ list_for_each_entry_safe(local_brick, temp, comp->brick_list, list)
++ {
++ qfile_array_size++;
++ }
++ query_cbk_args->qfile_array = qfile_array_new(qfile_array_size);
++ if (!query_cbk_args->qfile_array) {
++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to create new "
++ "qfile_array");
++ goto out;
++ }
++
++ /*Open all qfiles*/
++ count = 0;
++ query_cbk_args->qfile_array->exhausted_count = 0;
++ list_for_each_entry_safe(local_brick, temp, comp->brick_list, list)
++ {
++ temp_fd = query_cbk_args->qfile_array->fd_array[count];
++ temp_fd = open(local_brick->qfile_path, O_RDONLY,
++ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
++ if (temp_fd < 0) {
++ gf_msg("tier", GF_LOG_ERROR, errno, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to open "
++ "%s to the query file",
++ local_brick->qfile_path);
++ query_cbk_args->qfile_array->exhausted_count++;
++ }
++ query_cbk_args->qfile_array->fd_array[count] = temp_fd;
++ count++;
++ }
++
++ /* Moving the query file index to the next, so that we won't the same
++ * query file every cycle as the first one */
++ query_cbk_args->qfile_array
++ ->next_index = (query_cbk_args->is_promotion)
++ ? tier_conf->last_promote_qfile_index
++ : tier_conf->last_demote_qfile_index;
++ shift_next_index(query_cbk_args->qfile_array);
++ if (query_cbk_args->is_promotion) {
++ tier_conf->last_promote_qfile_index = query_cbk_args->qfile_array
++ ->next_index;
++ } else {
++ tier_conf->last_demote_qfile_index = query_cbk_args->qfile_array
++ ->next_index;
++ }
++
++ /* Migrate files using query file list */
++ ret = tier_migrate_using_query_file((void *)query_cbk_args);
++out:
++ qfile_array_free(query_cbk_args->qfile_array);
++
++ /* If there is an error rename all the query files to .err files
++ * with a timestamp for better debugging */
++ if (ret) {
++ struct tm tm = {
++ 0,
++ };
++ char time_str[128] = {
++ 0,
++ };
++ char query_file_path_err[PATH_MAX] = {
++ 0,
++ };
++ int32_t len = 0;
++
++ /* Time format for error query files */
++ gmtime_r(&current_time.tv_sec, &tm);
++ strftime(time_str, sizeof(time_str), "%F-%T", &tm);
++
++ list_for_each_entry_safe(local_brick, temp, comp->brick_list, list)
++ {
++ /* rename error qfile*/
++ len = snprintf(query_file_path_err, sizeof(query_file_path_err),
++ "%s-%s.err", local_brick->qfile_path, time_str);
++ if ((len >= 0) && (len < sizeof(query_file_path_err))) {
++ if (sys_rename(local_brick->qfile_path, query_file_path_err) ==
++ -1)
++ gf_msg_debug("tier", 0,
++ "rename "
++ "failed");
++ }
++ }
++ }
++
++ query_cbk_args->qfile_array = NULL;
++
++ return ret;
++}
++
++int
++tier_demote(migration_args_t *demotion_args)
++{
++ query_cbk_args_t query_cbk_args;
++ int ret = -1;
++
++ GF_VALIDATE_OR_GOTO("tier", demotion_args, out);
++ GF_VALIDATE_OR_GOTO("tier", demotion_args->this, out);
++ GF_VALIDATE_OR_GOTO(demotion_args->this->name, demotion_args->brick_list,
++ out);
++ GF_VALIDATE_OR_GOTO(demotion_args->this->name, demotion_args->defrag, out);
++
++ THIS = demotion_args->this;
++
++ query_cbk_args.this = demotion_args->this;
++ query_cbk_args.defrag = demotion_args->defrag;
++ query_cbk_args.is_promotion = 0;
++
++ /*Build the query file using bricklist*/
++ ret = tier_build_migration_qfile(demotion_args, &query_cbk_args, _gf_false);
++ if (ret)
++ goto out;
++
++ /* Migrate files using the query file */
++ ret = tier_migrate_files_using_qfile(demotion_args, &query_cbk_args);
++ if (ret)
++ goto out;
++
++out:
++ demotion_args->return_value = ret;
++ return ret;
++}
++
++int
++tier_promote(migration_args_t *promotion_args)
++{
++ int ret = -1;
++ query_cbk_args_t query_cbk_args;
++
++ GF_VALIDATE_OR_GOTO("tier", promotion_args->this, out);
++ GF_VALIDATE_OR_GOTO(promotion_args->this->name, promotion_args->brick_list,
++ out);
++ GF_VALIDATE_OR_GOTO(promotion_args->this->name, promotion_args->defrag,
++ out);
++
++ THIS = promotion_args->this;
++
++ query_cbk_args.this = promotion_args->this;
++ query_cbk_args.defrag = promotion_args->defrag;
++ query_cbk_args.is_promotion = 1;
++
++ /*Build the query file using bricklist*/
++ ret = tier_build_migration_qfile(promotion_args, &query_cbk_args, _gf_true);
++ if (ret)
++ goto out;
++
++ /* Migrate files using the query file */
++ ret = tier_migrate_files_using_qfile(promotion_args, &query_cbk_args);
++ if (ret)
++ goto out;
++
++out:
++ promotion_args->return_value = ret;
++ return ret;
++}
++
++/*
++ * Command the CTR on a brick to compact the local database using an IPC
++ */
++static int
++tier_process_self_compact(tier_brick_list_t *local_brick, void *args)
++{
++ int ret = -1;
++ char *db_path = NULL;
++ query_cbk_args_t *query_cbk_args = NULL;
++ xlator_t *this = NULL;
++ gfdb_conn_node_t *conn_node = NULL;
++ dict_t *params_dict = NULL;
++ dict_t *ctr_ipc_dict = NULL;
++ gfdb_brick_info_t *gfdb_brick_info = args;
++
++ /*Init of all the essentials*/
++ GF_VALIDATE_OR_GOTO("tier", gfdb_brick_info, out);
++ query_cbk_args = gfdb_brick_info->_query_cbk_args;
++
++ GF_VALIDATE_OR_GOTO("tier", query_cbk_args->this, out);
++ this = query_cbk_args->this;
++
++ GF_VALIDATE_OR_GOTO(this->name, gfdb_brick_info->_query_cbk_args, out);
++
++ GF_VALIDATE_OR_GOTO(this->name, local_brick, out);
++
++ GF_VALIDATE_OR_GOTO(this->name, local_brick->xlator, out);
++
++ GF_VALIDATE_OR_GOTO(this->name, local_brick->brick_db_path, out);
++
++ db_path = local_brick->brick_db_path;
++
++ /*Preparing DB parameters before init_db i.e getting db connection*/
++ params_dict = dict_new();
++ if (!params_dict) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "DB Params cannot initialized");
++ goto out;
++ }
++ SET_DB_PARAM_TO_DICT(this->name, params_dict,
++ (char *)gfdb_methods.get_db_path_key(), db_path, ret,
++ out);
++
++ /*Get the db connection*/
++ conn_node = gfdb_methods.init_db((void *)params_dict, dht_tier_db_type);
++ if (!conn_node) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "FATAL: Failed initializing db operations");
++ goto out;
++ }
++
++ ret = 0;
++
++ /*Preparing ctr_ipc_dict*/
++ ctr_ipc_dict = dict_new();
++ if (!ctr_ipc_dict) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "ctr_ipc_dict cannot initialized");
++ goto out;
++ }
++
++ ret = dict_set_int32(ctr_ipc_dict, "compact_active",
++ query_cbk_args->defrag->tier_conf.compact_active);
++
++ if (ret) {
++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
++ "Failed to set %s "
++ "to params dictionary",
++ "compact_active");
++ goto out;
++ }
++
++ ret = dict_set_int32(
++ ctr_ipc_dict, "compact_mode_switched",
++ query_cbk_args->defrag->tier_conf.compact_mode_switched);
++
++ if (ret) {
++ gf_msg("tier", GF_LOG_ERROR, 0, LG_MSG_SET_PARAM_FAILED,
++ "Failed to set %s "
++ "to params dictionary",
++ "compact_mode_switched");
++ goto out;
++ }
++
++ SET_DB_PARAM_TO_DICT(this->name, ctr_ipc_dict, GFDB_IPC_CTR_KEY,
++ GFDB_IPC_CTR_SET_COMPACT_PRAGMA, ret, out);
++
++ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Starting Compaction IPC");
++
++ ret = syncop_ipc(local_brick->xlator, GF_IPC_TARGET_CTR, ctr_ipc_dict,
++ NULL);
++
++ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Ending Compaction IPC");
++
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed compaction "
++ "on db %s error %d",
++ local_brick->brick_db_path, ret);
++ goto out;
++ }
++
++ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
++ "SUCCESS: %s Compaction", local_brick->brick_name);
++
++ ret = 0;
++out:
++ if (params_dict) {
++ dict_unref(params_dict);
++ params_dict = NULL;
++ }
++
++ if (ctr_ipc_dict) {
++ dict_unref(ctr_ipc_dict);
++ ctr_ipc_dict = NULL;
++ }
++
++ gfdb_methods.fini_db(conn_node);
++
++ return ret;
++}
++
++/*
++ * This is the call back function for each brick from hot/cold bricklist.
++ * It determines the database type on each brick and calls the corresponding
++ * function to prepare the compaction IPC.
++ */
++static int
++tier_compact_db_brick(tier_brick_list_t *local_brick, void *args)
++{
++ int ret = -1;
++
++ GF_VALIDATE_OR_GOTO("tier", local_brick, out);
++
++ GF_VALIDATE_OR_GOTO("tier", local_brick->xlator, out);
++
++ ret = tier_process_self_compact(local_brick, args);
++ if (ret) {
++ gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Brick %s did not compact", local_brick->brick_name);
++ goto out;
++ }
++
++ ret = 0;
++
++out:
++
++ return ret;
++}
++
++static int
++tier_send_compact(migration_args_t *args, query_cbk_args_t *query_cbk_args)
++{
++ gfdb_time_t current_time;
++ gfdb_brick_info_t gfdb_brick_info;
++ gfdb_time_t time_in_past;
++ int ret = -1;
++ tier_brick_list_t *local_brick = NULL;
++
++ time_in_past.tv_sec = args->freq_time;
++ time_in_past.tv_usec = 0;
++
++ ret = gettimeofday(&current_time, NULL);
++ if (ret == -1) {
++ gf_msg(args->this->name, GF_LOG_ERROR, errno,
++ DHT_MSG_SYS_CALL_GET_TIME_FAILED, "Failed to get current time");
++ goto out;
++ }
++ time_in_past.tv_sec = current_time.tv_sec - time_in_past.tv_sec;
++
++ /* The migration daemon may run a varying numberof usec after the sleep
++ call triggers. A file may be registered in CTR some number of usec X
++ after the daemon started and missed in the subsequent cycle if the
++ daemon starts Y usec after the period in seconds where Y>X. Normalize
++ away this problem by always setting usec to 0. */
++ time_in_past.tv_usec = 0;
++
++ gfdb_brick_info.time_stamp = &time_in_past;
++
++ /* This is meant to say we are always compacting at this point */
++ /* We simply borrow the promotion flag to do this */
++ gfdb_brick_info._gfdb_promote = 1;
++
++ gfdb_brick_info._query_cbk_args = query_cbk_args;
++
++ list_for_each_entry(local_brick, args->brick_list, list)
++ {
++ gf_msg(args->this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Start compaction for %s", local_brick->brick_name);
++
++ ret = tier_compact_db_brick(local_brick, &gfdb_brick_info);
++ if (ret) {
++ gf_msg(args->this->name, GF_LOG_ERROR, 0,
++ DHT_MSG_BRICK_QUERY_FAILED, "Brick %s compaction failed\n",
++ local_brick->brick_db_path);
++ }
++
++ gf_msg(args->this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
++ "End compaction for %s", local_brick->brick_name);
++ }
++ ret = 0;
++out:
++ return ret;
++}
++
++static int
++tier_compact(void *args)
++{
++ int ret = -1;
++ query_cbk_args_t query_cbk_args;
++ migration_args_t *compaction_args = args;
++
++ GF_VALIDATE_OR_GOTO("tier", compaction_args->this, out);
++ GF_VALIDATE_OR_GOTO(compaction_args->this->name,
++ compaction_args->brick_list, out);
++ GF_VALIDATE_OR_GOTO(compaction_args->this->name, compaction_args->defrag,
++ out);
++
++ THIS = compaction_args->this;
++
++ query_cbk_args.this = compaction_args->this;
++ query_cbk_args.defrag = compaction_args->defrag;
++ query_cbk_args.is_compaction = 1;
++
++ /* Send the compaction pragma out to all the bricks on the bricklist. */
++ /* tier_get_bricklist ensures all bricks on the list are local to */
++ /* this node. */
++ ret = tier_send_compact(compaction_args, &query_cbk_args);
++ if (ret)
++ goto out;
++
++ ret = 0;
++out:
++ compaction_args->return_value = ret;
++ return ret;
++}
++
++static int
++tier_get_bricklist(xlator_t *xl, struct list_head *local_bricklist_head)
++{
++ xlator_list_t *child = NULL;
++ char *rv = NULL;
++ char *rh = NULL;
++ char *brickname = NULL;
++ char db_name[PATH_MAX] = "";
++ int ret = 0;
++ tier_brick_list_t *local_brick = NULL;
++ int32_t len = 0;
++
++ GF_VALIDATE_OR_GOTO("tier", xl, out);
++ GF_VALIDATE_OR_GOTO("tier", local_bricklist_head, out);
++
++ /*
++ * This function obtains remote subvolumes and filters out only
++ * those running on the same node as the tier daemon.
++ */
++ if (strcmp(xl->type, "protocol/client") == 0) {
++ ret = dict_get_str(xl->options, "remote-host", &rh);
++ if (ret < 0)
++ goto out;
++
++ if (gf_is_local_addr(rh)) {
++ local_brick = GF_CALLOC(1, sizeof(tier_brick_list_t),
++ gf_tier_mt_bricklist_t);
++ if (!local_brick) {
++ goto out;
++ }
++
++ ret = dict_get_str(xl->options, "remote-subvolume", &rv);
++ if (ret < 0)
++ goto out;
++
++ brickname = strrchr(rv, '/') + 1;
++ snprintf(db_name, sizeof(db_name), "%s.db", brickname);
++
++ local_brick->brick_db_path = GF_MALLOC(PATH_MAX, gf_common_mt_char);
++ if (!local_brick->brick_db_path) {
++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Failed to allocate memory for"
++ " bricklist.");
++ ret = -1;
++ goto out;
++ }
++
++ len = snprintf(local_brick->brick_db_path, PATH_MAX, "%s/%s/%s", rv,
++ GF_HIDDEN_PATH, db_name);
++ if ((len < 0) || (len >= PATH_MAX)) {
++ gf_msg("tier", GF_LOG_ERROR, EINVAL, DHT_MSG_LOG_TIER_STATUS,
++ "DB path too long");
++ ret = -1;
++ goto out;
++ }
++
++ local_brick->xlator = xl;
++
++ snprintf(local_brick->brick_name, NAME_MAX, "%s", brickname);
++
++ list_add_tail(&(local_brick->list), local_bricklist_head);
++
++ ret = 0;
++ goto out;
++ }
++ }
++
++ for (child = xl->children; child; child = child->next) {
++ ret = tier_get_bricklist(child->xlator, local_bricklist_head);
++ if (ret) {
++ goto out;
++ }
++ }
++
++ ret = 0;
++out:
++
++ if (ret) {
++ if (local_brick) {
++ GF_FREE(local_brick->brick_db_path);
++ }
++ GF_FREE(local_brick);
++ }
++
++ return ret;
++}
++
++int
++tier_get_freq_demote(gf_tier_conf_t *tier_conf)
++{
++ if ((tier_conf->mode == TIER_MODE_WM) &&
++ (tier_conf->watermark_last == TIER_WM_HI))
++ return DEFAULT_DEMOTE_DEGRADED;
++ else
++ return tier_conf->tier_demote_frequency;
++}
++
++int
++tier_get_freq_promote(gf_tier_conf_t *tier_conf)
++{
++ return tier_conf->tier_promote_frequency;
++}
++
++int
++tier_get_freq_compact_hot(gf_tier_conf_t *tier_conf)
++{
++ return tier_conf->tier_compact_hot_frequency;
++}
++
++int
++tier_get_freq_compact_cold(gf_tier_conf_t *tier_conf)
++{
++ return tier_conf->tier_compact_cold_frequency;
++}
++
++static int
++tier_check_demote(gfdb_time_t current_time, int freq)
++{
++ return ((current_time.tv_sec % freq) == 0) ? _gf_true : _gf_false;
++}
++
++static gf_boolean_t
++tier_check_promote(gf_tier_conf_t *tier_conf, gfdb_time_t current_time,
++ int freq)
++{
++ if ((tier_conf->mode == TIER_MODE_WM) &&
++ (tier_conf->watermark_last == TIER_WM_HI))
++ return _gf_false;
++
++ else
++ return ((current_time.tv_sec % freq) == 0) ? _gf_true : _gf_false;
++}
++
++static gf_boolean_t
++tier_check_compact(gf_tier_conf_t *tier_conf, gfdb_time_t current_time,
++ int freq_compact)
++{
++ if (!(tier_conf->compact_active || tier_conf->compact_mode_switched))
++ return _gf_false;
++
++ return ((current_time.tv_sec % freq_compact) == 0) ? _gf_true : _gf_false;
++}
++
++void
++clear_bricklist(struct list_head *brick_list)
++{
++ tier_brick_list_t *local_brick = NULL;
++ tier_brick_list_t *temp = NULL;
++
++ if (list_empty(brick_list)) {
++ return;
++ }
++
++ list_for_each_entry_safe(local_brick, temp, brick_list, list)
++ {
++ list_del(&local_brick->list);
++ GF_FREE(local_brick->brick_db_path);
++ GF_FREE(local_brick);
++ }
++}
++
++static void
++set_brick_list_qpath(struct list_head *brick_list, gf_boolean_t is_cold)
++{
++ tier_brick_list_t *local_brick = NULL;
++ int i = 0;
++
++ GF_VALIDATE_OR_GOTO("tier", brick_list, out);
++
++ list_for_each_entry(local_brick, brick_list, list)
++ {
++ /* Construct query file path for this brick
++ * i.e
++ * /var/run/gluster/xlator_name/
++ * {promote/demote}-brickname-indexinbricklist
++ * So that no two query files will have same path even
++ * bricks have the same name
++ * */
++ snprintf(local_brick->qfile_path, PATH_MAX, "%s-%s-%d",
++ GET_QFILE_PATH(is_cold), local_brick->brick_name, i);
++ i++;
++ }
++out:
++ return;
++}
++
++static int
++tier_prepare_compact(migration_args_t *args, gfdb_time_t current_time)
++{
++ xlator_t *this = NULL;
++ dht_conf_t *conf = NULL;
++ gf_defrag_info_t *defrag = NULL;
++ gf_tier_conf_t *tier_conf = NULL;
++ gf_boolean_t is_hot_tier = args->is_hot_tier;
++ int freq = 0;
++ int ret = -1;
++ const char *tier_type = is_hot_tier ? "hot" : "cold";
++
++ this = args->this;
++
++ conf = this->private;
++
++ defrag = conf->defrag;
++
++ tier_conf = &defrag->tier_conf;
++
++ freq = is_hot_tier ? tier_get_freq_compact_hot(tier_conf)
++ : tier_get_freq_compact_cold(tier_conf);
++
++ defrag->tier_conf.compact_mode_switched =
++ is_hot_tier ? defrag->tier_conf.compact_mode_switched_hot
++ : defrag->tier_conf.compact_mode_switched_cold;
++
++ gf_msg(this->name, GF_LOG_TRACE, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Compact mode %i", defrag->tier_conf.compact_mode_switched);
++
++ if (tier_check_compact(tier_conf, current_time, freq)) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Start compaction on %s tier", tier_type);
++
++ args->freq_time = freq;
++ ret = tier_compact(args);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Compaction failed on "
++ "%s tier",
++ tier_type);
++ goto out;
++ }
++
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "End compaction on %s tier", tier_type);
++
++ if (is_hot_tier) {
++ defrag->tier_conf.compact_mode_switched_hot = _gf_false;
++ } else {
++ defrag->tier_conf.compact_mode_switched_cold = _gf_false;
++ }
++ }
++
++out:
++ return ret;
++}
++
++static int
++tier_get_wm_interval(tier_mode_t mode, tier_watermark_op_t wm)
++{
++ if (mode == TIER_MODE_WM && wm == TIER_WM_HI)
++ return WM_INTERVAL_EMERG;
++
++ return WM_INTERVAL;
++}
++
++/*
++ * Main tiering loop. This is called from the promotion and the
++ * demotion threads spawned in tier_start().
++ *
++ * Every second, wake from sleep to perform tasks.
++ * 1. Check trigger to migrate data.
++ * 2. Check for state changes (pause, unpause, stop).
++ */
++static void *
++tier_run(void *in_args)
++{
++ dht_conf_t *conf = NULL;
++ gfdb_time_t current_time = {0};
++ int freq = 0;
++ int ret = 0;
++ xlator_t *any = NULL;
++ xlator_t *xlator = NULL;
++ gf_tier_conf_t *tier_conf = NULL;
++ loc_t root_loc = {0};
++ int check_watermark = 0;
++ gf_defrag_info_t *defrag = NULL;
++ xlator_t *this = NULL;
++ migration_args_t *args = in_args;
++ GF_VALIDATE_OR_GOTO("tier", args, out);
++ GF_VALIDATE_OR_GOTO("tier", args->brick_list, out);
++
++ this = args->this;
++ GF_VALIDATE_OR_GOTO("tier", this, out);
++
++ conf = this->private;
++ GF_VALIDATE_OR_GOTO("tier", conf, out);
++
++ defrag = conf->defrag;
++ GF_VALIDATE_OR_GOTO("tier", defrag, out);
++
++ if (list_empty(args->brick_list)) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Brick list for tier is empty. Exiting.");
++ goto out;
++ }
++
++ defrag->defrag_status = GF_DEFRAG_STATUS_STARTED;
++ tier_conf = &defrag->tier_conf;
++
++ dht_build_root_loc(defrag->root_inode, &root_loc);
++
++ while (1) {
++ /*
++ * Check if a graph switch occurred. If so, stop migration
++ * thread. It will need to be restarted manually.
++ */
++ any = THIS->ctx->active->first;
++ xlator = xlator_search_by_name(any, this->name);
++
++ if (xlator != this) {
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Detected graph switch. Exiting migration "
++ "daemon.");
++ goto out;
++ }
++
++ gf_defrag_check_pause_tier(tier_conf);
++
++ sleep(1);
++
++ if (defrag->defrag_status != GF_DEFRAG_STATUS_STARTED) {
++ ret = 1;
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "defrag->defrag_status != "
++ "GF_DEFRAG_STATUS_STARTED");
++ goto out;
++ }
++
++ if (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER ||
++ defrag->cmd == GF_DEFRAG_CMD_DETACH_START) {
++ ret = 0;
++ defrag->defrag_status = GF_DEFRAG_STATUS_COMPLETE;
++ gf_msg(this->name, GF_LOG_DEBUG, 0, DHT_MSG_LOG_TIER_ERROR,
++ "defrag->defrag_cmd == "
++ "GF_DEFRAG_CMD_START_DETACH_TIER");
++ goto out;
++ }
++
++ if (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)
++ continue;
++
++ /* To have proper synchronization amongst all
++ * brick holding nodes, so that promotion and demotions
++ * start atomically w.r.t promotion/demotion frequency
++ * period, all nodes should have their system time
++ * in-sync with each other either manually set or
++ * using a NTP server*/
++ ret = gettimeofday(&current_time, NULL);
++ if (ret == -1) {
++ gf_msg(this->name, GF_LOG_ERROR, errno,
++ DHT_MSG_SYS_CALL_GET_TIME_FAILED,
++ "Failed to get current time");
++ goto out;
++ }
++
++ check_watermark++;
++
++ /* emergency demotion requires frequent watermark monitoring */
++ if (check_watermark >=
++ tier_get_wm_interval(tier_conf->mode, tier_conf->watermark_last)) {
++ check_watermark = 0;
++ if (tier_conf->mode == TIER_MODE_WM) {
++ ret = tier_get_fs_stat(this, &root_loc);
++ if (ret != 0) {
++ continue;
++ }
++ ret = tier_check_watermark(this);
++ if (ret != 0) {
++ gf_msg(this->name, GF_LOG_CRITICAL, errno,
++ DHT_MSG_LOG_TIER_ERROR, "Failed to get watermark");
++ continue;
++ }
++ }
++ }
++
++ if (args->is_promotion) {
++ freq = tier_get_freq_promote(tier_conf);
++
++ if (tier_check_promote(tier_conf, current_time, freq)) {
++ args->freq_time = freq;
++ ret = tier_promote(args);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Promotion failed");
++ }
++ }
++ } else if (args->is_compaction) {
++ tier_prepare_compact(args, current_time);
++ } else {
++ freq = tier_get_freq_demote(tier_conf);
++
++ if (tier_check_demote(current_time, freq)) {
++ args->freq_time = freq;
++ ret = tier_demote(args);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Demotion failed");
++ }
++ }
++ }
++
++ /* Check the statfs immediately after the processing threads
++ return */
++ check_watermark = WM_INTERVAL;
++ }
++
++ ret = 0;
++out:
++
++ args->return_value = ret;
++
++ return NULL;
++}
++
++int
++tier_start(xlator_t *this, gf_defrag_info_t *defrag)
++{
++ pthread_t promote_thread;
++ pthread_t demote_thread;
++ pthread_t hot_compact_thread;
++ pthread_t cold_compact_thread;
++ int ret = -1;
++ struct list_head bricklist_hot = {0};
++ struct list_head bricklist_cold = {0};
++ migration_args_t promotion_args = {0};
++ migration_args_t demotion_args = {0};
++ migration_args_t hot_compaction_args = {0};
++ migration_args_t cold_compaction_args = {0};
++ dht_conf_t *conf = NULL;
++
++ INIT_LIST_HEAD((&bricklist_hot));
++ INIT_LIST_HEAD((&bricklist_cold));
++
++ conf = this->private;
++
++ tier_get_bricklist(conf->subvolumes[1], &bricklist_hot);
++ set_brick_list_qpath(&bricklist_hot, _gf_false);
++
++ demotion_args.this = this;
++ demotion_args.brick_list = &bricklist_hot;
++ demotion_args.defrag = defrag;
++ demotion_args.is_promotion = _gf_false;
++ demotion_args.is_compaction = _gf_false;
++
++ ret = gf_thread_create(&demote_thread, NULL, &tier_run, &demotion_args,
++ "tierdem");
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to start demotion thread.");
++ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
++ goto cleanup;
++ }
++
++ tier_get_bricklist(conf->subvolumes[0], &bricklist_cold);
++ set_brick_list_qpath(&bricklist_cold, _gf_true);
++
++ promotion_args.this = this;
++ promotion_args.brick_list = &bricklist_cold;
++ promotion_args.defrag = defrag;
++ promotion_args.is_promotion = _gf_true;
++
++ ret = gf_thread_create(&promote_thread, NULL, &tier_run, &promotion_args,
++ "tierpro");
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to start promotion thread.");
++ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
++ goto waitforspawned;
++ }
++
++ hot_compaction_args.this = this;
++ hot_compaction_args.brick_list = &bricklist_hot;
++ hot_compaction_args.defrag = defrag;
++ hot_compaction_args.is_promotion = _gf_false;
++ hot_compaction_args.is_compaction = _gf_true;
++ hot_compaction_args.is_hot_tier = _gf_true;
++
++ ret = gf_thread_create(&hot_compact_thread, NULL, &tier_run,
++ &hot_compaction_args, "tierhcom");
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to start compaction thread.");
++ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
++ goto waitforspawnedpromote;
++ }
++
++ cold_compaction_args.this = this;
++ cold_compaction_args.brick_list = &bricklist_cold;
++ cold_compaction_args.defrag = defrag;
++ cold_compaction_args.is_promotion = _gf_false;
++ cold_compaction_args.is_compaction = _gf_true;
++ cold_compaction_args.is_hot_tier = _gf_false;
++
++ ret = gf_thread_create(&cold_compact_thread, NULL, &tier_run,
++ &cold_compaction_args, "tierccom");
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Failed to start compaction thread.");
++ defrag->defrag_status = GF_DEFRAG_STATUS_FAILED;
++ goto waitforspawnedhotcompact;
++ }
++ pthread_join(cold_compact_thread, NULL);
++
++waitforspawnedhotcompact:
++ pthread_join(hot_compact_thread, NULL);
++
++waitforspawnedpromote:
++ pthread_join(promote_thread, NULL);
++
++waitforspawned:
++ pthread_join(demote_thread, NULL);
++
++cleanup:
++ clear_bricklist(&bricklist_cold);
++ clear_bricklist(&bricklist_hot);
++ return ret;
++}
++
++int32_t
++tier_migration_needed(xlator_t *this)
++{
++ gf_defrag_info_t *defrag = NULL;
++ dht_conf_t *conf = NULL;
++ int ret = 0;
++
++ conf = this->private;
++
++ GF_VALIDATE_OR_GOTO(this->name, conf, out);
++ GF_VALIDATE_OR_GOTO(this->name, conf->defrag, out);
++
++ defrag = conf->defrag;
++
++ if ((defrag->cmd == GF_DEFRAG_CMD_START_TIER) ||
++ (defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER))
++ ret = 1;
++out:
++ return ret;
++}
++
++int32_t
++tier_migration_get_dst(xlator_t *this, dht_local_t *local)
++{
++ dht_conf_t *conf = NULL;
++ int32_t ret = -1;
++ gf_defrag_info_t *defrag = NULL;
++
++ GF_VALIDATE_OR_GOTO("tier", this, out);
++ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
++
++ conf = this->private;
++
++ defrag = conf->defrag;
++
++ if (defrag && defrag->cmd == GF_DEFRAG_CMD_START_DETACH_TIER) {
++ local->rebalance.target_node = conf->subvolumes[0];
++
++ } else if (conf->subvolumes[0] == local->cached_subvol)
++ local->rebalance.target_node = conf->subvolumes[1];
++ else
++ local->rebalance.target_node = conf->subvolumes[0];
++
++ if (local->rebalance.target_node)
++ ret = 0;
++
++out:
++ return ret;
++}
++
++xlator_t *
++tier_search(xlator_t *this, dht_layout_t *layout, const char *name)
++{
++ xlator_t *subvol = NULL;
++ dht_conf_t *conf = NULL;
++
++ GF_VALIDATE_OR_GOTO("tier", this, out);
++ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
++
++ conf = this->private;
++
++ subvol = TIER_HASHED_SUBVOL;
++
++out:
++ return subvol;
++}
++
++static int
++tier_load_externals(xlator_t *this)
++{
++ int ret = -1;
++ char *libpathfull = (LIBDIR "/libgfdb.so.0");
++ get_gfdb_methods_t get_gfdb_methods;
++
++ GF_VALIDATE_OR_GOTO("this", this, out);
++
++ libhandle = dlopen(libpathfull, RTLD_NOW);
++ if (!libhandle) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Error loading libgfdb.so %s\n", dlerror());
++ ret = -1;
++ goto out;
++ }
++
++ get_gfdb_methods = dlsym(libhandle, "get_gfdb_methods");
++ if (!get_gfdb_methods) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Error loading get_gfdb_methods()");
++ ret = -1;
++ goto out;
++ }
++
++ get_gfdb_methods(&gfdb_methods);
++
++ ret = 0;
++
++out:
++ if (ret && libhandle)
++ dlclose(libhandle);
++
++ return ret;
++}
++
++static tier_mode_t
++tier_validate_mode(char *mode)
++{
++ int ret = -1;
++
++ if (strcmp(mode, "test") == 0) {
++ ret = TIER_MODE_TEST;
++ } else {
++ ret = TIER_MODE_WM;
++ }
++
++ return ret;
++}
++
++static gf_boolean_t
++tier_validate_compact_mode(char *mode)
++{
++ gf_boolean_t ret = _gf_false;
++
++ gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "tier_validate_compact_mode: mode = %s", mode);
++
++ if (!strcmp(mode, "on")) {
++ ret = _gf_true;
++ } else {
++ ret = _gf_false;
++ }
++
++ gf_msg("tier", GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_STATUS,
++ "tier_validate_compact_mode: ret = %i", ret);
++
++ return ret;
++}
++
++int
++tier_init_methods(xlator_t *this)
++{
++ int ret = -1;
++ dht_conf_t *conf = NULL;
++ dht_methods_t *methods = NULL;
++
++ GF_VALIDATE_OR_GOTO("tier", this, err);
++
++ conf = this->private;
++
++ methods = &(conf->methods);
++
++ methods->migration_get_dst_subvol = tier_migration_get_dst;
++ methods->migration_other = tier_start;
++ methods->migration_needed = tier_migration_needed;
++ methods->layout_search = tier_search;
++
++ ret = 0;
++err:
++ return ret;
++}
++
++static void
++tier_save_vol_name(xlator_t *this)
++{
++ dht_conf_t *conf = NULL;
++ gf_defrag_info_t *defrag = NULL;
++ char *suffix = NULL;
++ int name_len = 0;
++
++ conf = this->private;
++ defrag = conf->defrag;
++
++ suffix = strstr(this->name, "-tier-dht");
++
++ if (suffix)
++ name_len = suffix - this->name;
++ else
++ name_len = strlen(this->name);
++
++ if (name_len > GD_VOLUME_NAME_MAX)
++ name_len = GD_VOLUME_NAME_MAX;
++
++ strncpy(defrag->tier_conf.volname, this->name, name_len);
++ defrag->tier_conf.volname[name_len] = 0;
++}
++
++int
++tier_init(xlator_t *this)
++{
++ int ret = -1;
++ int freq = 0;
++ int maxsize = 0;
++ dht_conf_t *conf = NULL;
++ gf_defrag_info_t *defrag = NULL;
++ char *voldir = NULL;
++ char *mode = NULL;
++ char *paused = NULL;
++ tier_mode_t tier_mode = DEFAULT_TIER_MODE;
++ gf_boolean_t compact_mode = _gf_false;
++
++ ret = dht_init(this);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "tier_init failed");
++ goto out;
++ }
++
++ conf = this->private;
++
++ ret = tier_init_methods(this);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "tier_init_methods failed");
++ goto out;
++ }
++
++ if (conf->subvolume_cnt != 2) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Invalid number of subvolumes %d", conf->subvolume_cnt);
++ goto out;
++ }
++
++ /* if instatiated from client side initialization is complete. */
++ if (!conf->defrag) {
++ ret = 0;
++ goto out;
++ }
++
++ /* if instatiated from server side, load db libraries */
++ ret = tier_load_externals(this);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "Could not load externals. Aborting");
++ goto out;
++ }
++
++ defrag = conf->defrag;
++
++ defrag->tier_conf.last_demote_qfile_index = 0;
++ defrag->tier_conf.last_promote_qfile_index = 0;
++
++ defrag->tier_conf.is_tier = 1;
++ defrag->this = this;
++
++ ret = dict_get_int32(this->options, "tier-max-promote-file-size", &maxsize);
++ if (ret) {
++ maxsize = 0;
++ }
++
++ defrag->tier_conf.tier_max_promote_size = maxsize;
++
++ ret = dict_get_int32(this->options, "tier-promote-frequency", &freq);
++ if (ret) {
++ freq = DEFAULT_PROMOTE_FREQ_SEC;
++ }
++
++ defrag->tier_conf.tier_promote_frequency = freq;
++
++ ret = dict_get_int32(this->options, "tier-demote-frequency", &freq);
++ if (ret) {
++ freq = DEFAULT_DEMOTE_FREQ_SEC;
++ }
++
++ defrag->tier_conf.tier_demote_frequency = freq;
++
++ ret = dict_get_int32(this->options, "tier-hot-compact-frequency", &freq);
++ if (ret) {
++ freq = DEFAULT_HOT_COMPACT_FREQ_SEC;
++ }
++
++ defrag->tier_conf.tier_compact_hot_frequency = freq;
++
++ ret = dict_get_int32(this->options, "tier-cold-compact-frequency", &freq);
++ if (ret) {
++ freq = DEFAULT_COLD_COMPACT_FREQ_SEC;
++ }
++
++ defrag->tier_conf.tier_compact_cold_frequency = freq;
++
++ ret = dict_get_int32(this->options, "watermark-hi", &freq);
++ if (ret) {
++ freq = DEFAULT_WM_HI;
++ }
++
++ defrag->tier_conf.watermark_hi = freq;
++
++ ret = dict_get_int32(this->options, "watermark-low", &freq);
++ if (ret) {
++ freq = DEFAULT_WM_LOW;
++ }
++
++ defrag->tier_conf.watermark_low = freq;
++
++ ret = dict_get_int32(this->options, "write-freq-threshold", &freq);
++ if (ret) {
++ freq = DEFAULT_WRITE_FREQ_SEC;
++ }
++
++ defrag->write_freq_threshold = freq;
++
++ ret = dict_get_int32(this->options, "read-freq-threshold", &freq);
++ if (ret) {
++ freq = DEFAULT_READ_FREQ_SEC;
++ }
++
++ defrag->read_freq_threshold = freq;
++
++ ret = dict_get_int32(this->options, "tier-max-mb", &freq);
++ if (ret) {
++ freq = DEFAULT_TIER_MAX_MIGRATE_MB;
++ }
++
++ defrag->tier_conf.max_migrate_bytes = (uint64_t)freq * 1024 * 1024;
++
++ ret = dict_get_int32(this->options, "tier-max-files", &freq);
++ if (ret) {
++ freq = DEFAULT_TIER_MAX_MIGRATE_FILES;
++ }
++
++ defrag->tier_conf.max_migrate_files = freq;
++
++ ret = dict_get_int32(this->options, "tier-query-limit",
++ &(defrag->tier_conf.query_limit));
++ if (ret) {
++ defrag->tier_conf.query_limit = DEFAULT_TIER_QUERY_LIMIT;
++ }
++
++ ret = dict_get_str(this->options, "tier-compact", &mode);
++
++ if (ret) {
++ defrag->tier_conf.compact_active = DEFAULT_COMP_MODE;
++ } else {
++ compact_mode = tier_validate_compact_mode(mode);
++ /* If compaction is now active, we need to inform the bricks on
++ the hot and cold tier of this. See dht-common.h for more. */
++ defrag->tier_conf.compact_active = compact_mode;
++ if (compact_mode) {
++ defrag->tier_conf.compact_mode_switched_hot = _gf_true;
++ defrag->tier_conf.compact_mode_switched_cold = _gf_true;
++ }
++ }
++
++ ret = dict_get_str(this->options, "tier-mode", &mode);
++ if (ret) {
++ defrag->tier_conf.mode = DEFAULT_TIER_MODE;
++ } else {
++ tier_mode = tier_validate_mode(mode);
++ defrag->tier_conf.mode = tier_mode;
++ }
++
++ pthread_mutex_init(&defrag->tier_conf.pause_mutex, 0);
++
++ gf_defrag_set_pause_state(&defrag->tier_conf, TIER_RUNNING);
++
++ ret = dict_get_str(this->options, "tier-pause", &paused);
++
++ if (paused && strcmp(paused, "on") == 0)
++ gf_defrag_set_pause_state(&defrag->tier_conf, TIER_REQUEST_PAUSE);
++
++ ret = gf_asprintf(&voldir, "%s/%s", DEFAULT_VAR_RUN_DIRECTORY, this->name);
++ if (ret < 0)
++ goto out;
++
++ ret = mkdir_p(voldir, 0777, _gf_true);
++ if (ret == -1 && errno != EEXIST) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "tier_init failed");
++
++ GF_FREE(voldir);
++ goto out;
++ }
++
++ GF_FREE(voldir);
++
++ ret = gf_asprintf(&promotion_qfile, "%s/%s/promote",
++ DEFAULT_VAR_RUN_DIRECTORY, this->name);
++ if (ret < 0)
++ goto out;
++
++ ret = gf_asprintf(&demotion_qfile, "%s/%s/demote",
++ DEFAULT_VAR_RUN_DIRECTORY, this->name);
++ if (ret < 0) {
++ GF_FREE(promotion_qfile);
++ goto out;
++ }
++
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "Promote/demote frequency %d/%d "
++ "Write/Read freq thresholds %d/%d",
++ defrag->tier_conf.tier_promote_frequency,
++ defrag->tier_conf.tier_demote_frequency,
++ defrag->write_freq_threshold, defrag->read_freq_threshold);
++
++ tier_save_vol_name(this);
++
++ ret = 0;
++
++out:
++
++ return ret;
++}
++
++int
++tier_cli_pause_done(int op_ret, call_frame_t *sync_frame, void *data)
++{
++ gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
++ "Migrate file paused with op_ret %d", op_ret);
++
++ return op_ret;
++}
++
++int
++tier_cli_pause(void *data)
++{
++ gf_defrag_info_t *defrag = NULL;
++ xlator_t *this = NULL;
++ dht_conf_t *conf = NULL;
++ int ret = -1;
++
++ this = data;
++
++ conf = this->private;
++ GF_VALIDATE_OR_GOTO(this->name, conf, exit);
++
++ defrag = conf->defrag;
++ GF_VALIDATE_OR_GOTO(this->name, defrag, exit);
++
++ gf_defrag_pause_tier(this, defrag);
++
++ ret = 0;
++exit:
++ return ret;
++}
++
++int
++tier_reconfigure(xlator_t *this, dict_t *options)
++{
++ dht_conf_t *conf = NULL;
++ gf_defrag_info_t *defrag = NULL;
++ char *mode = NULL;
++ int migrate_mb = 0;
++ gf_boolean_t req_pause = _gf_false;
++ int ret = 0;
++ call_frame_t *frame = NULL;
++ gf_boolean_t last_compact_setting = _gf_false;
++
++ conf = this->private;
++
++ if (conf->defrag) {
++ defrag = conf->defrag;
++ GF_OPTION_RECONF("tier-max-promote-file-size",
++ defrag->tier_conf.tier_max_promote_size, options,
++ int32, out);
++
++ GF_OPTION_RECONF("tier-promote-frequency",
++ defrag->tier_conf.tier_promote_frequency, options,
++ int32, out);
++
++ GF_OPTION_RECONF("tier-demote-frequency",
++ defrag->tier_conf.tier_demote_frequency, options,
++ int32, out);
++
++ GF_OPTION_RECONF("write-freq-threshold", defrag->write_freq_threshold,
++ options, int32, out);
++
++ GF_OPTION_RECONF("read-freq-threshold", defrag->read_freq_threshold,
++ options, int32, out);
++
++ GF_OPTION_RECONF("watermark-hi", defrag->tier_conf.watermark_hi,
++ options, int32, out);
++
++ GF_OPTION_RECONF("watermark-low", defrag->tier_conf.watermark_low,
++ options, int32, out);
++
++ last_compact_setting = defrag->tier_conf.compact_active;
++
++ GF_OPTION_RECONF("tier-compact", defrag->tier_conf.compact_active,
++ options, bool, out);
++
++ if (last_compact_setting != defrag->tier_conf.compact_active) {
++ defrag->tier_conf.compact_mode_switched_hot = _gf_true;
++ defrag->tier_conf.compact_mode_switched_cold = _gf_true;
++ gf_msg(this->name, GF_LOG_INFO, 0, DHT_MSG_LOG_TIER_STATUS,
++ "compact mode switched");
++ }
++
++ GF_OPTION_RECONF("tier-hot-compact-frequency",
++ defrag->tier_conf.tier_compact_hot_frequency, options,
++ int32, out);
++
++ GF_OPTION_RECONF("tier-cold-compact-frequency",
++ defrag->tier_conf.tier_compact_cold_frequency, options,
++ int32, out);
++
++ GF_OPTION_RECONF("tier-mode", mode, options, str, out);
++ defrag->tier_conf.mode = tier_validate_mode(mode);
++
++ GF_OPTION_RECONF("tier-max-mb", migrate_mb, options, int32, out);
++ defrag->tier_conf.max_migrate_bytes = (uint64_t)migrate_mb * 1024 *
++ 1024;
++
++ GF_OPTION_RECONF("tier-max-files", defrag->tier_conf.max_migrate_files,
++ options, int32, out);
++
++ GF_OPTION_RECONF("tier-query-limit", defrag->tier_conf.query_limit,
++ options, int32, out);
++
++ GF_OPTION_RECONF("tier-pause", req_pause, options, bool, out);
++
++ if (req_pause == _gf_true) {
++ frame = create_frame(this, this->ctx->pool);
++ if (!frame)
++ goto out;
++
++ frame->root->pid = GF_CLIENT_PID_DEFRAG;
++
++ ret = synctask_new(this->ctx->env, tier_cli_pause,
++ tier_cli_pause_done, frame, this);
++
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "pause tier failed on reconfigure");
++ }
++ } else {
++ ret = gf_defrag_resume_tier(this, defrag);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, DHT_MSG_LOG_TIER_ERROR,
++ "resume tier failed on reconfigure");
++ }
++ }
++ }
++
++out:
++ return dht_reconfigure(this, options);
++}
++
++void
++tier_fini(xlator_t *this)
++{
++ if (libhandle)
++ dlclose(libhandle);
++
++ GF_FREE(demotion_qfile);
++ GF_FREE(promotion_qfile);
++
++ dht_fini(this);
++}
++
++struct xlator_fops fops = {
++
++ .lookup = dht_lookup,
++ .create = tier_create,
++ .mknod = dht_mknod,
++
++ .open = dht_open,
++ .statfs = tier_statfs,
++ .opendir = dht_opendir,
++ .readdir = tier_readdir,
++ .readdirp = tier_readdirp,
++ .fsyncdir = dht_fsyncdir,
++ .symlink = dht_symlink,
++ .unlink = tier_unlink,
++ .link = tier_link,
++ .mkdir = dht_mkdir,
++ .rmdir = dht_rmdir,
++ .rename = dht_rename,
++ .entrylk = dht_entrylk,
++ .fentrylk = dht_fentrylk,
++
++ /* Inode read operations */
++ .stat = dht_stat,
++ .fstat = dht_fstat,
++ .access = dht_access,
++ .readlink = dht_readlink,
++ .getxattr = dht_getxattr,
++ .fgetxattr = dht_fgetxattr,
++ .readv = dht_readv,
++ .flush = dht_flush,
++ .fsync = dht_fsync,
++ .inodelk = dht_inodelk,
++ .finodelk = dht_finodelk,
++ .lk = dht_lk,
++
++ /* Inode write operations */
++ .fremovexattr = dht_fremovexattr,
++ .removexattr = dht_removexattr,
++ .setxattr = dht_setxattr,
++ .fsetxattr = dht_fsetxattr,
++ .truncate = dht_truncate,
++ .ftruncate = dht_ftruncate,
++ .writev = dht_writev,
++ .xattrop = dht_xattrop,
++ .fxattrop = dht_fxattrop,
++ .setattr = dht_setattr,
++ .fsetattr = dht_fsetattr,
++ .fallocate = dht_fallocate,
++ .discard = dht_discard,
++ .zerofill = dht_zerofill,
++};
++
++struct xlator_cbks cbks = {.release = dht_release, .forget = dht_forget};
++
++extern int32_t
++mem_acct_init(xlator_t *this);
++
++extern struct volume_options dht_options[];
++
++xlator_api_t xlator_api = {
++ .init = tier_init,
++ .fini = tier_fini,
++ .notify = dht_notify,
++ .reconfigure = tier_reconfigure,
++ .mem_acct_init = mem_acct_init,
++ .op_version = {GD_OP_VERSION_3_7_0}, /* Present from the initial version */
++ .fops = &fops,
++ .cbks = &cbks,
++ .options = dht_options,
++ .identifier = "tier",
++ .category = GF_MAINTAINED,
++};
++
+diff --git a/xlators/cluster/dht/src/tier.h b/xlators/cluster/dht/src/tier.h
+new file mode 100644
+index 0000000..a20b1db
+--- /dev/null
++++ b/xlators/cluster/dht/src/tier.h
+@@ -0,0 +1,110 @@
++/*
++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#ifndef _TIER_H_
++#define _TIER_H_
++
++/******************************************************************************/
++/* This is from dht-rebalancer.c as we don't have dht-rebalancer.h */
++#include "dht-common.h"
++#include <glusterfs/xlator.h>
++#include <signal.h>
++#include <fnmatch.h>
++#include <signal.h>
++
++/*
++ * Size of timer wheel. We would not promote or demote less
++ * frequently than this number.
++ */
++#define TIMER_SECS 3600
++
++#include "gfdb_data_store.h"
++#include <ctype.h>
++#include <sys/stat.h>
++
++#define PROMOTION_QFILE "promotequeryfile"
++#define DEMOTION_QFILE "demotequeryfile"
++
++#define TIER_HASHED_SUBVOL conf->subvolumes[0]
++#define TIER_UNHASHED_SUBVOL conf->subvolumes[1]
++
++#define GET_QFILE_PATH(is_promotion) \
++ (is_promotion) ? promotion_qfile : demotion_qfile
++
++typedef struct tier_qfile_array {
++ int *fd_array;
++ ssize_t array_size;
++ ssize_t next_index;
++ /* Indicate the number of exhuasted FDs*/
++ ssize_t exhausted_count;
++} tier_qfile_array_t;
++
++typedef struct _query_cbk_args {
++ xlator_t *this;
++ gf_defrag_info_t *defrag;
++ /* This is write */
++ int query_fd;
++ int is_promotion;
++ int is_compaction;
++ /* This is for read */
++ tier_qfile_array_t *qfile_array;
++} query_cbk_args_t;
++
++int
++gf_run_tier(xlator_t *this, gf_defrag_info_t *defrag);
++
++typedef struct gfdb_brick_info {
++ gfdb_time_t *time_stamp;
++ gf_boolean_t _gfdb_promote;
++ query_cbk_args_t *_query_cbk_args;
++} gfdb_brick_info_t;
++
++typedef struct brick_list {
++ xlator_t *xlator;
++ char *brick_db_path;
++ char brick_name[NAME_MAX];
++ char qfile_path[PATH_MAX];
++ struct list_head list;
++} tier_brick_list_t;
++
++typedef struct _dm_thread_args {
++ xlator_t *this;
++ gf_defrag_info_t *defrag;
++ struct list_head *brick_list;
++ int freq_time;
++ int return_value;
++ int is_promotion;
++ int is_compaction;
++ gf_boolean_t is_hot_tier;
++} migration_args_t;
++
++typedef enum tier_watermark_op_ {
++ TIER_WM_NONE = 0,
++ TIER_WM_LOW,
++ TIER_WM_HI,
++ TIER_WM_MID
++} tier_watermark_op_t;
++
++#define DEFAULT_PROMOTE_FREQ_SEC 120
++#define DEFAULT_DEMOTE_FREQ_SEC 120
++#define DEFAULT_HOT_COMPACT_FREQ_SEC 604800
++#define DEFAULT_COLD_COMPACT_FREQ_SEC 604800
++#define DEFAULT_DEMOTE_DEGRADED 1
++#define DEFAULT_WRITE_FREQ_SEC 0
++#define DEFAULT_READ_FREQ_SEC 0
++#define DEFAULT_WM_LOW 75
++#define DEFAULT_WM_HI 90
++#define DEFAULT_TIER_MODE TIER_MODE_TEST
++#define DEFAULT_COMP_MODE _gf_true
++#define DEFAULT_TIER_MAX_MIGRATE_MB 1000
++#define DEFAULT_TIER_MAX_MIGRATE_FILES 5000
++#define DEFAULT_TIER_QUERY_LIMIT 100
++
++#endif
+diff --git a/xlators/features/Makefile.am b/xlators/features/Makefile.am
+index 194634b..545c02b 100644
+--- a/xlators/features/Makefile.am
++++ b/xlators/features/Makefile.am
+@@ -5,6 +5,6 @@ endif
+ SUBDIRS = locks quota read-only quiesce marker index barrier arbiter upcall \
+ compress changelog gfid-access snapview-client snapview-server trash \
+ shard bit-rot leases selinux sdfs namespace $(CLOUDSYNC_DIR) thin-arbiter \
+- utime
++ utime changetimerecorder
+
+ CLEANFILES =
+diff --git a/xlators/features/changetimerecorder/Makefile.am b/xlators/features/changetimerecorder/Makefile.am
+new file mode 100644
+index 0000000..a985f42
+--- /dev/null
++++ b/xlators/features/changetimerecorder/Makefile.am
+@@ -0,0 +1,3 @@
++SUBDIRS = src
++
++CLEANFILES =
+diff --git a/xlators/features/changetimerecorder/src/Makefile.am b/xlators/features/changetimerecorder/src/Makefile.am
+new file mode 100644
+index 0000000..620017e
+--- /dev/null
++++ b/xlators/features/changetimerecorder/src/Makefile.am
+@@ -0,0 +1,26 @@
++xlatordir = $(libdir)/glusterfs/$(PACKAGE_VERSION)/xlator/features
++
++# changetimerecorder can only get build when libgfdb is enabled
++if BUILD_GFDB
++ xlator_LTLIBRARIES = changetimerecorder.la
++endif
++
++changetimerecorder_la_LDFLAGS = -module $(GF_XLATOR_DEFAULT_LDFLAGS)
++
++changetimerecorder_la_SOURCES = changetimerecorder.c \
++ ctr-helper.c ctr-xlator-ctx.c
++
++changetimerecorder_la_LIBADD = $(top_builddir)/libglusterfs/src/libglusterfs.la\
++ $(top_builddir)/libglusterfs/src/gfdb/libgfdb.la
++
++noinst_HEADERS = ctr-messages.h changetimerecorder.h ctr_mem_types.h \
++ ctr-helper.h ctr-xlator-ctx.h
++
++AM_CPPFLAGS = $(GF_CPPFLAGS) -I$(top_srcdir)/libglusterfs/src \
++ -I$(top_srcdir)/libglusterfs/src/gfdb \
++ -I$(top_srcdir)/rpc/xdr/src -I$(top_builddir)/rpc/xdr/src \
++ -DDATADIR=\"$(localstatedir)\"
++
++AM_CFLAGS = -Wall $(GF_CFLAGS) $(SQLITE_CFLAGS)
++
++CLEANFILES =
+diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.c b/xlators/features/changetimerecorder/src/changetimerecorder.c
+new file mode 100644
+index 0000000..f2aa4a9
+--- /dev/null
++++ b/xlators/features/changetimerecorder/src/changetimerecorder.c
+@@ -0,0 +1,2371 @@
++/*
++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++#include <ctype.h>
++#include <sys/uio.h>
++
++#include "gfdb_sqlite3.h"
++#include "ctr-helper.h"
++#include "ctr-messages.h"
++#include <glusterfs/syscall.h>
++
++#include "changetimerecorder.h"
++#include "tier-ctr-interface.h"
++
++/*******************************inode forget***********************************/
++int
++ctr_forget(xlator_t *this, inode_t *inode)
++{
++ fini_ctr_xlator_ctx(this, inode);
++ return 0;
++}
++
++/************************** Look up heal **************************************/
++/*
++Problem: The CTR xlator records file meta (heat/hardlinks)
++into the data. This works fine for files which are created
++after ctr xlator is switched ON. But for files which were
++created before CTR xlator is ON, CTR xlator is not able to
++record either of the meta i.e heat or hardlinks. Thus making
++those files immune to promotions/demotions.
++
++Solution: The solution that is implemented in this patch is
++do ctr-db heal of all those pre-existent files, using named lookup.
++For this purpose we use the inode-xlator context variable option
++in gluster.
++The inode-xlator context variable for ctr xlator will have the
++following,
++ a. A Lock for the context variable
++ b. A hardlink list: This list represents the successful looked
++ up hardlinks.
++These are the scenarios when the hardlink list is updated:
++1) Named-Lookup: Whenever a named lookup happens on a file, in the
++ wind path we copy all required hardlink and inode information to
++ ctr_db_record structure, which resides in the frame->local variable.
++ We don't update the database in wind. During the unwind, we read the
++ information from the ctr_db_record and ,
++ Check if the inode context variable is created, if not we create it.
++ Check if the hard link is there in the hardlink list.
++ If its not there we add it to the list and send a update to the
++ database using libgfdb.
++ Please note: The database transaction can fail(and we ignore) as there
++ already might be a record in the db. This update to the db is to heal
++ if its not there.
++ If its there in the list we ignore it.
++2) Inode Forget: Whenever an inode forget hits we clear the hardlink list in
++ the inode context variable and delete the inode context variable.
++ Please note: An inode forget may happen for two reason,
++ a. when the inode is delete.
++ b. the in-memory inode is evicted from the inode table due to cache limits.
++3) create: whenever a create happens we create the inode context variable and
++ add the hardlink. The database updation is done as usual by ctr.
++4) link: whenever a hardlink is created for the inode, we create the inode
++ context variable, if not present, and add the hardlink to the list.
++5) unlink: whenever a unlink happens we delete the hardlink from the list.
++6) mknod: same as create.
++7) rename: whenever a rename happens we update the hardlink in list. if the
++ hardlink was not present for updation, we add the hardlink to the list.
++
++What is pending:
++1) This solution will only work for named lookups.
++2) We don't track afr-self-heal/dht-rebalancer traffic for healing.
++
++*/
++
++/* This function does not write anything to the db,
++ * just created the local variable
++ * for the frame and sets values for the ctr_db_record */
++static int
++ctr_lookup_wind(call_frame_t *frame, xlator_t *this,
++ gf_ctr_inode_context_t *ctr_inode_cx)
++{
++ int ret = -1;
++ gf_ctr_private_t *_priv = NULL;
++ gf_ctr_local_t *ctr_local = NULL;
++
++ GF_ASSERT(frame);
++ GF_ASSERT(frame->root);
++ GF_ASSERT(this);
++ IS_CTR_INODE_CX_SANE(ctr_inode_cx);
++
++ _priv = this->private;
++ GF_ASSERT(_priv);
++
++ if (_priv->ctr_record_wind && ctr_inode_cx->ia_type != IA_IFDIR) {
++ frame->local = init_ctr_local_t(this);
++ if (!frame->local) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
++ "WIND: Error while creating ctr local");
++ goto out;
++ };
++ ctr_local = frame->local;
++ /*Definitely no internal fops will reach here*/
++ ctr_local->is_internal_fop = _gf_false;
++ /*Don't record counters*/
++ CTR_DB_REC(ctr_local).do_record_counters = _gf_false;
++ /*Don't record time at all*/
++ CTR_DB_REC(ctr_local).do_record_times = _gf_false;
++
++ /* Copy gfid into db record*/
++ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, *(ctr_inode_cx->gfid));
++
++ /* Set fop_path and fop_type, required by libgfdb to make
++ * decision while inserting the record */
++ CTR_DB_REC(ctr_local).gfdb_fop_path = ctr_inode_cx->fop_path;
++ CTR_DB_REC(ctr_local).gfdb_fop_type = ctr_inode_cx->fop_type;
++
++ /* Copy hard link info*/
++ gf_uuid_copy(CTR_DB_REC(ctr_local).pargfid,
++ *((NEW_LINK_CX(ctr_inode_cx))->pargfid));
++ if (snprintf(CTR_DB_REC(ctr_local).file_name,
++ sizeof(CTR_DB_REC(ctr_local).file_name), "%s",
++ NEW_LINK_CX(ctr_inode_cx)->basename) >=
++ sizeof(CTR_DB_REC(ctr_local).file_name)) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
++ "WIND: Error copying filename of ctr local");
++ goto out;
++ }
++ /* Since we are in lookup we can ignore errors while
++ * Inserting in the DB, because there may be many
++ * to write to the DB attempts for healing.
++ * We don't want to log all failed attempts and
++ * bloat the log*/
++ ctr_local->gfdb_db_record.ignore_errors = _gf_true;
++ }
++
++ ret = 0;
++
++out:
++
++ if (ret) {
++ free_ctr_local(ctr_local);
++ frame->local = NULL;
++ }
++
++ return ret;
++}
++
++/* This function inserts the ctr_db_record populated by ctr_lookup_wind
++ * in to the db. It also destroys the frame->local created by ctr_lookup_wind */
++static int
++ctr_lookup_unwind(call_frame_t *frame, xlator_t *this)
++{
++ int ret = -1;
++ gf_ctr_private_t *_priv = NULL;
++ gf_ctr_local_t *ctr_local = NULL;
++
++ GF_ASSERT(frame);
++ GF_ASSERT(this);
++
++ _priv = this->private;
++ GF_ASSERT(_priv);
++
++ GF_ASSERT(_priv->_db_conn);
++
++ ctr_local = frame->local;
++
++ if (ctr_local && (ctr_local->ia_inode_type != IA_IFDIR)) {
++ ret = insert_record(_priv->_db_conn, &ctr_local->gfdb_db_record);
++ if (ret == -1) {
++ gf_msg(this->name,
++ _gfdb_log_level(GF_LOG_ERROR,
++ ctr_local->gfdb_db_record.ignore_errors),
++ 0, CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND,
++ "UNWIND: Error filling ctr local");
++ goto out;
++ }
++ }
++ ret = 0;
++out:
++ free_ctr_local(ctr_local);
++ frame->local = NULL;
++ return ret;
++}
++
++/******************************************************************************
++ *
++ * FOPS HANDLING BELOW
++ *
++ * ***************************************************************************/
++
++/****************************LOOKUP********************************************/
++
++int32_t
++ctr_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, inode_t *inode,
++ struct iatt *buf, dict_t *dict, struct iatt *postparent)
++{
++ int ret = -1;
++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
++ gf_ctr_local_t *ctr_local = NULL;
++ ctr_heal_ret_val_t ret_val = CTR_CTX_ERROR;
++ gf_boolean_t _is_heal_needed = _gf_false;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++
++ /* if the lookup failed lookup don't do anything*/
++ if (op_ret == -1) {
++ gf_msg_trace(this->name, 0, "lookup failed with %s",
++ strerror(op_errno));
++ goto out;
++ }
++
++ /* Ignore directory lookups */
++ if (inode->ia_type == IA_IFDIR) {
++ goto out;
++ }
++
++ /* if frame local was not set by the ctr_lookup()
++ * so don't so anything*/
++ if (!frame->local) {
++ goto out;
++ }
++
++ /* if the lookup is for dht link donot record*/
++ if (dht_is_linkfile(buf, dict)) {
++ gf_msg_trace(this->name, 0,
++ "Ignoring Lookup "
++ "for dht link file");
++ goto out;
++ }
++
++ ctr_local = frame->local;
++ /*Assign the proper inode type*/
++ ctr_local->ia_inode_type = inode->ia_type;
++
++ /* Copy gfid directly from inode */
++ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, inode->gfid);
++
++ /* Checking if gfid and parent gfid is valid */
++ if (gf_uuid_is_null(CTR_DB_REC(ctr_local).gfid) ||
++ gf_uuid_is_null(CTR_DB_REC(ctr_local).pargfid)) {
++ gf_msg_trace(this->name, 0, "Invalid GFID");
++ goto out;
++ }
++
++ /* if its a first entry
++ * then mark the ctr_record for create
++ * A create will attempt a file and a hard link created in the db*/
++ ctr_xlator_ctx = get_ctr_xlator_ctx(this, inode);
++ if (!ctr_xlator_ctx) {
++ /* This marks inode heal */
++ CTR_DB_REC(ctr_local).gfdb_fop_type = GFDB_FOP_CREATE_WRITE;
++ _is_heal_needed = _gf_true;
++ }
++
++ /* Copy the correct gfid from resolved inode */
++ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, inode->gfid);
++
++ /* Add hard link to the list */
++ ret_val = add_hard_link_ctx(frame, this, inode);
++ if (ret_val == CTR_CTX_ERROR) {
++ gf_msg_trace(this->name, 0, "Failed adding hardlink to list");
++ goto out;
++ }
++ /* If inode needs healing then heal the hardlink also */
++ else if (ret_val & CTR_TRY_INODE_HEAL) {
++ /* This marks inode heal */
++ CTR_DB_REC(ctr_local).gfdb_fop_type = GFDB_FOP_CREATE_WRITE;
++ _is_heal_needed = _gf_true;
++ }
++ /* If hardlink needs healing */
++ else if (ret_val & CTR_TRY_HARDLINK_HEAL) {
++ _is_heal_needed = _gf_true;
++ }
++
++ /* If lookup heal needed */
++ if (!_is_heal_needed)
++ goto out;
++
++ /* FINALLY HEAL : Inserts the ctr_db_record populated by ctr_lookup_wind
++ * in to the db. It also destroys the frame->local
++ * created by ctr_lookup_wind */
++ ret = ctr_lookup_unwind(frame, this);
++ if (ret) {
++ gf_msg_trace(this->name, 0, "Failed healing/inserting link");
++ }
++
++out:
++ free_ctr_local((gf_ctr_local_t *)frame->local);
++ frame->local = NULL;
++
++ STACK_UNWIND_STRICT(lookup, frame, op_ret, op_errno, inode, buf, dict,
++ postparent);
++
++ return 0;
++}
++
++int32_t
++ctr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
++{
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++ gf_ctr_link_context_t ctr_link_cx;
++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++
++ GF_ASSERT(frame);
++ GF_ASSERT(frame->root);
++
++ /* Don't handle nameless lookups*/
++ if (!loc->parent || !loc->name)
++ goto out;
++
++ /*fill ctr link context*/
++ FILL_CTR_LINK_CX(_link_cx, loc->parent->gfid, loc->name, out);
++
++ /* Fill ctr inode context*/
++ /* IA_IFREG : We assume its a file in the wind
++ * but in the unwind we are sure what the inode is a file
++ * or directory
++ * gfid: we are just filling loc->gfid which is not correct.
++ * In unwind we fill the correct gfid for successful lookup*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, IA_IFREG, loc->gfid, _link_cx, NULL,
++ GFDB_FOP_DENTRY_WRITE, GFDB_FOP_WIND);
++
++ /* Create the frame->local and populate ctr_db_record
++ * No writing to the db yet */
++ ret = ctr_lookup_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_LINK_WIND_FAILED,
++ "Failed to insert link wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_lookup_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->lookup, loc, xdata);
++ return 0;
++}
++
++/****************************WRITEV********************************************/
++int32_t
++ctr_writev_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
++ struct iatt *postbuf, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_WRITEV_UNWIND_FAILED,
++ "Failed to insert writev unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(writev, frame, op_ret, op_errno, prebuf, postbuf,
++ xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *vector,
++ int32_t count, off_t off, uint32_t flags, struct iobref *iobref,
++ dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_WRITEV_WIND_FAILED,
++ "Failed to insert writev wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_writev_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->writev, fd, vector, count, off, flags,
++ iobref, xdata);
++
++ return 0;
++}
++
++/******************************setattr*****************************************/
++
++int32_t
++ctr_setattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, struct iatt *preop_stbuf,
++ struct iatt *postop_stbuf, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_SETATTR_UNWIND_FAILED,
++ "Failed to insert setattr unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(setattr, frame, op_ret, op_errno, preop_stbuf,
++ postop_stbuf, xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf,
++ int32_t valid, dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
++ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
++ "Failed to insert setattr wind");
++ }
++out:
++
++ STACK_WIND(frame, ctr_setattr_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->setattr, loc, stbuf, valid, xdata);
++
++ return 0;
++}
++
++/*************************** fsetattr ***************************************/
++int32_t
++ctr_fsetattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, struct iatt *preop_stbuf,
++ struct iatt *postop_stbuf, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_SETATTR_UNWIND_FAILED,
++ "Failed to insert fsetattr unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(fsetattr, frame, op_ret, op_errno, preop_stbuf,
++ postop_stbuf, xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *stbuf,
++ int32_t valid, dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
++ "Failed to insert fsetattr wind");
++ }
++out:
++ STACK_WIND(frame, ctr_fsetattr_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->fsetattr, fd, stbuf, valid, xdata);
++
++ return 0;
++}
++/****************************fremovexattr************************************/
++
++int32_t
++ctr_fremovexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_FREMOVEXATTR_UNWIND_FAILED,
++ "Failed to insert fremovexattr unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(fremovexattr, frame, op_ret, op_errno, xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd,
++ const char *name, dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_FREMOVEXATTR_WIND_FAILED,
++ "Failed to insert fremovexattr wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_fremovexattr_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->fremovexattr, fd, name, xdata);
++ return 0;
++}
++
++/****************************removexattr*************************************/
++
++int32_t
++ctr_removexattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_REMOVEXATTR_UNWIND_FAILED,
++ "Failed to insert removexattr unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(removexattr, frame, op_ret, op_errno, xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_removexattr(call_frame_t *frame, xlator_t *this, loc_t *loc,
++ const char *name, dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
++ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_REMOVEXATTR_WIND_FAILED,
++ "Failed to insert removexattr wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_removexattr_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->removexattr, loc, name, xdata);
++ return 0;
++}
++
++/****************************truncate****************************************/
++
++int32_t
++ctr_truncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
++ struct iatt *postbuf, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_TRUNCATE_UNWIND_FAILED,
++ "Failed to insert truncate unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(truncate, frame, op_ret, op_errno, prebuf, postbuf,
++ xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_truncate(call_frame_t *frame, xlator_t *this, loc_t *loc, off_t offset,
++ dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
++ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_TRUNCATE_WIND_FAILED,
++ "Failed to insert truncate wind");
++ }
++out:
++ STACK_WIND(frame, ctr_truncate_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->truncate, loc, offset, xdata);
++ return 0;
++}
++
++/****************************ftruncate***************************************/
++
++int32_t
++ctr_ftruncate_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, struct iatt *prebuf,
++ struct iatt *postbuf, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_FTRUNCATE_UNWIND_FAILED,
++ "Failed to insert ftruncate unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(ftruncate, frame, op_ret, op_errno, prebuf, postbuf,
++ xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset,
++ dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_FTRUNCATE_WIND_FAILED,
++ "Failed to insert ftruncate wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_ftruncate_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->ftruncate, fd, offset, xdata);
++ return 0;
++}
++
++/****************************rename******************************************/
++int32_t
++ctr_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, struct iatt *buf,
++ struct iatt *preoldparent, struct iatt *postoldparent,
++ struct iatt *prenewparent, struct iatt *postnewparent,
++ dict_t *xdata)
++{
++ int ret = -1;
++ uint32_t remaining_links = -1;
++ gf_ctr_local_t *ctr_local = NULL;
++ gfdb_fop_type_t fop_type = GFDB_FOP_INVALID_OP;
++ gfdb_fop_path_t fop_path = GFDB_FOP_INVALID;
++
++ GF_ASSERT(frame);
++ GF_ASSERT(this);
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE,
++ GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RENAME_UNWIND_FAILED,
++ "Failed to insert rename unwind");
++ goto out;
++ }
++
++ if (!xdata)
++ goto out;
++ /*
++ *
++ * Extracting GF_RESPONSE_LINK_COUNT_XDATA from POSIX Xlator
++ * This is only set when we are overwriting hardlinks.
++ *
++ * */
++ ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA,
++ &remaining_links);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_GET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
++ "Failed to getting GF_RESPONSE_LINK_COUNT_XDATA");
++ remaining_links = -1;
++ goto out;
++ }
++
++ ctr_local = frame->local;
++ if (!ctr_local) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_NULL_LOCAL,
++ "ctr_local is NULL.");
++ goto out;
++ }
++
++ /* This is not the only link */
++ if (remaining_links > 1) {
++ fop_type = GFDB_FOP_DENTRY_WRITE;
++ fop_path = GFDB_FOP_UNDEL;
++ }
++ /* Last link that was deleted */
++ else if (remaining_links == 1) {
++ fop_type = GFDB_FOP_DENTRY_WRITE;
++ fop_path = GFDB_FOP_UNDEL_ALL;
++ } else {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RENAME_UNWIND_FAILED,
++ "Invalid link count from posix");
++ goto out;
++ }
++
++ ret = ctr_delete_hard_link_from_db(
++ this, CTR_DB_REC(ctr_local).old_gfid, CTR_DB_REC(ctr_local).pargfid,
++ CTR_DB_REC(ctr_local).file_name, fop_type, fop_path);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_UNLINK_UNWIND_FAILED,
++ "Failed to delete records of %s",
++ CTR_DB_REC(ctr_local).old_file_name);
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(rename, frame, op_ret, op_errno, buf, preoldparent,
++ postoldparent, prenewparent, postnewparent, xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
++ dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++ gf_ctr_link_context_t new_link_cx, old_link_cx;
++ gf_ctr_link_context_t *_nlink_cx = &new_link_cx;
++ gf_ctr_link_context_t *_olink_cx = &old_link_cx;
++ int is_dict_created = 0;
++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++
++ /*Fill old link context*/
++ FILL_CTR_LINK_CX(_olink_cx, oldloc->pargfid, oldloc->name, out);
++
++ /*Fill new link context*/
++ FILL_CTR_LINK_CX(_nlink_cx, newloc->pargfid, newloc->name, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, oldloc->inode->ia_type,
++ oldloc->inode->gfid, _nlink_cx, _olink_cx,
++ GFDB_FOP_DENTRY_WRITE, GFDB_FOP_WIND);
++
++ /* If the rename is a overwrite of hardlink
++ * rename ("file1", "file2")
++ * file1 is hardlink for gfid say 00000000-0000-0000-0000-00000000000A
++ * file2 is hardlink for gfid say 00000000-0000-0000-0000-00000000000B
++ * so we are saving file2 gfid in old_gfid so that we delete entries
++ * from the db during rename callback if the fop is successful
++ * */
++ if (newloc->inode) {
++ /* This is the GFID from where the newloc hardlink will be
++ * unlinked */
++ _inode_cx->old_gfid = &newloc->inode->gfid;
++ }
++
++ /* Is a metatdata fop */
++ _inode_cx->is_metadata_fop = _gf_true;
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RENAME_WIND_FAILED,
++ "Failed to insert rename wind");
++ } else {
++ /* We are doing updation of hard link in inode context in wind
++ * As we don't get the "inode" in the call back for rename */
++ ret = update_hard_link_ctx(frame, this, oldloc->inode);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_UPDATE_HARDLINK_FAILED,
++ "Failed "
++ "updating hard link in ctr inode context");
++ goto out;
++ }
++
++ /* If the newloc has an inode. i.e acquiring hardlink of an
++ * exisitng file i.e overwritting a file.
++ * */
++ if (newloc->inode) {
++ /* Getting the ctr inode context variable for
++ * inode whose hardlink will be acquired during
++ * the rename
++ * */
++ ctr_xlator_ctx = get_ctr_xlator_ctx(this, newloc->inode);
++ if (!ctr_xlator_ctx) {
++ /* Since there is no ctr inode context
++ * so nothing more to do */
++ ret = 0;
++ goto out;
++ }
++
++ /* Deleting hardlink from context variable */
++ ret = ctr_delete_hard_link(this, ctr_xlator_ctx, newloc->pargfid,
++ newloc->name);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_DELETE_HARDLINK_FAILED,
++ "Failed to delete hard link");
++ goto out;
++ }
++
++ /* Requesting for number of hardlinks on the newloc
++ * inode from POSIX.
++ * */
++ is_dict_created = set_posix_link_request(this, &xdata);
++ if (is_dict_created == -1) {
++ ret = -1;
++ goto out;
++ }
++ }
++ }
++
++out:
++ STACK_WIND(frame, ctr_rename_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
++
++ if (is_dict_created == 1) {
++ dict_unref(xdata);
++ }
++
++ return 0;
++}
++
++/****************************unlink******************************************/
++int32_t
++ctr_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, struct iatt *preparent,
++ struct iatt *postparent, dict_t *xdata)
++{
++ int ret = -1;
++ uint32_t remaining_links = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ if (!xdata)
++ goto out;
++
++ /*
++ *
++ * Extracting GF_RESPONSE_LINK_COUNT_XDATA from POSIX Xlator
++ *
++ * */
++ ret = dict_get_uint32(xdata, GF_RESPONSE_LINK_COUNT_XDATA,
++ &remaining_links);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_GET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
++ "Failed to getting GF_RESPONSE_LINK_COUNT_XDATA");
++ remaining_links = -1;
++ }
++
++ /*This is not the only link*/
++ if (remaining_links != 1) {
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE,
++ GFDB_FOP_UNDEL);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_UNLINK_UNWIND_FAILED,
++ "Failed to insert unlink unwind");
++ }
++ }
++ /*Last link that was deleted*/
++ else if (remaining_links == 1) {
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE,
++ GFDB_FOP_UNDEL_ALL);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_UNLINK_UNWIND_FAILED,
++ "Failed to insert unlink unwind");
++ }
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(unlink, frame, op_ret, op_errno, preparent, postparent,
++ xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
++ dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++ gf_ctr_link_context_t ctr_link_cx;
++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
++ gf_boolean_t is_xdata_created = _gf_false;
++ struct iatt dummy_stat = {0};
++
++ GF_ASSERT(frame);
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++
++ /*Fill link context*/
++ FILL_CTR_LINK_CX(_link_cx, loc->pargfid, loc->name, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
++ _link_cx, NULL, GFDB_FOP_DENTRY_WRITE,
++ GFDB_FOP_WDEL);
++
++ /*Internal FOP*/
++ _inode_cx->is_internal_fop = is_internal_fop(frame, xdata);
++
++ /* Is a metadata FOP */
++ _inode_cx->is_metadata_fop = _gf_true;
++
++ /* If its a internal FOP and dht link file donot record*/
++ if (_inode_cx->is_internal_fop && dht_is_linkfile(&dummy_stat, xdata)) {
++ goto out;
++ }
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_UNLINK_UNWIND_FAILED,
++ "Failed to insert unlink wind");
++ } else {
++ /* We are doing delete of hard link in inode context in wind
++ * As we don't get the "inode" in the call back for rename */
++ ret = delete_hard_link_ctx(frame, this, loc->inode);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_DELETE_HARDLINK_FAILED,
++ "Failed "
++ "deleting hard link from ctr inode context");
++ }
++ }
++
++ /*
++ *
++ * Sending GF_REQUEST_LINK_COUNT_XDATA
++ * to POSIX Xlator to send link count in unwind path
++ *
++ * */
++ /*create xdata if NULL*/
++ if (!xdata) {
++ xdata = dict_new();
++ is_xdata_created = (xdata) ? _gf_true : _gf_false;
++ }
++ if (!xdata) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_XDATA_NULL,
++ "xdata is NULL :Cannot send "
++ "GF_REQUEST_LINK_COUNT_XDATA to posix");
++ goto out;
++ }
++
++ ret = dict_set_int32(xdata, GF_REQUEST_LINK_COUNT_XDATA, 1);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_SET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
++ "Failed setting GF_REQUEST_LINK_COUNT_XDATA");
++ if (is_xdata_created) {
++ dict_unref(xdata);
++ }
++ goto out;
++ }
++
++out:
++ STACK_WIND(frame, ctr_unlink_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
++
++ if (is_xdata_created)
++ dict_unref(xdata);
++
++ return 0;
++}
++
++/****************************fsync******************************************/
++int32_t
++ctr_fsync_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
++ int32_t op_errno, struct iatt *prebuf, struct iatt *postbuf,
++ dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_UNWIND_FAILED,
++ "Failed to insert fsync unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(fsync, frame, op_ret, op_errno, prebuf, postbuf, xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t flags,
++ dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_WIND_FAILED,
++ "Failed to insert fsync wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_fsync_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->fsync, fd, flags, xdata);
++ return 0;
++}
++
++/****************************setxattr****************************************/
++
++int
++ctr_setxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_UNWIND_FAILED,
++ "Failed to insert setxattr unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(setxattr, frame, op_ret, op_errno, xdata);
++
++ return 0;
++}
++
++int
++ctr_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr,
++ int flags, dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, loc->inode->gfid,
++ NULL, NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
++ "Failed to insert setxattr wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_setxattr_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->setxattr, loc, xattr, flags, xdata);
++ return 0;
++}
++/**************************** fsetxattr *************************************/
++int32_t
++ctr_fsetxattr_cbk(call_frame_t *frame, void *cookie, xlator_t *this,
++ int32_t op_ret, int32_t op_errno, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_WRITE, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_FSYNC_UNWIND_FAILED,
++ "Failed to insert fsetxattr unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(fsetxattr, frame, op_ret, op_errno, xdata);
++
++ return 0;
++}
++
++int32_t
++ctr_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *dict,
++ int32_t flags, dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++ CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
++ NULL, GFDB_FOP_INODE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
++ "Failed to insert fsetxattr wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_fsetxattr_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->fsetxattr, fd, dict, flags, xdata);
++ return 0;
++}
++/****************************mknod*******************************************/
++
++int32_t
++ctr_mknod_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
++ int32_t op_errno, inode_t *inode, struct iatt *buf,
++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
++{
++ int ret = -1;
++ ctr_heal_ret_val_t ret_val = CTR_CTX_ERROR;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ /* Add hard link to the list */
++ ret_val = add_hard_link_ctx(frame, this, inode);
++ if (ret_val == CTR_CTX_ERROR) {
++ gf_msg_trace(this->name, 0, "Failed adding hard link");
++ }
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_CREATE_WRITE,
++ GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_MKNOD_UNWIND_FAILED,
++ "Failed to insert mknod unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(mknod, frame, op_ret, op_errno, inode, buf, preparent,
++ postparent, xdata);
++
++ return 0;
++}
++
++int
++ctr_mknod(call_frame_t *frame, xlator_t *this, loc_t *loc, mode_t mode,
++ dev_t rdev, mode_t umask, dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++ gf_ctr_link_context_t ctr_link_cx;
++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
++ uuid_t gfid = {
++ 0,
++ };
++ uuid_t *ptr_gfid = &gfid;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++
++ GF_ASSERT(frame);
++ GF_ASSERT(frame->root);
++
++ /*get gfid from xdata dict*/
++ ret = dict_get_gfuuid(xdata, "gfid-req", &gfid);
++ if (ret) {
++ gf_msg_debug(this->name, 0, "failed to get gfid from dict");
++ goto out;
++ }
++
++ /*fill ctr link context*/
++ FILL_CTR_LINK_CX(_link_cx, loc->pargfid, loc->name, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, *ptr_gfid, _link_cx,
++ NULL, GFDB_FOP_CREATE_WRITE, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_MKNOD_WIND_FAILED,
++ "Failed to insert mknod wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_mknod_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->mknod, loc, mode, rdev, umask, xdata);
++ return 0;
++}
++
++/****************************create******************************************/
++int
++ctr_create_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, fd_t *fd, inode_t *inode, struct iatt *stbuf,
++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = add_hard_link_ctx(frame, this, inode);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_ADD_HARDLINK_FAILED,
++ "Failed adding hard link");
++ }
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_CREATE_WRITE,
++ GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_UNWIND_FAILED,
++ "Failed to insert create unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(create, frame, op_ret, op_errno, fd, inode, stbuf,
++ preparent, postparent, xdata);
++
++ return 0;
++}
++
++int
++ctr_create(call_frame_t *frame, xlator_t *this, loc_t *loc, int32_t flags,
++ mode_t mode, mode_t umask, fd_t *fd, dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++ gf_ctr_link_context_t ctr_link_cx;
++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
++ uuid_t gfid = {
++ 0,
++ };
++ uuid_t *ptr_gfid = &gfid;
++ struct iatt dummy_stat = {0};
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++
++ GF_ASSERT(frame);
++ GF_ASSERT(frame->root);
++
++ /*Get GFID from Xdata dict*/
++ ret = dict_get_gfuuid(xdata, "gfid-req", &gfid);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_GET_GFID_FROM_DICT_FAILED,
++ "failed to get gfid from dict");
++ goto out;
++ }
++
++ /*fill ctr link context*/
++ FILL_CTR_LINK_CX(_link_cx, loc->pargfid, loc->name, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, loc->inode->ia_type, *ptr_gfid, _link_cx,
++ NULL, GFDB_FOP_CREATE_WRITE, GFDB_FOP_WIND);
++
++ /*Internal FOP*/
++ _inode_cx->is_internal_fop = is_internal_fop(frame, xdata);
++
++ /* If its a internal FOP and dht link file donot record*/
++ if (_inode_cx->is_internal_fop && dht_is_linkfile(&dummy_stat, xdata)) {
++ goto out;
++ }
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, &ctr_inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_WIND_FAILED,
++ "Failed to insert create wind");
++ }
++out:
++ STACK_WIND(frame, ctr_create_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->create, loc, flags, mode, umask, fd,
++ xdata);
++ return 0;
++}
++
++/****************************link********************************************/
++
++int
++ctr_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, inode_t *inode, struct iatt *stbuf,
++ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ /* Add hard link to the list */
++ ret = add_hard_link_ctx(frame, this, inode);
++ if (ret) {
++ gf_msg_trace(this->name, 0, "Failed adding hard link");
++ }
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_DENTRY_WRITE,
++ GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_UNWIND_FAILED,
++ "Failed to insert create unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(link, frame, op_ret, op_errno, inode, stbuf, preparent,
++ postparent, xdata);
++ return 0;
++}
++
++int
++ctr_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
++ dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++ gf_ctr_link_context_t ctr_link_cx;
++ gf_ctr_link_context_t *_link_cx = &ctr_link_cx;
++ struct iatt dummy_stat = {0};
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++
++ GF_ASSERT(frame);
++ GF_ASSERT(frame->root);
++
++ /*fill ctr link context*/
++ FILL_CTR_LINK_CX(_link_cx, newloc->pargfid, newloc->name, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, oldloc->inode->ia_type,
++ oldloc->inode->gfid, _link_cx, NULL,
++ GFDB_FOP_DENTRY_WRITE, GFDB_FOP_WIND);
++
++ /*Internal FOP*/
++ _inode_cx->is_internal_fop = is_internal_fop(frame, xdata);
++
++ /* Is a metadata fop */
++ _inode_cx->is_metadata_fop = _gf_true;
++
++ /* If its a internal FOP and dht link file donot record*/
++ if (_inode_cx->is_internal_fop && dht_is_linkfile(&dummy_stat, xdata)) {
++ goto out;
++ }
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_LINK_WIND_FAILED,
++ "Failed to insert link wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_link_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
++ return 0;
++}
++
++/******************************readv*****************************************/
++int
++ctr_readv_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret,
++ int op_errno, struct iovec *vector, int count, struct iatt *stbuf,
++ struct iobref *iobref, dict_t *xdata)
++{
++ int ret = -1;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, out);
++
++ ret = ctr_insert_unwind(frame, this, GFDB_FOP_INODE_READ, GFDB_FOP_UNWIND);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_CREATE_UNWIND_FAILED,
++ "Failed to insert create unwind");
++ }
++
++out:
++ ctr_free_frame_local(frame);
++
++ STACK_UNWIND_STRICT(readv, frame, op_ret, op_errno, vector, count, stbuf,
++ iobref, xdata);
++ return 0;
++}
++
++int
++ctr_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, off_t off,
++ uint32_t flags, dict_t *xdata)
++{
++ int ret = -1;
++ gf_ctr_inode_context_t ctr_inode_cx;
++ gf_ctr_inode_context_t *_inode_cx = &ctr_inode_cx;
++
++ CTR_IS_DISABLED_THEN_GOTO(this, out);
++ CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, xdata, out);
++
++ /*Fill ctr inode context*/
++ FILL_CTR_INODE_CONTEXT(_inode_cx, fd->inode->ia_type, fd->inode->gfid, NULL,
++ NULL, GFDB_FOP_INODE_READ, GFDB_FOP_WIND);
++
++ /*record into the database*/
++ ret = ctr_insert_wind(frame, this, _inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_READV_WIND_FAILED,
++ "Failed to insert readv wind");
++ }
++
++out:
++ STACK_WIND(frame, ctr_readv_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->readv, fd, size, off, flags, xdata);
++ return 0;
++}
++
++/*******************************ctr_ipc****************************************/
++
++/*This is the call back function per record/file from data base*/
++static int
++ctr_db_query_callback(gfdb_query_record_t *gfdb_query_record, void *args)
++{
++ int ret = -1;
++ ctr_query_cbk_args_t *query_cbk_args = args;
++
++ GF_VALIDATE_OR_GOTO("ctr", query_cbk_args, out);
++
++ ret = gfdb_write_query_record(query_cbk_args->query_fd, gfdb_query_record);
++ if (ret) {
++ gf_msg("ctr", GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
++ "Failed to write to query file");
++ goto out;
++ }
++
++ query_cbk_args->count++;
++
++ ret = 0;
++out:
++ return ret;
++}
++
++/* This function does all the db queries related to tiering and
++ * generates/populates new/existing query file
++ * inputs:
++ * xlator_t *this : CTR Translator
++ * void *conn_node : Database connection
++ * char *query_file: the query file that needs to be updated
++ * gfdb_ipc_ctr_params_t *ipc_ctr_params: the query parameters
++ * Return:
++ * On success 0
++ * On failure -1
++ * */
++int
++ctr_db_query(xlator_t *this, void *conn_node, char *query_file,
++ gfdb_ipc_ctr_params_t *ipc_ctr_params)
++{
++ int ret = -1;
++ ctr_query_cbk_args_t query_cbk_args = {0};
++
++ GF_VALIDATE_OR_GOTO("ctr", this, out);
++ GF_VALIDATE_OR_GOTO(this->name, conn_node, out);
++ GF_VALIDATE_OR_GOTO(this->name, query_file, out);
++ GF_VALIDATE_OR_GOTO(this->name, ipc_ctr_params, out);
++
++ /*Query for eligible files from db*/
++ query_cbk_args.query_fd = open(query_file, O_WRONLY | O_CREAT | O_APPEND,
++ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
++ if (query_cbk_args.query_fd < 0) {
++ gf_msg(this->name, GF_LOG_ERROR, errno, CTR_MSG_FATAL_ERROR,
++ "Failed to open query file %s", query_file);
++ goto out;
++ }
++ if (!ipc_ctr_params->is_promote) {
++ if (ipc_ctr_params->emergency_demote) {
++ /* emergency demotion mode */
++ ret = find_all(conn_node, ctr_db_query_callback,
++ (void *)&query_cbk_args,
++ ipc_ctr_params->query_limit);
++ } else {
++ if (ipc_ctr_params->write_freq_threshold == 0 &&
++ ipc_ctr_params->read_freq_threshold == 0) {
++ ret = find_unchanged_for_time(conn_node, ctr_db_query_callback,
++ (void *)&query_cbk_args,
++ &ipc_ctr_params->time_stamp);
++ } else {
++ ret = find_unchanged_for_time_freq(
++ conn_node, ctr_db_query_callback, (void *)&query_cbk_args,
++ &ipc_ctr_params->time_stamp,
++ ipc_ctr_params->write_freq_threshold,
++ ipc_ctr_params->read_freq_threshold, _gf_false);
++ }
++ }
++ } else {
++ if (ipc_ctr_params->write_freq_threshold == 0 &&
++ ipc_ctr_params->read_freq_threshold == 0) {
++ ret = find_recently_changed_files(conn_node, ctr_db_query_callback,
++ (void *)&query_cbk_args,
++ &ipc_ctr_params->time_stamp);
++ } else {
++ ret = find_recently_changed_files_freq(
++ conn_node, ctr_db_query_callback, (void *)&query_cbk_args,
++ &ipc_ctr_params->time_stamp,
++ ipc_ctr_params->write_freq_threshold,
++ ipc_ctr_params->read_freq_threshold, _gf_false);
++ }
++ }
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
++ "FATAL: query from db failed");
++ goto out;
++ }
++
++ ret = clear_files_heat(conn_node);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
++ "FATAL: Failed to clear db entries");
++ goto out;
++ }
++
++ ret = 0;
++out:
++
++ if (!ret)
++ ret = query_cbk_args.count;
++
++ if (query_cbk_args.query_fd >= 0) {
++ sys_close(query_cbk_args.query_fd);
++ query_cbk_args.query_fd = -1;
++ }
++
++ return ret;
++}
++
++void *
++ctr_compact_thread(void *args)
++{
++ int ret = -1;
++ void *db_conn = NULL;
++
++ xlator_t *this = NULL;
++ gf_ctr_private_t *priv = NULL;
++ gf_boolean_t compact_active = _gf_false;
++ gf_boolean_t compact_mode_switched = _gf_false;
++
++ this = (xlator_t *)args;
++
++ GF_VALIDATE_OR_GOTO("ctr", this, out);
++
++ priv = this->private;
++
++ db_conn = priv->_db_conn;
++ compact_active = priv->compact_active;
++ compact_mode_switched = priv->compact_mode_switched;
++
++ gf_msg("ctr-compact", GF_LOG_INFO, 0, CTR_MSG_SET, "Starting compaction");
++
++ ret = compact_db(db_conn, compact_active, compact_mode_switched);
++
++ if (ret) {
++ gf_msg("ctr-compact", GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed to perform the compaction");
++ }
++
++ ret = pthread_mutex_lock(&priv->compact_lock);
++
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed to acquire lock");
++ goto out;
++ }
++
++ /* We are done compaction on this brick. Set all flags to false */
++ priv->compact_active = _gf_false;
++ priv->compact_mode_switched = _gf_false;
++
++ ret = pthread_mutex_unlock(&priv->compact_lock);
++
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed to release lock");
++ goto out;
++ }
++
++out:
++ return NULL;
++}
++
++int
++ctr_ipc_helper(xlator_t *this, dict_t *in_dict, dict_t *out_dict)
++{
++ int ret = -1;
++ char *ctr_ipc_ops = NULL;
++ gf_ctr_private_t *priv = NULL;
++ char *db_version = NULL;
++ char *db_param_key = NULL;
++ char *db_param = NULL;
++ char *query_file = NULL;
++ gfdb_ipc_ctr_params_t *ipc_ctr_params = NULL;
++ int result = 0;
++ pthread_t compact_thread;
++
++ GF_VALIDATE_OR_GOTO("ctr", this, out);
++ GF_VALIDATE_OR_GOTO(this->name, this->private, out);
++ priv = this->private;
++ GF_VALIDATE_OR_GOTO(this->name, priv->_db_conn, out);
++ GF_VALIDATE_OR_GOTO(this->name, in_dict, out);
++ GF_VALIDATE_OR_GOTO(this->name, out_dict, out);
++
++ GET_DB_PARAM_FROM_DICT(this->name, in_dict, GFDB_IPC_CTR_KEY, ctr_ipc_ops,
++ out);
++
++ /*if its a db clear operation */
++ if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_CLEAR_OPS,
++ SLEN(GFDB_IPC_CTR_CLEAR_OPS)) == 0) {
++ ret = clear_files_heat(priv->_db_conn);
++ if (ret)
++ goto out;
++
++ } /* if its a query operation, in which case its query + clear db*/
++ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_QUERY_OPS,
++ SLEN(GFDB_IPC_CTR_QUERY_OPS)) == 0) {
++ ret = dict_get_str(in_dict, GFDB_IPC_CTR_GET_QFILE_PATH, &query_file);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed extracting query file path");
++ goto out;
++ }
++
++ ret = dict_get_bin(in_dict, GFDB_IPC_CTR_GET_QUERY_PARAMS,
++ (void *)&ipc_ctr_params);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed extracting query parameters");
++ goto out;
++ }
++
++ ret = ctr_db_query(this, priv->_db_conn, query_file, ipc_ctr_params);
++
++ ret = dict_set_int32(out_dict, GFDB_IPC_CTR_RET_QUERY_COUNT, ret);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed setting query reply");
++ goto out;
++ }
++
++ } /* if its a query for db version */
++ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_GET_DB_VERSION_OPS,
++ SLEN(GFDB_IPC_CTR_GET_DB_VERSION_OPS)) == 0) {
++ ret = get_db_version(priv->_db_conn, &db_version);
++ if (ret == -1 || !db_version) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed extracting db version ");
++ goto out;
++ }
++
++ SET_DB_PARAM_TO_DICT(this->name, out_dict, GFDB_IPC_CTR_RET_DB_VERSION,
++ db_version, ret, error);
++
++ } /* if its a query for a db setting */
++ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_GET_DB_PARAM_OPS,
++ SLEN(GFDB_IPC_CTR_GET_DB_PARAM_OPS)) == 0) {
++ ret = dict_get_str(in_dict, GFDB_IPC_CTR_GET_DB_KEY, &db_param_key);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed extracting db param key");
++ goto out;
++ }
++
++ ret = get_db_params(priv->_db_conn, db_param_key, &db_param);
++ if (ret == -1 || !db_param) {
++ goto out;
++ }
++
++ SET_DB_PARAM_TO_DICT(this->name, out_dict, db_param_key, db_param, ret,
++ error);
++ } /* if its an attempt to compact the database */
++ else if (strncmp(ctr_ipc_ops, GFDB_IPC_CTR_SET_COMPACT_PRAGMA,
++ SLEN(GFDB_IPC_CTR_SET_COMPACT_PRAGMA)) == 0) {
++ ret = pthread_mutex_lock(&priv->compact_lock);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed to acquire lock for compaction");
++ goto out;
++ }
++
++ if ((priv->compact_active || priv->compact_mode_switched)) {
++ /* Compaction in progress. LEAVE */
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Compaction already in progress.");
++ pthread_mutex_unlock(&priv->compact_lock);
++ goto out;
++ }
++ /* At this point, we should be the only one on the brick */
++ /* compacting */
++
++ /* Grab the arguments from the dictionary */
++ ret = dict_get_int32(in_dict, "compact_active", &result);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed to get compaction type");
++ goto out;
++ }
++
++ if (result) {
++ priv->compact_active = _gf_true;
++ }
++
++ ret = dict_get_int32(in_dict, "compact_mode_switched", &result);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed to see if compaction switched");
++ goto out;
++ }
++
++ if (result) {
++ priv->compact_mode_switched = _gf_true;
++ gf_msg("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET,
++ "Pre-thread: Compact mode switch is true");
++ } else {
++ gf_msg("ctr-compact", GF_LOG_TRACE, 0, CTR_MSG_SET,
++ "Pre-thread: Compact mode switch is false");
++ }
++
++ ret = pthread_mutex_unlock(&priv->compact_lock);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed to release lock for compaction");
++ goto out;
++ }
++
++ ret = gf_thread_create(&compact_thread, NULL, ctr_compact_thread,
++ (void *)this, "ctrcomp");
++
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed to spawn compaction thread");
++ goto out;
++ }
++
++ goto out;
++ } /* default case */
++ else {
++ goto out;
++ }
++
++ ret = 0;
++ goto out;
++error:
++ GF_FREE(db_param_key);
++ GF_FREE(db_param);
++ GF_FREE(db_version);
++out:
++ return ret;
++}
++
++/* IPC Call from tier migrator to clear the heat on the DB */
++int32_t
++ctr_ipc(call_frame_t *frame, xlator_t *this, int32_t op, dict_t *in_dict)
++{
++ int ret = -1;
++ gf_ctr_private_t *priv = NULL;
++ dict_t *out_dict = NULL;
++
++ GF_ASSERT(this);
++ priv = this->private;
++ GF_ASSERT(priv);
++ GF_ASSERT(priv->_db_conn);
++ GF_VALIDATE_OR_GOTO(this->name, in_dict, wind);
++
++ if (op != GF_IPC_TARGET_CTR)
++ goto wind;
++
++ out_dict = dict_new();
++ if (!out_dict) {
++ goto out;
++ }
++
++ ret = ctr_ipc_helper(this, in_dict, out_dict);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_SET,
++ "Failed in ctr_ipc_helper");
++ }
++out:
++
++ STACK_UNWIND_STRICT(ipc, frame, ret, 0, out_dict);
++
++ if (out_dict)
++ dict_unref(out_dict);
++
++ return 0;
++
++wind:
++ STACK_WIND(frame, default_ipc_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->ipc, op, in_dict);
++
++ return 0;
++}
++
++/* Call to initialize db for ctr xlator while ctr is enabled */
++int32_t
++initialize_ctr_resource(xlator_t *this, gf_ctr_private_t *priv)
++{
++ int ret_db = -1;
++ dict_t *params_dict = NULL;
++
++ if (!priv)
++ goto error;
++
++ /* For compaction */
++ priv->compact_active = _gf_false;
++ priv->compact_mode_switched = _gf_false;
++ ret_db = pthread_mutex_init(&priv->compact_lock, NULL);
++
++ if (ret_db) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
++ "FATAL: Failed initializing compaction mutex");
++ goto error;
++ }
++
++ params_dict = dict_new();
++ if (!params_dict) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INIT_DB_PARAMS_FAILED,
++ "DB Params cannot initialized!");
++ goto error;
++ }
++
++ /*Extract db params options*/
++ ret_db = extract_db_params(this, params_dict, priv->gfdb_db_type);
++ if (ret_db) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_EXTRACT_DB_PARAM_OPTIONS_FAILED,
++ "Failed extracting db params options");
++ goto error;
++ }
++
++ /*Create a memory pool for ctr xlator*/
++ this->local_pool = mem_pool_new(gf_ctr_local_t, 64);
++ if (!this->local_pool) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_CREATE_LOCAL_MEMORY_POOL_FAILED,
++ "failed to create local memory pool");
++ goto error;
++ }
++
++ /*Initialize Database Connection*/
++ priv->_db_conn = init_db(params_dict, priv->gfdb_db_type);
++ if (!priv->_db_conn) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
++ "FATAL: Failed initializing data base");
++ goto error;
++ }
++
++ ret_db = 0;
++ goto out;
++
++error:
++ if (this)
++ mem_pool_destroy(this->local_pool);
++
++ if (priv) {
++ GF_FREE(priv->ctr_db_path);
++ }
++ GF_FREE(priv);
++ ret_db = -1;
++out:
++ if (params_dict)
++ dict_unref(params_dict);
++
++ return ret_db;
++}
++
++/******************************************************************************/
++int
++reconfigure(xlator_t *this, dict_t *options)
++{
++ char *temp_str = NULL;
++ int ret = 0;
++ gf_ctr_private_t *priv = NULL;
++
++ priv = this->private;
++
++ if (dict_get_str(options, "changetimerecorder.frequency", &temp_str)) {
++ gf_msg(this->name, GF_LOG_TRACE, 0, CTR_MSG_SET, "set");
++ }
++
++ GF_OPTION_RECONF("ctr-enabled", priv->enabled, options, bool, out);
++ if (!priv->enabled) {
++ gf_msg(GFDB_DATA_STORE, GF_LOG_INFO, 0, CTR_MSG_XLATOR_DISABLED,
++ "CTR Xlator is not enabled so skip ctr reconfigure");
++ goto out;
++ }
++
++ /* If ctr is enabled after skip init for ctr xlator then call
++ initialize_ctr_resource during reconfigure phase to allocate resources
++ for xlator
++ */
++ if (priv->enabled && !priv->_db_conn) {
++ ret = initialize_ctr_resource(this, priv);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
++ "FATAL: Failed ctr initialize resource");
++ goto out;
++ }
++ }
++
++ GF_OPTION_RECONF("record-counters", priv->ctr_record_counter, options, bool,
++ out);
++
++ GF_OPTION_RECONF("ctr-record-metadata-heat", priv->ctr_record_metadata_heat,
++ options, bool, out);
++
++ GF_OPTION_RECONF("ctr_link_consistency", priv->ctr_link_consistency,
++ options, bool, out);
++
++ GF_OPTION_RECONF("ctr_lookupheal_inode_timeout",
++ priv->ctr_lookupheal_inode_timeout, options, uint64, out);
++
++ GF_OPTION_RECONF("ctr_lookupheal_link_timeout",
++ priv->ctr_lookupheal_link_timeout, options, uint64, out);
++
++ GF_OPTION_RECONF("record-exit", priv->ctr_record_unwind, options, bool,
++ out);
++
++ GF_OPTION_RECONF("record-entry", priv->ctr_record_wind, options, bool, out);
++
++ /* If database is sqlite */
++ if (priv->gfdb_db_type == GFDB_SQLITE3) {
++ /* AUTOCHECKPOINT */
++ if (dict_get_str(options, GFDB_SQL_PARAM_WAL_AUTOCHECK, &temp_str) ==
++ 0) {
++ ret = set_db_params(priv->_db_conn, "wal_autocheckpoint", temp_str);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED,
++ "Failed to set %s", GFDB_SQL_PARAM_WAL_AUTOCHECK);
++ }
++ }
++
++ /* CACHE_SIZE */
++ if (dict_get_str(options, GFDB_SQL_PARAM_CACHE_SIZE, &temp_str) == 0) {
++ ret = set_db_params(priv->_db_conn, "cache_size", temp_str);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED,
++ "Failed to set %s", GFDB_SQL_PARAM_CACHE_SIZE);
++ }
++ }
++ }
++
++ ret = 0;
++
++out:
++
++ return ret;
++}
++
++/****************************init********************************************/
++
++int32_t
++init(xlator_t *this)
++{
++ gf_ctr_private_t *priv = NULL;
++ int ret_db = -1;
++
++ if (!this) {
++ gf_msg("ctr", GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
++ "FATAL: ctr this is not initialized");
++ return -1;
++ }
++
++ if (!this->children || this->children->next) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
++ "FATAL: ctr should have exactly one child");
++ return -1;
++ }
++
++ if (!this->parents) {
++ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_DANGLING_VOLUME,
++ "dangling volume. check volfile ");
++ }
++
++ priv = GF_CALLOC(1, sizeof(*priv), gf_ctr_mt_private_t);
++ if (!priv) {
++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, CTR_MSG_CALLOC_FAILED,
++ "Calloc did not work!!!");
++ return -1;
++ }
++
++ /*Default values for the translator*/
++ priv->ctr_record_wind = _gf_true;
++ priv->ctr_record_unwind = _gf_false;
++ priv->ctr_hot_brick = _gf_false;
++ priv->gfdb_db_type = GFDB_SQLITE3;
++ priv->gfdb_sync_type = GFDB_DB_SYNC;
++ priv->_db_conn = NULL;
++ priv->ctr_lookupheal_link_timeout = CTR_DEFAULT_HARDLINK_EXP_PERIOD;
++ priv->ctr_lookupheal_inode_timeout = CTR_DEFAULT_INODE_EXP_PERIOD;
++
++ /*Extract ctr xlator options*/
++ ret_db = extract_ctr_options(this, priv);
++ if (ret_db) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_EXTRACT_CTR_XLATOR_OPTIONS_FAILED,
++ "Failed extracting ctr xlator options");
++ GF_FREE(priv);
++ return -1;
++ }
++
++ if (!priv->enabled) {
++ gf_msg(GFDB_DATA_STORE, GF_LOG_INFO, 0, CTR_MSG_XLATOR_DISABLED,
++ "CTR Xlator is not enabled so skip ctr init");
++ goto out;
++ }
++
++ ret_db = initialize_ctr_resource(this, priv);
++ if (ret_db) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_FATAL_ERROR,
++ "FATAL: Failed ctr initialize resource");
++ return -1;
++ }
++
++out:
++ this->private = (void *)priv;
++ return 0;
++}
++
++int
++notify(xlator_t *this, int event, void *data, ...)
++{
++ gf_ctr_private_t *priv = NULL;
++ int ret = 0;
++
++ priv = this->private;
++
++ if (!priv)
++ goto out;
++
++ ret = default_notify(this, event, data);
++
++out:
++ return ret;
++}
++
++int32_t
++mem_acct_init(xlator_t *this)
++{
++ int ret = -1;
++
++ GF_VALIDATE_OR_GOTO("ctr", this, out);
++
++ ret = xlator_mem_acct_init(this, gf_ctr_mt_end + 1);
++
++ if (ret != 0) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_MEM_ACC_INIT_FAILED,
++ "Memory accounting init"
++ "failed");
++ return ret;
++ }
++out:
++ return ret;
++}
++
++void
++fini(xlator_t *this)
++{
++ gf_ctr_private_t *priv = NULL;
++
++ priv = this->private;
++
++ if (priv && priv->enabled) {
++ if (fini_db(priv->_db_conn)) {
++ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_CLOSE_DB_CONN_FAILED,
++ "Failed closing "
++ "db connection");
++ }
++
++ if (priv->_db_conn)
++ priv->_db_conn = NULL;
++
++ GF_FREE(priv->ctr_db_path);
++ if (pthread_mutex_destroy(&priv->compact_lock)) {
++ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_CLOSE_DB_CONN_FAILED,
++ "Failed to "
++ "destroy the compaction mutex");
++ }
++ }
++ GF_FREE(priv);
++ mem_pool_destroy(this->local_pool);
++ this->local_pool = NULL;
++
++ return;
++}
++
++struct xlator_fops fops = {
++ /*lookup*/
++ .lookup = ctr_lookup,
++ /*write fops */
++ .mknod = ctr_mknod,
++ .create = ctr_create,
++ .truncate = ctr_truncate,
++ .ftruncate = ctr_ftruncate,
++ .setxattr = ctr_setxattr,
++ .fsetxattr = ctr_fsetxattr,
++ .removexattr = ctr_removexattr,
++ .fremovexattr = ctr_fremovexattr,
++ .unlink = ctr_unlink,
++ .link = ctr_link,
++ .rename = ctr_rename,
++ .writev = ctr_writev,
++ .setattr = ctr_setattr,
++ .fsetattr = ctr_fsetattr,
++ /*read fops*/
++ .readv = ctr_readv,
++ /* IPC call*/
++ .ipc = ctr_ipc};
++
++struct xlator_cbks cbks = {.forget = ctr_forget};
++
++struct volume_options options[] = {
++ {.key =
++ {
++ "ctr-enabled",
++ },
++ .type = GF_OPTION_TYPE_BOOL,
++ .value = {"on", "off"},
++ .default_value = "off",
++ .description = "Enables the CTR",
++ .flags = OPT_FLAG_SETTABLE},
++ {.key = {"record-entry"},
++ .type = GF_OPTION_TYPE_BOOL,
++ .value = {"on", "off"},
++ .default_value = "on"},
++ {.key = {"record-exit"},
++ .type = GF_OPTION_TYPE_BOOL,
++ .value = {"on", "off"},
++ .default_value = "off"},
++ {.key = {"record-counters"},
++ .type = GF_OPTION_TYPE_BOOL,
++ .value = {"on", "off"},
++ .default_value = "off",
++ .op_version = {GD_OP_VERSION_3_7_0},
++ .flags = OPT_FLAG_SETTABLE,
++ .tags = {}},
++ {.key = {"ctr-record-metadata-heat"},
++ .type = GF_OPTION_TYPE_BOOL,
++ .value = {"on", "off"},
++ .default_value = "off",
++ .flags = OPT_FLAG_SETTABLE,
++ .op_version = {GD_OP_VERSION_3_7_0},
++ .tags = {}},
++ {.key = {"ctr_link_consistency"},
++ .type = GF_OPTION_TYPE_BOOL,
++ .value = {"on", "off"},
++ .default_value = "off",
++ .flags = OPT_FLAG_SETTABLE,
++ .op_version = {GD_OP_VERSION_3_7_0},
++ .tags = {}},
++ {.key = {"ctr_lookupheal_link_timeout"},
++ .type = GF_OPTION_TYPE_INT,
++ .default_value = "300",
++ .flags = OPT_FLAG_SETTABLE,
++ .op_version = {GD_OP_VERSION_3_7_2},
++ .tags = {}},
++ {.key = {"ctr_lookupheal_inode_timeout"},
++ .type = GF_OPTION_TYPE_INT,
++ .default_value = "300",
++ .flags = OPT_FLAG_SETTABLE,
++ .op_version = {GD_OP_VERSION_3_7_2},
++ .tags = {}},
++ {.key = {"hot-brick"},
++ .type = GF_OPTION_TYPE_BOOL,
++ .value = {"on", "off"},
++ .default_value = "off"},
++ {.key = {"db-type"},
++ .type = GF_OPTION_TYPE_STR,
++ .value = {"hashfile", "rocksdb", "changelog", "sqlite3", "hyperdex"},
++ .default_value = "sqlite3",
++ .op_version = {GD_OP_VERSION_3_7_0},
++ .flags = OPT_FLAG_SETTABLE,
++ .tags = {}},
++ {.key = {"db-sync"},
++ .type = GF_OPTION_TYPE_STR,
++ .value = {"sync", "async"},
++ .default_value = "sync"},
++ {.key = {"db-path"}, .type = GF_OPTION_TYPE_PATH},
++ {.key = {"db-name"}, .type = GF_OPTION_TYPE_STR},
++ {.key = {GFDB_SQL_PARAM_SYNC},
++ .type = GF_OPTION_TYPE_STR,
++ .value = {"off", "normal", "full"},
++ .default_value = "normal"},
++ {.key = {GFDB_SQL_PARAM_JOURNAL_MODE},
++ .type = GF_OPTION_TYPE_STR,
++ .value = {"delete", "truncate", "persist", "memory", "wal", "off"},
++ .default_value = "wal",
++ .flags = OPT_FLAG_SETTABLE,
++ .op_version = {GD_OP_VERSION_3_7_0},
++ .tags = {}},
++ {.key = {GFDB_SQL_PARAM_AUTO_VACUUM},
++ .type = GF_OPTION_TYPE_STR,
++ .value = {"off", "full", "incr"},
++ .default_value = "off",
++ .flags = OPT_FLAG_SETTABLE,
++ .op_version = {GD_OP_VERSION_3_7_0},
++ .tags = {}},
++ {.key = {GFDB_SQL_PARAM_WAL_AUTOCHECK},
++ .type = GF_OPTION_TYPE_INT,
++ .default_value = "25000",
++ .flags = OPT_FLAG_SETTABLE,
++ .op_version = {GD_OP_VERSION_3_7_0},
++ .tags = {}},
++ {.key = {GFDB_SQL_PARAM_CACHE_SIZE},
++ .type = GF_OPTION_TYPE_INT,
++ .default_value = "12500",
++ .flags = OPT_FLAG_SETTABLE,
++ .op_version = {GD_OP_VERSION_3_7_0},
++ .tags = {}},
++ {.key = {GFDB_SQL_PARAM_PAGE_SIZE},
++ .type = GF_OPTION_TYPE_INT,
++ .default_value = "4096",
++ .flags = OPT_FLAG_SETTABLE,
++ .op_version = {GD_OP_VERSION_3_7_0},
++ .tags = {}},
++ {.key = {NULL}},
++};
++
++xlator_api_t xlator_api = {
++ .init = init,
++ .fini = fini,
++ .notify = notify,
++ .reconfigure = reconfigure,
++ .mem_acct_init = mem_acct_init,
++ .op_version = {GD_OP_VERSION_3_7_0}, /* Present from the initial version */
++ .fops = &fops,
++ .cbks = &cbks,
++ .identifier = "changetimerecorder",
++ .category = GF_MAINTAINED,
++ .options = options,
++};
+diff --git a/xlators/features/changetimerecorder/src/changetimerecorder.h b/xlators/features/changetimerecorder/src/changetimerecorder.h
+new file mode 100644
+index 0000000..0150a1c
+--- /dev/null
++++ b/xlators/features/changetimerecorder/src/changetimerecorder.h
+@@ -0,0 +1,21 @@
++/*
++ Copyright (c) 2006-2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#ifndef __CTR_H
++#define __CTR_H
++
++#include <glusterfs/glusterfs.h>
++#include <glusterfs/xlator.h>
++#include <glusterfs/logging.h>
++#include <glusterfs/common-utils.h>
++#include "ctr_mem_types.h"
++#include "ctr-helper.h"
++
++#endif /* __CTR_H */
+diff --git a/xlators/features/changetimerecorder/src/ctr-helper.c b/xlators/features/changetimerecorder/src/ctr-helper.c
+new file mode 100644
+index 0000000..e1e6573
+--- /dev/null
++++ b/xlators/features/changetimerecorder/src/ctr-helper.c
+@@ -0,0 +1,293 @@
++/*
++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#include "gfdb_sqlite3.h"
++#include "ctr-helper.h"
++#include "ctr-messages.h"
++
++/*******************************************************************************
++ *
++ * Fill unwind into db record
++ *
++ ******************************************************************************/
++int
++fill_db_record_for_unwind(xlator_t *this, gf_ctr_local_t *ctr_local,
++ gfdb_fop_type_t fop_type, gfdb_fop_path_t fop_path)
++{
++ int ret = -1;
++ gfdb_time_t *ctr_uwtime = NULL;
++ gf_ctr_private_t *_priv = NULL;
++
++ GF_ASSERT(this);
++ _priv = this->private;
++ GF_ASSERT(_priv);
++
++ GF_ASSERT(ctr_local);
++
++ /*If not unwind path error*/
++ if (!isunwindpath(fop_path)) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_WRONG_FOP_PATH,
++ "Wrong fop_path. Should be unwind");
++ goto out;
++ }
++
++ ctr_uwtime = &CTR_DB_REC(ctr_local).gfdb_unwind_change_time;
++ CTR_DB_REC(ctr_local).gfdb_fop_path = fop_path;
++ CTR_DB_REC(ctr_local).gfdb_fop_type = fop_type;
++
++ ret = gettimeofday(ctr_uwtime, NULL);
++ if (ret == -1) {
++ gf_msg(this->name, GF_LOG_ERROR, errno,
++ CTR_MSG_FILL_UNWIND_TIME_REC_ERROR,
++ "Error "
++ "filling unwind time record %s",
++ strerror(errno));
++ goto out;
++ }
++
++ /* Special case i.e if its a tier rebalance
++ * + cold tier brick
++ * + its a create/mknod FOP
++ * we record unwind time as zero */
++ if (ctr_local->client_pid == GF_CLIENT_PID_TIER_DEFRAG &&
++ (!_priv->ctr_hot_brick) && isdentrycreatefop(fop_type)) {
++ memset(ctr_uwtime, 0, sizeof(*ctr_uwtime));
++ }
++ ret = 0;
++out:
++ return ret;
++}
++
++/*******************************************************************************
++ *
++ * Fill wind into db record
++ *
++ ******************************************************************************/
++int
++fill_db_record_for_wind(xlator_t *this, gf_ctr_local_t *ctr_local,
++ gf_ctr_inode_context_t *ctr_inode_cx)
++{
++ int ret = -1;
++ gfdb_time_t *ctr_wtime = NULL;
++ gf_ctr_private_t *_priv = NULL;
++
++ GF_ASSERT(this);
++ _priv = this->private;
++ GF_ASSERT(_priv);
++ GF_ASSERT(ctr_local);
++ IS_CTR_INODE_CX_SANE(ctr_inode_cx);
++
++ /*if not wind path error!*/
++ if (!iswindpath(ctr_inode_cx->fop_path)) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_WRONG_FOP_PATH,
++ "Wrong fop_path. Should be wind");
++ goto out;
++ }
++
++ ctr_wtime = &CTR_DB_REC(ctr_local).gfdb_wind_change_time;
++ CTR_DB_REC(ctr_local).gfdb_fop_path = ctr_inode_cx->fop_path;
++ CTR_DB_REC(ctr_local).gfdb_fop_type = ctr_inode_cx->fop_type;
++ CTR_DB_REC(ctr_local).link_consistency = _priv->ctr_link_consistency;
++
++ ret = gettimeofday(ctr_wtime, NULL);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, errno,
++ CTR_MSG_FILL_UNWIND_TIME_REC_ERROR,
++ "Error filling wind time record %s", strerror(errno));
++ goto out;
++ }
++
++ /* Special case i.e if its a tier rebalance
++ * + cold tier brick
++ * + its a create/mknod FOP
++ * we record wind time as zero */
++ if (ctr_local->client_pid == GF_CLIENT_PID_TIER_DEFRAG &&
++ (!_priv->ctr_hot_brick) && isdentrycreatefop(ctr_inode_cx->fop_type)) {
++ memset(ctr_wtime, 0, sizeof(*ctr_wtime));
++ }
++
++ /* Copy gfid into db record */
++ gf_uuid_copy(CTR_DB_REC(ctr_local).gfid, *(ctr_inode_cx->gfid));
++
++ /* Copy older gfid if any */
++ if (ctr_inode_cx->old_gfid &&
++ (!gf_uuid_is_null(*(ctr_inode_cx->old_gfid)))) {
++ gf_uuid_copy(CTR_DB_REC(ctr_local).old_gfid, *(ctr_inode_cx->old_gfid));
++ }
++
++ /*Hard Links*/
++ if (isdentryfop(ctr_inode_cx->fop_type)) {
++ /*new link fop*/
++ if (NEW_LINK_CX(ctr_inode_cx)) {
++ gf_uuid_copy(CTR_DB_REC(ctr_local).pargfid,
++ *((NEW_LINK_CX(ctr_inode_cx))->pargfid));
++ strcpy(CTR_DB_REC(ctr_local).file_name,
++ NEW_LINK_CX(ctr_inode_cx)->basename);
++ }
++ /*rename fop*/
++ if (OLD_LINK_CX(ctr_inode_cx)) {
++ gf_uuid_copy(CTR_DB_REC(ctr_local).old_pargfid,
++ *((OLD_LINK_CX(ctr_inode_cx))->pargfid));
++ strcpy(CTR_DB_REC(ctr_local).old_file_name,
++ OLD_LINK_CX(ctr_inode_cx)->basename);
++ }
++ }
++
++ ret = 0;
++out:
++ /*On error roll back and clean the record*/
++ if (ret == -1) {
++ CLEAR_CTR_DB_RECORD(ctr_local);
++ }
++ return ret;
++}
++
++/******************************************************************************
++ *
++ * CTR xlator init related functions
++ *
++ *
++ * ****************************************************************************/
++static int
++extract_sql_params(xlator_t *this, dict_t *params_dict)
++{
++ int ret = -1;
++ char *db_path = NULL;
++ char *db_name = NULL;
++ char *db_full_path = NULL;
++
++ GF_ASSERT(this);
++ GF_ASSERT(params_dict);
++
++ /*Extract the path of the db*/
++ db_path = NULL;
++ GET_DB_PARAM_FROM_DICT_DEFAULT(this->name, this->options, "db-path",
++ db_path, "/var/run/gluster/");
++
++ /*Extract the name of the db*/
++ db_name = NULL;
++ GET_DB_PARAM_FROM_DICT_DEFAULT(this->name, this->options, "db-name",
++ db_name, "gf_ctr_db.db");
++
++ /*Construct full path of the db*/
++ ret = gf_asprintf(&db_full_path, "%s/%s", db_path, db_name);
++ if (ret < 0) {
++ gf_msg(GFDB_DATA_STORE, GF_LOG_ERROR, 0,
++ CTR_MSG_CONSTRUCT_DB_PATH_FAILED,
++ "Construction of full db path failed!");
++ goto out;
++ }
++
++ /*Setting the SQL DB Path*/
++ SET_DB_PARAM_TO_DICT(this->name, params_dict, GFDB_SQL_PARAM_DBPATH,
++ db_full_path, ret, out);
++
++ /*Extract rest of the sql params*/
++ ret = gfdb_set_sql_params(this->name, this->options, params_dict);
++ if (ret) {
++ gf_msg(GFDB_DATA_STORE, GF_LOG_ERROR, 0,
++ CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED,
++ "Failed setting values to sql param dict!");
++ }
++
++ ret = 0;
++
++out:
++ if (ret)
++ GF_FREE(db_full_path);
++ return ret;
++}
++
++int
++extract_db_params(xlator_t *this, dict_t *params_dict, gfdb_db_type_t db_type)
++{
++ int ret = -1;
++
++ GF_ASSERT(this);
++ GF_ASSERT(params_dict);
++
++ switch (db_type) {
++ case GFDB_SQLITE3:
++ ret = extract_sql_params(this, params_dict);
++ if (ret)
++ goto out;
++ break;
++ case GFDB_ROCKS_DB:
++ case GFDB_HYPERDEX:
++ case GFDB_HASH_FILE_STORE:
++ case GFDB_INVALID_DB:
++ case GFDB_DB_END:
++ goto out;
++ }
++ ret = 0;
++out:
++ return ret;
++}
++
++int
++extract_ctr_options(xlator_t *this, gf_ctr_private_t *_priv)
++{
++ int ret = -1;
++ char *_val_str = NULL;
++
++ GF_ASSERT(this);
++ GF_ASSERT(_priv);
++
++ /*Checking if the CTR Translator is enabled. By default its disabled*/
++ _priv->enabled = _gf_false;
++ GF_OPTION_INIT("ctr-enabled", _priv->enabled, bool, out);
++ if (!_priv->enabled) {
++ gf_msg(GFDB_DATA_STORE, GF_LOG_INFO, 0, CTR_MSG_XLATOR_DISABLED,
++ "CTR Xlator is disabled.");
++ ret = 0;
++ goto out;
++ }
++
++ /*Extract db type*/
++ GF_OPTION_INIT("db-type", _val_str, str, out);
++ _priv->gfdb_db_type = gf_string2gfdbdbtype(_val_str);
++
++ /*Extract flag for record on wind*/
++ GF_OPTION_INIT("record-entry", _priv->ctr_record_wind, bool, out);
++
++ /*Extract flag for record on unwind*/
++ GF_OPTION_INIT("record-exit", _priv->ctr_record_unwind, bool, out);
++
++ /*Extract flag for record on counters*/
++ GF_OPTION_INIT("record-counters", _priv->ctr_record_counter, bool, out);
++
++ /* Extract flag for record metadata heat */
++ GF_OPTION_INIT("ctr-record-metadata-heat", _priv->ctr_record_metadata_heat,
++ bool, out);
++
++ /*Extract flag for link consistency*/
++ GF_OPTION_INIT("ctr_link_consistency", _priv->ctr_link_consistency, bool,
++ out);
++
++ /*Extract ctr_lookupheal_inode_timeout */
++ GF_OPTION_INIT("ctr_lookupheal_inode_timeout",
++ _priv->ctr_lookupheal_inode_timeout, uint64, out);
++
++ /*Extract ctr_lookupheal_link_timeout*/
++ GF_OPTION_INIT("ctr_lookupheal_link_timeout",
++ _priv->ctr_lookupheal_link_timeout, uint64, out);
++
++ /*Extract flag for hot tier brick*/
++ GF_OPTION_INIT("hot-brick", _priv->ctr_hot_brick, bool, out);
++
++ /*Extract flag for sync mode*/
++ GF_OPTION_INIT("db-sync", _val_str, str, out);
++ _priv->gfdb_sync_type = gf_string2gfdbdbsync(_val_str);
++
++ ret = 0;
++
++out:
++ return ret;
++}
+diff --git a/xlators/features/changetimerecorder/src/ctr-helper.h b/xlators/features/changetimerecorder/src/ctr-helper.h
+new file mode 100644
+index 0000000..517fbb0
+--- /dev/null
++++ b/xlators/features/changetimerecorder/src/ctr-helper.h
+@@ -0,0 +1,854 @@
++/*
++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#ifndef __CTR_HELPER_H
++#define __CTR_HELPER_H
++
++#include <glusterfs/xlator.h>
++#include "ctr_mem_types.h"
++#include <glusterfs/iatt.h>
++#include <glusterfs/glusterfs.h>
++#include <glusterfs/xlator.h>
++#include <glusterfs/defaults.h>
++#include <glusterfs/logging.h>
++#include <glusterfs/common-utils.h>
++#include <time.h>
++#include <sys/time.h>
++#include <pthread.h>
++
++#include "gfdb_data_store.h"
++#include "ctr-xlator-ctx.h"
++#include "ctr-messages.h"
++
++#define CTR_DEFAULT_HARDLINK_EXP_PERIOD 300 /* Five mins */
++#define CTR_DEFAULT_INODE_EXP_PERIOD 300 /* Five mins */
++
++typedef struct ctr_query_cbk_args {
++ int query_fd;
++ int count;
++} ctr_query_cbk_args_t;
++
++/*CTR Xlator Private structure*/
++typedef struct gf_ctr_private {
++ gf_boolean_t enabled;
++ char *ctr_db_path;
++ gf_boolean_t ctr_hot_brick;
++ gf_boolean_t ctr_record_wind;
++ gf_boolean_t ctr_record_unwind;
++ gf_boolean_t ctr_record_counter;
++ gf_boolean_t ctr_record_metadata_heat;
++ gf_boolean_t ctr_link_consistency;
++ gfdb_db_type_t gfdb_db_type;
++ gfdb_sync_type_t gfdb_sync_type;
++ gfdb_conn_node_t *_db_conn;
++ uint64_t ctr_lookupheal_link_timeout;
++ uint64_t ctr_lookupheal_inode_timeout;
++ gf_boolean_t compact_active;
++ gf_boolean_t compact_mode_switched;
++ pthread_mutex_t compact_lock;
++} gf_ctr_private_t;
++
++/*
++ * gf_ctr_local_t is the ctr xlator local data structure that is stored in
++ * the call_frame of each FOP.
++ *
++ * gfdb_db_record: The gf_ctr_local contains a gfdb_db_record object, which is
++ * used by the insert_record() api from the libgfdb. The gfdb_db_record object
++ * will contain all the inode and hardlink(only for dentry fops: create,
++ * mknod,link, unlink, rename).The ctr_local is keep alive till the unwind
++ * call and will be release during the unwind. The same gfdb_db_record will
++ * used for the unwind insert_record() api, to record unwind in the database.
++ *
++ * ia_inode_type in gf_ctr_local will tell the type of the inode. This is
++ * important for during the unwind path. As we will not have the inode during
++ * the unwind path. We would have include this in the gfdb_db_record itself
++ * but currently we record only file inode information.
++ *
++ * is_internal_fop in gf_ctr_local will tell us if this is a internal fop and
++ * take special/no action. We don't record change/access times or increement
++ * heat counter for internal fops from rebalancer.
++ * */
++typedef struct gf_ctr_local {
++ gfdb_db_record_t gfdb_db_record;
++ ia_type_t ia_inode_type;
++ gf_boolean_t is_internal_fop;
++ gf_special_pid_t client_pid;
++} gf_ctr_local_t;
++/*
++ * Easy access of gfdb_db_record of ctr_local
++ * */
++#define CTR_DB_REC(ctr_local) (ctr_local->gfdb_db_record)
++
++/*Clear db record*/
++#define CLEAR_CTR_DB_RECORD(ctr_local) \
++ do { \
++ ctr_local->gfdb_db_record.gfdb_fop_path = GFDB_FOP_INVALID; \
++ memset(&(ctr_local->gfdb_db_record.gfdb_wind_change_time), 0, \
++ sizeof(gfdb_time_t)); \
++ memset(&(ctr_local->gfdb_db_record.gfdb_unwind_change_time), 0, \
++ sizeof(gfdb_time_t)); \
++ gf_uuid_clear(ctr_local->gfdb_db_record.gfid); \
++ gf_uuid_clear(ctr_local->gfdb_db_record.pargfid); \
++ memset(ctr_local->gfdb_db_record.file_name, 0, GF_NAME_MAX + 1); \
++ memset(ctr_local->gfdb_db_record.old_file_name, 0, GF_NAME_MAX + 1); \
++ ctr_local->gfdb_db_record.gfdb_fop_type = GFDB_FOP_INVALID_OP; \
++ ctr_local->ia_inode_type = IA_INVAL; \
++ } while (0)
++
++static gf_ctr_local_t *
++init_ctr_local_t(xlator_t *this)
++{
++ gf_ctr_local_t *ctr_local = NULL;
++
++ GF_ASSERT(this);
++
++ ctr_local = mem_get0(this->local_pool);
++ if (!ctr_local) {
++ gf_msg(GFDB_DATA_STORE, GF_LOG_ERROR, 0,
++ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
++ "Error while creating ctr local");
++ goto out;
++ }
++
++ CLEAR_CTR_DB_RECORD(ctr_local);
++out:
++ return ctr_local;
++}
++
++static void
++free_ctr_local(gf_ctr_local_t *ctr_local)
++{
++ if (ctr_local)
++ mem_put(ctr_local);
++}
++
++/******************************************************************************
++ *
++ *
++ * Context Carrier Structures
++ *
++ *
++ * ****************************************************************************/
++
++/*
++ * Context Carrier structures are used to carry relevant information about
++ * inodes and links from the fops calls to the ctr_insert_wind.
++ * These structure just have pointers to the original data and donot
++ * do a deep copy of any data. This info is deep copied to
++ * ctr_local->gfdb_db_record and passed to insert_record() api of libgfdb. This
++ * info remains persistent for the unwind in ctr_local->gfdb_db_record
++ * and once used will be destroyed.
++ *
++ * gf_ctr_link_context_t : Context structure for hard links
++ * gf_ctr_inode_context_t : Context structure for inodes
++ *
++ * */
++
++/*Context Carrier Structure for hard links*/
++typedef struct gf_ctr_link_context {
++ uuid_t *pargfid;
++ const char *basename;
++} gf_ctr_link_context_t;
++
++/*Context Carrier Structure for inodes*/
++typedef struct gf_ctr_inode_context {
++ ia_type_t ia_type;
++ uuid_t *gfid;
++ uuid_t *old_gfid;
++ gf_ctr_link_context_t *new_link_cx;
++ gf_ctr_link_context_t *old_link_cx;
++ gfdb_fop_type_t fop_type;
++ gfdb_fop_path_t fop_path;
++ gf_boolean_t is_internal_fop;
++ /* Indicating metadata fops */
++ gf_boolean_t is_metadata_fop;
++} gf_ctr_inode_context_t;
++
++/*******************Util Macros for Context Carrier Structures*****************/
++
++/*Checks if ctr_link_cx is sane!*/
++#define IS_CTR_LINK_CX_SANE(ctr_link_cx) \
++ do { \
++ if (ctr_link_cx) { \
++ if (ctr_link_cx->pargfid) \
++ GF_ASSERT(*(ctr_link_cx->pargfid)); \
++ GF_ASSERT(ctr_link_cx->basename); \
++ }; \
++ } while (0)
++
++/*Clear and fill the ctr_link_context with values*/
++#define FILL_CTR_LINK_CX(ctr_link_cx, _pargfid, _basename, label) \
++ do { \
++ GF_VALIDATE_OR_GOTO("ctr", ctr_link_cx, label); \
++ GF_VALIDATE_OR_GOTO("ctr", _pargfid, label); \
++ GF_VALIDATE_OR_GOTO("ctr", _basename, label); \
++ memset(ctr_link_cx, 0, sizeof(*ctr_link_cx)); \
++ ctr_link_cx->pargfid = &_pargfid; \
++ ctr_link_cx->basename = _basename; \
++ } while (0)
++
++#define NEW_LINK_CX(ctr_inode_cx) ctr_inode_cx->new_link_cx
++
++#define OLD_LINK_CX(ctr_inode_cx) ctr_inode_cx->old_link_cx
++
++/*Checks if ctr_inode_cx is sane!*/
++#define IS_CTR_INODE_CX_SANE(ctr_inode_cx) \
++ do { \
++ GF_ASSERT(ctr_inode_cx); \
++ GF_ASSERT(ctr_inode_cx->gfid); \
++ GF_ASSERT(*(ctr_inode_cx->gfid)); \
++ GF_ASSERT(ctr_inode_cx->fop_type != GFDB_FOP_INVALID_OP); \
++ GF_ASSERT(ctr_inode_cx->fop_path != GFDB_FOP_INVALID); \
++ IS_CTR_LINK_CX_SANE(NEW_LINK_CX(ctr_inode_cx)); \
++ IS_CTR_LINK_CX_SANE(OLD_LINK_CX(ctr_inode_cx)); \
++ } while (0)
++
++/*Clear and fill the ctr_inode_context with values*/
++#define FILL_CTR_INODE_CONTEXT(ctr_inode_cx, _ia_type, _gfid, _new_link_cx, \
++ _old_link_cx, _fop_type, _fop_path) \
++ do { \
++ GF_ASSERT(ctr_inode_cx); \
++ GF_ASSERT(_gfid); \
++ GF_ASSERT(_fop_type != GFDB_FOP_INVALID_OP); \
++ GF_ASSERT(_fop_path != GFDB_FOP_INVALID); \
++ memset(ctr_inode_cx, 0, sizeof(*ctr_inode_cx)); \
++ ctr_inode_cx->ia_type = _ia_type; \
++ ctr_inode_cx->gfid = &_gfid; \
++ IS_CTR_LINK_CX_SANE(NEW_LINK_CX(ctr_inode_cx)); \
++ if (_new_link_cx) \
++ NEW_LINK_CX(ctr_inode_cx) = _new_link_cx; \
++ IS_CTR_LINK_CX_SANE(OLD_LINK_CX(ctr_inode_cx)); \
++ if (_old_link_cx) \
++ OLD_LINK_CX(ctr_inode_cx) = _old_link_cx; \
++ ctr_inode_cx->fop_type = _fop_type; \
++ ctr_inode_cx->fop_path = _fop_path; \
++ } while (0)
++
++/******************************************************************************
++ *
++ * Util functions or macros used by
++ * insert wind and insert unwind
++ *
++ * ****************************************************************************/
++/* Free ctr frame local */
++static inline void
++ctr_free_frame_local(call_frame_t *frame)
++{
++ if (frame) {
++ free_ctr_local((gf_ctr_local_t *)frame->local);
++ frame->local = NULL;
++ }
++}
++
++/* Setting GF_REQUEST_LINK_COUNT_XDATA in dict
++ * that has to be sent to POSIX Xlator to send
++ * link count in unwind path.
++ * return 0 for success with not creation of dict
++ * return 1 for success with creation of dict
++ * return -1 for failure.
++ * */
++static inline int
++set_posix_link_request(xlator_t *this, dict_t **xdata)
++{
++ int ret = -1;
++ gf_boolean_t is_created = _gf_false;
++
++ GF_VALIDATE_OR_GOTO("ctr", this, out);
++ GF_VALIDATE_OR_GOTO(this->name, xdata, out);
++
++ /*create xdata if NULL*/
++ if (!*xdata) {
++ *xdata = dict_new();
++ is_created = _gf_true;
++ ret = 1;
++ } else {
++ ret = 0;
++ }
++
++ if (!*xdata) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_XDATA_NULL,
++ "xdata is NULL :Cannot send "
++ "GF_REQUEST_LINK_COUNT_XDATA to posix");
++ ret = -1;
++ goto out;
++ }
++
++ ret = dict_set_int32(*xdata, GF_REQUEST_LINK_COUNT_XDATA, 1);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_SET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
++ "Failed setting GF_REQUEST_LINK_COUNT_XDATA");
++ ret = -1;
++ goto out;
++ }
++ ret = 0;
++out:
++ if (ret == -1) {
++ if (*xdata && is_created) {
++ dict_unref(*xdata);
++ }
++ }
++ return ret;
++}
++
++/*
++ * If a bitrot fop
++ * */
++#define BITROT_FOP(frame) \
++ (frame->root->pid == GF_CLIENT_PID_BITD || \
++ frame->root->pid == GF_CLIENT_PID_SCRUB)
++
++/*
++ * If a rebalancer fop
++ * */
++#define REBALANCE_FOP(frame) (frame->root->pid == GF_CLIENT_PID_DEFRAG)
++
++/*
++ * If its a tiering rebalancer fop
++ * */
++#define TIER_REBALANCE_FOP(frame) \
++ (frame->root->pid == GF_CLIENT_PID_TIER_DEFRAG)
++
++/*
++ * If its a AFR SELF HEAL
++ * */
++#define AFR_SELF_HEAL_FOP(frame) (frame->root->pid == GF_CLIENT_PID_SELF_HEALD)
++
++/*
++ * if a rebalancer fop goto
++ * */
++#define CTR_IF_REBALANCE_FOP_THEN_GOTO(frame, label) \
++ do { \
++ if (REBALANCE_FOP(frame)) \
++ goto label; \
++ } while (0)
++
++/*
++ * Internal fop
++ *
++ * */
++static inline gf_boolean_t
++is_internal_fop(call_frame_t *frame, dict_t *xdata)
++{
++ gf_boolean_t ret = _gf_false;
++
++ GF_ASSERT(frame);
++ GF_ASSERT(frame->root);
++
++ if (AFR_SELF_HEAL_FOP(frame)) {
++ ret = _gf_true;
++ }
++ if (BITROT_FOP(frame)) {
++ ret = _gf_true;
++ }
++ if (REBALANCE_FOP(frame) || TIER_REBALANCE_FOP(frame)) {
++ ret = _gf_true;
++ if (xdata && dict_get(xdata, CTR_ATTACH_TIER_LOOKUP)) {
++ ret = _gf_false;
++ }
++ }
++ if (xdata && dict_get(xdata, GLUSTERFS_INTERNAL_FOP_KEY)) {
++ ret = _gf_true;
++ }
++
++ return ret;
++}
++
++#define CTR_IF_INTERNAL_FOP_THEN_GOTO(frame, dict, label) \
++ do { \
++ if (is_internal_fop(frame, dict)) \
++ goto label; \
++ } while (0)
++
++/* if fop has failed exit */
++#define CTR_IF_FOP_FAILED_THEN_GOTO(this, op_ret, op_errno, label) \
++ do { \
++ if (op_ret == -1) { \
++ gf_msg_trace(this->name, 0, "Failed fop with %s", \
++ strerror(op_errno)); \
++ goto label; \
++ }; \
++ } while (0)
++
++/*
++ * IS CTR Xlator is disabled then goto to label
++ * */
++#define CTR_IS_DISABLED_THEN_GOTO(this, label) \
++ do { \
++ gf_ctr_private_t *_priv = NULL; \
++ GF_ASSERT(this); \
++ GF_ASSERT(this->private); \
++ _priv = this->private; \
++ if (!_priv->_db_conn) \
++ goto label; \
++ } while (0)
++
++/*
++ * IS CTR record metadata heat is disabled then goto to label
++ * */
++#define CTR_RECORD_METADATA_HEAT_IS_DISABLED_THEN_GOTO(this, label) \
++ do { \
++ gf_ctr_private_t *_priv = NULL; \
++ GF_ASSERT(this); \
++ GF_ASSERT(this->private); \
++ _priv = this->private; \
++ if (!_priv->ctr_record_metadata_heat) \
++ goto label; \
++ } while (0)
++
++int
++fill_db_record_for_unwind(xlator_t *this, gf_ctr_local_t *ctr_local,
++ gfdb_fop_type_t fop_type, gfdb_fop_path_t fop_path);
++
++int
++fill_db_record_for_wind(xlator_t *this, gf_ctr_local_t *ctr_local,
++ gf_ctr_inode_context_t *ctr_inode_cx);
++
++/*******************************************************************************
++ * CTR INSERT WIND
++ * *****************************************************************************
++ * Function used to insert/update record into the database during a wind fop
++ * This function creates ctr_local structure into the frame of the fop
++ * call.
++ * ****************************************************************************/
++
++static inline int
++ctr_insert_wind(call_frame_t *frame, xlator_t *this,
++ gf_ctr_inode_context_t *ctr_inode_cx)
++{
++ int ret = -1;
++ gf_ctr_private_t *_priv = NULL;
++ gf_ctr_local_t *ctr_local = NULL;
++
++ GF_ASSERT(frame);
++ GF_ASSERT(frame->root);
++ GF_ASSERT(this);
++ IS_CTR_INODE_CX_SANE(ctr_inode_cx);
++
++ _priv = this->private;
++ GF_ASSERT(_priv);
++
++ GF_ASSERT(_priv->_db_conn);
++
++ /*If record_wind option of CTR is on record wind for
++ * regular files only*/
++ if (_priv->ctr_record_wind && ctr_inode_cx->ia_type != IA_IFDIR) {
++ frame->local = init_ctr_local_t(this);
++ if (!frame->local) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
++ "WIND: Error while creating ctr local");
++ goto out;
++ };
++ ctr_local = frame->local;
++ ctr_local->client_pid = frame->root->pid;
++ ctr_local->is_internal_fop = ctr_inode_cx->is_internal_fop;
++
++ /* Decide whether to record counters or not */
++ CTR_DB_REC(ctr_local).do_record_counters = _gf_false;
++ /* If record counter is enabled */
++ if (_priv->ctr_record_counter) {
++ /* If not a internal fop */
++ if (!(ctr_local->is_internal_fop)) {
++ /* If its a metadata fop AND
++ * record metadata heat
++ * OR
++ * its NOT a metadata fop */
++ if ((ctr_inode_cx->is_metadata_fop &&
++ _priv->ctr_record_metadata_heat) ||
++ (!ctr_inode_cx->is_metadata_fop)) {
++ CTR_DB_REC(ctr_local).do_record_counters = _gf_true;
++ }
++ }
++ }
++
++ /* Decide whether to record times or not
++ * For non internal FOPS record times as usual*/
++ CTR_DB_REC(ctr_local).do_record_times = _gf_false;
++ if (!ctr_local->is_internal_fop) {
++ /* If its a metadata fop AND
++ * record metadata heat
++ * OR
++ * its NOT a metadata fop */
++ if ((ctr_inode_cx->is_metadata_fop &&
++ _priv->ctr_record_metadata_heat) ||
++ (!ctr_inode_cx->is_metadata_fop)) {
++ CTR_DB_REC(ctr_local).do_record_times =
++ (_priv->ctr_record_wind || _priv->ctr_record_unwind);
++ }
++ }
++ /* when its a internal FOPS*/
++ else {
++ /* Record times only for create
++ * i.e when the inode is created */
++ CTR_DB_REC(ctr_local).do_record_times = (isdentrycreatefop(
++ ctr_inode_cx->fop_type))
++ ? _gf_true
++ : _gf_false;
++ }
++
++ /*Fill the db record for insertion*/
++ ret = fill_db_record_for_wind(this, ctr_local, ctr_inode_cx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_FILL_CTR_LOCAL_ERROR_WIND,
++ "WIND: Error filling ctr local");
++ goto out;
++ }
++
++ /*Insert the db record*/
++ ret = insert_record(_priv->_db_conn, &ctr_local->gfdb_db_record);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_INSERT_RECORD_WIND_FAILED,
++ "WIND: Inserting of record failed!");
++ goto out;
++ }
++ }
++ ret = 0;
++out:
++
++ if (ret) {
++ free_ctr_local(ctr_local);
++ frame->local = NULL;
++ }
++
++ return ret;
++}
++
++/*******************************************************************************
++ * CTR INSERT UNWIND
++ * *****************************************************************************
++ * Function used to insert/update record into the database during a unwind fop
++ * This function destroys ctr_local structure into the frame of the fop
++ * call at the end.
++ * ****************************************************************************/
++static inline int
++ctr_insert_unwind(call_frame_t *frame, xlator_t *this, gfdb_fop_type_t fop_type,
++ gfdb_fop_path_t fop_path)
++{
++ int ret = -1;
++ gf_ctr_private_t *_priv = NULL;
++ gf_ctr_local_t *ctr_local = NULL;
++
++ GF_ASSERT(frame);
++ GF_ASSERT(this);
++
++ _priv = this->private;
++ GF_ASSERT(_priv);
++
++ GF_ASSERT(_priv->_db_conn);
++
++ ctr_local = frame->local;
++
++ if (ctr_local && (_priv->ctr_record_unwind || isdentryfop(fop_type)) &&
++ (ctr_local->ia_inode_type != IA_IFDIR)) {
++ CTR_DB_REC(ctr_local).do_record_uwind_time = _priv->ctr_record_unwind;
++
++ ret = fill_db_record_for_unwind(this, ctr_local, fop_type, fop_path);
++ if (ret == -1) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND,
++ "UNWIND: Error filling ctr local");
++ goto out;
++ }
++
++ ret = insert_record(_priv->_db_conn, &ctr_local->gfdb_db_record);
++ if (ret == -1) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND,
++ "UNWIND: Error filling ctr local");
++ goto out;
++ }
++ }
++ ret = 0;
++out:
++ return ret;
++}
++
++/******************************************************************************
++ * Delete file/flink record/s from db
++ * ****************************************************************************/
++static inline int
++ctr_delete_hard_link_from_db(xlator_t *this, uuid_t gfid, uuid_t pargfid,
++ char *basename, gfdb_fop_type_t fop_type,
++ gfdb_fop_path_t fop_path)
++{
++ int ret = -1;
++ gfdb_db_record_t gfdb_db_record;
++ gf_ctr_private_t *_priv = NULL;
++
++ _priv = this->private;
++ GF_VALIDATE_OR_GOTO(this->name, _priv, out);
++ GF_VALIDATE_OR_GOTO(this->name, (!gf_uuid_is_null(gfid)), out);
++ GF_VALIDATE_OR_GOTO(this->name, (!gf_uuid_is_null(pargfid)), out);
++ GF_VALIDATE_OR_GOTO(this->name, (fop_type == GFDB_FOP_DENTRY_WRITE), out);
++ GF_VALIDATE_OR_GOTO(
++ this->name, (fop_path == GFDB_FOP_UNDEL || GFDB_FOP_UNDEL_ALL), out);
++
++ /* Set gfdb_db_record to 0 */
++ memset(&gfdb_db_record, 0, sizeof(gfdb_db_record));
++
++ /* Copy basename */
++ if (snprintf(gfdb_db_record.file_name, GF_NAME_MAX, "%s", basename) >=
++ GF_NAME_MAX)
++ goto out;
++
++ /* Copy gfid into db record */
++ gf_uuid_copy(gfdb_db_record.gfid, gfid);
++
++ /* Copy pargid into db record */
++ gf_uuid_copy(gfdb_db_record.pargfid, pargfid);
++
++ gfdb_db_record.gfdb_fop_path = fop_path;
++ gfdb_db_record.gfdb_fop_type = fop_type;
++
++ /*send delete request to db*/
++ ret = insert_record(_priv->_db_conn, &gfdb_db_record);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_INSERT_RECORD_WIND_FAILED,
++ "Failed to delete record. %s", basename);
++ goto out;
++ }
++
++ ret = 0;
++out:
++ return ret;
++}
++
++/******************************* Hard link function ***************************/
++
++static inline gf_boolean_t
++__is_inode_expired(ctr_xlator_ctx_t *ctr_xlator_ctx, gf_ctr_private_t *_priv,
++ gfdb_time_t *current_time)
++{
++ gf_boolean_t ret = _gf_false;
++ uint64_t time_diff = 0;
++
++ GF_ASSERT(ctr_xlator_ctx);
++ GF_ASSERT(_priv);
++ GF_ASSERT(current_time);
++
++ time_diff = current_time->tv_sec - ctr_xlator_ctx->inode_heal_period;
++
++ ret = (time_diff >= _priv->ctr_lookupheal_inode_timeout) ? _gf_true
++ : _gf_false;
++ return ret;
++}
++
++static inline gf_boolean_t
++__is_hardlink_expired(ctr_hard_link_t *ctr_hard_link, gf_ctr_private_t *_priv,
++ gfdb_time_t *current_time)
++{
++ gf_boolean_t ret = _gf_false;
++ uint64_t time_diff = 0;
++
++ GF_ASSERT(ctr_hard_link);
++ GF_ASSERT(_priv);
++ GF_ASSERT(current_time);
++
++ time_diff = current_time->tv_sec - ctr_hard_link->hardlink_heal_period;
++
++ ret = ret || (time_diff >= _priv->ctr_lookupheal_link_timeout) ? _gf_true
++ : _gf_false;
++
++ return ret;
++}
++
++/* Return values of heal*/
++typedef enum ctr_heal_ret_val {
++ CTR_CTX_ERROR = -1,
++ /* No healing required */
++ CTR_TRY_NO_HEAL = 0,
++ /* Try healing hard link */
++ CTR_TRY_HARDLINK_HEAL = 1,
++ /* Try healing inode */
++ CTR_TRY_INODE_HEAL = 2,
++} ctr_heal_ret_val_t;
++
++/**
++ * @brief Function to add hard link to the inode context variable.
++ * The inode context maintainences a in-memory list. This is used
++ * smart healing of database.
++ * @param frame of the FOP
++ * @param this is the Xlator instant
++ * @param inode
++ * @return Return ctr_heal_ret_val_t
++ */
++
++static inline ctr_heal_ret_val_t
++add_hard_link_ctx(call_frame_t *frame, xlator_t *this, inode_t *inode)
++{
++ ctr_heal_ret_val_t ret_val = CTR_TRY_NO_HEAL;
++ int ret = -1;
++ gf_ctr_local_t *ctr_local = NULL;
++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
++ ctr_hard_link_t *ctr_hard_link = NULL;
++ gf_ctr_private_t *_priv = NULL;
++ gfdb_time_t current_time = {0};
++
++ GF_ASSERT(frame);
++ GF_ASSERT(this);
++ GF_ASSERT(inode);
++ GF_ASSERT(this->private);
++
++ _priv = this->private;
++
++ ctr_local = frame->local;
++ if (!ctr_local) {
++ goto out;
++ }
++
++ ctr_xlator_ctx = init_ctr_xlator_ctx(this, inode);
++ if (!ctr_xlator_ctx) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_ACCESS_CTR_INODE_CONTEXT_FAILED,
++ "Failed accessing ctr inode context");
++ goto out;
++ }
++
++ LOCK(&ctr_xlator_ctx->lock);
++
++ /* Check if the hard link already exists
++ * in the ctr inode context*/
++ ctr_hard_link = ctr_search_hard_link_ctx(this, ctr_xlator_ctx,
++ CTR_DB_REC(ctr_local).pargfid,
++ CTR_DB_REC(ctr_local).file_name);
++ /* if there then ignore */
++ if (ctr_hard_link) {
++ ret = gettimeofday(&current_time, NULL);
++ if (ret == -1) {
++ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time");
++ ret_val = CTR_CTX_ERROR;
++ goto unlock;
++ }
++
++ if (__is_hardlink_expired(ctr_hard_link, _priv, &current_time)) {
++ ctr_hard_link->hardlink_heal_period = current_time.tv_sec;
++ ret_val = ret_val | CTR_TRY_HARDLINK_HEAL;
++ }
++
++ if (__is_inode_expired(ctr_xlator_ctx, _priv, &current_time)) {
++ ctr_xlator_ctx->inode_heal_period = current_time.tv_sec;
++ ret_val = ret_val | CTR_TRY_INODE_HEAL;
++ }
++
++ goto unlock;
++ }
++
++ /* Add the hard link to the list*/
++ ret = ctr_add_hard_link(this, ctr_xlator_ctx, CTR_DB_REC(ctr_local).pargfid,
++ CTR_DB_REC(ctr_local).file_name);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_ADD_HARDLINK_TO_CTR_INODE_CONTEXT_FAILED,
++ "Failed to add hardlink to the ctr inode context");
++ ret_val = CTR_CTX_ERROR;
++ goto unlock;
++ }
++
++ ret_val = CTR_TRY_NO_HEAL;
++unlock:
++ UNLOCK(&ctr_xlator_ctx->lock);
++out:
++ return ret_val;
++}
++
++static inline int
++delete_hard_link_ctx(call_frame_t *frame, xlator_t *this, inode_t *inode)
++{
++ int ret = -1;
++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
++ gf_ctr_local_t *ctr_local = NULL;
++
++ GF_ASSERT(frame);
++ GF_ASSERT(this);
++ GF_ASSERT(inode);
++
++ ctr_local = frame->local;
++ if (!ctr_local) {
++ goto out;
++ }
++
++ ctr_xlator_ctx = get_ctr_xlator_ctx(this, inode);
++ if (!ctr_xlator_ctx) {
++ /* Since there is no ctr inode context so nothing more to do */
++ ret = 0;
++ goto out;
++ }
++
++ ret = ctr_delete_hard_link(this, ctr_xlator_ctx,
++ CTR_DB_REC(ctr_local).pargfid,
++ CTR_DB_REC(ctr_local).file_name);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_DELETE_HARDLINK_FAILED,
++ "Failed to delete hard link");
++ goto out;
++ }
++
++ ret = 0;
++
++out:
++ return ret;
++}
++
++static inline int
++update_hard_link_ctx(call_frame_t *frame, xlator_t *this, inode_t *inode)
++{
++ int ret = -1;
++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
++ gf_ctr_local_t *ctr_local = NULL;
++
++ GF_ASSERT(frame);
++ GF_ASSERT(this);
++ GF_ASSERT(inode);
++
++ ctr_local = frame->local;
++ if (!ctr_local) {
++ goto out;
++ }
++
++ ctr_xlator_ctx = init_ctr_xlator_ctx(this, inode);
++ if (!ctr_xlator_ctx) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_ACCESS_CTR_INODE_CONTEXT_FAILED,
++ "Failed accessing ctr inode context");
++ goto out;
++ }
++
++ ret = ctr_update_hard_link(
++ this, ctr_xlator_ctx, CTR_DB_REC(ctr_local).pargfid,
++ CTR_DB_REC(ctr_local).file_name, CTR_DB_REC(ctr_local).old_pargfid,
++ CTR_DB_REC(ctr_local).old_file_name);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_DELETE_HARDLINK_FAILED,
++ "Failed to delete hard link");
++ goto out;
++ }
++
++ ret = 0;
++
++out:
++ return ret;
++}
++
++/******************************************************************************
++ *
++ * CTR xlator init related functions
++ *
++ *
++ * ****************************************************************************/
++int
++extract_db_params(xlator_t *this, dict_t *params_dict, gfdb_db_type_t db_type);
++
++int
++extract_ctr_options(xlator_t *this, gf_ctr_private_t *_priv);
++
++#endif
+diff --git a/xlators/features/changetimerecorder/src/ctr-messages.h b/xlators/features/changetimerecorder/src/ctr-messages.h
+new file mode 100644
+index 0000000..23adf0a
+--- /dev/null
++++ b/xlators/features/changetimerecorder/src/ctr-messages.h
+@@ -0,0 +1,61 @@
++/*
++ Copyright (c) 2013 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++ */
++
++#ifndef _CTR_MESSAGES_H_
++#define _CTR_MESSAGES_H_
++
++#include <glusterfs/glfs-message-id.h>
++
++/* To add new message IDs, append new identifiers at the end of the list.
++ *
++ * Never remove a message ID. If it's not used anymore, you can rename it or
++ * leave it as it is, but not delete it. This is to prevent reutilization of
++ * IDs by other messages.
++ *
++ * The component name must match one of the entries defined in
++ * glfs-message-id.h.
++ */
++
++GLFS_MSGID(
++ CTR, CTR_MSG_CREATE_CTR_LOCAL_ERROR_WIND,
++ CTR_MSG_FILL_CTR_LOCAL_ERROR_UNWIND, CTR_MSG_FILL_CTR_LOCAL_ERROR_WIND,
++ CTR_MSG_INSERT_LINK_WIND_FAILED, CTR_MSG_INSERT_WRITEV_WIND_FAILED,
++ CTR_MSG_INSERT_WRITEV_UNWIND_FAILED, CTR_MSG_INSERT_SETATTR_WIND_FAILED,
++ CTR_MSG_INSERT_SETATTR_UNWIND_FAILED,
++ CTR_MSG_INSERT_FREMOVEXATTR_UNWIND_FAILED,
++ CTR_MSG_INSERT_FREMOVEXATTR_WIND_FAILED,
++ CTR_MSG_INSERT_REMOVEXATTR_WIND_FAILED,
++ CTR_MSG_INSERT_REMOVEXATTR_UNWIND_FAILED,
++ CTR_MSG_INSERT_TRUNCATE_WIND_FAILED, CTR_MSG_INSERT_TRUNCATE_UNWIND_FAILED,
++ CTR_MSG_INSERT_FTRUNCATE_UNWIND_FAILED,
++ CTR_MSG_INSERT_FTRUNCATE_WIND_FAILED, CTR_MSG_INSERT_RENAME_WIND_FAILED,
++ CTR_MSG_INSERT_RENAME_UNWIND_FAILED,
++ CTR_MSG_ACCESS_CTR_INODE_CONTEXT_FAILED, CTR_MSG_ADD_HARDLINK_FAILED,
++ CTR_MSG_DELETE_HARDLINK_FAILED, CTR_MSG_UPDATE_HARDLINK_FAILED,
++ CTR_MSG_GET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
++ CTR_MSG_SET_CTR_RESPONSE_LINK_COUNT_XDATA_FAILED,
++ CTR_MSG_INSERT_UNLINK_UNWIND_FAILED, CTR_MSG_INSERT_UNLINK_WIND_FAILED,
++ CTR_MSG_XDATA_NULL, CTR_MSG_INSERT_FSYNC_WIND_FAILED,
++ CTR_MSG_INSERT_FSYNC_UNWIND_FAILED, CTR_MSG_INSERT_MKNOD_UNWIND_FAILED,
++ CTR_MSG_INSERT_MKNOD_WIND_FAILED, CTR_MSG_INSERT_CREATE_WIND_FAILED,
++ CTR_MSG_INSERT_CREATE_UNWIND_FAILED, CTR_MSG_INSERT_RECORD_WIND_FAILED,
++ CTR_MSG_INSERT_READV_WIND_FAILED, CTR_MSG_GET_GFID_FROM_DICT_FAILED,
++ CTR_MSG_SET, CTR_MSG_FATAL_ERROR, CTR_MSG_DANGLING_VOLUME,
++ CTR_MSG_CALLOC_FAILED, CTR_MSG_EXTRACT_CTR_XLATOR_OPTIONS_FAILED,
++ CTR_MSG_INIT_DB_PARAMS_FAILED, CTR_MSG_CREATE_LOCAL_MEMORY_POOL_FAILED,
++ CTR_MSG_MEM_ACC_INIT_FAILED, CTR_MSG_CLOSE_DB_CONN_FAILED,
++ CTR_MSG_FILL_UNWIND_TIME_REC_ERROR, CTR_MSG_WRONG_FOP_PATH,
++ CTR_MSG_CONSTRUCT_DB_PATH_FAILED, CTR_MSG_SET_VALUE_TO_SQL_PARAM_FAILED,
++ CTR_MSG_XLATOR_DISABLED, CTR_MSG_HARDLINK_MISSING_IN_LIST,
++ CTR_MSG_ADD_HARDLINK_TO_LIST_FAILED, CTR_MSG_INIT_LOCK_FAILED,
++ CTR_MSG_COPY_FAILED, CTR_MSG_EXTRACT_DB_PARAM_OPTIONS_FAILED,
++ CTR_MSG_ADD_HARDLINK_TO_CTR_INODE_CONTEXT_FAILED, CTR_MSG_NULL_LOCAL);
++
++#endif /* !_CTR_MESSAGES_H_ */
+diff --git a/xlators/features/changetimerecorder/src/ctr-xlator-ctx.c b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.c
+new file mode 100644
+index 0000000..b6b66d5
+--- /dev/null
++++ b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.c
+@@ -0,0 +1,362 @@
++/*
++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#include "ctr-xlator-ctx.h"
++#include "ctr-messages.h"
++#include <time.h>
++#include <sys/time.h>
++
++#define IS_THE_ONLY_HARDLINK(ctr_hard_link) \
++ (ctr_hard_link->list.next == ctr_hard_link->list.prev)
++
++static void
++fini_ctr_hard_link(ctr_hard_link_t **ctr_hard_link)
++{
++ GF_ASSERT(ctr_hard_link);
++
++ if (*ctr_hard_link)
++ return;
++ GF_FREE((*ctr_hard_link)->base_name);
++ GF_FREE(*ctr_hard_link);
++ *ctr_hard_link = NULL;
++}
++
++/* Please lock the ctr_xlator_ctx before using this function */
++ctr_hard_link_t *
++ctr_search_hard_link_ctx(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
++ uuid_t pgfid, const char *base_name)
++{
++ ctr_hard_link_t *_hard_link = NULL;
++ ctr_hard_link_t *searched_hardlink = NULL;
++
++ GF_ASSERT(this);
++ GF_ASSERT(ctr_xlator_ctx);
++
++ if (pgfid == NULL || base_name == NULL)
++ goto out;
++
++ /*linear search*/
++ list_for_each_entry(_hard_link, &ctr_xlator_ctx->hardlink_list, list)
++ {
++ if (gf_uuid_compare(_hard_link->pgfid, pgfid) == 0 &&
++ _hard_link->base_name &&
++ strcmp(_hard_link->base_name, base_name) == 0) {
++ searched_hardlink = _hard_link;
++ break;
++ }
++ }
++
++out:
++ return searched_hardlink;
++}
++
++/* Please lock the ctr_xlator_ctx before using this function */
++int
++ctr_add_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
++ uuid_t pgfid, const char *base_name)
++{
++ int ret = -1;
++ ctr_hard_link_t *ctr_hard_link = NULL;
++ struct timeval current_time = {0};
++
++ GF_ASSERT(this);
++ GF_ASSERT(ctr_xlator_ctx);
++
++ if (pgfid == NULL || base_name == NULL)
++ goto out;
++
++ ctr_hard_link = GF_CALLOC(1, sizeof(*ctr_hard_link), gf_ctr_mt_hard_link_t);
++ if (!ctr_hard_link) {
++ gf_msg(this->name, GF_LOG_ERROR, ENOMEM, CTR_MSG_CALLOC_FAILED,
++ "Failed allocating "
++ "ctr_hard_link");
++ goto out;
++ }
++
++ /*Initialize the ctr_hard_link object and
++ * Assign the values : parent GFID and basename*/
++ INIT_LIST_HEAD(&ctr_hard_link->list);
++ gf_uuid_copy(ctr_hard_link->pgfid, pgfid);
++ ret = gf_asprintf(&ctr_hard_link->base_name, "%s", base_name);
++ if (ret < 0) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_COPY_FAILED,
++ "Failed copying basename"
++ "to ctr_hard_link");
++ goto error;
++ }
++
++ ret = gettimeofday(&current_time, NULL);
++ if (ret == -1) {
++ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time");
++ goto error;
++ }
++
++ /*Add the hard link to the list*/
++ list_add_tail(&ctr_hard_link->list, &ctr_xlator_ctx->hardlink_list);
++
++ ctr_hard_link->hardlink_heal_period = current_time.tv_sec;
++
++ /*aal izz well!*/
++ ret = 0;
++ goto out;
++error:
++ GF_FREE(ctr_hard_link);
++out:
++ return ret;
++}
++
++static void
++__delete_hard_link_from_list(ctr_hard_link_t **ctr_hard_link)
++{
++ GF_ASSERT(ctr_hard_link);
++ GF_ASSERT(*ctr_hard_link);
++
++ /*Remove hard link from list*/
++ list_del(&(*ctr_hard_link)->list);
++ fini_ctr_hard_link(ctr_hard_link);
++}
++
++int
++ctr_delete_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
++ uuid_t pgfid, const char *base_name)
++{
++ int ret = -1;
++ ctr_hard_link_t *ctr_hard_link = NULL;
++
++ GF_ASSERT(this);
++ GF_ASSERT(ctr_xlator_ctx);
++
++ LOCK(&ctr_xlator_ctx->lock);
++
++ /*Check if the hard link is present */
++ ctr_hard_link = ctr_search_hard_link_ctx(this, ctr_xlator_ctx, pgfid,
++ base_name);
++ if (!ctr_hard_link) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_HARDLINK_MISSING_IN_LIST,
++ "Hard link doesn't exist in the list");
++ goto out;
++ }
++
++ __delete_hard_link_from_list(&ctr_hard_link);
++ ctr_hard_link = NULL;
++
++ ret = 0;
++out:
++ UNLOCK(&ctr_xlator_ctx->lock);
++
++ return ret;
++}
++
++int
++ctr_update_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
++ uuid_t pgfid, const char *base_name, uuid_t old_pgfid,
++ const char *old_base_name)
++{
++ int ret = -1;
++ ctr_hard_link_t *ctr_hard_link = NULL;
++ struct timeval current_time = {0};
++
++ GF_ASSERT(this);
++ GF_ASSERT(ctr_xlator_ctx);
++
++ LOCK(&ctr_xlator_ctx->lock);
++
++ /*Check if the hard link is present */
++ ctr_hard_link = ctr_search_hard_link_ctx(this, ctr_xlator_ctx, old_pgfid,
++ old_base_name);
++ if (!ctr_hard_link) {
++ gf_msg_trace(this->name, 0,
++ "Hard link doesn't exist"
++ " in the list");
++ /* Since the hard link is not present in the list
++ * we add it to the list */
++ ret = ctr_add_hard_link(this, ctr_xlator_ctx, pgfid, base_name);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, 0,
++ CTR_MSG_ADD_HARDLINK_TO_LIST_FAILED,
++ "Failed adding hard link to the list");
++ goto out;
++ }
++ ret = 0;
++ goto out;
++ }
++
++ /* update the hard link */
++ gf_uuid_copy(ctr_hard_link->pgfid, pgfid);
++ GF_FREE(ctr_hard_link->base_name);
++ ret = gf_asprintf(&ctr_hard_link->base_name, "%s", base_name);
++ if (ret < 0) {
++ gf_msg(this->name, GF_LOG_ERROR, 0, CTR_MSG_COPY_FAILED,
++ "Failed copying basename"
++ "to ctr_hard_link");
++ /* delete the corrupted entry */
++ __delete_hard_link_from_list(&ctr_hard_link);
++ ctr_hard_link = NULL;
++ goto out;
++ }
++
++ ret = gettimeofday(&current_time, NULL);
++ if (ret == -1) {
++ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time");
++ ctr_hard_link->hardlink_heal_period = 0;
++ } else {
++ ctr_hard_link->hardlink_heal_period = current_time.tv_sec;
++ }
++
++ ret = 0;
++
++out:
++ UNLOCK(&ctr_xlator_ctx->lock);
++
++ return ret;
++}
++
++/* Delete all hardlinks */
++static int
++ctr_delete_all_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx)
++{
++ int ret = -1;
++ ctr_hard_link_t *ctr_hard_link = NULL;
++ ctr_hard_link_t *tmp = NULL;
++
++ GF_ASSERT(ctr_xlator_ctx);
++
++ LOCK(&ctr_xlator_ctx->lock);
++
++ list_for_each_entry_safe(ctr_hard_link, tmp, &ctr_xlator_ctx->hardlink_list,
++ list)
++ {
++ /*Remove hard link from list*/
++ __delete_hard_link_from_list(&ctr_hard_link);
++ ctr_hard_link = NULL;
++ }
++
++ UNLOCK(&ctr_xlator_ctx->lock);
++
++ ret = 0;
++
++ return ret;
++}
++
++/* Please lock the inode before using this function */
++static ctr_xlator_ctx_t *
++__get_ctr_xlator_ctx(xlator_t *this, inode_t *inode)
++{
++ int ret = 0;
++ uint64_t _addr = 0;
++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
++
++ GF_ASSERT(this);
++ GF_ASSERT(inode);
++
++ ret = __inode_ctx_get(inode, this, &_addr);
++ if (ret < 0)
++ _addr = 0;
++ if (_addr != 0) {
++ ctr_xlator_ctx = (ctr_xlator_ctx_t *)(long)_addr;
++ }
++
++ return ctr_xlator_ctx;
++}
++
++ctr_xlator_ctx_t *
++init_ctr_xlator_ctx(xlator_t *this, inode_t *inode)
++{
++ int ret = -1;
++ uint64_t _addr = 0;
++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
++ struct timeval current_time = {0};
++
++ GF_ASSERT(this);
++ GF_ASSERT(inode);
++
++ LOCK(&inode->lock);
++ {
++ ctr_xlator_ctx = __get_ctr_xlator_ctx(this, inode);
++ if (ctr_xlator_ctx) {
++ ret = 0;
++ goto out;
++ }
++ ctr_xlator_ctx = GF_CALLOC(1, sizeof(*ctr_xlator_ctx),
++ gf_ctr_mt_xlator_ctx);
++ if (!ctr_xlator_ctx)
++ goto out;
++
++ ret = LOCK_INIT(&ctr_xlator_ctx->lock);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_ERROR, ret, CTR_MSG_INIT_LOCK_FAILED,
++ "Failed init lock %s", strerror(ret));
++ goto out;
++ }
++ _addr = (uint64_t)(uintptr_t)ctr_xlator_ctx;
++
++ ret = __inode_ctx_set(inode, this, &_addr);
++ if (ret) {
++ goto out;
++ }
++
++ INIT_LIST_HEAD(&ctr_xlator_ctx->hardlink_list);
++
++ ret = gettimeofday(&current_time, NULL);
++ if (ret == -1) {
++ gf_log(this->name, GF_LOG_ERROR, "Failed to get current time");
++ goto out;
++ }
++
++ ctr_xlator_ctx->inode_heal_period = current_time.tv_sec;
++ }
++ ret = 0;
++out:
++ if (ret) {
++ GF_FREE(ctr_xlator_ctx);
++ ctr_xlator_ctx = NULL;
++ }
++
++ UNLOCK(&inode->lock);
++
++ return ctr_xlator_ctx;
++}
++
++void
++fini_ctr_xlator_ctx(xlator_t *this, inode_t *inode)
++{
++ int ret = 0;
++ uint64_t _addr = 0;
++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
++
++ inode_ctx_del(inode, this, &_addr);
++ if (!_addr)
++ return;
++
++ ctr_xlator_ctx = (ctr_xlator_ctx_t *)(long)_addr;
++
++ ret = ctr_delete_all_hard_link(this, ctr_xlator_ctx);
++ if (ret) {
++ gf_msg(this->name, GF_LOG_WARNING, 0, CTR_MSG_DELETE_HARDLINK_FAILED,
++ "Failed deleting all "
++ "hard links from inode context");
++ }
++
++ LOCK_DESTROY(&ctr_xlator_ctx->lock);
++
++ GF_FREE(ctr_xlator_ctx);
++}
++
++ctr_xlator_ctx_t *
++get_ctr_xlator_ctx(xlator_t *this, inode_t *inode)
++{
++ ctr_xlator_ctx_t *ctr_xlator_ctx = NULL;
++
++ LOCK(&inode->lock);
++ ctr_xlator_ctx = __get_ctr_xlator_ctx(this, inode);
++ UNLOCK(&inode->lock);
++
++ return ctr_xlator_ctx;
++}
+diff --git a/xlators/features/changetimerecorder/src/ctr-xlator-ctx.h b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.h
+new file mode 100644
+index 0000000..4e3bf7e
+--- /dev/null
++++ b/xlators/features/changetimerecorder/src/ctr-xlator-ctx.h
+@@ -0,0 +1,68 @@
++/*
++ Copyright (c) 2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#ifndef __CTR_XLATOR_CTX_H
++#define __CTR_XLATOR_CTX_H
++
++#include <glusterfs/xlator.h>
++#include "ctr_mem_types.h"
++#include <glusterfs/iatt.h>
++#include <glusterfs/glusterfs.h>
++#include <glusterfs/xlator.h>
++#include <glusterfs/logging.h>
++#include <glusterfs/locking.h>
++#include <glusterfs/common-utils.h>
++#include <time.h>
++#include <sys/time.h>
++
++typedef struct ctr_hard_link {
++ uuid_t pgfid;
++ char *base_name;
++ /* Hardlink expiry : Defines the expiry period after which a
++ * database heal is attempted. */
++ uint64_t hardlink_heal_period;
++ struct list_head list;
++} ctr_hard_link_t;
++
++typedef struct ctr_xlator_ctx {
++ /* This represents the looked up hardlinks
++ * NOTE: This doesn't represent all physical hardlinks of the inode*/
++ struct list_head hardlink_list;
++ uint64_t inode_heal_period;
++ gf_lock_t lock;
++} ctr_xlator_ctx_t;
++
++ctr_hard_link_t *
++ctr_search_hard_link_ctx(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
++ uuid_t pgfid, const char *base_name);
++
++int
++ctr_add_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
++ uuid_t pgfid, const char *base_name);
++
++int
++ctr_delete_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
++ uuid_t pgfid, const char *base_name);
++
++int
++ctr_update_hard_link(xlator_t *this, ctr_xlator_ctx_t *ctr_xlator_ctx,
++ uuid_t pgfid, const char *base_name, uuid_t old_pgfid,
++ const char *old_base_name);
++
++ctr_xlator_ctx_t *
++get_ctr_xlator_ctx(xlator_t *this, inode_t *inode);
++
++ctr_xlator_ctx_t *
++init_ctr_xlator_ctx(xlator_t *this, inode_t *inode);
++
++void
++fini_ctr_xlator_ctx(xlator_t *this, inode_t *inode);
++
++#endif
+diff --git a/xlators/features/changetimerecorder/src/ctr_mem_types.h b/xlators/features/changetimerecorder/src/ctr_mem_types.h
+new file mode 100644
+index 0000000..7b8f531
+--- /dev/null
++++ b/xlators/features/changetimerecorder/src/ctr_mem_types.h
+@@ -0,0 +1,22 @@
++/*
++ Copyright (c) 2008-2015 Red Hat, Inc. <http://www.redhat.com>
++ This file is part of GlusterFS.
++
++ This file is licensed to you under your choice of the GNU Lesser
++ General Public License, version 3 or any later version (LGPLv3 or
++ later), or the GNU General Public License, version 2 (GPLv2), in all
++ cases as published by the Free Software Foundation.
++*/
++
++#ifndef __CTR_MEM_TYPES_H__
++#define __CTR_MEM_TYPES_H__
++
++#include "gfdb_mem-types.h"
++
++enum gf_ctr_mem_types_ {
++ gf_ctr_mt_private_t = gfdb_mt_end + 1,
++ gf_ctr_mt_xlator_ctx,
++ gf_ctr_mt_hard_link_t,
++ gf_ctr_mt_end
++};
++#endif
+--
+1.8.3.1
+