summaryrefslogtreecommitdiff
path: root/0457-cluster-ec-Improve-detection-of-new-heals.patch
diff options
context:
space:
mode:
Diffstat (limited to '0457-cluster-ec-Improve-detection-of-new-heals.patch')
-rw-r--r--0457-cluster-ec-Improve-detection-of-new-heals.patch409
1 files changed, 409 insertions, 0 deletions
diff --git a/0457-cluster-ec-Improve-detection-of-new-heals.patch b/0457-cluster-ec-Improve-detection-of-new-heals.patch
new file mode 100644
index 0000000..be9202a
--- /dev/null
+++ b/0457-cluster-ec-Improve-detection-of-new-heals.patch
@@ -0,0 +1,409 @@
+From 3e8b3a2c2c6f83635486035fc8040c87d89813d2 Mon Sep 17 00:00:00 2001
+From: Xavi Hernandez <xhernandez@redhat.com>
+Date: Thu, 2 Jul 2020 18:08:52 +0200
+Subject: [PATCH 457/465] cluster/ec: Improve detection of new heals
+
+When EC successfully healed a directory it assumed that maybe other
+entries inside that directory could have been created, which could
+require additional heal cycles. For this reason, when the heal happened
+as part of one index heal iteration, it triggered a new iteration.
+
+The problem happened when the directory was healthy, so no new entries
+were added, but its index entry was not removed for some reason. In
+this case self-heal started and endless loop healing the same directory
+continuously, cause high CPU utilization.
+
+This patch improves detection of new files added to the heal index so
+that a new index heal iteration is only triggered if there is new work
+to do.
+
+>Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24665/
+>Fixes: #1354
+
+Change-Id: I2355742b85fbfa6de758bccc5d2e1a283c82b53f
+BUG: 1852736
+Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/208041
+Tested-by: Ashish Pandey <aspandey@redhat.com>
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+Reviewed-by: Ashish Pandey <aspandey@redhat.com>
+---
+ xlators/cluster/ec/src/ec-common.c | 2 +-
+ xlators/cluster/ec/src/ec-heal.c | 58 +++++++++++++++++++++++-----------
+ xlators/cluster/ec/src/ec-heald.c | 24 ++++++++++----
+ xlators/cluster/ec/src/ec-inode-read.c | 27 ++++++++++++++--
+ xlators/cluster/ec/src/ec-types.h | 4 +--
+ xlators/cluster/ec/src/ec.h | 1 +
+ 6 files changed, 86 insertions(+), 30 deletions(-)
+
+diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
+index e580bfb..e3f8769 100644
+--- a/xlators/cluster/ec/src/ec-common.c
++++ b/xlators/cluster/ec/src/ec-common.c
+@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
+ int32_t
+ ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
+- uintptr_t bad, dict_t *xdata)
++ uintptr_t bad, uint32_t pending, dict_t *xdata)
+ {
+ if (op_ret < 0) {
+ gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
+diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
+index 06a7016..e2de879 100644
+--- a/xlators/cluster/ec/src/ec-heal.c
++++ b/xlators/cluster/ec/src/ec-heal.c
+@@ -72,6 +72,7 @@ struct ec_name_data {
+ char *name;
+ inode_t *parent;
+ default_args_cbk_t *replies;
++ uint32_t heal_pending;
+ };
+
+ static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
+@@ -996,6 +997,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
+ ret = -ENOTCONN;
+ goto out;
+ }
++
+ out:
+ if (xattr)
+ dict_unref(xattr);
+@@ -1164,6 +1166,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+ dict_t *xdata = NULL;
+ char *linkname = NULL;
+ ec_config_t config;
++
+ /* There should be just one gfid key */
+ EC_REPLIES_ALLOC(replies, ec->nodes);
+ if (gfid_db->count != 1) {
+@@ -1408,6 +1411,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+
+ ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
+ participants);
++ if (ret >= 0) {
++ /* If ec_create_name() succeeded we return 1 to indicate that a new
++ * file has been created and it will need to be healed. */
++ ret = 1;
++ }
+ out:
+ cluster_replies_wipe(replies, ec->nodes);
+ loc_wipe(&loc);
+@@ -1485,18 +1493,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+ ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
+ name_on);
+
+- if (ret < 0)
++ if (ret < 0) {
+ memset(name_on, 0, ec->nodes);
++ } else {
++ name_data->heal_pending += ret;
++ }
+
+ for (i = 0; i < ec->nodes; i++)
+ if (name_data->participants[i] && !name_on[i])
+ name_data->failed_on[i] = 1;
++
+ return 0;
+ }
+
+ int
+ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+- unsigned char *participants)
++ unsigned char *participants, uint32_t *pending)
+ {
+ int i = 0;
+ int j = 0;
+@@ -1509,7 +1521,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ name_data.frame = frame;
+ name_data.participants = participants;
+ name_data.failed_on = alloca0(ec->nodes);
+- ;
++ name_data.heal_pending = 0;
+
+ for (i = 0; i < ec->nodes; i++) {
+ if (!participants[i])
+@@ -1528,6 +1540,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ break;
+ }
+ }
++ *pending += name_data.heal_pending;
++
+ loc_wipe(&loc);
+ return ret;
+ }
+@@ -1535,7 +1549,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ int
+ __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ unsigned char *heal_on, unsigned char *sources,
+- unsigned char *healed_sinks)
++ unsigned char *healed_sinks, uint32_t *pending)
+ {
+ unsigned char *locked_on = NULL;
+ unsigned char *output = NULL;
+@@ -1580,7 +1594,7 @@ unlock:
+ if (sources[i] || healed_sinks[i])
+ participants[i] = 1;
+ }
+- ret = ec_heal_names(frame, ec, inode, participants);
++ ret = ec_heal_names(frame, ec, inode, participants, pending);
+
+ if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
+ goto out;
+@@ -1601,7 +1615,8 @@ out:
+
+ int
+ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+- unsigned char *sources, unsigned char *healed_sinks)
++ unsigned char *sources, unsigned char *healed_sinks,
++ uint32_t *pending)
+ {
+ unsigned char *locked_on = NULL;
+ unsigned char *up_subvols = NULL;
+@@ -1632,7 +1647,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ goto unlock;
+ }
+ ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
+- healed_sinks);
++ healed_sinks, pending);
+ }
+ unlock:
+ cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+@@ -1953,14 +1968,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0,
+ (heal->good | heal->bad), heal->good, heal->bad,
+- NULL);
++ 0, NULL);
+ }
+
+ return EC_STATE_END;
+ case -EC_STATE_REPORT:
+ if (fop->cbks.heal) {
+- fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
+- 0, 0, NULL);
++ fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
++ fop->error, 0, 0, 0, 0, NULL);
+ }
+
+ return EC_STATE_END;
+@@ -1997,14 +2012,15 @@ out:
+ if (fop != NULL) {
+ ec_manager(fop, error);
+ } else {
+- func(frame, NULL, this, -1, error, 0, 0, 0, NULL);
++ func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
+ }
+ }
+
+ int32_t
+ ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
+ int32_t op_ret, int32_t op_errno, uintptr_t mask,
+- uintptr_t good, uintptr_t bad, dict_t *xdata)
++ uintptr_t good, uintptr_t bad, uint32_t pending,
++ dict_t *xdata)
+ {
+ ec_fop_data_t *fop = cookie;
+ ec_heal_t *heal = fop->data;
+@@ -2489,6 +2505,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+ intptr_t mbad = 0;
+ intptr_t good = 0;
+ intptr_t bad = 0;
++ uint32_t pending = 0;
+ ec_fop_data_t *fop = data;
+ gf_boolean_t blocking = _gf_false;
+ ec_heal_need_t need_heal = EC_HEAL_NONEED;
+@@ -2524,7 +2541,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+ if (loc->name && strlen(loc->name)) {
+ ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
+ participants);
+- if (ret == 0) {
++ if (ret >= 0) {
+ gf_msg_debug(this->name, 0,
+ "%s: name heal "
+ "successful on %" PRIXPTR,
+@@ -2542,7 +2559,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+
+ /* Mount triggers heal only when it detects that it must need heal, shd
+ * triggers heals periodically which need not be thorough*/
+- if (ec->shd.iamshd) {
++ if (ec->shd.iamshd && (ret <= 0)) {
+ ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
+ &need_heal);
+
+@@ -2552,13 +2569,15 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+ goto out;
+ }
+ }
++
+ sources = alloca0(ec->nodes);
+ healed_sinks = alloca0(ec->nodes);
+ if (IA_ISREG(loc->inode->ia_type)) {
+ ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
+ healed_sinks);
+ } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
+- ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
++ ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
++ &pending);
+ } else {
+ ret = 0;
+ memcpy(sources, participants, ec->nodes);
+@@ -2588,10 +2607,11 @@ out:
+ if (fop->cbks.heal) {
+ fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
+ ec_char_array_to_mask(participants, ec->nodes),
+- mgood & good, mbad & bad, NULL);
++ mgood & good, mbad & bad, pending, NULL);
+ }
+ if (frame)
+ STACK_DESTROY(frame->root);
++
+ return;
+ }
+
+@@ -2638,8 +2658,8 @@ void
+ ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
+ {
+ if (fop->cbks.heal) {
+- fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0,
+- NULL);
++ fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
++ 0, 0, NULL);
+ }
+ ec_fop_data_release(fop);
+ }
+@@ -2826,7 +2846,7 @@ fail:
+ if (fop)
+ ec_fop_data_release(fop);
+ if (func)
+- func(frame, NULL, this, -1, err, 0, 0, 0, NULL);
++ func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
+ }
+
+ int
+diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
+index cba111a..4f4b6aa 100644
+--- a/xlators/cluster/ec/src/ec-heald.c
++++ b/xlators/cluster/ec/src/ec-heald.c
+@@ -156,15 +156,27 @@ int
+ ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
+ gf_boolean_t full)
+ {
++ dict_t *xdata = NULL;
++ uint32_t count;
+ int32_t ret;
+
+- ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
+- if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
++ ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata);
++ if (!full && (loc->inode->ia_type == IA_IFDIR)) {
+ /* If we have just healed a directory, it's possible that
+- * other index entries have appeared to be healed. We put a
+- * mark so that we can check it later and restart a scan
+- * without delay. */
+- healer->rerun = _gf_true;
++ * other index entries have appeared to be healed. */
++ if ((xdata != NULL) &&
++ (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
++ (count > 0)) {
++ /* Force a rerun of the index healer. */
++ gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
++ count);
++
++ healer->rerun = _gf_true;
++ }
++ }
++
++ if (xdata != NULL) {
++ dict_unref(xdata);
+ }
+
+ return ret;
+diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
+index f87a94a..e82e8f6 100644
+--- a/xlators/cluster/ec/src/ec-inode-read.c
++++ b/xlators/cluster/ec/src/ec-inode-read.c
+@@ -393,7 +393,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
+ int32_t
+ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
+ int32_t op_ret, int32_t op_errno, uintptr_t mask,
+- uintptr_t good, uintptr_t bad, dict_t *xdata)
++ uintptr_t good, uintptr_t bad, uint32_t pending,
++ dict_t *xdata)
+ {
+ ec_fop_data_t *fop = cookie;
+ fop_getxattr_cbk_t func = fop->data;
+@@ -402,6 +403,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
+ char *str;
+ char bin1[65], bin2[65];
+
++ /* We try to return the 'pending' information in xdata, but if this cannot
++ * be set, we will ignore it silently. We prefer to report the success or
++ * failure of the heal itself. */
++ if (xdata == NULL) {
++ xdata = dict_new();
++ } else {
++ dict_ref(xdata);
++ }
++ if (xdata != NULL) {
++ if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
++ /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
++ * enforces to check the result in this case. However we don't
++ * really care if it succeeded or not. We'll just do the same.
++ *
++ * This empty 'if' avoids the warning, and it will be removed by
++ * the optimizer. */
++ }
++ }
++
+ if (op_ret >= 0) {
+ dict = dict_new();
+ if (dict == NULL) {
+@@ -435,11 +455,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
+ }
+
+ out:
+- func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
++ func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
+
+ if (dict != NULL) {
+ dict_unref(dict);
+ }
++ if (xdata != NULL) {
++ dict_unref(xdata);
++ }
+
+ return 0;
+ }
+diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
+index 34a9768..f15429d 100644
+--- a/xlators/cluster/ec/src/ec-types.h
++++ b/xlators/cluster/ec/src/ec-types.h
+@@ -186,10 +186,10 @@ struct _ec_inode {
+
+ typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
+ int32_t, uintptr_t, uintptr_t, uintptr_t,
+- dict_t *);
++ uint32_t, dict_t *);
+ typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
+ int32_t, uintptr_t, uintptr_t, uintptr_t,
+- dict_t *);
++ uint32_t, dict_t *);
+
+ union _ec_cbk {
+ fop_access_cbk_t access;
+diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
+index 1b210d9..6f6de6d 100644
+--- a/xlators/cluster/ec/src/ec.h
++++ b/xlators/cluster/ec/src/ec.h
+@@ -18,6 +18,7 @@
+ #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
+ #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
+ #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
++#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
+ #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
+ #define EC_STRIPE_CACHE_MAX_SIZE 10
+ #define EC_VERSION_SIZE 2
+--
+1.8.3.1
+