1 files changed, 409 insertions, 0 deletions
diff --git a/0457-cluster-ec-Improve-detection-of-new-heals.patch b/0457-cluster-ec-Improve-detection-of-new-heals.patch
new file mode 100644
index 0000000..be9202a
--- /dev/null
+++ b/0457-cluster-ec-Improve-detection-of-new-heals.patch
@@ -0,0 +1,409 @@
+From 3e8b3a2c2c6f83635486035fc8040c87d89813d2 Mon Sep 17 00:00:00 2001
+From: Xavi Hernandez <xhernandez@redhat.com>
+Date: Thu, 2 Jul 2020 18:08:52 +0200
+Subject: [PATCH 457/465] cluster/ec: Improve detection of new heals
+
+When EC successfully healed a directory it assumed that maybe other
+entries inside that directory could have been created, which could
+require additional heal cycles. For this reason, when the heal happened
+as part of one index heal iteration, it triggered a new iteration.
+
+The problem happened when the directory was healthy, so no new entries
+were added, but its index entry was not removed for some reason. In
+this case self-heal started and endless loop healing the same directory
+continuously, cause high CPU utilization.
+
+This patch improves detection of new files added to the heal index so
+that a new index heal iteration is only triggered if there is new work
+to do.
+
+>Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24665/
+>Fixes: #1354
+
+Change-Id: I2355742b85fbfa6de758bccc5d2e1a283c82b53f
+BUG: 1852736
+Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/208041
+Tested-by: Ashish Pandey <aspandey@redhat.com>
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+Reviewed-by: Ashish Pandey <aspandey@redhat.com>
+---
+ xlators/cluster/ec/src/ec-common.c     |  2 +-
+ xlators/cluster/ec/src/ec-heal.c       | 58 +++++++++++++++++++++++-----------
+ xlators/cluster/ec/src/ec-heald.c      | 24 ++++++++++----
+ xlators/cluster/ec/src/ec-inode-read.c | 27 ++++++++++++++--
+ xlators/cluster/ec/src/ec-types.h      |  4 +--
+ xlators/cluster/ec/src/ec.h            |  1 +
+ 6 files changed, 86 insertions(+), 30 deletions(-)
+
+diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
+index e580bfb..e3f8769 100644
+--- a/xlators/cluster/ec/src/ec-common.c
++++ b/xlators/cluster/ec/src/ec-common.c
+@@ -230,7 +230,7 @@ ec_child_next(ec_t *ec, ec_fop_data_t *fop, uint32_t idx)
+ int32_t
+ ec_heal_report(call_frame_t *frame, void *cookie, xlator_t *this,
+                int32_t op_ret, int32_t op_errno, uintptr_t mask, uintptr_t good,
+-               uintptr_t bad, dict_t *xdata)
++               uintptr_t bad, uint32_t pending, dict_t *xdata)
+ {
+     if (op_ret < 0) {
+         gf_msg(this->name, GF_LOG_DEBUG, op_errno, EC_MSG_HEAL_FAIL,
+diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
+index 06a7016..e2de879 100644
+--- a/xlators/cluster/ec/src/ec-heal.c
++++ b/xlators/cluster/ec/src/ec-heal.c
+@@ -72,6 +72,7 @@ struct ec_name_data {
+     char *name;
+     inode_t *parent;
+     default_args_cbk_t *replies;
++    uint32_t heal_pending;
+ };
+ 
+ static char *ec_ignore_xattrs[] = {GF_SELINUX_XATTR_KEY, QUOTA_SIZE_KEY, NULL};
+@@ -996,6 +997,7 @@ ec_set_new_entry_dirty(ec_t *ec, loc_t *loc, struct iatt *ia,
+         ret = -ENOTCONN;
+         goto out;
+     }
++
+ out:
+     if (xattr)
+         dict_unref(xattr);
+@@ -1164,6 +1166,7 @@ ec_create_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+     dict_t *xdata = NULL;
+     char *linkname = NULL;
+     ec_config_t config;
++
+     /* There should be just one gfid key */
+     EC_REPLIES_ALLOC(replies, ec->nodes);
+     if (gfid_db->count != 1) {
+@@ -1408,6 +1411,11 @@ __ec_heal_name(call_frame_t *frame, ec_t *ec, inode_t *parent, char *name,
+ 
+     ret = ec_create_name(frame, ec, parent, name, replies, gfid_db, enoent,
+                          participants);
++    if (ret >= 0) {
++        /* If ec_create_name() succeeded we return 1 to indicate that a new
++         * file has been created and it will need to be healed. */
++        ret = 1;
++    }
+ out:
+     cluster_replies_wipe(replies, ec->nodes);
+     loc_wipe(&loc);
+@@ -1485,18 +1493,22 @@ ec_name_heal_handler(xlator_t *subvol, gf_dirent_t *entry, loc_t *parent,
+     ret = ec_heal_name(name_data->frame, ec, parent->inode, entry->d_name,
+                        name_on);
+ 
+-    if (ret < 0)
++    if (ret < 0) {
+         memset(name_on, 0, ec->nodes);
++    } else {
++        name_data->heal_pending += ret;
++    }
+ 
+     for (i = 0; i < ec->nodes; i++)
+         if (name_data->participants[i] && !name_on[i])
+             name_data->failed_on[i] = 1;
++
+     return 0;
+ }
+ 
+ int
+ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+-              unsigned char *participants)
++              unsigned char *participants, uint32_t *pending)
+ {
+     int i = 0;
+     int j = 0;
+@@ -1509,7 +1521,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+     name_data.frame = frame;
+     name_data.participants = participants;
+     name_data.failed_on = alloca0(ec->nodes);
+-    ;
++    name_data.heal_pending = 0;
+ 
+     for (i = 0; i < ec->nodes; i++) {
+         if (!participants[i])
+@@ -1528,6 +1540,8 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+             break;
+         }
+     }
++    *pending += name_data.heal_pending;
++
+     loc_wipe(&loc);
+     return ret;
+ }
+@@ -1535,7 +1549,7 @@ ec_heal_names(call_frame_t *frame, ec_t *ec, inode_t *inode,
+ int
+ __ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+                 unsigned char *heal_on, unsigned char *sources,
+-                unsigned char *healed_sinks)
++                unsigned char *healed_sinks, uint32_t *pending)
+ {
+     unsigned char *locked_on = NULL;
+     unsigned char *output = NULL;
+@@ -1580,7 +1594,7 @@ unlock:
+         if (sources[i] || healed_sinks[i])
+             participants[i] = 1;
+     }
+-    ret = ec_heal_names(frame, ec, inode, participants);
++    ret = ec_heal_names(frame, ec, inode, participants, pending);
+ 
+     if (EC_COUNT(participants, ec->nodes) <= ec->fragments)
+         goto out;
+@@ -1601,7 +1615,8 @@ out:
+ 
+ int
+ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+-              unsigned char *sources, unsigned char *healed_sinks)
++              unsigned char *sources, unsigned char *healed_sinks,
++              uint32_t *pending)
+ {
+     unsigned char *locked_on = NULL;
+     unsigned char *up_subvols = NULL;
+@@ -1632,7 +1647,7 @@ ec_heal_entry(call_frame_t *frame, ec_t *ec, inode_t *inode,
+             goto unlock;
+         }
+         ret = __ec_heal_entry(frame, ec, inode, locked_on, sources,
+-                              healed_sinks);
++                              healed_sinks, pending);
+     }
+ unlock:
+     cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame,
+@@ -1953,14 +1968,14 @@ ec_manager_heal_block(ec_fop_data_t *fop, int32_t state)
+             if (fop->cbks.heal) {
+                 fop->cbks.heal(fop->req_frame, fop, fop->xl, 0, 0,
+                                (heal->good | heal->bad), heal->good, heal->bad,
+-                               NULL);
++                               0, NULL);
+             }
+ 
+             return EC_STATE_END;
+         case -EC_STATE_REPORT:
+             if (fop->cbks.heal) {
+-                fop->cbks.heal(fop->req_frame, fop, fop->xl, -1, fop->error, 0,
+-                               0, 0, NULL);
++                fop->cbks.heal(fop->req_frame, fop->data, fop->xl, -1,
++                               fop->error, 0, 0, 0, 0, NULL);
+             }
+ 
+             return EC_STATE_END;
+@@ -1997,14 +2012,15 @@ out:
+     if (fop != NULL) {
+         ec_manager(fop, error);
+     } else {
+-        func(frame, NULL, this, -1, error, 0, 0, 0, NULL);
++        func(frame, heal, this, -1, error, 0, 0, 0, 0, NULL);
+     }
+ }
+ 
+ int32_t
+ ec_heal_block_done(call_frame_t *frame, void *cookie, xlator_t *this,
+                    int32_t op_ret, int32_t op_errno, uintptr_t mask,
+-                   uintptr_t good, uintptr_t bad, dict_t *xdata)
++                   uintptr_t good, uintptr_t bad, uint32_t pending,
++                   dict_t *xdata)
+ {
+     ec_fop_data_t *fop = cookie;
+     ec_heal_t *heal = fop->data;
+@@ -2489,6 +2505,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+     intptr_t mbad = 0;
+     intptr_t good = 0;
+     intptr_t bad = 0;
++    uint32_t pending = 0;
+     ec_fop_data_t *fop = data;
+     gf_boolean_t blocking = _gf_false;
+     ec_heal_need_t need_heal = EC_HEAL_NONEED;
+@@ -2524,7 +2541,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+     if (loc->name && strlen(loc->name)) {
+         ret = ec_heal_name(frame, ec, loc->parent, (char *)loc->name,
+                            participants);
+-        if (ret == 0) {
++        if (ret >= 0) {
+             gf_msg_debug(this->name, 0,
+                          "%s: name heal "
+                          "successful on %" PRIXPTR,
+@@ -2542,7 +2559,7 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+ 
+     /* Mount triggers heal only when it detects that it must need heal, shd
+      * triggers heals periodically which need not be thorough*/
+-    if (ec->shd.iamshd) {
++    if (ec->shd.iamshd && (ret <= 0)) {
+         ec_heal_inspect(frame, ec, loc->inode, up_subvols, _gf_false, _gf_false,
+                         &need_heal);
+ 
+@@ -2552,13 +2569,15 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+             goto out;
+         }
+     }
++
+     sources = alloca0(ec->nodes);
+     healed_sinks = alloca0(ec->nodes);
+     if (IA_ISREG(loc->inode->ia_type)) {
+         ret = ec_heal_data(frame, ec, blocking, loc->inode, sources,
+                            healed_sinks);
+     } else if (IA_ISDIR(loc->inode->ia_type) && !partial) {
+-        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks);
++        ret = ec_heal_entry(frame, ec, loc->inode, sources, healed_sinks,
++                            &pending);
+     } else {
+         ret = 0;
+         memcpy(sources, participants, ec->nodes);
+@@ -2588,10 +2607,11 @@ out:
+     if (fop->cbks.heal) {
+         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
+                        ec_char_array_to_mask(participants, ec->nodes),
+-                       mgood & good, mbad & bad, NULL);
++                       mgood & good, mbad & bad, pending, NULL);
+     }
+     if (frame)
+         STACK_DESTROY(frame->root);
++
+     return;
+ }
+ 
+@@ -2638,8 +2658,8 @@ void
+ ec_heal_fail(ec_t *ec, ec_fop_data_t *fop)
+ {
+     if (fop->cbks.heal) {
+-        fop->cbks.heal(fop->req_frame, NULL, ec->xl, -1, fop->error, 0, 0, 0,
+-                       NULL);
++        fop->cbks.heal(fop->req_frame, fop->data, ec->xl, -1, fop->error, 0, 0,
++                       0, 0, NULL);
+     }
+     ec_fop_data_release(fop);
+ }
+@@ -2826,7 +2846,7 @@ fail:
+     if (fop)
+         ec_fop_data_release(fop);
+     if (func)
+-        func(frame, NULL, this, -1, err, 0, 0, 0, NULL);
++        func(frame, data, this, -1, err, 0, 0, 0, 0, NULL);
+ }
+ 
+ int
+diff --git a/xlators/cluster/ec/src/ec-heald.c b/xlators/cluster/ec/src/ec-heald.c
+index cba111a..4f4b6aa 100644
+--- a/xlators/cluster/ec/src/ec-heald.c
++++ b/xlators/cluster/ec/src/ec-heald.c
+@@ -156,15 +156,27 @@ int
+ ec_shd_selfheal(struct subvol_healer *healer, int child, loc_t *loc,
+                 gf_boolean_t full)
+ {
++    dict_t *xdata = NULL;
++    uint32_t count;
+     int32_t ret;
+ 
+-    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, NULL);
+-    if (!full && (ret >= 0) && (loc->inode->ia_type == IA_IFDIR)) {
++    ret = syncop_getxattr(healer->this, loc, NULL, EC_XATTR_HEAL, NULL, &xdata);
++    if (!full && (loc->inode->ia_type == IA_IFDIR)) {
+         /* If we have just healed a directory, it's possible that
+-         * other index entries have appeared to be healed. We put a
+-         * mark so that we can check it later and restart a scan
+-         * without delay. */
+-        healer->rerun = _gf_true;
++         * other index entries have appeared to be healed. */
++        if ((xdata != NULL) &&
++            (dict_get_uint32(xdata, EC_XATTR_HEAL_NEW, &count) == 0) &&
++            (count > 0)) {
++            /* Force a rerun of the index healer. */
++            gf_msg_debug(healer->this->name, 0, "%d more entries to heal",
++                         count);
++
++            healer->rerun = _gf_true;
++        }
++    }
++
++    if (xdata != NULL) {
++        dict_unref(xdata);
+     }
+ 
+     return ret;
+diff --git a/xlators/cluster/ec/src/ec-inode-read.c b/xlators/cluster/ec/src/ec-inode-read.c
+index f87a94a..e82e8f6 100644
+--- a/xlators/cluster/ec/src/ec-inode-read.c
++++ b/xlators/cluster/ec/src/ec-inode-read.c
+@@ -393,7 +393,8 @@ ec_manager_getxattr(ec_fop_data_t *fop, int32_t state)
+ int32_t
+ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
+                      int32_t op_ret, int32_t op_errno, uintptr_t mask,
+-                     uintptr_t good, uintptr_t bad, dict_t *xdata)
++                     uintptr_t good, uintptr_t bad, uint32_t pending,
++                     dict_t *xdata)
+ {
+     ec_fop_data_t *fop = cookie;
+     fop_getxattr_cbk_t func = fop->data;
+@@ -402,6 +403,25 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
+     char *str;
+     char bin1[65], bin2[65];
+ 
++    /* We try to return the 'pending' information in xdata, but if this cannot
++     * be set, we will ignore it silently. We prefer to report the success or
++     * failure of the heal itself. */
++    if (xdata == NULL) {
++        xdata = dict_new();
++    } else {
++        dict_ref(xdata);
++    }
++    if (xdata != NULL) {
++        if (dict_set_uint32(xdata, EC_XATTR_HEAL_NEW, pending) != 0) {
++            /* dict_set_uint32() is marked as 'warn_unused_result' and gcc
++             * enforces to check the result in this case. However we don't
++             * really care if it succeeded or not. We'll just do the same.
++             *
++             * This empty 'if' avoids the warning, and it will be removed by
++             * the optimizer. */
++        }
++    }
++
+     if (op_ret >= 0) {
+         dict = dict_new();
+         if (dict == NULL) {
+@@ -435,11 +455,14 @@ ec_getxattr_heal_cbk(call_frame_t *frame, void *cookie, xlator_t *xl,
+     }
+ 
+ out:
+-    func(frame, NULL, xl, op_ret, op_errno, dict, NULL);
++    func(frame, NULL, xl, op_ret, op_errno, dict, xdata);
+ 
+     if (dict != NULL) {
+         dict_unref(dict);
+     }
++    if (xdata != NULL) {
++        dict_unref(xdata);
++    }
+ 
+     return 0;
+ }
+diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
+index 34a9768..f15429d 100644
+--- a/xlators/cluster/ec/src/ec-types.h
++++ b/xlators/cluster/ec/src/ec-types.h
+@@ -186,10 +186,10 @@ struct _ec_inode {
+ 
+ typedef int32_t (*fop_heal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
+                                   int32_t, uintptr_t, uintptr_t, uintptr_t,
+-                                  dict_t *);
++                                  uint32_t, dict_t *);
+ typedef int32_t (*fop_fheal_cbk_t)(call_frame_t *, void *, xlator_t *, int32_t,
+                                    int32_t, uintptr_t, uintptr_t, uintptr_t,
+-                                   dict_t *);
++                                   uint32_t, dict_t *);
+ 
+ union _ec_cbk {
+     fop_access_cbk_t access;
+diff --git a/xlators/cluster/ec/src/ec.h b/xlators/cluster/ec/src/ec.h
+index 1b210d9..6f6de6d 100644
+--- a/xlators/cluster/ec/src/ec.h
++++ b/xlators/cluster/ec/src/ec.h
+@@ -18,6 +18,7 @@
+ #define EC_XATTR_SIZE EC_XATTR_PREFIX "size"
+ #define EC_XATTR_VERSION EC_XATTR_PREFIX "version"
+ #define EC_XATTR_HEAL EC_XATTR_PREFIX "heal"
++#define EC_XATTR_HEAL_NEW EC_XATTR_PREFIX "heal-new"
+ #define EC_XATTR_DIRTY EC_XATTR_PREFIX "dirty"
+ #define EC_STRIPE_CACHE_MAX_SIZE 10
+ #define EC_VERSION_SIZE 2
+-- 
+1.8.3.1
+