1 files changed, 360 insertions, 0 deletions
diff --git a/0044-cluster-ec-Don-t-enqueue-an-entry-if-it-is-already-h.patch b/0044-cluster-ec-Don-t-enqueue-an-entry-if-it-is-already-h.patch
new file mode 100644
index 0000000..30ab28b
--- /dev/null
+++ b/0044-cluster-ec-Don-t-enqueue-an-entry-if-it-is-already-h.patch
@@ -0,0 +1,360 @@
+From bc6588890ce94101a63b861178cf38db5549d8a8 Mon Sep 17 00:00:00 2001
+From: Ashish Pandey <aspandey@redhat.com>
+Date: Wed, 28 Nov 2018 11:22:52 +0530
+Subject: [PATCH 44/52] cluster/ec: Don't enqueue an entry if it is already
+ healing
+
+Problem:
+1 - heal-wait-qlength is by default 128. If shd is disabled
+and we need to heal files, client side heal is needed.
+If we access these files that will trigger the heal.
+However, it has been observed that a file will be enqueued
+multiple times in the heal wait queue, which in turn causes
+queue to be filled and prevent other files to be enqueued.
+
+2 - While a file is going through healing and a write fop from
+mount comes on that file, it sends write on all the bricks including
+healing one. At the end it updates version and size on all the
+bricks. However, it does not unset dirty flag on all the bricks,
+even if this write fop was successful on all the bricks.
+After healing completion this dirty flag remain set and never
+gets cleaned up if SHD is disabled.
+
+Solution:
+1 - If an entry is already in queue or going through heal process,
+don't enqueue next client side request to heal the same file.
+
+2 - Unset dirty on all the bricks at the end if fop has succeeded on
+all the bricks even if some of the bricks are going through heal.
+
+backport of : https://review.gluster.org/#/c/glusterfs/+/21744/
+
+Change-Id: Ia61ffe230c6502ce6cb934425d55e2f40dd1a727
+BUG: 1600918
+Signed-off-by: Ashish Pandey <aspandey@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/166296
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
+---
+ tests/bugs/ec/bug-1236065.t         |   1 -
+ xlators/cluster/ec/src/ec-common.c  |  43 +++++++++------
+ xlators/cluster/ec/src/ec-common.h  |   8 +++
+ xlators/cluster/ec/src/ec-heal.c    | 104 +++++++++++++++++++++++++++++++-----
+ xlators/cluster/ec/src/ec-helpers.c |   1 +
+ xlators/cluster/ec/src/ec-types.h   |   1 +
+ 6 files changed, 127 insertions(+), 31 deletions(-)
+
+diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t
+index 76d25d7..9181e73 100644
+--- a/tests/bugs/ec/bug-1236065.t
++++ b/tests/bugs/ec/bug-1236065.t
+@@ -85,7 +85,6 @@ TEST pidof glusterd
+ EXPECT "$V0" volinfo_field $V0 'Volume Name'
+ EXPECT 'Started' volinfo_field $V0 'Status'
+ EXPECT '7' online_brick_count
+-
+ ## cleanup
+ cd
+ EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c
+index 8d65670..5183680 100644
+--- a/xlators/cluster/ec/src/ec-common.c
++++ b/xlators/cluster/ec/src/ec-common.c
+@@ -313,14 +313,15 @@ ec_check_status(ec_fop_data_t *fop)
+ 
+     gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS,
+            "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, "
+-           "remaining=%s, good=%s, bad=%s)",
++           "remaining=%s, good=%s, bad=%s, %s)",
+            gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes,
+            ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes),
+            ec_bin(str2, sizeof(str2), fop->mask, ec->nodes),
+            ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes),
+            ec_bin(str4, sizeof(str4), fop->good, ec->nodes),
+            ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good),
+-                  ec->nodes));
++                  ec->nodes),
++           ec_msg_str(fop));
+     if (fop->use_fd) {
+         if (fop->fd != NULL) {
+             ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL,
+@@ -2371,37 +2372,47 @@ ec_update_info(ec_lock_link_t *link)
+     uint64_t dirty[2] = {0, 0};
+     uint64_t size;
+     ec_t *ec = NULL;
++    uintptr_t mask;
+ 
+     lock = link->lock;
+     ctx = lock->ctx;
+     ec = link->fop->xl->private;
+ 
+     /* pre_version[*] will be 0 if have_version is false */
+-    version[0] = ctx->post_version[0] - ctx->pre_version[0];
+-    version[1] = ctx->post_version[1] - ctx->pre_version[1];
++    version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] -
++                           ctx->pre_version[EC_DATA_TXN];
++    version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] -
++                               ctx->pre_version[EC_METADATA_TXN];
+ 
+     size = ctx->post_size - ctx->pre_size;
+     /* If we set the dirty flag for update fop, we have to unset it.
+      * If fop has failed on some bricks, leave the dirty as marked. */
++
+     if (lock->unlock_now) {
++        if (version[EC_DATA_TXN]) {
++            /*A data fop will have difference in post and pre version
++             *and for data fop we send writes on healing bricks also */
++            mask = lock->good_mask | lock->healing;
++        } else {
++            mask = lock->good_mask;
++        }
+         /* Ensure that nodes are up while doing final
+          * metadata update.*/
+-        if (!(ec->node_mask & ~lock->good_mask) &&
+-            !(ec->node_mask & ~ec->xl_up)) {
+-            if (ctx->dirty[0] != 0) {
+-                dirty[0] = -1;
++        if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) {
++            if (ctx->dirty[EC_DATA_TXN] != 0) {
++                dirty[EC_DATA_TXN] = -1;
+             }
+-            if (ctx->dirty[1] != 0) {
+-                dirty[1] = -1;
++            if (ctx->dirty[EC_METADATA_TXN] != 0) {
++                dirty[EC_METADATA_TXN] = -1;
+             }
+             /*If everything is fine and we already
+              *have version xattr set on entry, there
+              *is no need to update version again*/
+-            if (ctx->pre_version[0]) {
+-                version[0] = 0;
++            if (ctx->pre_version[EC_DATA_TXN]) {
++                version[EC_DATA_TXN] = 0;
+             }
+-            if (ctx->pre_version[1]) {
+-                version[1] = 0;
++            if (ctx->pre_version[EC_METADATA_TXN]) {
++                version[EC_METADATA_TXN] = 0;
+             }
+         } else {
+             link->optimistic_changelog = _gf_false;
+@@ -2410,8 +2421,8 @@ ec_update_info(ec_lock_link_t *link)
+         memset(ctx->dirty, 0, sizeof(ctx->dirty));
+     }
+ 
+-    if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) ||
+-        (dirty[1] != 0)) {
++    if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) ||
++        (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) {
+         ec_update_size_version(link, version, size, dirty);
+         return _gf_true;
+     }
+diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h
+index 115e147..54aaa77 100644
+--- a/xlators/cluster/ec/src/ec-common.h
++++ b/xlators/cluster/ec/src/ec-common.h
+@@ -190,4 +190,12 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this,
+ void
+ ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index,
+                     int32_t ret_status);
++gf_boolean_t
++ec_is_entry_healing(ec_fop_data_t *fop);
++void
++ec_set_entry_healing(ec_fop_data_t *fop);
++void
++ec_reset_entry_healing(ec_fop_data_t *fop);
++char *
++ec_msg_str(ec_fop_data_t *fop);
+ #endif /* __EC_COMMON_H__ */
+diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c
+index eaf80e0..1ca12c1 100644
+--- a/xlators/cluster/ec/src/ec-heal.c
++++ b/xlators/cluster/ec/src/ec-heal.c
+@@ -103,6 +103,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata)
+ }
+ /* FOP: heal */
+ 
++void
++ec_set_entry_healing(ec_fop_data_t *fop)
++{
++    ec_inode_t *ctx = NULL;
++    loc_t *loc = NULL;
++
++    if (!fop)
++        return;
++
++    loc = &fop->loc[0];
++    LOCK(&loc->inode->lock);
++    {
++        ctx = __ec_inode_get(loc->inode, fop->xl);
++        if (ctx) {
++            ctx->heal_count += 1;
++        }
++    }
++    UNLOCK(&loc->inode->lock);
++}
++
++void
++ec_reset_entry_healing(ec_fop_data_t *fop)
++{
++    ec_inode_t *ctx = NULL;
++    loc_t *loc = NULL;
++    int32_t heal_count = 0;
++    if (!fop)
++        return;
++
++    loc = &fop->loc[0];
++    LOCK(&loc->inode->lock);
++    {
++        ctx = __ec_inode_get(loc->inode, fop->xl);
++        if (ctx) {
++            ctx->heal_count += -1;
++            heal_count = ctx->heal_count;
++        }
++    }
++    UNLOCK(&loc->inode->lock);
++    GF_ASSERT(heal_count >= 0);
++}
++
+ uintptr_t
+ ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood)
+ {
+@@ -2507,17 +2549,6 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+                "Heal is not required for : %s ", uuid_utoa(loc->gfid));
+         goto out;
+     }
+-
+-    msources = alloca0(ec->nodes);
+-    mhealed_sinks = alloca0(ec->nodes);
+-    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
+-    if (ret == 0) {
+-        mgood = ec_char_array_to_mask(msources, ec->nodes);
+-        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
+-    } else {
+-        op_ret = -1;
+-        op_errno = -ret;
+-    }
+     sources = alloca0(ec->nodes);
+     healed_sinks = alloca0(ec->nodes);
+     if (IA_ISREG(loc->inode->ia_type)) {
+@@ -2538,8 +2569,19 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial)
+         op_ret = -1;
+         op_errno = -ret;
+     }
++    msources = alloca0(ec->nodes);
++    mhealed_sinks = alloca0(ec->nodes);
++    ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks);
++    if (ret == 0) {
++        mgood = ec_char_array_to_mask(msources, ec->nodes);
++        mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes);
++    } else {
++        op_ret = -1;
++        op_errno = -ret;
++    }
+ 
+ out:
++    ec_reset_entry_healing(fop);
+     if (fop->cbks.heal) {
+         fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno,
+                        ec_char_array_to_mask(participants, ec->nodes),
+@@ -2650,11 +2692,33 @@ ec_handle_healers_done(ec_fop_data_t *fop)
+         ec_launch_heal(ec, heal_fop);
+ }
+ 
++gf_boolean_t
++ec_is_entry_healing(ec_fop_data_t *fop)
++{
++    ec_inode_t *ctx = NULL;
++    int32_t heal_count = 0;
++    loc_t *loc = NULL;
++
++    loc = &fop->loc[0];
++
++    LOCK(&loc->inode->lock);
++    {
++        ctx = __ec_inode_get(loc->inode, fop->xl);
++        if (ctx) {
++            heal_count = ctx->heal_count;
++        }
++    }
++    UNLOCK(&loc->inode->lock);
++    GF_ASSERT(heal_count >= 0);
++    return heal_count;
++}
++
+ void
+ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
+ {
+     gf_boolean_t can_heal = _gf_true;
+     ec_t *ec = this->private;
++    ec_fop_data_t *fop_rel = NULL;
+ 
+     if (fop->req_frame == NULL) {
+         LOCK(&ec->lock);
+@@ -2662,8 +2726,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
+             if ((ec->background_heals > 0) &&
+                 (ec->heal_wait_qlen + ec->background_heals) >
+                     (ec->heal_waiters + ec->healers)) {
+-                list_add_tail(&fop->healer, &ec->heal_waiting);
+-                ec->heal_waiters++;
++                if (!ec_is_entry_healing(fop)) {
++                    list_add_tail(&fop->healer, &ec->heal_waiting);
++                    ec->heal_waiters++;
++                    ec_set_entry_healing(fop);
++                } else {
++                    fop_rel = fop;
++                }
+                 fop = __ec_dequeue_heals(ec);
+             } else {
+                 can_heal = _gf_false;
+@@ -2673,8 +2742,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
+     }
+ 
+     if (can_heal) {
+-        if (fop)
++        if (fop) {
++            if (fop->req_frame != NULL) {
++                ec_set_entry_healing(fop);
++            }
+             ec_launch_heal(ec, fop);
++        }
+     } else {
+         gf_msg_debug(this->name, 0,
+                      "Max number of heals are "
+@@ -2682,6 +2755,9 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop)
+         ec_fop_set_error(fop, EBUSY);
+         ec_heal_fail(ec, fop);
+     }
++    if (fop_rel) {
++        ec_heal_done(0, NULL, fop_rel);
++    }
+ }
+ 
+ void
+diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c
+index e6b0359..43f6e3b 100644
+--- a/xlators/cluster/ec/src/ec-helpers.c
++++ b/xlators/cluster/ec/src/ec-helpers.c
+@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl)
+             memset(ctx, 0, sizeof(*ctx));
+             INIT_LIST_HEAD(&ctx->heal);
+             INIT_LIST_HEAD(&ctx->stripe_cache.lru);
++            ctx->heal_count = 0;
+             value = (uint64_t)(uintptr_t)ctx;
+             if (__inode_ctx_set(inode, xl, &value) != 0) {
+                 GF_FREE(ctx);
+diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h
+index f3d63ca..6ae4a2b 100644
+--- a/xlators/cluster/ec/src/ec-types.h
++++ b/xlators/cluster/ec/src/ec-types.h
+@@ -171,6 +171,7 @@ struct _ec_inode {
+     gf_boolean_t have_config;
+     gf_boolean_t have_version;
+     gf_boolean_t have_size;
++    int32_t heal_count;
+     ec_config_t config;
+     uint64_t pre_version[2];
+     uint64_t post_version[2];
+-- 
+1.8.3.1
+