diff options
Diffstat (limited to '0044-cluster-ec-Don-t-enqueue-an-entry-if-it-is-already-h.patch')
-rw-r--r-- | 0044-cluster-ec-Don-t-enqueue-an-entry-if-it-is-already-h.patch | 360 |
1 files changed, 360 insertions, 0 deletions
diff --git a/0044-cluster-ec-Don-t-enqueue-an-entry-if-it-is-already-h.patch b/0044-cluster-ec-Don-t-enqueue-an-entry-if-it-is-already-h.patch new file mode 100644 index 0000000..30ab28b --- /dev/null +++ b/0044-cluster-ec-Don-t-enqueue-an-entry-if-it-is-already-h.patch @@ -0,0 +1,360 @@ +From bc6588890ce94101a63b861178cf38db5549d8a8 Mon Sep 17 00:00:00 2001 +From: Ashish Pandey <aspandey@redhat.com> +Date: Wed, 28 Nov 2018 11:22:52 +0530 +Subject: [PATCH 44/52] cluster/ec: Don't enqueue an entry if it is already + healing + +Problem: +1 - heal-wait-qlength is by default 128. If shd is disabled +and we need to heal files, client side heal is needed. +If we access these files that will trigger the heal. +However, it has been observed that a file will be enqueued +multiple times in the heal wait queue, which in turn causes +queue to be filled and prevent other files to be enqueued. + +2 - While a file is going through healing and a write fop from +mount comes on that file, it sends write on all the bricks including +healing one. At the end it updates version and size on all the +bricks. However, it does not unset dirty flag on all the bricks, +even if this write fop was successful on all the bricks. +After healing completion this dirty flag remain set and never +gets cleaned up if SHD is disabled. + +Solution: +1 - If an entry is already in queue or going through heal process, +don't enqueue next client side request to heal the same file. + +2 - Unset dirty on all the bricks at the end if fop has succeeded on +all the bricks even if some of the bricks are going through heal. + +backport of : https://review.gluster.org/#/c/glusterfs/+/21744/ + +Change-Id: Ia61ffe230c6502ce6cb934425d55e2f40dd1a727 +BUG: 1600918 +Signed-off-by: Ashish Pandey <aspandey@redhat.com> +Reviewed-on: https://code.engineering.redhat.com/gerrit/166296 +Tested-by: RHGS Build Bot <nigelb@redhat.com> +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> +--- + tests/bugs/ec/bug-1236065.t | 1 - + xlators/cluster/ec/src/ec-common.c | 43 +++++++++------ + xlators/cluster/ec/src/ec-common.h | 8 +++ + xlators/cluster/ec/src/ec-heal.c | 104 +++++++++++++++++++++++++++++++----- + xlators/cluster/ec/src/ec-helpers.c | 1 + + xlators/cluster/ec/src/ec-types.h | 1 + + 6 files changed, 127 insertions(+), 31 deletions(-) + +diff --git a/tests/bugs/ec/bug-1236065.t b/tests/bugs/ec/bug-1236065.t +index 76d25d7..9181e73 100644 +--- a/tests/bugs/ec/bug-1236065.t ++++ b/tests/bugs/ec/bug-1236065.t +@@ -85,7 +85,6 @@ TEST pidof glusterd + EXPECT "$V0" volinfo_field $V0 'Volume Name' + EXPECT 'Started' volinfo_field $V0 'Status' + EXPECT '7' online_brick_count +- + ## cleanup + cd + EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +diff --git a/xlators/cluster/ec/src/ec-common.c b/xlators/cluster/ec/src/ec-common.c +index 8d65670..5183680 100644 +--- a/xlators/cluster/ec/src/ec-common.c ++++ b/xlators/cluster/ec/src/ec-common.c +@@ -313,14 +313,15 @@ ec_check_status(ec_fop_data_t *fop) + + gf_msg(fop->xl->name, GF_LOG_WARNING, 0, EC_MSG_OP_FAIL_ON_SUBVOLS, + "Operation failed on %d of %d subvolumes.(up=%s, mask=%s, " +- "remaining=%s, good=%s, bad=%s)", ++ "remaining=%s, good=%s, bad=%s, %s)", + gf_bits_count(ec->xl_up & ~(fop->remaining | fop->good)), ec->nodes, + ec_bin(str1, sizeof(str1), ec->xl_up, ec->nodes), + ec_bin(str2, sizeof(str2), fop->mask, ec->nodes), + ec_bin(str3, sizeof(str3), fop->remaining, ec->nodes), + ec_bin(str4, sizeof(str4), fop->good, ec->nodes), + ec_bin(str5, sizeof(str5), ec->xl_up & ~(fop->remaining | fop->good), +- ec->nodes)); ++ ec->nodes), ++ ec_msg_str(fop)); + if (fop->use_fd) { + if (fop->fd != NULL) { + ec_fheal(NULL, fop->xl, -1, EC_MINIMUM_ONE, ec_heal_report, NULL, +@@ -2371,37 +2372,47 @@ ec_update_info(ec_lock_link_t *link) + uint64_t dirty[2] = {0, 0}; + uint64_t size; + ec_t *ec = NULL; ++ uintptr_t mask; + + lock = link->lock; + ctx = lock->ctx; + ec = link->fop->xl->private; + + /* pre_version[*] will be 0 if have_version is false */ +- version[0] = ctx->post_version[0] - ctx->pre_version[0]; +- version[1] = ctx->post_version[1] - ctx->pre_version[1]; ++ version[EC_DATA_TXN] = ctx->post_version[EC_DATA_TXN] - ++ ctx->pre_version[EC_DATA_TXN]; ++ version[EC_METADATA_TXN] = ctx->post_version[EC_METADATA_TXN] - ++ ctx->pre_version[EC_METADATA_TXN]; + + size = ctx->post_size - ctx->pre_size; + /* If we set the dirty flag for update fop, we have to unset it. + * If fop has failed on some bricks, leave the dirty as marked. */ ++ + if (lock->unlock_now) { ++ if (version[EC_DATA_TXN]) { ++ /*A data fop will have difference in post and pre version ++ *and for data fop we send writes on healing bricks also */ ++ mask = lock->good_mask | lock->healing; ++ } else { ++ mask = lock->good_mask; ++ } + /* Ensure that nodes are up while doing final + * metadata update.*/ +- if (!(ec->node_mask & ~lock->good_mask) && +- !(ec->node_mask & ~ec->xl_up)) { +- if (ctx->dirty[0] != 0) { +- dirty[0] = -1; ++ if (!(ec->node_mask & ~(mask)) && !(ec->node_mask & ~ec->xl_up)) { ++ if (ctx->dirty[EC_DATA_TXN] != 0) { ++ dirty[EC_DATA_TXN] = -1; + } +- if (ctx->dirty[1] != 0) { +- dirty[1] = -1; ++ if (ctx->dirty[EC_METADATA_TXN] != 0) { ++ dirty[EC_METADATA_TXN] = -1; + } + /*If everything is fine and we already + *have version xattr set on entry, there + *is no need to update version again*/ +- if (ctx->pre_version[0]) { +- version[0] = 0; ++ if (ctx->pre_version[EC_DATA_TXN]) { ++ version[EC_DATA_TXN] = 0; + } +- if (ctx->pre_version[1]) { +- version[1] = 0; ++ if (ctx->pre_version[EC_METADATA_TXN]) { ++ version[EC_METADATA_TXN] = 0; + } + } else { + link->optimistic_changelog = _gf_false; +@@ -2410,8 +2421,8 @@ ec_update_info(ec_lock_link_t *link) + memset(ctx->dirty, 0, sizeof(ctx->dirty)); + } + +- if ((version[0] != 0) || (version[1] != 0) || (dirty[0] != 0) || +- (dirty[1] != 0)) { ++ if ((version[EC_DATA_TXN] != 0) || (version[EC_METADATA_TXN] != 0) || ++ (dirty[EC_DATA_TXN] != 0) || (dirty[EC_METADATA_TXN] != 0)) { + ec_update_size_version(link, version, size, dirty); + return _gf_true; + } +diff --git a/xlators/cluster/ec/src/ec-common.h b/xlators/cluster/ec/src/ec-common.h +index 115e147..54aaa77 100644 +--- a/xlators/cluster/ec/src/ec-common.h ++++ b/xlators/cluster/ec/src/ec-common.h +@@ -190,4 +190,12 @@ ec_lock_unlocked(call_frame_t *frame, void *cookie, xlator_t *this, + void + ec_update_fd_status(fd_t *fd, xlator_t *xl, int child_index, + int32_t ret_status); ++gf_boolean_t ++ec_is_entry_healing(ec_fop_data_t *fop); ++void ++ec_set_entry_healing(ec_fop_data_t *fop); ++void ++ec_reset_entry_healing(ec_fop_data_t *fop); ++char * ++ec_msg_str(ec_fop_data_t *fop); + #endif /* __EC_COMMON_H__ */ +diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c +index eaf80e0..1ca12c1 100644 +--- a/xlators/cluster/ec/src/ec-heal.c ++++ b/xlators/cluster/ec/src/ec-heal.c +@@ -103,6 +103,48 @@ ec_sh_key_match(dict_t *dict, char *key, data_t *val, void *mdata) + } + /* FOP: heal */ + ++void ++ec_set_entry_healing(ec_fop_data_t *fop) ++{ ++ ec_inode_t *ctx = NULL; ++ loc_t *loc = NULL; ++ ++ if (!fop) ++ return; ++ ++ loc = &fop->loc[0]; ++ LOCK(&loc->inode->lock); ++ { ++ ctx = __ec_inode_get(loc->inode, fop->xl); ++ if (ctx) { ++ ctx->heal_count += 1; ++ } ++ } ++ UNLOCK(&loc->inode->lock); ++} ++ ++void ++ec_reset_entry_healing(ec_fop_data_t *fop) ++{ ++ ec_inode_t *ctx = NULL; ++ loc_t *loc = NULL; ++ int32_t heal_count = 0; ++ if (!fop) ++ return; ++ ++ loc = &fop->loc[0]; ++ LOCK(&loc->inode->lock); ++ { ++ ctx = __ec_inode_get(loc->inode, fop->xl); ++ if (ctx) { ++ ctx->heal_count += -1; ++ heal_count = ctx->heal_count; ++ } ++ } ++ UNLOCK(&loc->inode->lock); ++ GF_ASSERT(heal_count >= 0); ++} ++ + uintptr_t + ec_heal_check(ec_fop_data_t *fop, uintptr_t *pgood) + { +@@ -2507,17 +2549,6 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) + "Heal is not required for : %s ", uuid_utoa(loc->gfid)); + goto out; + } +- +- msources = alloca0(ec->nodes); +- mhealed_sinks = alloca0(ec->nodes); +- ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks); +- if (ret == 0) { +- mgood = ec_char_array_to_mask(msources, ec->nodes); +- mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes); +- } else { +- op_ret = -1; +- op_errno = -ret; +- } + sources = alloca0(ec->nodes); + healed_sinks = alloca0(ec->nodes); + if (IA_ISREG(loc->inode->ia_type)) { +@@ -2538,8 +2569,19 @@ ec_heal_do(xlator_t *this, void *data, loc_t *loc, int32_t partial) + op_ret = -1; + op_errno = -ret; + } ++ msources = alloca0(ec->nodes); ++ mhealed_sinks = alloca0(ec->nodes); ++ ret = ec_heal_metadata(frame, ec, loc->inode, msources, mhealed_sinks); ++ if (ret == 0) { ++ mgood = ec_char_array_to_mask(msources, ec->nodes); ++ mbad = ec_char_array_to_mask(mhealed_sinks, ec->nodes); ++ } else { ++ op_ret = -1; ++ op_errno = -ret; ++ } + + out: ++ ec_reset_entry_healing(fop); + if (fop->cbks.heal) { + fop->cbks.heal(fop->req_frame, fop, fop->xl, op_ret, op_errno, + ec_char_array_to_mask(participants, ec->nodes), +@@ -2650,11 +2692,33 @@ ec_handle_healers_done(ec_fop_data_t *fop) + ec_launch_heal(ec, heal_fop); + } + ++gf_boolean_t ++ec_is_entry_healing(ec_fop_data_t *fop) ++{ ++ ec_inode_t *ctx = NULL; ++ int32_t heal_count = 0; ++ loc_t *loc = NULL; ++ ++ loc = &fop->loc[0]; ++ ++ LOCK(&loc->inode->lock); ++ { ++ ctx = __ec_inode_get(loc->inode, fop->xl); ++ if (ctx) { ++ heal_count = ctx->heal_count; ++ } ++ } ++ UNLOCK(&loc->inode->lock); ++ GF_ASSERT(heal_count >= 0); ++ return heal_count; ++} ++ + void + ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) + { + gf_boolean_t can_heal = _gf_true; + ec_t *ec = this->private; ++ ec_fop_data_t *fop_rel = NULL; + + if (fop->req_frame == NULL) { + LOCK(&ec->lock); +@@ -2662,8 +2726,13 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) + if ((ec->background_heals > 0) && + (ec->heal_wait_qlen + ec->background_heals) > + (ec->heal_waiters + ec->healers)) { +- list_add_tail(&fop->healer, &ec->heal_waiting); +- ec->heal_waiters++; ++ if (!ec_is_entry_healing(fop)) { ++ list_add_tail(&fop->healer, &ec->heal_waiting); ++ ec->heal_waiters++; ++ ec_set_entry_healing(fop); ++ } else { ++ fop_rel = fop; ++ } + fop = __ec_dequeue_heals(ec); + } else { + can_heal = _gf_false; +@@ -2673,8 +2742,12 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) + } + + if (can_heal) { +- if (fop) ++ if (fop) { ++ if (fop->req_frame != NULL) { ++ ec_set_entry_healing(fop); ++ } + ec_launch_heal(ec, fop); ++ } + } else { + gf_msg_debug(this->name, 0, + "Max number of heals are " +@@ -2682,6 +2755,9 @@ ec_heal_throttle(xlator_t *this, ec_fop_data_t *fop) + ec_fop_set_error(fop, EBUSY); + ec_heal_fail(ec, fop); + } ++ if (fop_rel) { ++ ec_heal_done(0, NULL, fop_rel); ++ } + } + + void +diff --git a/xlators/cluster/ec/src/ec-helpers.c b/xlators/cluster/ec/src/ec-helpers.c +index e6b0359..43f6e3b 100644 +--- a/xlators/cluster/ec/src/ec-helpers.c ++++ b/xlators/cluster/ec/src/ec-helpers.c +@@ -717,6 +717,7 @@ __ec_inode_get(inode_t *inode, xlator_t *xl) + memset(ctx, 0, sizeof(*ctx)); + INIT_LIST_HEAD(&ctx->heal); + INIT_LIST_HEAD(&ctx->stripe_cache.lru); ++ ctx->heal_count = 0; + value = (uint64_t)(uintptr_t)ctx; + if (__inode_ctx_set(inode, xl, &value) != 0) { + GF_FREE(ctx); +diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h +index f3d63ca..6ae4a2b 100644 +--- a/xlators/cluster/ec/src/ec-types.h ++++ b/xlators/cluster/ec/src/ec-types.h +@@ -171,6 +171,7 @@ struct _ec_inode { + gf_boolean_t have_config; + gf_boolean_t have_version; + gf_boolean_t have_size; ++ int32_t heal_count; + ec_config_t config; + uint64_t pre_version[2]; + uint64_t post_version[2]; +-- +1.8.3.1 + |