diff options
Diffstat (limited to '0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch')
-rw-r--r-- | 0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch | 204 |
1 files changed, 204 insertions, 0 deletions
diff --git a/0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch b/0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch new file mode 100644 index 0000000..0e17c44 --- /dev/null +++ b/0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch @@ -0,0 +1,204 @@ +From cca418b78ec976aa69eacd56b0e6127ea7e3dd26 Mon Sep 17 00:00:00 2001 +From: Pranith Kumar K <pkarampu@redhat.com> +Date: Thu, 4 Apr 2019 15:31:56 +0530 +Subject: [PATCH 095/124] cluster/afr: Remove local from owners_list on failure + of lock-acquisition + + Backport of https://review.gluster.org/c/glusterfs/+/22515 + +When eager-lock lock acquisition fails because of say network failures, the +local is not being removed from owners_list, this leads to accumulation of +waiting frames and the application will hang because the waiting frames are +under the assumption that another transaction is in the process of acquiring +lock because owner-list is not empty. Handled this case as well in this patch. +Added asserts to make it easier to find these problems in future. + +Change-Id: I3101393265e9827755725b1f2d94a93d8709e923 +fixes: bz#1688395 +Signed-off-by: Pranith Kumar K <pkarampu@redhat.com> +Reviewed-on: https://code.engineering.redhat.com/gerrit/167859 +Tested-by: RHGS Build Bot <nigelb@redhat.com> +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> +--- + tests/bugs/replicate/bug-1696599-io-hang.t | 47 ++++++++++++++++++++++++++++++ + xlators/cluster/afr/src/afr-common.c | 8 ++--- + xlators/cluster/afr/src/afr-lk-common.c | 1 - + xlators/cluster/afr/src/afr-transaction.c | 19 +++++------- + xlators/cluster/afr/src/afr.h | 4 +-- + 5 files changed, 61 insertions(+), 18 deletions(-) + create mode 100755 tests/bugs/replicate/bug-1696599-io-hang.t + +diff --git a/tests/bugs/replicate/bug-1696599-io-hang.t b/tests/bugs/replicate/bug-1696599-io-hang.t +new file mode 100755 +index 0000000..869cdb9 +--- /dev/null ++++ b/tests/bugs/replicate/bug-1696599-io-hang.t +@@ -0,0 +1,47 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../fileio.rc ++ ++#Tests that local structures in afr are removed from granted/blocked list of ++#locks when inodelk fails on all bricks ++ ++cleanup; ++ ++TEST glusterd ++TEST pidof glusterd ++ ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3} ++TEST $CLI volume set $V0 performance.quick-read off ++TEST $CLI volume set $V0 performance.write-behind off ++TEST $CLI volume set $V0 performance.io-cache off ++TEST $CLI volume set $V0 performance.stat-prefetch off ++TEST $CLI volume set $V0 performance.client-io-threads off ++TEST $CLI volume set $V0 delay-gen locks ++TEST $CLI volume set $V0 delay-gen.delay-duration 5000000 ++TEST $CLI volume set $V0 delay-gen.delay-percentage 100 ++TEST $CLI volume set $V0 delay-gen.enable finodelk ++ ++TEST $CLI volume start $V0 ++EXPECT 'Started' volinfo_field $V0 'Status' ++ ++TEST $GFS -s $H0 --volfile-id $V0 $M0 ++TEST touch $M0/file ++#Trigger write and stop bricks so inodelks fail on all bricks leading to ++#lock failure condition ++echo abc >> $M0/file & ++ ++TEST $CLI volume stop $V0 ++TEST $CLI volume reset $V0 delay-gen ++wait ++TEST $CLI volume start $V0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2 ++#Test that only one write succeeded, this tests that delay-gen worked as ++#expected ++echo abc >> $M0/file ++EXPECT "abc" cat $M0/file ++ ++cleanup; +diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c +index 45b96e3..47a5d3a 100644 +--- a/xlators/cluster/afr/src/afr-common.c ++++ b/xlators/cluster/afr/src/afr-common.c +@@ -5763,6 +5763,10 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this) + afr_private_t *priv = NULL; + + priv = this->private; ++ INIT_LIST_HEAD(&local->transaction.wait_list); ++ INIT_LIST_HEAD(&local->transaction.owner_list); ++ INIT_LIST_HEAD(&local->ta_waitq); ++ INIT_LIST_HEAD(&local->ta_onwireq); + ret = afr_internal_lock_init(&local->internal_lock, priv->child_count); + if (ret < 0) + goto out; +@@ -5800,10 +5804,6 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this) + goto out; + + ret = 0; +- INIT_LIST_HEAD(&local->transaction.wait_list); +- INIT_LIST_HEAD(&local->transaction.owner_list); +- INIT_LIST_HEAD(&local->ta_waitq); +- INIT_LIST_HEAD(&local->ta_onwireq); + out: + return ret; + } +diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c +index 4091671..bc8eabe 100644 +--- a/xlators/cluster/afr/src/afr-lk-common.c ++++ b/xlators/cluster/afr/src/afr-lk-common.c +@@ -397,7 +397,6 @@ afr_unlock_now(call_frame_t *frame, xlator_t *this) + int_lock->lk_call_count = call_count; + + if (!call_count) { +- GF_ASSERT(!local->transaction.do_eager_unlock); + gf_msg_trace(this->name, 0, "No internal locks unlocked"); + int_lock->lock_cbk(frame, this); + goto out; +diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c +index 229820b..15f3a7e 100644 +--- a/xlators/cluster/afr/src/afr-transaction.c ++++ b/xlators/cluster/afr/src/afr-transaction.c +@@ -372,6 +372,8 @@ afr_transaction_done(call_frame_t *frame, xlator_t *this) + } + local->transaction.unwind(frame, this); + ++ GF_ASSERT(list_empty(&local->transaction.owner_list)); ++ GF_ASSERT(list_empty(&local->transaction.wait_list)); + AFR_STACK_DESTROY(frame); + + return 0; +@@ -393,7 +395,7 @@ afr_lock_fail_shared(afr_local_t *local, struct list_head *list) + } + + static void +-afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked) ++afr_handle_lock_acquire_failure(afr_local_t *local) + { + struct list_head shared; + afr_lock_t *lock = NULL; +@@ -414,13 +416,8 @@ afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked) + afr_lock_fail_shared(local, &shared); + local->transaction.do_eager_unlock = _gf_true; + out: +- if (locked) { +- local->internal_lock.lock_cbk = afr_transaction_done; +- afr_unlock(local->transaction.frame, local->transaction.frame->this); +- } else { +- afr_transaction_done(local->transaction.frame, +- local->transaction.frame->this); +- } ++ local->internal_lock.lock_cbk = afr_transaction_done; ++ afr_unlock(local->transaction.frame, local->transaction.frame->this); + } + + call_frame_t * +@@ -619,7 +616,7 @@ afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this) + failure_count = AFR_COUNT(local->transaction.failed_subvols, + priv->child_count); + if (failure_count == priv->child_count) { +- afr_handle_lock_acquire_failure(local, _gf_true); ++ afr_handle_lock_acquire_failure(local); + return 0; + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; +@@ -2092,7 +2089,7 @@ err: + local->op_ret = -1; + local->op_errno = op_errno; + +- afr_handle_lock_acquire_failure(local, _gf_true); ++ afr_handle_lock_acquire_failure(local); + + if (xdata_req) + dict_unref(xdata_req); +@@ -2361,7 +2358,7 @@ afr_internal_lock_finish(call_frame_t *frame, xlator_t *this) + } else { + lock = &local->inode_ctx->lock[local->transaction.type]; + if (local->internal_lock.lock_op_ret < 0) { +- afr_handle_lock_acquire_failure(local, _gf_false); ++ afr_handle_lock_acquire_failure(local); + } else { + lock->event_generation = local->event_generation; + afr_changelog_pre_op(frame, this); +diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h +index 2cc3797..e731cfa 100644 +--- a/xlators/cluster/afr/src/afr.h ++++ b/xlators/cluster/afr/src/afr.h +@@ -1091,8 +1091,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd); + #define AFR_FRAME_INIT(frame, op_errno) \ + ({ \ + frame->local = mem_get0(THIS->local_pool); \ +- if (afr_local_init(frame->local, THIS->private, &op_errno)) { \ +- afr_local_cleanup(frame->local, THIS); \ ++ if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \ ++ afr_local_cleanup(frame->local, frame->this); \ + mem_put(frame->local); \ + frame->local = NULL; \ + }; \ +-- +1.8.3.1 + |