summaryrefslogtreecommitdiff
path: root/0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch
diff options
context:
space:
mode:
Diffstat (limited to '0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch')
-rw-r--r--0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch204
1 files changed, 204 insertions, 0 deletions
diff --git a/0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch b/0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch
new file mode 100644
index 0000000..0e17c44
--- /dev/null
+++ b/0095-cluster-afr-Remove-local-from-owners_list-on-failure.patch
@@ -0,0 +1,204 @@
+From cca418b78ec976aa69eacd56b0e6127ea7e3dd26 Mon Sep 17 00:00:00 2001
+From: Pranith Kumar K <pkarampu@redhat.com>
+Date: Thu, 4 Apr 2019 15:31:56 +0530
+Subject: [PATCH 095/124] cluster/afr: Remove local from owners_list on failure
+ of lock-acquisition
+
+ Backport of https://review.gluster.org/c/glusterfs/+/22515
+
+When eager-lock lock acquisition fails because of say network failures, the
+local is not being removed from owners_list, this leads to accumulation of
+waiting frames and the application will hang because the waiting frames are
+under the assumption that another transaction is in the process of acquiring
+lock because owner-list is not empty. Handled this case as well in this patch.
+Added asserts to make it easier to find these problems in future.
+
+Change-Id: I3101393265e9827755725b1f2d94a93d8709e923
+fixes: bz#1688395
+Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/167859
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
+---
+ tests/bugs/replicate/bug-1696599-io-hang.t | 47 ++++++++++++++++++++++++++++++
+ xlators/cluster/afr/src/afr-common.c | 8 ++---
+ xlators/cluster/afr/src/afr-lk-common.c | 1 -
+ xlators/cluster/afr/src/afr-transaction.c | 19 +++++-------
+ xlators/cluster/afr/src/afr.h | 4 +--
+ 5 files changed, 61 insertions(+), 18 deletions(-)
+ create mode 100755 tests/bugs/replicate/bug-1696599-io-hang.t
+
+diff --git a/tests/bugs/replicate/bug-1696599-io-hang.t b/tests/bugs/replicate/bug-1696599-io-hang.t
+new file mode 100755
+index 0000000..869cdb9
+--- /dev/null
++++ b/tests/bugs/replicate/bug-1696599-io-hang.t
+@@ -0,0 +1,47 @@
++#!/bin/bash
++
++. $(dirname $0)/../../include.rc
++. $(dirname $0)/../../volume.rc
++. $(dirname $0)/../../fileio.rc
++
++#Tests that local structures in afr are removed from granted/blocked list of
++#locks when inodelk fails on all bricks
++
++cleanup;
++
++TEST glusterd
++TEST pidof glusterd
++
++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{1..3}
++TEST $CLI volume set $V0 performance.quick-read off
++TEST $CLI volume set $V0 performance.write-behind off
++TEST $CLI volume set $V0 performance.io-cache off
++TEST $CLI volume set $V0 performance.stat-prefetch off
++TEST $CLI volume set $V0 performance.client-io-threads off
++TEST $CLI volume set $V0 delay-gen locks
++TEST $CLI volume set $V0 delay-gen.delay-duration 5000000
++TEST $CLI volume set $V0 delay-gen.delay-percentage 100
++TEST $CLI volume set $V0 delay-gen.enable finodelk
++
++TEST $CLI volume start $V0
++EXPECT 'Started' volinfo_field $V0 'Status'
++
++TEST $GFS -s $H0 --volfile-id $V0 $M0
++TEST touch $M0/file
++#Trigger write and stop bricks so inodelks fail on all bricks leading to
++#lock failure condition
++echo abc >> $M0/file &
++
++TEST $CLI volume stop $V0
++TEST $CLI volume reset $V0 delay-gen
++wait
++TEST $CLI volume start $V0
++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 0
++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 1
++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_meta $M0 $V0-replicate-0 2
++#Test that only one write succeeded, this tests that delay-gen worked as
++#expected
++echo abc >> $M0/file
++EXPECT "abc" cat $M0/file
++
++cleanup;
+diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
+index 45b96e3..47a5d3a 100644
+--- a/xlators/cluster/afr/src/afr-common.c
++++ b/xlators/cluster/afr/src/afr-common.c
+@@ -5763,6 +5763,10 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
+ afr_private_t *priv = NULL;
+
+ priv = this->private;
++ INIT_LIST_HEAD(&local->transaction.wait_list);
++ INIT_LIST_HEAD(&local->transaction.owner_list);
++ INIT_LIST_HEAD(&local->ta_waitq);
++ INIT_LIST_HEAD(&local->ta_onwireq);
+ ret = afr_internal_lock_init(&local->internal_lock, priv->child_count);
+ if (ret < 0)
+ goto out;
+@@ -5800,10 +5804,6 @@ afr_transaction_local_init(afr_local_t *local, xlator_t *this)
+ goto out;
+
+ ret = 0;
+- INIT_LIST_HEAD(&local->transaction.wait_list);
+- INIT_LIST_HEAD(&local->transaction.owner_list);
+- INIT_LIST_HEAD(&local->ta_waitq);
+- INIT_LIST_HEAD(&local->ta_onwireq);
+ out:
+ return ret;
+ }
+diff --git a/xlators/cluster/afr/src/afr-lk-common.c b/xlators/cluster/afr/src/afr-lk-common.c
+index 4091671..bc8eabe 100644
+--- a/xlators/cluster/afr/src/afr-lk-common.c
++++ b/xlators/cluster/afr/src/afr-lk-common.c
+@@ -397,7 +397,6 @@ afr_unlock_now(call_frame_t *frame, xlator_t *this)
+ int_lock->lk_call_count = call_count;
+
+ if (!call_count) {
+- GF_ASSERT(!local->transaction.do_eager_unlock);
+ gf_msg_trace(this->name, 0, "No internal locks unlocked");
+ int_lock->lock_cbk(frame, this);
+ goto out;
+diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
+index 229820b..15f3a7e 100644
+--- a/xlators/cluster/afr/src/afr-transaction.c
++++ b/xlators/cluster/afr/src/afr-transaction.c
+@@ -372,6 +372,8 @@ afr_transaction_done(call_frame_t *frame, xlator_t *this)
+ }
+ local->transaction.unwind(frame, this);
+
++ GF_ASSERT(list_empty(&local->transaction.owner_list));
++ GF_ASSERT(list_empty(&local->transaction.wait_list));
+ AFR_STACK_DESTROY(frame);
+
+ return 0;
+@@ -393,7 +395,7 @@ afr_lock_fail_shared(afr_local_t *local, struct list_head *list)
+ }
+
+ static void
+-afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked)
++afr_handle_lock_acquire_failure(afr_local_t *local)
+ {
+ struct list_head shared;
+ afr_lock_t *lock = NULL;
+@@ -414,13 +416,8 @@ afr_handle_lock_acquire_failure(afr_local_t *local, gf_boolean_t locked)
+ afr_lock_fail_shared(local, &shared);
+ local->transaction.do_eager_unlock = _gf_true;
+ out:
+- if (locked) {
+- local->internal_lock.lock_cbk = afr_transaction_done;
+- afr_unlock(local->transaction.frame, local->transaction.frame->this);
+- } else {
+- afr_transaction_done(local->transaction.frame,
+- local->transaction.frame->this);
+- }
++ local->internal_lock.lock_cbk = afr_transaction_done;
++ afr_unlock(local->transaction.frame, local->transaction.frame->this);
+ }
+
+ call_frame_t *
+@@ -619,7 +616,7 @@ afr_transaction_perform_fop(call_frame_t *frame, xlator_t *this)
+ failure_count = AFR_COUNT(local->transaction.failed_subvols,
+ priv->child_count);
+ if (failure_count == priv->child_count) {
+- afr_handle_lock_acquire_failure(local, _gf_true);
++ afr_handle_lock_acquire_failure(local);
+ return 0;
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+@@ -2092,7 +2089,7 @@ err:
+ local->op_ret = -1;
+ local->op_errno = op_errno;
+
+- afr_handle_lock_acquire_failure(local, _gf_true);
++ afr_handle_lock_acquire_failure(local);
+
+ if (xdata_req)
+ dict_unref(xdata_req);
+@@ -2361,7 +2358,7 @@ afr_internal_lock_finish(call_frame_t *frame, xlator_t *this)
+ } else {
+ lock = &local->inode_ctx->lock[local->transaction.type];
+ if (local->internal_lock.lock_op_ret < 0) {
+- afr_handle_lock_acquire_failure(local, _gf_false);
++ afr_handle_lock_acquire_failure(local);
+ } else {
+ lock->event_generation = local->event_generation;
+ afr_changelog_pre_op(frame, this);
+diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
+index 2cc3797..e731cfa 100644
+--- a/xlators/cluster/afr/src/afr.h
++++ b/xlators/cluster/afr/src/afr.h
+@@ -1091,8 +1091,8 @@ afr_cleanup_fd_ctx(xlator_t *this, fd_t *fd);
+ #define AFR_FRAME_INIT(frame, op_errno) \
+ ({ \
+ frame->local = mem_get0(THIS->local_pool); \
+- if (afr_local_init(frame->local, THIS->private, &op_errno)) { \
+- afr_local_cleanup(frame->local, THIS); \
++ if (afr_local_init(frame->local, frame->this->private, &op_errno)) { \
++ afr_local_cleanup(frame->local, frame->this); \
+ mem_put(frame->local); \
+ frame->local = NULL; \
+ }; \
+--
+1.8.3.1
+