summaryrefslogtreecommitdiff
path: root/0455-locks-prevent-deletion-of-locked-entries.patch
diff options
context:
space:
mode:
Diffstat (limited to '0455-locks-prevent-deletion-of-locked-entries.patch')
-rw-r--r--0455-locks-prevent-deletion-of-locked-entries.patch1253
1 files changed, 1253 insertions, 0 deletions
diff --git a/0455-locks-prevent-deletion-of-locked-entries.patch b/0455-locks-prevent-deletion-of-locked-entries.patch
new file mode 100644
index 0000000..5960690
--- /dev/null
+++ b/0455-locks-prevent-deletion-of-locked-entries.patch
@@ -0,0 +1,1253 @@
+From 3f6ff474db3934f43d9963dfe4dda7d201211e75 Mon Sep 17 00:00:00 2001
+From: Xavi Hernandez <xhernandez@redhat.com>
+Date: Fri, 12 Jun 2020 00:06:36 +0200
+Subject: [PATCH 455/456] locks: prevent deletion of locked entries
+
+To keep consistency inside transactions started by locking an entry or
+an inode, this change delays the removal of entries that are currently
+locked by one or more clients. Once all locks are released, the removal
+is processed.
+
+It has also been improved the detection of stale inodes in the locking
+code of EC.
+
+>Upstream patch - https://review.gluster.org/#/c/glusterfs/+/20025/
+>Fixes: #990
+
+Change-Id: Ic8ba23d9480f80c7f74e7a310bf8a15922320fd5
+BUG: 1812789
+Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/206442
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+---
+ xlators/cluster/ec/src/ec-locks.c | 69 ++++++--
+ xlators/features/locks/src/common.c | 316 ++++++++++++++++++++++++++++++++++-
+ xlators/features/locks/src/common.h | 43 +++++
+ xlators/features/locks/src/entrylk.c | 19 +--
+ xlators/features/locks/src/inodelk.c | 150 ++++++++++-------
+ xlators/features/locks/src/locks.h | 23 ++-
+ xlators/features/locks/src/posix.c | 183 ++++++++++++++++++--
+ 7 files changed, 689 insertions(+), 114 deletions(-)
+
+diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
+index ffcac07..db86296 100644
+--- a/xlators/cluster/ec/src/ec-locks.c
++++ b/xlators/cluster/ec/src/ec-locks.c
+@@ -28,9 +28,36 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
+ ec_t *ec = fop->xl->private;
+ ec_cbk_data_t *ans = NULL;
+ ec_cbk_data_t *cbk = NULL;
+- uintptr_t locked = 0, notlocked = 0;
++ uintptr_t locked = 0;
++ int32_t good = 0;
++ int32_t eagain = 0;
++ int32_t estale = 0;
+ int32_t error = -1;
+
++ /* There are some errors that we'll handle in an special way while trying
++ * to acquire a lock.
++ *
++ * EAGAIN: If it's found during a parallel non-blocking lock request, we
++ * consider that there's contention on the inode, so we consider
++ * the acquisition a failure and try again with a sequential
++ * blocking lock request. This will ensure that we get a lock on
++ * as many bricks as possible (ignoring EAGAIN here would cause
++ * unnecessary triggers of self-healing).
++ *
++ * If it's found during a sequential blocking lock request, it's
++ * considered an error. Lock will only succeed if there are
++ * enough other bricks locked.
++ *
++ * ESTALE: This can appear during parallel or sequential lock request if
++ * the inode has just been unlinked. We consider this error is
++ * not recoverable, but we also don't consider it as fatal. So,
++ * if it happens during parallel lock, we won't attempt a
++ * sequential one unless there are EAGAIN errors on other
++ * bricks (and are enough to form a quorum), but if we reach
++ * quorum counting the ESTALE bricks, we consider the whole
++ * result of the operation is ESTALE instead of EIO.
++ */
++
+ list_for_each_entry(ans, &fop->cbk_list, list)
+ {
+ if (ans->op_ret >= 0) {
+@@ -38,24 +65,23 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
+ error = EIO;
+ }
+ locked |= ans->mask;
++ good = ans->count;
+ cbk = ans;
+- } else {
+- if (ans->op_errno == EAGAIN) {
+- switch (fop->uint32) {
+- case EC_LOCK_MODE_NONE:
+- case EC_LOCK_MODE_ALL:
+- /* Goal is to treat non-blocking lock as failure
+- * even if there is a single EAGAIN*/
+- notlocked |= ans->mask;
+- break;
+- }
+- }
++ } else if (ans->op_errno == ESTALE) {
++ estale += ans->count;
++ } else if ((ans->op_errno == EAGAIN) &&
++ (fop->uint32 != EC_LOCK_MODE_INC)) {
++ eagain += ans->count;
+ }
+ }
+
+ if (error == -1) {
+- if (gf_bits_count(locked | notlocked) >= ec->fragments) {
+- if (notlocked == 0) {
++ /* If we have enough quorum with succeeded and EAGAIN answers, we
++ * ignore for now any ESTALE answer. If there are EAGAIN answers,
++ * we retry with a sequential blocking lock request if needed.
++ * Otherwise we succeed. */
++ if ((good + eagain) >= ec->fragments) {
++ if (eagain == 0) {
+ if (fop->answer == NULL) {
+ fop->answer = cbk;
+ }
+@@ -68,21 +94,28 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
+ case EC_LOCK_MODE_NONE:
+ error = EAGAIN;
+ break;
+-
+ case EC_LOCK_MODE_ALL:
+ fop->uint32 = EC_LOCK_MODE_INC;
+ break;
+-
+ default:
++ /* This shouldn't happen because eagain cannot be > 0
++ * when fop->uint32 is EC_LOCK_MODE_INC. */
+ error = EIO;
+ break;
+ }
+ }
+ } else {
+- if (fop->answer && fop->answer->op_ret < 0)
++ /* We have been unable to find enough candidates that will be able
++ * to take the lock. If we have quorum on some answer, we return
++ * it. Otherwise we check if ESTALE answers allow us to reach
++ * quorum. If so, we return ESTALE. */
++ if (fop->answer && fop->answer->op_ret < 0) {
+ error = fop->answer->op_errno;
+- else
++ } else if ((good + eagain + estale) >= ec->fragments) {
++ error = ESTALE;
++ } else {
+ error = EIO;
++ }
+ }
+ }
+
+diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c
+index 1406e70..0c52853 100644
+--- a/xlators/features/locks/src/common.c
++++ b/xlators/features/locks/src/common.c
+@@ -462,11 +462,16 @@ pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local)
+ INIT_LIST_HEAD(&pl_inode->blocked_calls);
+ INIT_LIST_HEAD(&pl_inode->metalk_list);
+ INIT_LIST_HEAD(&pl_inode->queued_locks);
++ INIT_LIST_HEAD(&pl_inode->waiting);
+ gf_uuid_copy(pl_inode->gfid, inode->gfid);
+
+ pl_inode->check_mlock_info = _gf_true;
+ pl_inode->mlock_enforced = _gf_false;
+
++ /* -2 means never looked up. -1 means something went wrong and link
++ * tracking is disabled. */
++ pl_inode->links = -2;
++
+ ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode));
+ if (ret) {
+ pthread_mutex_destroy(&pl_inode->mutex);
+@@ -1276,4 +1281,313 @@ pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+ }
+
+ return 0;
+-}
+\ No newline at end of file
++}
++
++gf_boolean_t
++pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client)
++{
++ if (client && (client->opversion < GD_OP_VERSION_7_0)) {
++ return _gf_true;
++ }
++
++ if (is_lk_owner_null(owner)) {
++ return _gf_false;
++ }
++ return _gf_true;
++}
++
++static int32_t
++pl_inode_from_loc(loc_t *loc, inode_t **pinode)
++{
++ inode_t *inode = NULL;
++ int32_t error = 0;
++
++ if (loc->inode != NULL) {
++ inode = inode_ref(loc->inode);
++ goto done;
++ }
++
++ if (loc->parent == NULL) {
++ error = EINVAL;
++ goto done;
++ }
++
++ if (!gf_uuid_is_null(loc->gfid)) {
++ inode = inode_find(loc->parent->table, loc->gfid);
++ if (inode != NULL) {
++ goto done;
++ }
++ }
++
++ if (loc->name == NULL) {
++ error = EINVAL;
++ goto done;
++ }
++
++ inode = inode_grep(loc->parent->table, loc->parent, loc->name);
++ if (inode == NULL) {
++ /* We haven't found any inode. This means that the file doesn't exist
++ * or that even if it exists, we don't have any knowledge about it, so
++ * we don't have locks on it either, which is fine for our purposes. */
++ goto done;
++ }
++
++done:
++ *pinode = inode;
++
++ return error;
++}
++
++static gf_boolean_t
++pl_inode_has_owners(xlator_t *xl, client_t *client, pl_inode_t *pl_inode,
++ struct timespec *now, struct list_head *contend)
++{
++ pl_dom_list_t *dom;
++ pl_inode_lock_t *lock;
++ gf_boolean_t has_owners = _gf_false;
++
++ list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
++ {
++ list_for_each_entry(lock, &dom->inodelk_list, list)
++ {
++ /* If the lock belongs to the same client, we assume it's related
++ * to the same operation, so we allow the removal to continue. */
++ if (lock->client == client) {
++ continue;
++ }
++ /* If the lock belongs to an internal process, we don't block the
++ * removal. */
++ if (lock->client_pid < 0) {
++ continue;
++ }
++ if (contend == NULL) {
++ return _gf_true;
++ }
++ has_owners = _gf_true;
++ inodelk_contention_notify_check(xl, lock, now, contend);
++ }
++ }
++
++ return has_owners;
++}
++
++int32_t
++pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
++ pl_inode_t **ppl_inode, struct list_head *contend)
++{
++ struct timespec now;
++ inode_t *inode;
++ pl_inode_t *pl_inode;
++ int32_t error;
++
++ pl_inode = NULL;
++
++ error = pl_inode_from_loc(loc, &inode);
++ if ((error != 0) || (inode == NULL)) {
++ goto done;
++ }
++
++ pl_inode = pl_inode_get(xl, inode, NULL);
++ if (pl_inode == NULL) {
++ inode_unref(inode);
++ error = ENOMEM;
++ goto done;
++ }
++
++ /* pl_inode_from_loc() already increments ref count for inode, so
++ * we only assign here our reference. */
++ pl_inode->inode = inode;
++
++ timespec_now(&now);
++
++ pthread_mutex_lock(&pl_inode->mutex);
++
++ if (pl_inode->removed) {
++ error = ESTALE;
++ goto unlock;
++ }
++
++ if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) {
++ error = -1;
++ /* We skip the unlock here because the caller must create a stub when
++ * we return -1 and do a call to pl_inode_remove_complete(), which
++ * assumes the lock is still acquired and will release it once
++ * everything else is prepared. */
++ goto done;
++ }
++
++ pl_inode->is_locked = _gf_true;
++ pl_inode->remove_running++;
++
++unlock:
++ pthread_mutex_unlock(&pl_inode->mutex);
++
++done:
++ *ppl_inode = pl_inode;
++
++ return error;
++}
++
++int32_t
++pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub,
++ struct list_head *contend)
++{
++ pl_inode_lock_t *lock;
++ int32_t error = -1;
++
++ if (stub != NULL) {
++ list_add_tail(&stub->list, &pl_inode->waiting);
++ pl_inode->is_locked = _gf_true;
++ } else {
++ error = ENOMEM;
++
++ while (!list_empty(contend)) {
++ lock = list_first_entry(contend, pl_inode_lock_t, list);
++ list_del_init(&lock->list);
++ __pl_inodelk_unref(lock);
++ }
++ }
++
++ pthread_mutex_unlock(&pl_inode->mutex);
++
++ if (error < 0) {
++ inodelk_contention_notify(xl, contend);
++ }
++
++ inode_unref(pl_inode->inode);
++
++ return error;
++}
++
++void
++pl_inode_remove_wake(struct list_head *list)
++{
++ call_stub_t *stub;
++
++ while (!list_empty(list)) {
++ stub = list_first_entry(list, call_stub_t, list);
++ list_del_init(&stub->list);
++
++ call_resume(stub);
++ }
++}
++
++void
++pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error)
++{
++ struct list_head contend, granted;
++ struct timespec now;
++ pl_dom_list_t *dom;
++
++ if (pl_inode == NULL) {
++ return;
++ }
++
++ INIT_LIST_HEAD(&contend);
++ INIT_LIST_HEAD(&granted);
++ timespec_now(&now);
++
++ pthread_mutex_lock(&pl_inode->mutex);
++
++ if (error == 0) {
++ if (pl_inode->links >= 0) {
++ pl_inode->links--;
++ }
++ if (pl_inode->links == 0) {
++ pl_inode->removed = _gf_true;
++ }
++ }
++
++ pl_inode->remove_running--;
++
++ if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) {
++ pl_inode->is_locked = _gf_false;
++
++ list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
++ {
++ __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now,
++ &contend);
++ }
++ }
++
++ pthread_mutex_unlock(&pl_inode->mutex);
++
++ unwind_granted_inodes(xl, pl_inode, &granted);
++
++ inodelk_contention_notify(xl, &contend);
++
++ inode_unref(pl_inode->inode);
++}
++
++void
++pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode,
++ struct list_head *list)
++{
++ call_stub_t *stub, *tmp;
++
++ if (!pl_inode->is_locked) {
++ return;
++ }
++
++ list_for_each_entry_safe(stub, tmp, &pl_inode->waiting, list)
++ {
++ if (!pl_inode_has_owners(xl, stub->frame->root->client, pl_inode, NULL,
++ NULL)) {
++ list_move_tail(&stub->list, list);
++ }
++ }
++}
++
++/* This function determines if an inodelk attempt can be done now or it needs
++ * to wait.
++ *
++ * Possible return values:
++ * < 0: An error occurred. Currently only -ESTALE can be returned if the
++ * inode has been deleted previously by unlink/rmdir/rename
++ * = 0: The lock can be attempted.
++ * > 0: The lock needs to wait because a conflicting remove operation is
++ * ongoing.
++ */
++int32_t
++pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock)
++{
++ pl_dom_list_t *dom;
++ pl_inode_lock_t *ilock;
++
++ /* If the inode has been deleted, we won't allow any lock. */
++ if (pl_inode->removed) {
++ return -ESTALE;
++ }
++
++ /* We only synchronize with locks made for regular operations coming from
++ * the user. Locks done for internal purposes are hard to control and could
++ * lead to long delays or deadlocks quite easily. */
++ if (lock->client_pid < 0) {
++ return 0;
++ }
++ if (!pl_inode->is_locked) {
++ return 0;
++ }
++ if (pl_inode->remove_running > 0) {
++ return 1;
++ }
++
++ list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
++ {
++ list_for_each_entry(ilock, &dom->inodelk_list, list)
++ {
++ /* If a lock from the same client is already granted, we allow this
++ * one to continue. This is necessary to prevent deadlocks when
++ * multiple locks are taken for the same operation.
++ *
++ * On the other side it's unlikely that the same client sends
++ * completely unrelated locks for the same inode.
++ */
++ if (ilock->client == lock->client) {
++ return 0;
++ }
++ }
++ }
++
++ return 1;
++}
+diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h
+index ea86b96..6c81ac3 100644
+--- a/xlators/features/locks/src/common.h
++++ b/xlators/features/locks/src/common.h
+@@ -105,6 +105,15 @@ void
+ __pl_inodelk_unref(pl_inode_lock_t *lock);
+
+ void
++__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
++ struct list_head *granted, pl_dom_list_t *dom,
++ struct timespec *now, struct list_head *contend);
++
++void
++unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode,
++ struct list_head *granted);
++
++void
+ grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode,
+ pl_dom_list_t *dom, struct timespec *now,
+ struct list_head *contend);
+@@ -204,6 +213,16 @@ pl_metalock_is_active(pl_inode_t *pl_inode);
+ void
+ __pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock);
+
++void
++inodelk_contention_notify_check(xlator_t *xl, pl_inode_lock_t *lock,
++ struct timespec *now,
++ struct list_head *contend);
++
++void
++entrylk_contention_notify_check(xlator_t *xl, pl_entry_lock_t *lock,
++ struct timespec *now,
++ struct list_head *contend);
++
+ gf_boolean_t
+ pl_does_monkey_want_stuck_lock();
+
+@@ -216,4 +235,28 @@ pl_clean_local(pl_local_t *local);
+ int
+ pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd);
+
++gf_boolean_t
++pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client);
++
++int32_t
++pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
++ pl_inode_t **ppl_inode, struct list_head *contend);
++
++int32_t
++pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub,
++ struct list_head *contend);
++
++void
++pl_inode_remove_wake(struct list_head *list);
++
++void
++pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error);
++
++void
++pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode,
++ struct list_head *list);
++
++int32_t
++pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock);
++
+ #endif /* __COMMON_H__ */
+diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c
+index 93c649c..b97836f 100644
+--- a/xlators/features/locks/src/entrylk.c
++++ b/xlators/features/locks/src/entrylk.c
+@@ -197,9 +197,9 @@ out:
+ return revoke_lock;
+ }
+
+-static gf_boolean_t
+-__entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+- struct timespec *now)
++void
++entrylk_contention_notify_check(xlator_t *this, pl_entry_lock_t *lock,
++ struct timespec *now, struct list_head *contend)
+ {
+ posix_locks_private_t *priv;
+ int64_t elapsed;
+@@ -209,7 +209,7 @@ __entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+ /* If this lock is in a list, it means that we are about to send a
+ * notification for it, so no need to do anything else. */
+ if (!list_empty(&lock->contend)) {
+- return _gf_false;
++ return;
+ }
+
+ elapsed = now->tv_sec;
+@@ -218,7 +218,7 @@ __entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+ elapsed--;
+ }
+ if (elapsed < priv->notify_contention_delay) {
+- return _gf_false;
++ return;
+ }
+
+ /* All contention notifications will be sent outside of the locked
+@@ -231,7 +231,7 @@ __entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+
+ lock->contention_time = *now;
+
+- return _gf_true;
++ list_add_tail(&lock->contend, contend);
+ }
+
+ void
+@@ -325,9 +325,7 @@ __entrylk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock,
+ break;
+ }
+ }
+- if (__entrylk_needs_contention_notify(this, tmp, now)) {
+- list_add_tail(&tmp->contend, contend);
+- }
++ entrylk_contention_notify_check(this, tmp, now, contend);
+ }
+ }
+
+@@ -690,10 +688,9 @@ __grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode,
+ bl_ret = __lock_entrylk(bl->this, pl_inode, bl, 0, dom, now, contend);
+
+ if (bl_ret == 0) {
+- list_add(&bl->blocked_locks, granted);
++ list_add_tail(&bl->blocked_locks, granted);
+ }
+ }
+- return;
+ }
+
+ /* Grants locks if possible which are blocked on a lock */
+diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
+index 24dee49..1a07243 100644
+--- a/xlators/features/locks/src/inodelk.c
++++ b/xlators/features/locks/src/inodelk.c
+@@ -231,9 +231,9 @@ out:
+ return revoke_lock;
+ }
+
+-static gf_boolean_t
+-__inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+- struct timespec *now)
++void
++inodelk_contention_notify_check(xlator_t *this, pl_inode_lock_t *lock,
++ struct timespec *now, struct list_head *contend)
+ {
+ posix_locks_private_t *priv;
+ int64_t elapsed;
+@@ -243,7 +243,7 @@ __inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+ /* If this lock is in a list, it means that we are about to send a
+ * notification for it, so no need to do anything else. */
+ if (!list_empty(&lock->contend)) {
+- return _gf_false;
++ return;
+ }
+
+ elapsed = now->tv_sec;
+@@ -252,7 +252,7 @@ __inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+ elapsed--;
+ }
+ if (elapsed < priv->notify_contention_delay) {
+- return _gf_false;
++ return;
+ }
+
+ /* All contention notifications will be sent outside of the locked
+@@ -265,7 +265,7 @@ __inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+
+ lock->contention_time = *now;
+
+- return _gf_true;
++ list_add_tail(&lock->contend, contend);
+ }
+
+ void
+@@ -353,9 +353,7 @@ __inodelk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+ break;
+ }
+ }
+- if (__inodelk_needs_contention_notify(this, l, now)) {
+- list_add_tail(&l->contend, contend);
+- }
++ inodelk_contention_notify_check(this, l, now, contend);
+ }
+ }
+
+@@ -435,12 +433,17 @@ __lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
+ struct list_head *contend)
+ {
+ pl_inode_lock_t *conf = NULL;
+- int ret = -EINVAL;
++ int ret;
+
+- conf = __inodelk_grantable(this, dom, lock, now, contend);
+- if (conf) {
+- ret = __lock_blocked_add(this, dom, lock, can_block);
+- goto out;
++ ret = pl_inode_remove_inodelk(pl_inode, lock);
++ if (ret < 0) {
++ return ret;
++ }
++ if (ret == 0) {
++ conf = __inodelk_grantable(this, dom, lock, now, contend);
++ }
++ if ((ret > 0) || (conf != NULL)) {
++ return __lock_blocked_add(this, dom, lock, can_block);
+ }
+
+ /* To prevent blocked locks starvation, check if there are any blocked
+@@ -462,17 +465,13 @@ __lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
+ "starvation");
+ }
+
+- ret = __lock_blocked_add(this, dom, lock, can_block);
+- goto out;
++ return __lock_blocked_add(this, dom, lock, can_block);
+ }
+ __pl_inodelk_ref(lock);
+ gettimeofday(&lock->granted_time, NULL);
+ list_add(&lock->list, &dom->inodelk_list);
+
+- ret = 0;
+-
+-out:
+- return ret;
++ return 0;
+ }
+
+ /* Return true if the two inodelks have exactly same lock boundaries */
+@@ -529,12 +528,11 @@ out:
+ return conf;
+ }
+
+-static void
++void
+ __grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+ struct list_head *granted, pl_dom_list_t *dom,
+ struct timespec *now, struct list_head *contend)
+ {
+- int bl_ret = 0;
+ pl_inode_lock_t *bl = NULL;
+ pl_inode_lock_t *tmp = NULL;
+
+@@ -547,52 +545,48 @@ __grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+ {
+ list_del_init(&bl->blocked_locks);
+
+- bl_ret = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend);
++ bl->status = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend);
+
+- if (bl_ret == 0) {
+- list_add(&bl->blocked_locks, granted);
++ if (bl->status != -EAGAIN) {
++ list_add_tail(&bl->blocked_locks, granted);
+ }
+ }
+- return;
+ }
+
+-/* Grant all inodelks blocked on a lock */
+ void
+-grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+- pl_dom_list_t *dom, struct timespec *now,
+- struct list_head *contend)
++unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode,
++ struct list_head *granted)
+ {
+- struct list_head granted;
+ pl_inode_lock_t *lock;
+ pl_inode_lock_t *tmp;
++ int32_t op_ret;
++ int32_t op_errno;
+
+- INIT_LIST_HEAD(&granted);
+-
+- pthread_mutex_lock(&pl_inode->mutex);
+- {
+- __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now,
+- contend);
+- }
+- pthread_mutex_unlock(&pl_inode->mutex);
+-
+- list_for_each_entry_safe(lock, tmp, &granted, blocked_locks)
++ list_for_each_entry_safe(lock, tmp, granted, blocked_locks)
+ {
+- gf_log(this->name, GF_LOG_TRACE,
+- "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => Granted",
+- lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid,
+- lkowner_utoa(&lock->owner), lock->user_flock.l_start,
+- lock->user_flock.l_len);
+-
++ if (lock->status == 0) {
++ op_ret = 0;
++ op_errno = 0;
++ gf_log(this->name, GF_LOG_TRACE,
++ "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64
++ " => Granted",
++ lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
++ lock->client_pid, lkowner_utoa(&lock->owner),
++ lock->user_flock.l_start, lock->user_flock.l_len);
++ } else {
++ op_ret = -1;
++ op_errno = -lock->status;
++ }
+ pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock,
+- 0, 0, lock->volume);
++ op_ret, op_errno, lock->volume);
+
+- STACK_UNWIND_STRICT(inodelk, lock->frame, 0, 0, NULL);
++ STACK_UNWIND_STRICT(inodelk, lock->frame, op_ret, op_errno, NULL);
+ lock->frame = NULL;
+ }
+
+ pthread_mutex_lock(&pl_inode->mutex);
+ {
+- list_for_each_entry_safe(lock, tmp, &granted, blocked_locks)
++ list_for_each_entry_safe(lock, tmp, granted, blocked_locks)
+ {
+ list_del_init(&lock->blocked_locks);
+ __pl_inodelk_unref(lock);
+@@ -601,6 +595,26 @@ grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+ pthread_mutex_unlock(&pl_inode->mutex);
+ }
+
++/* Grant all inodelks blocked on a lock */
++void
++grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
++ pl_dom_list_t *dom, struct timespec *now,
++ struct list_head *contend)
++{
++ struct list_head granted;
++
++ INIT_LIST_HEAD(&granted);
++
++ pthread_mutex_lock(&pl_inode->mutex);
++ {
++ __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now,
++ contend);
++ }
++ pthread_mutex_unlock(&pl_inode->mutex);
++
++ unwind_granted_inodes(this, pl_inode, &granted);
++}
++
+ static void
+ pl_inodelk_log_cleanup(pl_inode_lock_t *lock)
+ {
+@@ -662,7 +676,7 @@ pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx)
+ * and blocked lists, then this means that a parallel
+ * unlock on another inodelk (L2 say) may have 'granted'
+ * L1 and added it to 'granted' list in
+- * __grant_blocked_node_locks() (although using the
++ * __grant_blocked_inode_locks() (although using the
+ * 'blocked_locks' member). In that case, the cleanup
+ * codepath must try and grant other overlapping
+ * blocked inodelks from other clients, now that L1 is
+@@ -747,6 +761,7 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+ gf_boolean_t need_inode_unref = _gf_false;
+ struct list_head *pcontend = NULL;
+ struct list_head contend;
++ struct list_head wake;
+ struct timespec now = {};
+ short fl_type;
+
+@@ -798,6 +813,8 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+ timespec_now(&now);
+ }
+
++ INIT_LIST_HEAD(&wake);
++
+ if (ctx)
+ pthread_mutex_lock(&ctx->lock);
+ pthread_mutex_lock(&pl_inode->mutex);
+@@ -820,18 +837,17 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+ lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+ lock->client_pid, lkowner_utoa(&lock->owner),
+ lock->user_flock.l_start, lock->user_flock.l_len);
+- if (can_block)
++ if (can_block) {
+ unref = _gf_false;
+- /* For all but the case where a non-blocking
+- * lock attempt fails, the extra ref taken at
+- * the start of this function must be negated.
+- */
+- else
+- need_inode_unref = _gf_true;
++ }
+ }
+-
+- if (ctx && (!ret || can_block))
++ /* For all but the case where a non-blocking lock attempt fails
++ * with -EAGAIN, the extra ref taken at the start of this function
++ * must be negated. */
++ need_inode_unref = (ret != 0) && ((ret != -EAGAIN) || !can_block);
++ if (ctx && !need_inode_unref) {
+ list_add_tail(&lock->client_list, &ctx->inodelk_lockers);
++ }
+ } else {
+ /* Irrespective of whether unlock succeeds or not,
+ * the extra inode ref that was done at the start of
+@@ -849,6 +865,8 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+ list_del_init(&retlock->client_list);
+ __pl_inodelk_unref(retlock);
+
++ pl_inode_remove_unlocked(this, pl_inode, &wake);
++
+ ret = 0;
+ }
+ out:
+@@ -859,6 +877,8 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+ if (ctx)
+ pthread_mutex_unlock(&ctx->lock);
+
++ pl_inode_remove_wake(&wake);
++
+ /* The following (extra) unref corresponds to the ref that
+ * was done at the time the lock was granted.
+ */
+@@ -1033,10 +1053,14 @@ pl_common_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+ inode);
+
+ if (ret < 0) {
+- if ((can_block) && (F_UNLCK != lock_type)) {
+- goto out;
++ if (ret == -EAGAIN) {
++ if (can_block && (F_UNLCK != lock_type)) {
++ goto out;
++ }
++ gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN");
++ } else {
++ gf_log(this->name, GF_LOG_TRACE, "returning %d", ret);
+ }
+- gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN");
+ op_errno = -ret;
+ goto unwind;
+ }
+diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
+index aa267de..6666feb 100644
+--- a/xlators/features/locks/src/locks.h
++++ b/xlators/features/locks/src/locks.h
+@@ -102,6 +102,9 @@ struct __pl_inode_lock {
+
+ struct list_head client_list; /* list of all locks from a client */
+ short fl_type;
++
++ int32_t status; /* Error code when we try to grant a lock in blocked
++ state */
+ };
+ typedef struct __pl_inode_lock pl_inode_lock_t;
+
+@@ -164,13 +167,14 @@ struct __pl_inode {
+ struct list_head rw_list; /* list of waiting r/w requests */
+ struct list_head reservelk_list; /* list of reservelks */
+ struct list_head blocked_reservelks; /* list of blocked reservelks */
+- struct list_head
+- blocked_calls; /* List of blocked lock calls while a reserve is held*/
+- struct list_head metalk_list; /* Meta lock list */
+- /* This is to store the incoming lock
+- requests while meta lock is enabled */
+- struct list_head queued_locks;
+- int mandatory; /* if mandatory locking is enabled */
++ struct list_head blocked_calls; /* List of blocked lock calls while a
++ reserve is held*/
++ struct list_head metalk_list; /* Meta lock list */
++ struct list_head queued_locks; /* This is to store the incoming lock
++ requests while meta lock is enabled */
++ struct list_head waiting; /* List of pending fops waiting to unlink/rmdir
++ the inode. */
++ int mandatory; /* if mandatory locking is enabled */
+
+ inode_t *refkeeper; /* hold refs on an inode while locks are
+ held to prevent pruning */
+@@ -197,6 +201,11 @@ struct __pl_inode {
+ */
+ int fop_wind_count;
+ pthread_cond_t check_fop_wind_count;
++
++ int32_t links; /* Number of hard links the inode has. */
++ uint32_t remove_running; /* Number of remove operations running. */
++ gf_boolean_t is_locked; /* Regular locks will be blocked. */
++ gf_boolean_t removed; /* The inode has been deleted. */
+ };
+ typedef struct __pl_inode pl_inode_t;
+
+diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
+index 7887b82..5ae0125 100644
+--- a/xlators/features/locks/src/posix.c
++++ b/xlators/features/locks/src/posix.c
+@@ -147,6 +147,29 @@ fetch_pathinfo(xlator_t *, inode_t *, int32_t *, char **);
+ } \
+ } while (0)
+
++#define PL_INODE_REMOVE(_fop, _frame, _xl, _loc1, _loc2, _cont, _cbk, \
++ _args...) \
++ ({ \
++ struct list_head contend; \
++ pl_inode_t *__pl_inode; \
++ call_stub_t *__stub; \
++ int32_t __error; \
++ INIT_LIST_HEAD(&contend); \
++ __error = pl_inode_remove_prepare(_xl, _frame, _loc2 ? _loc2 : _loc1, \
++ &__pl_inode, &contend); \
++ if (__error < 0) { \
++ __stub = fop_##_fop##_stub(_frame, _cont, ##_args); \
++ __error = pl_inode_remove_complete(_xl, __pl_inode, __stub, \
++ &contend); \
++ } else if (__error == 0) { \
++ PL_LOCAL_GET_REQUESTS(_frame, _xl, xdata, ((fd_t *)NULL), _loc1, \
++ _loc2); \
++ STACK_WIND_COOKIE(_frame, _cbk, __pl_inode, FIRST_CHILD(_xl), \
++ FIRST_CHILD(_xl)->fops->_fop, ##_args); \
++ } \
++ __error; \
++ })
++
+ gf_boolean_t
+ pl_has_xdata_requests(dict_t *xdata)
+ {
+@@ -2969,11 +2992,85 @@ out:
+ return ret;
+ }
+
++static int32_t
++pl_request_link_count(dict_t **pxdata)
++{
++ dict_t *xdata;
++
++ xdata = *pxdata;
++ if (xdata == NULL) {
++ xdata = dict_new();
++ if (xdata == NULL) {
++ return ENOMEM;
++ }
++ } else {
++ dict_ref(xdata);
++ }
++
++ if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) {
++ dict_unref(xdata);
++ return ENOMEM;
++ }
++
++ *pxdata = xdata;
++
++ return 0;
++}
++
++static int32_t
++pl_check_link_count(dict_t *xdata)
++{
++ int32_t count;
++
++ /* In case we are unable to read the link count from xdata, we take a
++ * conservative approach and return -2, which will prevent the inode from
++ * being considered deleted. In fact it will cause link tracking for this
++ * inode to be disabled completely to avoid races. */
++
++ if (xdata == NULL) {
++ return -2;
++ }
++
++ if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) {
++ return -2;
++ }
++
++ return count;
++}
++
+ int32_t
+ pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+ struct iatt *postparent)
+ {
++ pl_inode_t *pl_inode;
++
++ if (op_ret >= 0) {
++ pl_inode = pl_inode_get(this, inode, NULL);
++ if (pl_inode == NULL) {
++ PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL,
++ NULL);
++ return 0;
++ }
++
++ pthread_mutex_lock(&pl_inode->mutex);
++
++ /* We only update the link count if we previously didn't know it.
++ * Doing it always can lead to races since lookup is not executed
++ * atomically most of the times. */
++ if (pl_inode->links == -2) {
++ pl_inode->links = pl_check_link_count(xdata);
++ if (buf->ia_type == IA_IFDIR) {
++ /* Directories have at least 2 links. To avoid special handling
++ * for directories, we simply decrement the value here to make
++ * them equivalent to regular files. */
++ pl_inode->links--;
++ }
++ }
++
++ pthread_mutex_unlock(&pl_inode->mutex);
++ }
++
+ PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata,
+ postparent);
+ return 0;
+@@ -2982,9 +3079,17 @@ pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t
+ pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+ {
+- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+- STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this),
+- FIRST_CHILD(this)->fops->lookup, loc, xdata);
++ int32_t error;
++
++ error = pl_request_link_count(&xdata);
++ if (error == 0) {
++ PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
++ STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->lookup, loc, xdata);
++ dict_unref(xdata);
++ } else {
++ STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL);
++ }
+ return 0;
+ }
+
+@@ -3792,6 +3897,10 @@ unlock:
+ gf_proc_dump_write("posixlk-count", "%d", count);
+ __dump_posixlks(pl_inode);
+ }
++
++ gf_proc_dump_write("links", "%d", pl_inode->links);
++ gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running);
++ gf_proc_dump_write("removed", "%u", pl_inode->removed);
+ }
+ pthread_mutex_unlock(&pl_inode->mutex);
+
+@@ -4137,8 +4246,11 @@ pl_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ struct iatt *postoldparent, struct iatt *prenewparent,
+ struct iatt *postnewparent, dict_t *xdata)
+ {
++ pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0);
++
+ PL_STACK_UNWIND(rename, xdata, frame, op_ret, op_errno, buf, preoldparent,
+ postoldparent, prenewparent, postnewparent, xdata);
++
+ return 0;
+ }
+
+@@ -4146,10 +4258,15 @@ int32_t
+ pl_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+ {
+- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc);
++ int32_t error;
++
++ error = PL_INODE_REMOVE(rename, frame, this, oldloc, newloc, pl_rename,
++ pl_rename_cbk, oldloc, newloc, xdata);
++ if (error > 0) {
++ STACK_UNWIND_STRICT(rename, frame, -1, error, NULL, NULL, NULL, NULL,
++ NULL, NULL);
++ }
+
+- STACK_WIND(frame, pl_rename_cbk, FIRST_CHILD(this),
+- FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+ return 0;
+ }
+
+@@ -4273,8 +4390,11 @@ pl_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+ {
++ pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0);
++
+ PL_STACK_UNWIND(unlink, xdata, frame, op_ret, op_errno, preparent,
+ postparent, xdata);
++
+ return 0;
+ }
+
+@@ -4282,9 +4402,14 @@ int32_t
+ pl_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+ dict_t *xdata)
+ {
+- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+- STACK_WIND(frame, pl_unlink_cbk, FIRST_CHILD(this),
+- FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
++ int32_t error;
++
++ error = PL_INODE_REMOVE(unlink, frame, this, loc, NULL, pl_unlink,
++ pl_unlink_cbk, loc, xflag, xdata);
++ if (error > 0) {
++ STACK_UNWIND_STRICT(unlink, frame, -1, error, NULL, NULL, NULL);
++ }
++
+ return 0;
+ }
+
+@@ -4351,8 +4476,11 @@ pl_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+ dict_t *xdata)
+ {
++ pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0);
++
+ PL_STACK_UNWIND_FOR_CLIENT(rmdir, xdata, frame, op_ret, op_errno, preparent,
+ postparent, xdata);
++
+ return 0;
+ }
+
+@@ -4360,9 +4488,14 @@ int
+ pl_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+ dict_t *xdata)
+ {
+- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+- STACK_WIND(frame, pl_rmdir_cbk, FIRST_CHILD(this),
+- FIRST_CHILD(this)->fops->rmdir, loc, xflags, xdata);
++ int32_t error;
++
++ error = PL_INODE_REMOVE(rmdir, frame, this, loc, NULL, pl_rmdir,
++ pl_rmdir_cbk, loc, xflags, xdata);
++ if (error > 0) {
++ STACK_UNWIND_STRICT(rmdir, frame, -1, error, NULL, NULL, NULL);
++ }
++
+ return 0;
+ }
+
+@@ -4392,6 +4525,19 @@ pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t op_errno, inode_t *inode, struct iatt *buf,
+ struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+ {
++ pl_inode_t *pl_inode = (pl_inode_t *)cookie;
++
++ if (op_ret >= 0) {
++ pthread_mutex_lock(&pl_inode->mutex);
++
++ /* TODO: can happen pl_inode->links == 0 ? */
++ if (pl_inode->links >= 0) {
++ pl_inode->links++;
++ }
++
++ pthread_mutex_unlock(&pl_inode->mutex);
++ }
++
+ PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf,
+ preparent, postparent, xdata);
+ return 0;
+@@ -4401,9 +4547,18 @@ int
+ pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+ dict_t *xdata)
+ {
++ pl_inode_t *pl_inode;
++
++ pl_inode = pl_inode_get(this, oldloc->inode, NULL);
++ if (pl_inode == NULL) {
++ STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
++ NULL);
++ return 0;
++ }
++
+ PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc);
+- STACK_WIND(frame, pl_link_cbk, FIRST_CHILD(this),
+- FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
++ STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this),
++ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+ return 0;
+ }
+
+--
+1.8.3.1
+