1 files changed, 1253 insertions, 0 deletions
diff --git a/0455-locks-prevent-deletion-of-locked-entries.patch b/0455-locks-prevent-deletion-of-locked-entries.patch
new file mode 100644
index 0000000..5960690
--- /dev/null
+++ b/0455-locks-prevent-deletion-of-locked-entries.patch
@@ -0,0 +1,1253 @@
+From 3f6ff474db3934f43d9963dfe4dda7d201211e75 Mon Sep 17 00:00:00 2001
+From: Xavi Hernandez <xhernandez@redhat.com>
+Date: Fri, 12 Jun 2020 00:06:36 +0200
+Subject: [PATCH 455/456] locks: prevent deletion of locked entries
+
+To keep consistency inside transactions started by locking an entry or
+an inode, this change delays the removal of entries that are currently
+locked by one or more clients. Once all locks are released, the removal
+is processed.
+
+It has also been improved the detection of stale inodes in the locking
+code of EC.
+
+>Upstream patch - https://review.gluster.org/#/c/glusterfs/+/20025/
+>Fixes: #990
+
+Change-Id: Ic8ba23d9480f80c7f74e7a310bf8a15922320fd5
+BUG: 1812789
+Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/206442
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+---
+ xlators/cluster/ec/src/ec-locks.c    |  69 ++++++--
+ xlators/features/locks/src/common.c  | 316 ++++++++++++++++++++++++++++++++++-
+ xlators/features/locks/src/common.h  |  43 +++++
+ xlators/features/locks/src/entrylk.c |  19 +--
+ xlators/features/locks/src/inodelk.c | 150 ++++++++++-------
+ xlators/features/locks/src/locks.h   |  23 ++-
+ xlators/features/locks/src/posix.c   | 183 ++++++++++++++++++--
+ 7 files changed, 689 insertions(+), 114 deletions(-)
+
+diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c
+index ffcac07..db86296 100644
+--- a/xlators/cluster/ec/src/ec-locks.c
++++ b/xlators/cluster/ec/src/ec-locks.c
+@@ -28,9 +28,36 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
+     ec_t *ec = fop->xl->private;
+     ec_cbk_data_t *ans = NULL;
+     ec_cbk_data_t *cbk = NULL;
+-    uintptr_t locked = 0, notlocked = 0;
++    uintptr_t locked = 0;
++    int32_t good = 0;
++    int32_t eagain = 0;
++    int32_t estale = 0;
+     int32_t error = -1;
+ 
++    /* There are some errors that we'll handle in an special way while trying
++     * to acquire a lock.
++     *
++     *   EAGAIN:  If it's found during a parallel non-blocking lock request, we
++     *            consider that there's contention on the inode, so we consider
++     *            the acquisition a failure and try again with a sequential
++     *            blocking lock request. This will ensure that we get a lock on
++     *            as many bricks as possible (ignoring EAGAIN here would cause
++     *            unnecessary triggers of self-healing).
++     *
++     *            If it's found during a sequential blocking lock request, it's
++     *            considered an error. Lock will only succeed if there are
++     *            enough other bricks locked.
++     *
++     *   ESTALE:  This can appear during parallel or sequential lock request if
++     *            the inode has just been unlinked. We consider this error is
++     *            not recoverable, but we also don't consider it as fatal. So,
++     *            if it happens during parallel lock, we won't attempt a
++     *            sequential one unless there are EAGAIN errors on other
++     *            bricks (and are enough to form a quorum), but if we reach
++     *            quorum counting the ESTALE bricks, we consider the whole
++     *            result of the operation is ESTALE instead of EIO.
++     */
++
+     list_for_each_entry(ans, &fop->cbk_list, list)
+     {
+         if (ans->op_ret >= 0) {
+@@ -38,24 +65,23 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
+                 error = EIO;
+             }
+             locked |= ans->mask;
++            good = ans->count;
+             cbk = ans;
+-        } else {
+-            if (ans->op_errno == EAGAIN) {
+-                switch (fop->uint32) {
+-                    case EC_LOCK_MODE_NONE:
+-                    case EC_LOCK_MODE_ALL:
+-                        /* Goal is to treat non-blocking lock as failure
+-                         * even if there is a single EAGAIN*/
+-                        notlocked |= ans->mask;
+-                        break;
+-                }
+-            }
++        } else if (ans->op_errno == ESTALE) {
++            estale += ans->count;
++        } else if ((ans->op_errno == EAGAIN) &&
++                   (fop->uint32 != EC_LOCK_MODE_INC)) {
++            eagain += ans->count;
+         }
+     }
+ 
+     if (error == -1) {
+-        if (gf_bits_count(locked | notlocked) >= ec->fragments) {
+-            if (notlocked == 0) {
++        /* If we have enough quorum with succeeded and EAGAIN answers, we
++         * ignore for now any ESTALE answer. If there are EAGAIN answers,
++         * we retry with a sequential blocking lock request if needed.
++         * Otherwise we succeed. */
++        if ((good + eagain) >= ec->fragments) {
++            if (eagain == 0) {
+                 if (fop->answer == NULL) {
+                     fop->answer = cbk;
+                 }
+@@ -68,21 +94,28 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask)
+                     case EC_LOCK_MODE_NONE:
+                         error = EAGAIN;
+                         break;
+-
+                     case EC_LOCK_MODE_ALL:
+                         fop->uint32 = EC_LOCK_MODE_INC;
+                         break;
+-
+                     default:
++                        /* This shouldn't happen because eagain cannot be > 0
++                         * when fop->uint32 is EC_LOCK_MODE_INC. */
+                         error = EIO;
+                         break;
+                 }
+             }
+         } else {
+-            if (fop->answer && fop->answer->op_ret < 0)
++            /* We have been unable to find enough candidates that will be able
++             * to take the lock. If we have quorum on some answer, we return
++             * it. Otherwise we check if ESTALE answers allow us to reach
++             * quorum. If so, we return ESTALE. */
++            if (fop->answer && fop->answer->op_ret < 0) {
+                 error = fop->answer->op_errno;
+-            else
++            } else if ((good + eagain + estale) >= ec->fragments) {
++                error = ESTALE;
++            } else {
+                 error = EIO;
++            }
+         }
+     }
+ 
+diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c
+index 1406e70..0c52853 100644
+--- a/xlators/features/locks/src/common.c
++++ b/xlators/features/locks/src/common.c
+@@ -462,11 +462,16 @@ pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local)
+         INIT_LIST_HEAD(&pl_inode->blocked_calls);
+         INIT_LIST_HEAD(&pl_inode->metalk_list);
+         INIT_LIST_HEAD(&pl_inode->queued_locks);
++        INIT_LIST_HEAD(&pl_inode->waiting);
+         gf_uuid_copy(pl_inode->gfid, inode->gfid);
+ 
+         pl_inode->check_mlock_info = _gf_true;
+         pl_inode->mlock_enforced = _gf_false;
+ 
++        /* -2 means never looked up. -1 means something went wrong and link
++         * tracking is disabled. */
++        pl_inode->links = -2;
++
+         ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode));
+         if (ret) {
+             pthread_mutex_destroy(&pl_inode->mutex);
+@@ -1276,4 +1281,313 @@ pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd)
+     }
+ 
+     return 0;
+-}
+\ No newline at end of file
++}
++
++gf_boolean_t
++pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client)
++{
++    if (client && (client->opversion < GD_OP_VERSION_7_0)) {
++        return _gf_true;
++    }
++
++    if (is_lk_owner_null(owner)) {
++        return _gf_false;
++    }
++    return _gf_true;
++}
++
++static int32_t
++pl_inode_from_loc(loc_t *loc, inode_t **pinode)
++{
++    inode_t *inode = NULL;
++    int32_t error = 0;
++
++    if (loc->inode != NULL) {
++        inode = inode_ref(loc->inode);
++        goto done;
++    }
++
++    if (loc->parent == NULL) {
++        error = EINVAL;
++        goto done;
++    }
++
++    if (!gf_uuid_is_null(loc->gfid)) {
++        inode = inode_find(loc->parent->table, loc->gfid);
++        if (inode != NULL) {
++            goto done;
++        }
++    }
++
++    if (loc->name == NULL) {
++        error = EINVAL;
++        goto done;
++    }
++
++    inode = inode_grep(loc->parent->table, loc->parent, loc->name);
++    if (inode == NULL) {
++        /* We haven't found any inode. This means that the file doesn't exist
++         * or that even if it exists, we don't have any knowledge about it, so
++         * we don't have locks on it either, which is fine for our purposes. */
++        goto done;
++    }
++
++done:
++    *pinode = inode;
++
++    return error;
++}
++
++static gf_boolean_t
++pl_inode_has_owners(xlator_t *xl, client_t *client, pl_inode_t *pl_inode,
++                    struct timespec *now, struct list_head *contend)
++{
++    pl_dom_list_t *dom;
++    pl_inode_lock_t *lock;
++    gf_boolean_t has_owners = _gf_false;
++
++    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
++    {
++        list_for_each_entry(lock, &dom->inodelk_list, list)
++        {
++            /* If the lock belongs to the same client, we assume it's related
++             * to the same operation, so we allow the removal to continue. */
++            if (lock->client == client) {
++                continue;
++            }
++            /* If the lock belongs to an internal process, we don't block the
++             * removal. */
++            if (lock->client_pid < 0) {
++                continue;
++            }
++            if (contend == NULL) {
++                return _gf_true;
++            }
++            has_owners = _gf_true;
++            inodelk_contention_notify_check(xl, lock, now, contend);
++        }
++    }
++
++    return has_owners;
++}
++
++int32_t
++pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
++                        pl_inode_t **ppl_inode, struct list_head *contend)
++{
++    struct timespec now;
++    inode_t *inode;
++    pl_inode_t *pl_inode;
++    int32_t error;
++
++    pl_inode = NULL;
++
++    error = pl_inode_from_loc(loc, &inode);
++    if ((error != 0) || (inode == NULL)) {
++        goto done;
++    }
++
++    pl_inode = pl_inode_get(xl, inode, NULL);
++    if (pl_inode == NULL) {
++        inode_unref(inode);
++        error = ENOMEM;
++        goto done;
++    }
++
++    /* pl_inode_from_loc() already increments ref count for inode, so
++     * we only assign here our reference. */
++    pl_inode->inode = inode;
++
++    timespec_now(&now);
++
++    pthread_mutex_lock(&pl_inode->mutex);
++
++    if (pl_inode->removed) {
++        error = ESTALE;
++        goto unlock;
++    }
++
++    if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) {
++        error = -1;
++        /* We skip the unlock here because the caller must create a stub when
++         * we return -1 and do a call to pl_inode_remove_complete(), which
++         * assumes the lock is still acquired and will release it once
++         * everything else is prepared. */
++        goto done;
++    }
++
++    pl_inode->is_locked = _gf_true;
++    pl_inode->remove_running++;
++
++unlock:
++    pthread_mutex_unlock(&pl_inode->mutex);
++
++done:
++    *ppl_inode = pl_inode;
++
++    return error;
++}
++
++int32_t
++pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub,
++                         struct list_head *contend)
++{
++    pl_inode_lock_t *lock;
++    int32_t error = -1;
++
++    if (stub != NULL) {
++        list_add_tail(&stub->list, &pl_inode->waiting);
++        pl_inode->is_locked = _gf_true;
++    } else {
++        error = ENOMEM;
++
++        while (!list_empty(contend)) {
++            lock = list_first_entry(contend, pl_inode_lock_t, list);
++            list_del_init(&lock->list);
++            __pl_inodelk_unref(lock);
++        }
++    }
++
++    pthread_mutex_unlock(&pl_inode->mutex);
++
++    if (error < 0) {
++        inodelk_contention_notify(xl, contend);
++    }
++
++    inode_unref(pl_inode->inode);
++
++    return error;
++}
++
++void
++pl_inode_remove_wake(struct list_head *list)
++{
++    call_stub_t *stub;
++
++    while (!list_empty(list)) {
++        stub = list_first_entry(list, call_stub_t, list);
++        list_del_init(&stub->list);
++
++        call_resume(stub);
++    }
++}
++
++void
++pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error)
++{
++    struct list_head contend, granted;
++    struct timespec now;
++    pl_dom_list_t *dom;
++
++    if (pl_inode == NULL) {
++        return;
++    }
++
++    INIT_LIST_HEAD(&contend);
++    INIT_LIST_HEAD(&granted);
++    timespec_now(&now);
++
++    pthread_mutex_lock(&pl_inode->mutex);
++
++    if (error == 0) {
++        if (pl_inode->links >= 0) {
++            pl_inode->links--;
++        }
++        if (pl_inode->links == 0) {
++            pl_inode->removed = _gf_true;
++        }
++    }
++
++    pl_inode->remove_running--;
++
++    if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) {
++        pl_inode->is_locked = _gf_false;
++
++        list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
++        {
++            __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now,
++                                        &contend);
++        }
++    }
++
++    pthread_mutex_unlock(&pl_inode->mutex);
++
++    unwind_granted_inodes(xl, pl_inode, &granted);
++
++    inodelk_contention_notify(xl, &contend);
++
++    inode_unref(pl_inode->inode);
++}
++
++void
++pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode,
++                         struct list_head *list)
++{
++    call_stub_t *stub, *tmp;
++
++    if (!pl_inode->is_locked) {
++        return;
++    }
++
++    list_for_each_entry_safe(stub, tmp, &pl_inode->waiting, list)
++    {
++        if (!pl_inode_has_owners(xl, stub->frame->root->client, pl_inode, NULL,
++                                 NULL)) {
++            list_move_tail(&stub->list, list);
++        }
++    }
++}
++
++/* This function determines if an inodelk attempt can be done now or it needs
++ * to wait.
++ *
++ * Possible return values:
++ *   < 0: An error occurred. Currently only -ESTALE can be returned if the
++ *        inode has been deleted previously by unlink/rmdir/rename
++ *   = 0: The lock can be attempted.
++ *   > 0: The lock needs to wait because a conflicting remove operation is
++ *        ongoing.
++ */
++int32_t
++pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock)
++{
++    pl_dom_list_t *dom;
++    pl_inode_lock_t *ilock;
++
++    /* If the inode has been deleted, we won't allow any lock. */
++    if (pl_inode->removed) {
++        return -ESTALE;
++    }
++
++    /* We only synchronize with locks made for regular operations coming from
++     * the user. Locks done for internal purposes are hard to control and could
++     * lead to long delays or deadlocks quite easily. */
++    if (lock->client_pid < 0) {
++        return 0;
++    }
++    if (!pl_inode->is_locked) {
++        return 0;
++    }
++    if (pl_inode->remove_running > 0) {
++        return 1;
++    }
++
++    list_for_each_entry(dom, &pl_inode->dom_list, inode_list)
++    {
++        list_for_each_entry(ilock, &dom->inodelk_list, list)
++        {
++            /* If a lock from the same client is already granted, we allow this
++             * one to continue. This is necessary to prevent deadlocks when
++             * multiple locks are taken for the same operation.
++             *
++             * On the other side it's unlikely that the same client sends
++             * completely unrelated locks for the same inode.
++             */
++            if (ilock->client == lock->client) {
++                return 0;
++            }
++        }
++    }
++
++    return 1;
++}
+diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h
+index ea86b96..6c81ac3 100644
+--- a/xlators/features/locks/src/common.h
++++ b/xlators/features/locks/src/common.h
+@@ -105,6 +105,15 @@ void
+ __pl_inodelk_unref(pl_inode_lock_t *lock);
+ 
+ void
++__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
++                            struct list_head *granted, pl_dom_list_t *dom,
++                            struct timespec *now, struct list_head *contend);
++
++void
++unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode,
++                      struct list_head *granted);
++
++void
+ grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode,
+                           pl_dom_list_t *dom, struct timespec *now,
+                           struct list_head *contend);
+@@ -204,6 +213,16 @@ pl_metalock_is_active(pl_inode_t *pl_inode);
+ void
+ __pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock);
+ 
++void
++inodelk_contention_notify_check(xlator_t *xl, pl_inode_lock_t *lock,
++                                struct timespec *now,
++                                struct list_head *contend);
++
++void
++entrylk_contention_notify_check(xlator_t *xl, pl_entry_lock_t *lock,
++                                struct timespec *now,
++                                struct list_head *contend);
++
+ gf_boolean_t
+ pl_does_monkey_want_stuck_lock();
+ 
+@@ -216,4 +235,28 @@ pl_clean_local(pl_local_t *local);
+ int
+ pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd);
+ 
++gf_boolean_t
++pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client);
++
++int32_t
++pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc,
++                        pl_inode_t **ppl_inode, struct list_head *contend);
++
++int32_t
++pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub,
++                         struct list_head *contend);
++
++void
++pl_inode_remove_wake(struct list_head *list);
++
++void
++pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error);
++
++void
++pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode,
++                         struct list_head *list);
++
++int32_t
++pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock);
++
+ #endif /* __COMMON_H__ */
+diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c
+index 93c649c..b97836f 100644
+--- a/xlators/features/locks/src/entrylk.c
++++ b/xlators/features/locks/src/entrylk.c
+@@ -197,9 +197,9 @@ out:
+     return revoke_lock;
+ }
+ 
+-static gf_boolean_t
+-__entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+-                                  struct timespec *now)
++void
++entrylk_contention_notify_check(xlator_t *this, pl_entry_lock_t *lock,
++                                struct timespec *now, struct list_head *contend)
+ {
+     posix_locks_private_t *priv;
+     int64_t elapsed;
+@@ -209,7 +209,7 @@ __entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+     /* If this lock is in a list, it means that we are about to send a
+      * notification for it, so no need to do anything else. */
+     if (!list_empty(&lock->contend)) {
+-        return _gf_false;
++        return;
+     }
+ 
+     elapsed = now->tv_sec;
+@@ -218,7 +218,7 @@ __entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+         elapsed--;
+     }
+     if (elapsed < priv->notify_contention_delay) {
+-        return _gf_false;
++        return;
+     }
+ 
+     /* All contention notifications will be sent outside of the locked
+@@ -231,7 +231,7 @@ __entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock,
+ 
+     lock->contention_time = *now;
+ 
+-    return _gf_true;
++    list_add_tail(&lock->contend, contend);
+ }
+ 
+ void
+@@ -325,9 +325,7 @@ __entrylk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock,
+                     break;
+                 }
+             }
+-            if (__entrylk_needs_contention_notify(this, tmp, now)) {
+-                list_add_tail(&tmp->contend, contend);
+-            }
++            entrylk_contention_notify_check(this, tmp, now, contend);
+         }
+     }
+ 
+@@ -690,10 +688,9 @@ __grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode,
+         bl_ret = __lock_entrylk(bl->this, pl_inode, bl, 0, dom, now, contend);
+ 
+         if (bl_ret == 0) {
+-            list_add(&bl->blocked_locks, granted);
++            list_add_tail(&bl->blocked_locks, granted);
+         }
+     }
+-    return;
+ }
+ 
+ /* Grants locks if possible which are blocked on a lock */
+diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c
+index 24dee49..1a07243 100644
+--- a/xlators/features/locks/src/inodelk.c
++++ b/xlators/features/locks/src/inodelk.c
+@@ -231,9 +231,9 @@ out:
+     return revoke_lock;
+ }
+ 
+-static gf_boolean_t
+-__inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+-                                  struct timespec *now)
++void
++inodelk_contention_notify_check(xlator_t *this, pl_inode_lock_t *lock,
++                                struct timespec *now, struct list_head *contend)
+ {
+     posix_locks_private_t *priv;
+     int64_t elapsed;
+@@ -243,7 +243,7 @@ __inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+     /* If this lock is in a list, it means that we are about to send a
+      * notification for it, so no need to do anything else. */
+     if (!list_empty(&lock->contend)) {
+-        return _gf_false;
++        return;
+     }
+ 
+     elapsed = now->tv_sec;
+@@ -252,7 +252,7 @@ __inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+         elapsed--;
+     }
+     if (elapsed < priv->notify_contention_delay) {
+-        return _gf_false;
++        return;
+     }
+ 
+     /* All contention notifications will be sent outside of the locked
+@@ -265,7 +265,7 @@ __inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock,
+ 
+     lock->contention_time = *now;
+ 
+-    return _gf_true;
++    list_add_tail(&lock->contend, contend);
+ }
+ 
+ void
+@@ -353,9 +353,7 @@ __inodelk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock,
+                     break;
+                 }
+             }
+-            if (__inodelk_needs_contention_notify(this, l, now)) {
+-                list_add_tail(&l->contend, contend);
+-            }
++            inodelk_contention_notify_check(this, l, now, contend);
+         }
+     }
+ 
+@@ -435,12 +433,17 @@ __lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
+                struct list_head *contend)
+ {
+     pl_inode_lock_t *conf = NULL;
+-    int ret = -EINVAL;
++    int ret;
+ 
+-    conf = __inodelk_grantable(this, dom, lock, now, contend);
+-    if (conf) {
+-        ret = __lock_blocked_add(this, dom, lock, can_block);
+-        goto out;
++    ret = pl_inode_remove_inodelk(pl_inode, lock);
++    if (ret < 0) {
++        return ret;
++    }
++    if (ret == 0) {
++        conf = __inodelk_grantable(this, dom, lock, now, contend);
++    }
++    if ((ret > 0) || (conf != NULL)) {
++        return __lock_blocked_add(this, dom, lock, can_block);
+     }
+ 
+     /* To prevent blocked locks starvation, check if there are any blocked
+@@ -462,17 +465,13 @@ __lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock,
+                    "starvation");
+         }
+ 
+-        ret = __lock_blocked_add(this, dom, lock, can_block);
+-        goto out;
++        return __lock_blocked_add(this, dom, lock, can_block);
+     }
+     __pl_inodelk_ref(lock);
+     gettimeofday(&lock->granted_time, NULL);
+     list_add(&lock->list, &dom->inodelk_list);
+ 
+-    ret = 0;
+-
+-out:
+-    return ret;
++    return 0;
+ }
+ 
+ /* Return true if the two inodelks have exactly same lock boundaries */
+@@ -529,12 +528,11 @@ out:
+     return conf;
+ }
+ 
+-static void
++void
+ __grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+                             struct list_head *granted, pl_dom_list_t *dom,
+                             struct timespec *now, struct list_head *contend)
+ {
+-    int bl_ret = 0;
+     pl_inode_lock_t *bl = NULL;
+     pl_inode_lock_t *tmp = NULL;
+ 
+@@ -547,52 +545,48 @@ __grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+     {
+         list_del_init(&bl->blocked_locks);
+ 
+-        bl_ret = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend);
++        bl->status = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend);
+ 
+-        if (bl_ret == 0) {
+-            list_add(&bl->blocked_locks, granted);
++        if (bl->status != -EAGAIN) {
++            list_add_tail(&bl->blocked_locks, granted);
+         }
+     }
+-    return;
+ }
+ 
+-/* Grant all inodelks blocked on a lock */
+ void
+-grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+-                          pl_dom_list_t *dom, struct timespec *now,
+-                          struct list_head *contend)
++unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode,
++                      struct list_head *granted)
+ {
+-    struct list_head granted;
+     pl_inode_lock_t *lock;
+     pl_inode_lock_t *tmp;
++    int32_t op_ret;
++    int32_t op_errno;
+ 
+-    INIT_LIST_HEAD(&granted);
+-
+-    pthread_mutex_lock(&pl_inode->mutex);
+-    {
+-        __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now,
+-                                    contend);
+-    }
+-    pthread_mutex_unlock(&pl_inode->mutex);
+-
+-    list_for_each_entry_safe(lock, tmp, &granted, blocked_locks)
++    list_for_each_entry_safe(lock, tmp, granted, blocked_locks)
+     {
+-        gf_log(this->name, GF_LOG_TRACE,
+-               "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => Granted",
+-               lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid,
+-               lkowner_utoa(&lock->owner), lock->user_flock.l_start,
+-               lock->user_flock.l_len);
+-
++        if (lock->status == 0) {
++            op_ret = 0;
++            op_errno = 0;
++            gf_log(this->name, GF_LOG_TRACE,
++                   "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64
++                   " => Granted",
++                   lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
++                   lock->client_pid, lkowner_utoa(&lock->owner),
++                   lock->user_flock.l_start, lock->user_flock.l_len);
++        } else {
++            op_ret = -1;
++            op_errno = -lock->status;
++        }
+         pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock,
+-                     0, 0, lock->volume);
++                     op_ret, op_errno, lock->volume);
+ 
+-        STACK_UNWIND_STRICT(inodelk, lock->frame, 0, 0, NULL);
++        STACK_UNWIND_STRICT(inodelk, lock->frame, op_ret, op_errno, NULL);
+         lock->frame = NULL;
+     }
+ 
+     pthread_mutex_lock(&pl_inode->mutex);
+     {
+-        list_for_each_entry_safe(lock, tmp, &granted, blocked_locks)
++        list_for_each_entry_safe(lock, tmp, granted, blocked_locks)
+         {
+             list_del_init(&lock->blocked_locks);
+             __pl_inodelk_unref(lock);
+@@ -601,6 +595,26 @@ grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
+     pthread_mutex_unlock(&pl_inode->mutex);
+ }
+ 
++/* Grant all inodelks blocked on a lock */
++void
++grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode,
++                          pl_dom_list_t *dom, struct timespec *now,
++                          struct list_head *contend)
++{
++    struct list_head granted;
++
++    INIT_LIST_HEAD(&granted);
++
++    pthread_mutex_lock(&pl_inode->mutex);
++    {
++        __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now,
++                                    contend);
++    }
++    pthread_mutex_unlock(&pl_inode->mutex);
++
++    unwind_granted_inodes(this, pl_inode, &granted);
++}
++
+ static void
+ pl_inodelk_log_cleanup(pl_inode_lock_t *lock)
+ {
+@@ -662,7 +676,7 @@ pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx)
+                  * and blocked lists, then this means that a parallel
+                  * unlock on another inodelk (L2 say) may have 'granted'
+                  * L1 and added it to 'granted' list in
+-                 * __grant_blocked_node_locks() (although using the
++                 * __grant_blocked_inode_locks() (although using the
+                  * 'blocked_locks' member). In that case, the cleanup
+                  * codepath must try and grant other overlapping
+                  * blocked inodelks from other clients, now that L1 is
+@@ -747,6 +761,7 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+     gf_boolean_t need_inode_unref = _gf_false;
+     struct list_head *pcontend = NULL;
+     struct list_head contend;
++    struct list_head wake;
+     struct timespec now = {};
+     short fl_type;
+ 
+@@ -798,6 +813,8 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+         timespec_now(&now);
+     }
+ 
++    INIT_LIST_HEAD(&wake);
++
+     if (ctx)
+         pthread_mutex_lock(&ctx->lock);
+     pthread_mutex_lock(&pl_inode->mutex);
+@@ -820,18 +837,17 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+                        lock->fl_type == F_UNLCK ? "Unlock" : "Lock",
+                        lock->client_pid, lkowner_utoa(&lock->owner),
+                        lock->user_flock.l_start, lock->user_flock.l_len);
+-                if (can_block)
++                if (can_block) {
+                     unref = _gf_false;
+-                /* For all but the case where a non-blocking
+-                 * lock attempt fails, the extra ref taken at
+-                 * the start of this function must be negated.
+-                 */
+-                else
+-                    need_inode_unref = _gf_true;
++                }
+             }
+-
+-            if (ctx && (!ret || can_block))
++            /* For all but the case where a non-blocking lock attempt fails
++             * with -EAGAIN, the extra ref taken at the start of this function
++             * must be negated. */
++            need_inode_unref = (ret != 0) && ((ret != -EAGAIN) || !can_block);
++            if (ctx && !need_inode_unref) {
+                 list_add_tail(&lock->client_list, &ctx->inodelk_lockers);
++            }
+         } else {
+             /* Irrespective of whether unlock succeeds or not,
+              * the extra inode ref that was done at the start of
+@@ -849,6 +865,8 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+             list_del_init(&retlock->client_list);
+             __pl_inodelk_unref(retlock);
+ 
++            pl_inode_remove_unlocked(this, pl_inode, &wake);
++
+             ret = 0;
+         }
+     out:
+@@ -859,6 +877,8 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode,
+     if (ctx)
+         pthread_mutex_unlock(&ctx->lock);
+ 
++    pl_inode_remove_wake(&wake);
++
+     /* The following (extra) unref corresponds to the ref that
+      * was done at the time the lock was granted.
+      */
+@@ -1033,10 +1053,14 @@ pl_common_inodelk(call_frame_t *frame, xlator_t *this, const char *volume,
+                                  inode);
+ 
+             if (ret < 0) {
+-                if ((can_block) && (F_UNLCK != lock_type)) {
+-                    goto out;
++                if (ret == -EAGAIN) {
++                    if (can_block && (F_UNLCK != lock_type)) {
++                        goto out;
++                    }
++                    gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN");
++                } else {
++                    gf_log(this->name, GF_LOG_TRACE, "returning %d", ret);
+                 }
+-                gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN");
+                 op_errno = -ret;
+                 goto unwind;
+             }
+diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h
+index aa267de..6666feb 100644
+--- a/xlators/features/locks/src/locks.h
++++ b/xlators/features/locks/src/locks.h
+@@ -102,6 +102,9 @@ struct __pl_inode_lock {
+ 
+     struct list_head client_list; /* list of all locks from a client */
+     short fl_type;
++
++    int32_t status; /* Error code when we try to grant a lock in blocked
++                       state */
+ };
+ typedef struct __pl_inode_lock pl_inode_lock_t;
+ 
+@@ -164,13 +167,14 @@ struct __pl_inode {
+     struct list_head rw_list;            /* list of waiting r/w requests */
+     struct list_head reservelk_list;     /* list of reservelks */
+     struct list_head blocked_reservelks; /* list of blocked reservelks */
+-    struct list_head
+-        blocked_calls; /* List of blocked lock calls while a reserve is held*/
+-    struct list_head metalk_list; /* Meta lock list */
+-                                  /* This is to store the incoming lock
+-                                     requests while meta lock is enabled */
+-    struct list_head queued_locks;
+-    int mandatory; /* if mandatory locking is enabled */
++    struct list_head blocked_calls;      /* List of blocked lock calls while a
++                                            reserve is held*/
++    struct list_head metalk_list;        /* Meta lock list */
++    struct list_head queued_locks;       /* This is to store the incoming lock
++                                            requests while meta lock is enabled */
++    struct list_head waiting; /* List of pending fops waiting to unlink/rmdir
++                                 the inode. */
++    int mandatory;            /* if mandatory locking is enabled */
+ 
+     inode_t *refkeeper; /* hold refs on an inode while locks are
+                            held to prevent pruning */
+@@ -197,6 +201,11 @@ struct __pl_inode {
+     */
+     int fop_wind_count;
+     pthread_cond_t check_fop_wind_count;
++
++    int32_t links;           /* Number of hard links the inode has. */
++    uint32_t remove_running; /* Number of remove operations running. */
++    gf_boolean_t is_locked;  /* Regular locks will be blocked. */
++    gf_boolean_t removed;    /* The inode has been deleted. */
+ };
+ typedef struct __pl_inode pl_inode_t;
+ 
+diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c
+index 7887b82..5ae0125 100644
+--- a/xlators/features/locks/src/posix.c
++++ b/xlators/features/locks/src/posix.c
+@@ -147,6 +147,29 @@ fetch_pathinfo(xlator_t *, inode_t *, int32_t *, char **);
+         }                                                                      \
+     } while (0)
+ 
++#define PL_INODE_REMOVE(_fop, _frame, _xl, _loc1, _loc2, _cont, _cbk,          \
++                        _args...)                                              \
++    ({                                                                         \
++        struct list_head contend;                                              \
++        pl_inode_t *__pl_inode;                                                \
++        call_stub_t *__stub;                                                   \
++        int32_t __error;                                                       \
++        INIT_LIST_HEAD(&contend);                                              \
++        __error = pl_inode_remove_prepare(_xl, _frame, _loc2 ? _loc2 : _loc1,  \
++                                          &__pl_inode, &contend);              \
++        if (__error < 0) {                                                     \
++            __stub = fop_##_fop##_stub(_frame, _cont, ##_args);                \
++            __error = pl_inode_remove_complete(_xl, __pl_inode, __stub,        \
++                                               &contend);                      \
++        } else if (__error == 0) {                                             \
++            PL_LOCAL_GET_REQUESTS(_frame, _xl, xdata, ((fd_t *)NULL), _loc1,   \
++                                  _loc2);                                      \
++            STACK_WIND_COOKIE(_frame, _cbk, __pl_inode, FIRST_CHILD(_xl),      \
++                              FIRST_CHILD(_xl)->fops->_fop, ##_args);          \
++        }                                                                      \
++        __error;                                                               \
++    })
++
+ gf_boolean_t
+ pl_has_xdata_requests(dict_t *xdata)
+ {
+@@ -2969,11 +2992,85 @@ out:
+     return ret;
+ }
+ 
++static int32_t
++pl_request_link_count(dict_t **pxdata)
++{
++    dict_t *xdata;
++
++    xdata = *pxdata;
++    if (xdata == NULL) {
++        xdata = dict_new();
++        if (xdata == NULL) {
++            return ENOMEM;
++        }
++    } else {
++        dict_ref(xdata);
++    }
++
++    if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) {
++        dict_unref(xdata);
++        return ENOMEM;
++    }
++
++    *pxdata = xdata;
++
++    return 0;
++}
++
++static int32_t
++pl_check_link_count(dict_t *xdata)
++{
++    int32_t count;
++
++    /* In case we are unable to read the link count from xdata, we take a
++     * conservative approach and return -2, which will prevent the inode from
++     * being considered deleted. In fact it will cause link tracking for this
++     * inode to be disabled completely to avoid races. */
++
++    if (xdata == NULL) {
++        return -2;
++    }
++
++    if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) {
++        return -2;
++    }
++
++    return count;
++}
++
+ int32_t
+ pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+               int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata,
+               struct iatt *postparent)
+ {
++    pl_inode_t *pl_inode;
++
++    if (op_ret >= 0) {
++        pl_inode = pl_inode_get(this, inode, NULL);
++        if (pl_inode == NULL) {
++            PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL,
++                            NULL);
++            return 0;
++        }
++
++        pthread_mutex_lock(&pl_inode->mutex);
++
++        /* We only update the link count if we previously didn't know it.
++         * Doing it always can lead to races since lookup is not executed
++         * atomically most of the times. */
++        if (pl_inode->links == -2) {
++            pl_inode->links = pl_check_link_count(xdata);
++            if (buf->ia_type == IA_IFDIR) {
++                /* Directories have at least 2 links. To avoid special handling
++                 * for directories, we simply decrement the value here to make
++                 * them equivalent to regular files. */
++                pl_inode->links--;
++            }
++        }
++
++        pthread_mutex_unlock(&pl_inode->mutex);
++    }
++
+     PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata,
+                     postparent);
+     return 0;
+@@ -2982,9 +3079,17 @@ pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+ int32_t
+ pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata)
+ {
+-    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+-    STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this),
+-               FIRST_CHILD(this)->fops->lookup, loc, xdata);
++    int32_t error;
++
++    error = pl_request_link_count(&xdata);
++    if (error == 0) {
++        PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
++        STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this),
++                   FIRST_CHILD(this)->fops->lookup, loc, xdata);
++        dict_unref(xdata);
++    } else {
++        STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL);
++    }
+     return 0;
+ }
+ 
+@@ -3792,6 +3897,10 @@ unlock:
+             gf_proc_dump_write("posixlk-count", "%d", count);
+             __dump_posixlks(pl_inode);
+         }
++
++        gf_proc_dump_write("links", "%d", pl_inode->links);
++        gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running);
++        gf_proc_dump_write("removed", "%u", pl_inode->removed);
+     }
+     pthread_mutex_unlock(&pl_inode->mutex);
+ 
+@@ -4137,8 +4246,11 @@ pl_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+               struct iatt *postoldparent, struct iatt *prenewparent,
+               struct iatt *postnewparent, dict_t *xdata)
+ {
++    pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0);
++
+     PL_STACK_UNWIND(rename, xdata, frame, op_ret, op_errno, buf, preoldparent,
+                     postoldparent, prenewparent, postnewparent, xdata);
++
+     return 0;
+ }
+ 
+@@ -4146,10 +4258,15 @@ int32_t
+ pl_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+           dict_t *xdata)
+ {
+-    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc);
++    int32_t error;
++
++    error = PL_INODE_REMOVE(rename, frame, this, oldloc, newloc, pl_rename,
++                            pl_rename_cbk, oldloc, newloc, xdata);
++    if (error > 0) {
++        STACK_UNWIND_STRICT(rename, frame, -1, error, NULL, NULL, NULL, NULL,
++                            NULL, NULL);
++    }
+ 
+-    STACK_WIND(frame, pl_rename_cbk, FIRST_CHILD(this),
+-               FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata);
+     return 0;
+ }
+ 
+@@ -4273,8 +4390,11 @@ pl_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+               int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+               dict_t *xdata)
+ {
++    pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0);
++
+     PL_STACK_UNWIND(unlink, xdata, frame, op_ret, op_errno, preparent,
+                     postparent, xdata);
++
+     return 0;
+ }
+ 
+@@ -4282,9 +4402,14 @@ int32_t
+ pl_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag,
+           dict_t *xdata)
+ {
+-    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+-    STACK_WIND(frame, pl_unlink_cbk, FIRST_CHILD(this),
+-               FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata);
++    int32_t error;
++
++    error = PL_INODE_REMOVE(unlink, frame, this, loc, NULL, pl_unlink,
++                            pl_unlink_cbk, loc, xflag, xdata);
++    if (error > 0) {
++        STACK_UNWIND_STRICT(unlink, frame, -1, error, NULL, NULL, NULL);
++    }
++
+     return 0;
+ }
+ 
+@@ -4351,8 +4476,11 @@ pl_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+              int32_t op_errno, struct iatt *preparent, struct iatt *postparent,
+              dict_t *xdata)
+ {
++    pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0);
++
+     PL_STACK_UNWIND_FOR_CLIENT(rmdir, xdata, frame, op_ret, op_errno, preparent,
+                                postparent, xdata);
++
+     return 0;
+ }
+ 
+@@ -4360,9 +4488,14 @@ int
+ pl_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags,
+          dict_t *xdata)
+ {
+-    PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL);
+-    STACK_WIND(frame, pl_rmdir_cbk, FIRST_CHILD(this),
+-               FIRST_CHILD(this)->fops->rmdir, loc, xflags, xdata);
++    int32_t error;
++
++    error = PL_INODE_REMOVE(rmdir, frame, this, loc, NULL, pl_rmdir,
++                            pl_rmdir_cbk, loc, xflags, xdata);
++    if (error > 0) {
++        STACK_UNWIND_STRICT(rmdir, frame, -1, error, NULL, NULL, NULL);
++    }
++
+     return 0;
+ }
+ 
+@@ -4392,6 +4525,19 @@ pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret,
+             int32_t op_errno, inode_t *inode, struct iatt *buf,
+             struct iatt *preparent, struct iatt *postparent, dict_t *xdata)
+ {
++    pl_inode_t *pl_inode = (pl_inode_t *)cookie;
++
++    if (op_ret >= 0) {
++        pthread_mutex_lock(&pl_inode->mutex);
++
++        /* TODO: can happen pl_inode->links == 0 ? */
++        if (pl_inode->links >= 0) {
++            pl_inode->links++;
++        }
++
++        pthread_mutex_unlock(&pl_inode->mutex);
++    }
++
+     PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf,
+                                preparent, postparent, xdata);
+     return 0;
+@@ -4401,9 +4547,18 @@ int
+ pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc,
+         dict_t *xdata)
+ {
++    pl_inode_t *pl_inode;
++
++    pl_inode = pl_inode_get(this, oldloc->inode, NULL);
++    if (pl_inode == NULL) {
++        STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL,
++                            NULL);
++        return 0;
++    }
++
+     PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc);
+-    STACK_WIND(frame, pl_link_cbk, FIRST_CHILD(this),
+-               FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
++    STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this),
++                      FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata);
+     return 0;
+ }
+ 
+-- 
+1.8.3.1
+