diff options
Diffstat (limited to '0455-locks-prevent-deletion-of-locked-entries.patch')
-rw-r--r-- | 0455-locks-prevent-deletion-of-locked-entries.patch | 1253 |
1 files changed, 1253 insertions, 0 deletions
diff --git a/0455-locks-prevent-deletion-of-locked-entries.patch b/0455-locks-prevent-deletion-of-locked-entries.patch new file mode 100644 index 0000000..5960690 --- /dev/null +++ b/0455-locks-prevent-deletion-of-locked-entries.patch @@ -0,0 +1,1253 @@ +From 3f6ff474db3934f43d9963dfe4dda7d201211e75 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez <xhernandez@redhat.com> +Date: Fri, 12 Jun 2020 00:06:36 +0200 +Subject: [PATCH 455/456] locks: prevent deletion of locked entries + +To keep consistency inside transactions started by locking an entry or +an inode, this change delays the removal of entries that are currently +locked by one or more clients. Once all locks are released, the removal +is processed. + +It has also been improved the detection of stale inodes in the locking +code of EC. + +>Upstream patch - https://review.gluster.org/#/c/glusterfs/+/20025/ +>Fixes: #990 + +Change-Id: Ic8ba23d9480f80c7f74e7a310bf8a15922320fd5 +BUG: 1812789 +Signed-off-by: Xavi Hernandez <xhernandez@redhat.com> +Reviewed-on: https://code.engineering.redhat.com/gerrit/206442 +Tested-by: RHGS Build Bot <nigelb@redhat.com> +--- + xlators/cluster/ec/src/ec-locks.c | 69 ++++++-- + xlators/features/locks/src/common.c | 316 ++++++++++++++++++++++++++++++++++- + xlators/features/locks/src/common.h | 43 +++++ + xlators/features/locks/src/entrylk.c | 19 +-- + xlators/features/locks/src/inodelk.c | 150 ++++++++++------- + xlators/features/locks/src/locks.h | 23 ++- + xlators/features/locks/src/posix.c | 183 ++++++++++++++++++-- + 7 files changed, 689 insertions(+), 114 deletions(-) + +diff --git a/xlators/cluster/ec/src/ec-locks.c b/xlators/cluster/ec/src/ec-locks.c +index ffcac07..db86296 100644 +--- a/xlators/cluster/ec/src/ec-locks.c ++++ b/xlators/cluster/ec/src/ec-locks.c +@@ -28,9 +28,36 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask) + ec_t *ec = fop->xl->private; + ec_cbk_data_t *ans = NULL; + ec_cbk_data_t *cbk = NULL; +- uintptr_t locked = 0, notlocked = 0; ++ uintptr_t locked = 0; ++ int32_t good = 0; ++ int32_t eagain = 0; ++ int32_t estale = 0; + int32_t error = -1; + ++ /* There are some errors that we'll handle in an special way while trying ++ * to acquire a lock. ++ * ++ * EAGAIN: If it's found during a parallel non-blocking lock request, we ++ * consider that there's contention on the inode, so we consider ++ * the acquisition a failure and try again with a sequential ++ * blocking lock request. This will ensure that we get a lock on ++ * as many bricks as possible (ignoring EAGAIN here would cause ++ * unnecessary triggers of self-healing). ++ * ++ * If it's found during a sequential blocking lock request, it's ++ * considered an error. Lock will only succeed if there are ++ * enough other bricks locked. ++ * ++ * ESTALE: This can appear during parallel or sequential lock request if ++ * the inode has just been unlinked. We consider this error is ++ * not recoverable, but we also don't consider it as fatal. So, ++ * if it happens during parallel lock, we won't attempt a ++ * sequential one unless there are EAGAIN errors on other ++ * bricks (and are enough to form a quorum), but if we reach ++ * quorum counting the ESTALE bricks, we consider the whole ++ * result of the operation is ESTALE instead of EIO. ++ */ ++ + list_for_each_entry(ans, &fop->cbk_list, list) + { + if (ans->op_ret >= 0) { +@@ -38,24 +65,23 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask) + error = EIO; + } + locked |= ans->mask; ++ good = ans->count; + cbk = ans; +- } else { +- if (ans->op_errno == EAGAIN) { +- switch (fop->uint32) { +- case EC_LOCK_MODE_NONE: +- case EC_LOCK_MODE_ALL: +- /* Goal is to treat non-blocking lock as failure +- * even if there is a single EAGAIN*/ +- notlocked |= ans->mask; +- break; +- } +- } ++ } else if (ans->op_errno == ESTALE) { ++ estale += ans->count; ++ } else if ((ans->op_errno == EAGAIN) && ++ (fop->uint32 != EC_LOCK_MODE_INC)) { ++ eagain += ans->count; + } + } + + if (error == -1) { +- if (gf_bits_count(locked | notlocked) >= ec->fragments) { +- if (notlocked == 0) { ++ /* If we have enough quorum with succeeded and EAGAIN answers, we ++ * ignore for now any ESTALE answer. If there are EAGAIN answers, ++ * we retry with a sequential blocking lock request if needed. ++ * Otherwise we succeed. */ ++ if ((good + eagain) >= ec->fragments) { ++ if (eagain == 0) { + if (fop->answer == NULL) { + fop->answer = cbk; + } +@@ -68,21 +94,28 @@ ec_lock_check(ec_fop_data_t *fop, uintptr_t *mask) + case EC_LOCK_MODE_NONE: + error = EAGAIN; + break; +- + case EC_LOCK_MODE_ALL: + fop->uint32 = EC_LOCK_MODE_INC; + break; +- + default: ++ /* This shouldn't happen because eagain cannot be > 0 ++ * when fop->uint32 is EC_LOCK_MODE_INC. */ + error = EIO; + break; + } + } + } else { +- if (fop->answer && fop->answer->op_ret < 0) ++ /* We have been unable to find enough candidates that will be able ++ * to take the lock. If we have quorum on some answer, we return ++ * it. Otherwise we check if ESTALE answers allow us to reach ++ * quorum. If so, we return ESTALE. */ ++ if (fop->answer && fop->answer->op_ret < 0) { + error = fop->answer->op_errno; +- else ++ } else if ((good + eagain + estale) >= ec->fragments) { ++ error = ESTALE; ++ } else { + error = EIO; ++ } + } + } + +diff --git a/xlators/features/locks/src/common.c b/xlators/features/locks/src/common.c +index 1406e70..0c52853 100644 +--- a/xlators/features/locks/src/common.c ++++ b/xlators/features/locks/src/common.c +@@ -462,11 +462,16 @@ pl_inode_get(xlator_t *this, inode_t *inode, pl_local_t *local) + INIT_LIST_HEAD(&pl_inode->blocked_calls); + INIT_LIST_HEAD(&pl_inode->metalk_list); + INIT_LIST_HEAD(&pl_inode->queued_locks); ++ INIT_LIST_HEAD(&pl_inode->waiting); + gf_uuid_copy(pl_inode->gfid, inode->gfid); + + pl_inode->check_mlock_info = _gf_true; + pl_inode->mlock_enforced = _gf_false; + ++ /* -2 means never looked up. -1 means something went wrong and link ++ * tracking is disabled. */ ++ pl_inode->links = -2; ++ + ret = __inode_ctx_put(inode, this, (uint64_t)(long)(pl_inode)); + if (ret) { + pthread_mutex_destroy(&pl_inode->mutex); +@@ -1276,4 +1281,313 @@ pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd) + } + + return 0; +-} +\ No newline at end of file ++} ++ ++gf_boolean_t ++pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client) ++{ ++ if (client && (client->opversion < GD_OP_VERSION_7_0)) { ++ return _gf_true; ++ } ++ ++ if (is_lk_owner_null(owner)) { ++ return _gf_false; ++ } ++ return _gf_true; ++} ++ ++static int32_t ++pl_inode_from_loc(loc_t *loc, inode_t **pinode) ++{ ++ inode_t *inode = NULL; ++ int32_t error = 0; ++ ++ if (loc->inode != NULL) { ++ inode = inode_ref(loc->inode); ++ goto done; ++ } ++ ++ if (loc->parent == NULL) { ++ error = EINVAL; ++ goto done; ++ } ++ ++ if (!gf_uuid_is_null(loc->gfid)) { ++ inode = inode_find(loc->parent->table, loc->gfid); ++ if (inode != NULL) { ++ goto done; ++ } ++ } ++ ++ if (loc->name == NULL) { ++ error = EINVAL; ++ goto done; ++ } ++ ++ inode = inode_grep(loc->parent->table, loc->parent, loc->name); ++ if (inode == NULL) { ++ /* We haven't found any inode. This means that the file doesn't exist ++ * or that even if it exists, we don't have any knowledge about it, so ++ * we don't have locks on it either, which is fine for our purposes. */ ++ goto done; ++ } ++ ++done: ++ *pinode = inode; ++ ++ return error; ++} ++ ++static gf_boolean_t ++pl_inode_has_owners(xlator_t *xl, client_t *client, pl_inode_t *pl_inode, ++ struct timespec *now, struct list_head *contend) ++{ ++ pl_dom_list_t *dom; ++ pl_inode_lock_t *lock; ++ gf_boolean_t has_owners = _gf_false; ++ ++ list_for_each_entry(dom, &pl_inode->dom_list, inode_list) ++ { ++ list_for_each_entry(lock, &dom->inodelk_list, list) ++ { ++ /* If the lock belongs to the same client, we assume it's related ++ * to the same operation, so we allow the removal to continue. */ ++ if (lock->client == client) { ++ continue; ++ } ++ /* If the lock belongs to an internal process, we don't block the ++ * removal. */ ++ if (lock->client_pid < 0) { ++ continue; ++ } ++ if (contend == NULL) { ++ return _gf_true; ++ } ++ has_owners = _gf_true; ++ inodelk_contention_notify_check(xl, lock, now, contend); ++ } ++ } ++ ++ return has_owners; ++} ++ ++int32_t ++pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, ++ pl_inode_t **ppl_inode, struct list_head *contend) ++{ ++ struct timespec now; ++ inode_t *inode; ++ pl_inode_t *pl_inode; ++ int32_t error; ++ ++ pl_inode = NULL; ++ ++ error = pl_inode_from_loc(loc, &inode); ++ if ((error != 0) || (inode == NULL)) { ++ goto done; ++ } ++ ++ pl_inode = pl_inode_get(xl, inode, NULL); ++ if (pl_inode == NULL) { ++ inode_unref(inode); ++ error = ENOMEM; ++ goto done; ++ } ++ ++ /* pl_inode_from_loc() already increments ref count for inode, so ++ * we only assign here our reference. */ ++ pl_inode->inode = inode; ++ ++ timespec_now(&now); ++ ++ pthread_mutex_lock(&pl_inode->mutex); ++ ++ if (pl_inode->removed) { ++ error = ESTALE; ++ goto unlock; ++ } ++ ++ if (pl_inode_has_owners(xl, frame->root->client, pl_inode, &now, contend)) { ++ error = -1; ++ /* We skip the unlock here because the caller must create a stub when ++ * we return -1 and do a call to pl_inode_remove_complete(), which ++ * assumes the lock is still acquired and will release it once ++ * everything else is prepared. */ ++ goto done; ++ } ++ ++ pl_inode->is_locked = _gf_true; ++ pl_inode->remove_running++; ++ ++unlock: ++ pthread_mutex_unlock(&pl_inode->mutex); ++ ++done: ++ *ppl_inode = pl_inode; ++ ++ return error; ++} ++ ++int32_t ++pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub, ++ struct list_head *contend) ++{ ++ pl_inode_lock_t *lock; ++ int32_t error = -1; ++ ++ if (stub != NULL) { ++ list_add_tail(&stub->list, &pl_inode->waiting); ++ pl_inode->is_locked = _gf_true; ++ } else { ++ error = ENOMEM; ++ ++ while (!list_empty(contend)) { ++ lock = list_first_entry(contend, pl_inode_lock_t, list); ++ list_del_init(&lock->list); ++ __pl_inodelk_unref(lock); ++ } ++ } ++ ++ pthread_mutex_unlock(&pl_inode->mutex); ++ ++ if (error < 0) { ++ inodelk_contention_notify(xl, contend); ++ } ++ ++ inode_unref(pl_inode->inode); ++ ++ return error; ++} ++ ++void ++pl_inode_remove_wake(struct list_head *list) ++{ ++ call_stub_t *stub; ++ ++ while (!list_empty(list)) { ++ stub = list_first_entry(list, call_stub_t, list); ++ list_del_init(&stub->list); ++ ++ call_resume(stub); ++ } ++} ++ ++void ++pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error) ++{ ++ struct list_head contend, granted; ++ struct timespec now; ++ pl_dom_list_t *dom; ++ ++ if (pl_inode == NULL) { ++ return; ++ } ++ ++ INIT_LIST_HEAD(&contend); ++ INIT_LIST_HEAD(&granted); ++ timespec_now(&now); ++ ++ pthread_mutex_lock(&pl_inode->mutex); ++ ++ if (error == 0) { ++ if (pl_inode->links >= 0) { ++ pl_inode->links--; ++ } ++ if (pl_inode->links == 0) { ++ pl_inode->removed = _gf_true; ++ } ++ } ++ ++ pl_inode->remove_running--; ++ ++ if ((pl_inode->remove_running == 0) && list_empty(&pl_inode->waiting)) { ++ pl_inode->is_locked = _gf_false; ++ ++ list_for_each_entry(dom, &pl_inode->dom_list, inode_list) ++ { ++ __grant_blocked_inode_locks(xl, pl_inode, &granted, dom, &now, ++ &contend); ++ } ++ } ++ ++ pthread_mutex_unlock(&pl_inode->mutex); ++ ++ unwind_granted_inodes(xl, pl_inode, &granted); ++ ++ inodelk_contention_notify(xl, &contend); ++ ++ inode_unref(pl_inode->inode); ++} ++ ++void ++pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode, ++ struct list_head *list) ++{ ++ call_stub_t *stub, *tmp; ++ ++ if (!pl_inode->is_locked) { ++ return; ++ } ++ ++ list_for_each_entry_safe(stub, tmp, &pl_inode->waiting, list) ++ { ++ if (!pl_inode_has_owners(xl, stub->frame->root->client, pl_inode, NULL, ++ NULL)) { ++ list_move_tail(&stub->list, list); ++ } ++ } ++} ++ ++/* This function determines if an inodelk attempt can be done now or it needs ++ * to wait. ++ * ++ * Possible return values: ++ * < 0: An error occurred. Currently only -ESTALE can be returned if the ++ * inode has been deleted previously by unlink/rmdir/rename ++ * = 0: The lock can be attempted. ++ * > 0: The lock needs to wait because a conflicting remove operation is ++ * ongoing. ++ */ ++int32_t ++pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock) ++{ ++ pl_dom_list_t *dom; ++ pl_inode_lock_t *ilock; ++ ++ /* If the inode has been deleted, we won't allow any lock. */ ++ if (pl_inode->removed) { ++ return -ESTALE; ++ } ++ ++ /* We only synchronize with locks made for regular operations coming from ++ * the user. Locks done for internal purposes are hard to control and could ++ * lead to long delays or deadlocks quite easily. */ ++ if (lock->client_pid < 0) { ++ return 0; ++ } ++ if (!pl_inode->is_locked) { ++ return 0; ++ } ++ if (pl_inode->remove_running > 0) { ++ return 1; ++ } ++ ++ list_for_each_entry(dom, &pl_inode->dom_list, inode_list) ++ { ++ list_for_each_entry(ilock, &dom->inodelk_list, list) ++ { ++ /* If a lock from the same client is already granted, we allow this ++ * one to continue. This is necessary to prevent deadlocks when ++ * multiple locks are taken for the same operation. ++ * ++ * On the other side it's unlikely that the same client sends ++ * completely unrelated locks for the same inode. ++ */ ++ if (ilock->client == lock->client) { ++ return 0; ++ } ++ } ++ } ++ ++ return 1; ++} +diff --git a/xlators/features/locks/src/common.h b/xlators/features/locks/src/common.h +index ea86b96..6c81ac3 100644 +--- a/xlators/features/locks/src/common.h ++++ b/xlators/features/locks/src/common.h +@@ -105,6 +105,15 @@ void + __pl_inodelk_unref(pl_inode_lock_t *lock); + + void ++__grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, ++ struct list_head *granted, pl_dom_list_t *dom, ++ struct timespec *now, struct list_head *contend); ++ ++void ++unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode, ++ struct list_head *granted); ++ ++void + grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode, + pl_dom_list_t *dom, struct timespec *now, + struct list_head *contend); +@@ -204,6 +213,16 @@ pl_metalock_is_active(pl_inode_t *pl_inode); + void + __pl_queue_lock(pl_inode_t *pl_inode, posix_lock_t *reqlock); + ++void ++inodelk_contention_notify_check(xlator_t *xl, pl_inode_lock_t *lock, ++ struct timespec *now, ++ struct list_head *contend); ++ ++void ++entrylk_contention_notify_check(xlator_t *xl, pl_entry_lock_t *lock, ++ struct timespec *now, ++ struct list_head *contend); ++ + gf_boolean_t + pl_does_monkey_want_stuck_lock(); + +@@ -216,4 +235,28 @@ pl_clean_local(pl_local_t *local); + int + pl_local_init(call_frame_t *frame, xlator_t *this, loc_t *loc, fd_t *fd); + ++gf_boolean_t ++pl_is_lk_owner_valid(gf_lkowner_t *owner, client_t *client); ++ ++int32_t ++pl_inode_remove_prepare(xlator_t *xl, call_frame_t *frame, loc_t *loc, ++ pl_inode_t **ppl_inode, struct list_head *contend); ++ ++int32_t ++pl_inode_remove_complete(xlator_t *xl, pl_inode_t *pl_inode, call_stub_t *stub, ++ struct list_head *contend); ++ ++void ++pl_inode_remove_wake(struct list_head *list); ++ ++void ++pl_inode_remove_cbk(xlator_t *xl, pl_inode_t *pl_inode, int32_t error); ++ ++void ++pl_inode_remove_unlocked(xlator_t *xl, pl_inode_t *pl_inode, ++ struct list_head *list); ++ ++int32_t ++pl_inode_remove_inodelk(pl_inode_t *pl_inode, pl_inode_lock_t *lock); ++ + #endif /* __COMMON_H__ */ +diff --git a/xlators/features/locks/src/entrylk.c b/xlators/features/locks/src/entrylk.c +index 93c649c..b97836f 100644 +--- a/xlators/features/locks/src/entrylk.c ++++ b/xlators/features/locks/src/entrylk.c +@@ -197,9 +197,9 @@ out: + return revoke_lock; + } + +-static gf_boolean_t +-__entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock, +- struct timespec *now) ++void ++entrylk_contention_notify_check(xlator_t *this, pl_entry_lock_t *lock, ++ struct timespec *now, struct list_head *contend) + { + posix_locks_private_t *priv; + int64_t elapsed; +@@ -209,7 +209,7 @@ __entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock, + /* If this lock is in a list, it means that we are about to send a + * notification for it, so no need to do anything else. */ + if (!list_empty(&lock->contend)) { +- return _gf_false; ++ return; + } + + elapsed = now->tv_sec; +@@ -218,7 +218,7 @@ __entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock, + elapsed--; + } + if (elapsed < priv->notify_contention_delay) { +- return _gf_false; ++ return; + } + + /* All contention notifications will be sent outside of the locked +@@ -231,7 +231,7 @@ __entrylk_needs_contention_notify(xlator_t *this, pl_entry_lock_t *lock, + + lock->contention_time = *now; + +- return _gf_true; ++ list_add_tail(&lock->contend, contend); + } + + void +@@ -325,9 +325,7 @@ __entrylk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_entry_lock_t *lock, + break; + } + } +- if (__entrylk_needs_contention_notify(this, tmp, now)) { +- list_add_tail(&tmp->contend, contend); +- } ++ entrylk_contention_notify_check(this, tmp, now, contend); + } + } + +@@ -690,10 +688,9 @@ __grant_blocked_entry_locks(xlator_t *this, pl_inode_t *pl_inode, + bl_ret = __lock_entrylk(bl->this, pl_inode, bl, 0, dom, now, contend); + + if (bl_ret == 0) { +- list_add(&bl->blocked_locks, granted); ++ list_add_tail(&bl->blocked_locks, granted); + } + } +- return; + } + + /* Grants locks if possible which are blocked on a lock */ +diff --git a/xlators/features/locks/src/inodelk.c b/xlators/features/locks/src/inodelk.c +index 24dee49..1a07243 100644 +--- a/xlators/features/locks/src/inodelk.c ++++ b/xlators/features/locks/src/inodelk.c +@@ -231,9 +231,9 @@ out: + return revoke_lock; + } + +-static gf_boolean_t +-__inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock, +- struct timespec *now) ++void ++inodelk_contention_notify_check(xlator_t *this, pl_inode_lock_t *lock, ++ struct timespec *now, struct list_head *contend) + { + posix_locks_private_t *priv; + int64_t elapsed; +@@ -243,7 +243,7 @@ __inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock, + /* If this lock is in a list, it means that we are about to send a + * notification for it, so no need to do anything else. */ + if (!list_empty(&lock->contend)) { +- return _gf_false; ++ return; + } + + elapsed = now->tv_sec; +@@ -252,7 +252,7 @@ __inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock, + elapsed--; + } + if (elapsed < priv->notify_contention_delay) { +- return _gf_false; ++ return; + } + + /* All contention notifications will be sent outside of the locked +@@ -265,7 +265,7 @@ __inodelk_needs_contention_notify(xlator_t *this, pl_inode_lock_t *lock, + + lock->contention_time = *now; + +- return _gf_true; ++ list_add_tail(&lock->contend, contend); + } + + void +@@ -353,9 +353,7 @@ __inodelk_grantable(xlator_t *this, pl_dom_list_t *dom, pl_inode_lock_t *lock, + break; + } + } +- if (__inodelk_needs_contention_notify(this, l, now)) { +- list_add_tail(&l->contend, contend); +- } ++ inodelk_contention_notify_check(this, l, now, contend); + } + } + +@@ -435,12 +433,17 @@ __lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, + struct list_head *contend) + { + pl_inode_lock_t *conf = NULL; +- int ret = -EINVAL; ++ int ret; + +- conf = __inodelk_grantable(this, dom, lock, now, contend); +- if (conf) { +- ret = __lock_blocked_add(this, dom, lock, can_block); +- goto out; ++ ret = pl_inode_remove_inodelk(pl_inode, lock); ++ if (ret < 0) { ++ return ret; ++ } ++ if (ret == 0) { ++ conf = __inodelk_grantable(this, dom, lock, now, contend); ++ } ++ if ((ret > 0) || (conf != NULL)) { ++ return __lock_blocked_add(this, dom, lock, can_block); + } + + /* To prevent blocked locks starvation, check if there are any blocked +@@ -462,17 +465,13 @@ __lock_inodelk(xlator_t *this, pl_inode_t *pl_inode, pl_inode_lock_t *lock, + "starvation"); + } + +- ret = __lock_blocked_add(this, dom, lock, can_block); +- goto out; ++ return __lock_blocked_add(this, dom, lock, can_block); + } + __pl_inodelk_ref(lock); + gettimeofday(&lock->granted_time, NULL); + list_add(&lock->list, &dom->inodelk_list); + +- ret = 0; +- +-out: +- return ret; ++ return 0; + } + + /* Return true if the two inodelks have exactly same lock boundaries */ +@@ -529,12 +528,11 @@ out: + return conf; + } + +-static void ++void + __grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + struct list_head *granted, pl_dom_list_t *dom, + struct timespec *now, struct list_head *contend) + { +- int bl_ret = 0; + pl_inode_lock_t *bl = NULL; + pl_inode_lock_t *tmp = NULL; + +@@ -547,52 +545,48 @@ __grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + { + list_del_init(&bl->blocked_locks); + +- bl_ret = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend); ++ bl->status = __lock_inodelk(this, pl_inode, bl, 1, dom, now, contend); + +- if (bl_ret == 0) { +- list_add(&bl->blocked_locks, granted); ++ if (bl->status != -EAGAIN) { ++ list_add_tail(&bl->blocked_locks, granted); + } + } +- return; + } + +-/* Grant all inodelks blocked on a lock */ + void +-grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, +- pl_dom_list_t *dom, struct timespec *now, +- struct list_head *contend) ++unwind_granted_inodes(xlator_t *this, pl_inode_t *pl_inode, ++ struct list_head *granted) + { +- struct list_head granted; + pl_inode_lock_t *lock; + pl_inode_lock_t *tmp; ++ int32_t op_ret; ++ int32_t op_errno; + +- INIT_LIST_HEAD(&granted); +- +- pthread_mutex_lock(&pl_inode->mutex); +- { +- __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now, +- contend); +- } +- pthread_mutex_unlock(&pl_inode->mutex); +- +- list_for_each_entry_safe(lock, tmp, &granted, blocked_locks) ++ list_for_each_entry_safe(lock, tmp, granted, blocked_locks) + { +- gf_log(this->name, GF_LOG_TRACE, +- "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 " => Granted", +- lock->fl_type == F_UNLCK ? "Unlock" : "Lock", lock->client_pid, +- lkowner_utoa(&lock->owner), lock->user_flock.l_start, +- lock->user_flock.l_len); +- ++ if (lock->status == 0) { ++ op_ret = 0; ++ op_errno = 0; ++ gf_log(this->name, GF_LOG_TRACE, ++ "%s (pid=%d) (lk-owner=%s) %" PRId64 " - %" PRId64 ++ " => Granted", ++ lock->fl_type == F_UNLCK ? "Unlock" : "Lock", ++ lock->client_pid, lkowner_utoa(&lock->owner), ++ lock->user_flock.l_start, lock->user_flock.l_len); ++ } else { ++ op_ret = -1; ++ op_errno = -lock->status; ++ } + pl_trace_out(this, lock->frame, NULL, NULL, F_SETLKW, &lock->user_flock, +- 0, 0, lock->volume); ++ op_ret, op_errno, lock->volume); + +- STACK_UNWIND_STRICT(inodelk, lock->frame, 0, 0, NULL); ++ STACK_UNWIND_STRICT(inodelk, lock->frame, op_ret, op_errno, NULL); + lock->frame = NULL; + } + + pthread_mutex_lock(&pl_inode->mutex); + { +- list_for_each_entry_safe(lock, tmp, &granted, blocked_locks) ++ list_for_each_entry_safe(lock, tmp, granted, blocked_locks) + { + list_del_init(&lock->blocked_locks); + __pl_inodelk_unref(lock); +@@ -601,6 +595,26 @@ grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, + pthread_mutex_unlock(&pl_inode->mutex); + } + ++/* Grant all inodelks blocked on a lock */ ++void ++grant_blocked_inode_locks(xlator_t *this, pl_inode_t *pl_inode, ++ pl_dom_list_t *dom, struct timespec *now, ++ struct list_head *contend) ++{ ++ struct list_head granted; ++ ++ INIT_LIST_HEAD(&granted); ++ ++ pthread_mutex_lock(&pl_inode->mutex); ++ { ++ __grant_blocked_inode_locks(this, pl_inode, &granted, dom, now, ++ contend); ++ } ++ pthread_mutex_unlock(&pl_inode->mutex); ++ ++ unwind_granted_inodes(this, pl_inode, &granted); ++} ++ + static void + pl_inodelk_log_cleanup(pl_inode_lock_t *lock) + { +@@ -662,7 +676,7 @@ pl_inodelk_client_cleanup(xlator_t *this, pl_ctx_t *ctx) + * and blocked lists, then this means that a parallel + * unlock on another inodelk (L2 say) may have 'granted' + * L1 and added it to 'granted' list in +- * __grant_blocked_node_locks() (although using the ++ * __grant_blocked_inode_locks() (although using the + * 'blocked_locks' member). In that case, the cleanup + * codepath must try and grant other overlapping + * blocked inodelks from other clients, now that L1 is +@@ -747,6 +761,7 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, + gf_boolean_t need_inode_unref = _gf_false; + struct list_head *pcontend = NULL; + struct list_head contend; ++ struct list_head wake; + struct timespec now = {}; + short fl_type; + +@@ -798,6 +813,8 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, + timespec_now(&now); + } + ++ INIT_LIST_HEAD(&wake); ++ + if (ctx) + pthread_mutex_lock(&ctx->lock); + pthread_mutex_lock(&pl_inode->mutex); +@@ -820,18 +837,17 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, + lock->fl_type == F_UNLCK ? "Unlock" : "Lock", + lock->client_pid, lkowner_utoa(&lock->owner), + lock->user_flock.l_start, lock->user_flock.l_len); +- if (can_block) ++ if (can_block) { + unref = _gf_false; +- /* For all but the case where a non-blocking +- * lock attempt fails, the extra ref taken at +- * the start of this function must be negated. +- */ +- else +- need_inode_unref = _gf_true; ++ } + } +- +- if (ctx && (!ret || can_block)) ++ /* For all but the case where a non-blocking lock attempt fails ++ * with -EAGAIN, the extra ref taken at the start of this function ++ * must be negated. */ ++ need_inode_unref = (ret != 0) && ((ret != -EAGAIN) || !can_block); ++ if (ctx && !need_inode_unref) { + list_add_tail(&lock->client_list, &ctx->inodelk_lockers); ++ } + } else { + /* Irrespective of whether unlock succeeds or not, + * the extra inode ref that was done at the start of +@@ -849,6 +865,8 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, + list_del_init(&retlock->client_list); + __pl_inodelk_unref(retlock); + ++ pl_inode_remove_unlocked(this, pl_inode, &wake); ++ + ret = 0; + } + out: +@@ -859,6 +877,8 @@ pl_inode_setlk(xlator_t *this, pl_ctx_t *ctx, pl_inode_t *pl_inode, + if (ctx) + pthread_mutex_unlock(&ctx->lock); + ++ pl_inode_remove_wake(&wake); ++ + /* The following (extra) unref corresponds to the ref that + * was done at the time the lock was granted. + */ +@@ -1033,10 +1053,14 @@ pl_common_inodelk(call_frame_t *frame, xlator_t *this, const char *volume, + inode); + + if (ret < 0) { +- if ((can_block) && (F_UNLCK != lock_type)) { +- goto out; ++ if (ret == -EAGAIN) { ++ if (can_block && (F_UNLCK != lock_type)) { ++ goto out; ++ } ++ gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN"); ++ } else { ++ gf_log(this->name, GF_LOG_TRACE, "returning %d", ret); + } +- gf_log(this->name, GF_LOG_TRACE, "returning EAGAIN"); + op_errno = -ret; + goto unwind; + } +diff --git a/xlators/features/locks/src/locks.h b/xlators/features/locks/src/locks.h +index aa267de..6666feb 100644 +--- a/xlators/features/locks/src/locks.h ++++ b/xlators/features/locks/src/locks.h +@@ -102,6 +102,9 @@ struct __pl_inode_lock { + + struct list_head client_list; /* list of all locks from a client */ + short fl_type; ++ ++ int32_t status; /* Error code when we try to grant a lock in blocked ++ state */ + }; + typedef struct __pl_inode_lock pl_inode_lock_t; + +@@ -164,13 +167,14 @@ struct __pl_inode { + struct list_head rw_list; /* list of waiting r/w requests */ + struct list_head reservelk_list; /* list of reservelks */ + struct list_head blocked_reservelks; /* list of blocked reservelks */ +- struct list_head +- blocked_calls; /* List of blocked lock calls while a reserve is held*/ +- struct list_head metalk_list; /* Meta lock list */ +- /* This is to store the incoming lock +- requests while meta lock is enabled */ +- struct list_head queued_locks; +- int mandatory; /* if mandatory locking is enabled */ ++ struct list_head blocked_calls; /* List of blocked lock calls while a ++ reserve is held*/ ++ struct list_head metalk_list; /* Meta lock list */ ++ struct list_head queued_locks; /* This is to store the incoming lock ++ requests while meta lock is enabled */ ++ struct list_head waiting; /* List of pending fops waiting to unlink/rmdir ++ the inode. */ ++ int mandatory; /* if mandatory locking is enabled */ + + inode_t *refkeeper; /* hold refs on an inode while locks are + held to prevent pruning */ +@@ -197,6 +201,11 @@ struct __pl_inode { + */ + int fop_wind_count; + pthread_cond_t check_fop_wind_count; ++ ++ int32_t links; /* Number of hard links the inode has. */ ++ uint32_t remove_running; /* Number of remove operations running. */ ++ gf_boolean_t is_locked; /* Regular locks will be blocked. */ ++ gf_boolean_t removed; /* The inode has been deleted. */ + }; + typedef struct __pl_inode pl_inode_t; + +diff --git a/xlators/features/locks/src/posix.c b/xlators/features/locks/src/posix.c +index 7887b82..5ae0125 100644 +--- a/xlators/features/locks/src/posix.c ++++ b/xlators/features/locks/src/posix.c +@@ -147,6 +147,29 @@ fetch_pathinfo(xlator_t *, inode_t *, int32_t *, char **); + } \ + } while (0) + ++#define PL_INODE_REMOVE(_fop, _frame, _xl, _loc1, _loc2, _cont, _cbk, \ ++ _args...) \ ++ ({ \ ++ struct list_head contend; \ ++ pl_inode_t *__pl_inode; \ ++ call_stub_t *__stub; \ ++ int32_t __error; \ ++ INIT_LIST_HEAD(&contend); \ ++ __error = pl_inode_remove_prepare(_xl, _frame, _loc2 ? _loc2 : _loc1, \ ++ &__pl_inode, &contend); \ ++ if (__error < 0) { \ ++ __stub = fop_##_fop##_stub(_frame, _cont, ##_args); \ ++ __error = pl_inode_remove_complete(_xl, __pl_inode, __stub, \ ++ &contend); \ ++ } else if (__error == 0) { \ ++ PL_LOCAL_GET_REQUESTS(_frame, _xl, xdata, ((fd_t *)NULL), _loc1, \ ++ _loc2); \ ++ STACK_WIND_COOKIE(_frame, _cbk, __pl_inode, FIRST_CHILD(_xl), \ ++ FIRST_CHILD(_xl)->fops->_fop, ##_args); \ ++ } \ ++ __error; \ ++ }) ++ + gf_boolean_t + pl_has_xdata_requests(dict_t *xdata) + { +@@ -2969,11 +2992,85 @@ out: + return ret; + } + ++static int32_t ++pl_request_link_count(dict_t **pxdata) ++{ ++ dict_t *xdata; ++ ++ xdata = *pxdata; ++ if (xdata == NULL) { ++ xdata = dict_new(); ++ if (xdata == NULL) { ++ return ENOMEM; ++ } ++ } else { ++ dict_ref(xdata); ++ } ++ ++ if (dict_set_uint32(xdata, GET_LINK_COUNT, 0) != 0) { ++ dict_unref(xdata); ++ return ENOMEM; ++ } ++ ++ *pxdata = xdata; ++ ++ return 0; ++} ++ ++static int32_t ++pl_check_link_count(dict_t *xdata) ++{ ++ int32_t count; ++ ++ /* In case we are unable to read the link count from xdata, we take a ++ * conservative approach and return -2, which will prevent the inode from ++ * being considered deleted. In fact it will cause link tracking for this ++ * inode to be disabled completely to avoid races. */ ++ ++ if (xdata == NULL) { ++ return -2; ++ } ++ ++ if (dict_get_int32(xdata, GET_LINK_COUNT, &count) != 0) { ++ return -2; ++ } ++ ++ return count; ++} ++ + int32_t + pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, dict_t *xdata, + struct iatt *postparent) + { ++ pl_inode_t *pl_inode; ++ ++ if (op_ret >= 0) { ++ pl_inode = pl_inode_get(this, inode, NULL); ++ if (pl_inode == NULL) { ++ PL_STACK_UNWIND(lookup, xdata, frame, -1, ENOMEM, NULL, NULL, NULL, ++ NULL); ++ return 0; ++ } ++ ++ pthread_mutex_lock(&pl_inode->mutex); ++ ++ /* We only update the link count if we previously didn't know it. ++ * Doing it always can lead to races since lookup is not executed ++ * atomically most of the times. */ ++ if (pl_inode->links == -2) { ++ pl_inode->links = pl_check_link_count(xdata); ++ if (buf->ia_type == IA_IFDIR) { ++ /* Directories have at least 2 links. To avoid special handling ++ * for directories, we simply decrement the value here to make ++ * them equivalent to regular files. */ ++ pl_inode->links--; ++ } ++ } ++ ++ pthread_mutex_unlock(&pl_inode->mutex); ++ } ++ + PL_STACK_UNWIND(lookup, xdata, frame, op_ret, op_errno, inode, buf, xdata, + postparent); + return 0; +@@ -2982,9 +3079,17 @@ pl_lookup_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t + pl_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xdata) + { +- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); +- STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->lookup, loc, xdata); ++ int32_t error; ++ ++ error = pl_request_link_count(&xdata); ++ if (error == 0) { ++ PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); ++ STACK_WIND(frame, pl_lookup_cbk, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->lookup, loc, xdata); ++ dict_unref(xdata); ++ } else { ++ STACK_UNWIND_STRICT(lookup, frame, -1, error, NULL, NULL, NULL, NULL); ++ } + return 0; + } + +@@ -3792,6 +3897,10 @@ unlock: + gf_proc_dump_write("posixlk-count", "%d", count); + __dump_posixlks(pl_inode); + } ++ ++ gf_proc_dump_write("links", "%d", pl_inode->links); ++ gf_proc_dump_write("removes_pending", "%u", pl_inode->remove_running); ++ gf_proc_dump_write("removed", "%u", pl_inode->removed); + } + pthread_mutex_unlock(&pl_inode->mutex); + +@@ -4137,8 +4246,11 @@ pl_rename_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + struct iatt *postoldparent, struct iatt *prenewparent, + struct iatt *postnewparent, dict_t *xdata) + { ++ pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0); ++ + PL_STACK_UNWIND(rename, xdata, frame, op_ret, op_errno, buf, preoldparent, + postoldparent, prenewparent, postnewparent, xdata); ++ + return 0; + } + +@@ -4146,10 +4258,15 @@ int32_t + pl_rename(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) + { +- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc); ++ int32_t error; ++ ++ error = PL_INODE_REMOVE(rename, frame, this, oldloc, newloc, pl_rename, ++ pl_rename_cbk, oldloc, newloc, xdata); ++ if (error > 0) { ++ STACK_UNWIND_STRICT(rename, frame, -1, error, NULL, NULL, NULL, NULL, ++ NULL, NULL); ++ } + +- STACK_WIND(frame, pl_rename_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->rename, oldloc, newloc, xdata); + return 0; + } + +@@ -4273,8 +4390,11 @@ pl_unlink_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) + { ++ pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0); ++ + PL_STACK_UNWIND(unlink, xdata, frame, op_ret, op_errno, preparent, + postparent, xdata); ++ + return 0; + } + +@@ -4282,9 +4402,14 @@ int32_t + pl_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflag, + dict_t *xdata) + { +- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); +- STACK_WIND(frame, pl_unlink_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->unlink, loc, xflag, xdata); ++ int32_t error; ++ ++ error = PL_INODE_REMOVE(unlink, frame, this, loc, NULL, pl_unlink, ++ pl_unlink_cbk, loc, xflag, xdata); ++ if (error > 0) { ++ STACK_UNWIND_STRICT(unlink, frame, -1, error, NULL, NULL, NULL); ++ } ++ + return 0; + } + +@@ -4351,8 +4476,11 @@ pl_rmdir_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, struct iatt *preparent, struct iatt *postparent, + dict_t *xdata) + { ++ pl_inode_remove_cbk(this, cookie, op_ret < 0 ? op_errno : 0); ++ + PL_STACK_UNWIND_FOR_CLIENT(rmdir, xdata, frame, op_ret, op_errno, preparent, + postparent, xdata); ++ + return 0; + } + +@@ -4360,9 +4488,14 @@ int + pl_rmdir(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) + { +- PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), loc, NULL); +- STACK_WIND(frame, pl_rmdir_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->rmdir, loc, xflags, xdata); ++ int32_t error; ++ ++ error = PL_INODE_REMOVE(rmdir, frame, this, loc, NULL, pl_rmdir, ++ pl_rmdir_cbk, loc, xflags, xdata); ++ if (error > 0) { ++ STACK_UNWIND_STRICT(rmdir, frame, -1, error, NULL, NULL, NULL); ++ } ++ + return 0; + } + +@@ -4392,6 +4525,19 @@ pl_link_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int32_t op_ret, + int32_t op_errno, inode_t *inode, struct iatt *buf, + struct iatt *preparent, struct iatt *postparent, dict_t *xdata) + { ++ pl_inode_t *pl_inode = (pl_inode_t *)cookie; ++ ++ if (op_ret >= 0) { ++ pthread_mutex_lock(&pl_inode->mutex); ++ ++ /* TODO: can happen pl_inode->links == 0 ? */ ++ if (pl_inode->links >= 0) { ++ pl_inode->links++; ++ } ++ ++ pthread_mutex_unlock(&pl_inode->mutex); ++ } ++ + PL_STACK_UNWIND_FOR_CLIENT(link, xdata, frame, op_ret, op_errno, inode, buf, + preparent, postparent, xdata); + return 0; +@@ -4401,9 +4547,18 @@ int + pl_link(call_frame_t *frame, xlator_t *this, loc_t *oldloc, loc_t *newloc, + dict_t *xdata) + { ++ pl_inode_t *pl_inode; ++ ++ pl_inode = pl_inode_get(this, oldloc->inode, NULL); ++ if (pl_inode == NULL) { ++ STACK_UNWIND_STRICT(link, frame, -1, ENOMEM, NULL, NULL, NULL, NULL, ++ NULL); ++ return 0; ++ } ++ + PL_LOCAL_GET_REQUESTS(frame, this, xdata, ((fd_t *)NULL), oldloc, newloc); +- STACK_WIND(frame, pl_link_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); ++ STACK_WIND_COOKIE(frame, pl_link_cbk, pl_inode, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->link, oldloc, newloc, xdata); + return 0; + } + +-- +1.8.3.1 + |