diff options
Diffstat (limited to '0523-open-behind-rewrite-of-internal-logic.patch')
-rw-r--r-- | 0523-open-behind-rewrite-of-internal-logic.patch | 2720 |
1 files changed, 2720 insertions, 0 deletions
diff --git a/0523-open-behind-rewrite-of-internal-logic.patch b/0523-open-behind-rewrite-of-internal-logic.patch new file mode 100644 index 0000000..621d5ae --- /dev/null +++ b/0523-open-behind-rewrite-of-internal-logic.patch @@ -0,0 +1,2720 @@ +From b924c8ca8a133fc9413c8ed1407e63f1658c7e79 Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez <xhernandez@redhat.com> +Date: Tue, 12 May 2020 23:54:54 +0200 +Subject: [PATCH 523/526] open-behind: rewrite of internal logic + +There was a critical flaw in the previous implementation of open-behind. + +When an open is done in the background, it's necessary to take a +reference on the fd_t object because once we "fake" the open answer, +the fd could be destroyed. However as long as there's a reference, +the release function won't be called. So, if the application closes +the file descriptor without having actually opened it, there will +always remain at least 1 reference, causing a leak. + +To avoid this problem, the previous implementation didn't take a +reference on the fd_t, so there were races where the fd could be +destroyed while it was still in use. + +To fix this, I've implemented a new xlator cbk that gets called from +fuse when the application closes a file descriptor. + +The whole logic of handling background opens have been simplified and +it's more efficient now. Only if the fop needs to be delayed until an +open completes, a stub is created. Otherwise no memory allocations are +needed. + +Correctly handling the close request while the open is still pending +has added a bit of complexity, but overall normal operation is simpler. + +Upstream patch: +> Upstream-patch-link: https://review.gluster.org/#/c/glusterfs/+/24451 +> Change-Id: I6376a5491368e0e1c283cc452849032636261592 +> Fixes: #1225 +> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com> + +BUG: 1830713 +Change-Id: I6376a5491368e0e1c283cc452849032636261592 +Signed-off-by: Xavi Hernandez <xhernandez@redhat.com> +Reviewed-on: https://code.engineering.redhat.com/gerrit/224487 +Tested-by: RHGS Build Bot <nigelb@redhat.com> +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> +--- + libglusterfs/src/fd.c | 26 + + libglusterfs/src/glusterfs/fd.h | 3 + + libglusterfs/src/glusterfs/xlator.h | 4 + + libglusterfs/src/libglusterfs.sym | 1 + + tests/basic/open-behind/open-behind.t | 183 +++ + tests/basic/open-behind/tester-fd.c | 99 ++ + tests/basic/open-behind/tester.c | 444 +++++++ + tests/basic/open-behind/tester.h | 145 +++ + tests/bugs/glusterfs/bug-873962-spb.t | 1 + + xlators/mount/fuse/src/fuse-bridge.c | 2 + + .../open-behind/src/open-behind-messages.h | 6 +- + xlators/performance/open-behind/src/open-behind.c | 1302 ++++++++------------ + 12 files changed, 1393 insertions(+), 823 deletions(-) + create mode 100644 tests/basic/open-behind/open-behind.t + create mode 100644 tests/basic/open-behind/tester-fd.c + create mode 100644 tests/basic/open-behind/tester.c + create mode 100644 tests/basic/open-behind/tester.h + +diff --git a/libglusterfs/src/fd.c b/libglusterfs/src/fd.c +index 314546a..e4ec401 100644 +--- a/libglusterfs/src/fd.c ++++ b/libglusterfs/src/fd.c +@@ -501,6 +501,32 @@ out: + } + + void ++fd_close(fd_t *fd) ++{ ++ xlator_t *xl, *old_THIS; ++ ++ old_THIS = THIS; ++ ++ for (xl = fd->inode->table->xl->graph->first; xl != NULL; xl = xl->next) { ++ if (!xl->call_cleanup) { ++ THIS = xl; ++ ++ if (IA_ISDIR(fd->inode->ia_type)) { ++ if (xl->cbks->fdclosedir != NULL) { ++ xl->cbks->fdclosedir(xl, fd); ++ } ++ } else { ++ if (xl->cbks->fdclose != NULL) { ++ xl->cbks->fdclose(xl, fd); ++ } ++ } ++ } ++ } ++ ++ THIS = old_THIS; ++} ++ ++void + fd_unref(fd_t *fd) + { + int32_t refcount = 0; +diff --git a/libglusterfs/src/glusterfs/fd.h b/libglusterfs/src/glusterfs/fd.h +index cdbe289..4d157c4 100644 +--- a/libglusterfs/src/glusterfs/fd.h ++++ b/libglusterfs/src/glusterfs/fd.h +@@ -107,6 +107,9 @@ fd_ref(fd_t *fd); + void + fd_unref(fd_t *fd); + ++void ++fd_close(fd_t *fd); ++ + fd_t * + fd_create(struct _inode *inode, pid_t pid); + +diff --git a/libglusterfs/src/glusterfs/xlator.h b/libglusterfs/src/glusterfs/xlator.h +index 8650ccc..273039a 100644 +--- a/libglusterfs/src/glusterfs/xlator.h ++++ b/libglusterfs/src/glusterfs/xlator.h +@@ -705,6 +705,8 @@ typedef size_t (*cbk_inodectx_size_t)(xlator_t *this, inode_t *inode); + + typedef size_t (*cbk_fdctx_size_t)(xlator_t *this, fd_t *fd); + ++typedef void (*cbk_fdclose_t)(xlator_t *this, fd_t *fd); ++ + struct xlator_cbks { + cbk_forget_t forget; + cbk_release_t release; +@@ -715,6 +717,8 @@ struct xlator_cbks { + cbk_ictxmerge_t ictxmerge; + cbk_inodectx_size_t ictxsize; + cbk_fdctx_size_t fdctxsize; ++ cbk_fdclose_t fdclose; ++ cbk_fdclose_t fdclosedir; + }; + + typedef int32_t (*dumpop_priv_t)(xlator_t *this); +diff --git a/libglusterfs/src/libglusterfs.sym b/libglusterfs/src/libglusterfs.sym +index bc770e2..0a0862e 100644 +--- a/libglusterfs/src/libglusterfs.sym ++++ b/libglusterfs/src/libglusterfs.sym +@@ -456,6 +456,7 @@ event_unregister_close + fd_anonymous + fd_anonymous_with_flags + fd_bind ++fd_close + fd_create + fd_create_uint64 + __fd_ctx_del +diff --git a/tests/basic/open-behind/open-behind.t b/tests/basic/open-behind/open-behind.t +new file mode 100644 +index 0000000..5e865d6 +--- /dev/null ++++ b/tests/basic/open-behind/open-behind.t +@@ -0,0 +1,183 @@ ++#!/bin/bash ++ ++WD="$(dirname "${0}")" ++ ++. ${WD}/../../include.rc ++. ${WD}/../../volume.rc ++ ++function assign() { ++ local _assign_var="${1}" ++ local _assign_value="${2}" ++ ++ printf -v "${_assign_var}" "%s" "${_assign_value}" ++} ++ ++function pipe_create() { ++ local _pipe_create_var="${1}" ++ local _pipe_create_name ++ local _pipe_create_fd ++ ++ _pipe_create_name="$(mktemp -u)" ++ mkfifo "${_pipe_create_name}" ++ exec {_pipe_create_fd}<>"${_pipe_create_name}" ++ rm "${_pipe_create_name}" ++ ++ assign "${_pipe_create_var}" "${_pipe_create_fd}" ++} ++ ++function pipe_close() { ++ local _pipe_close_fd="${!1}" ++ ++ exec {_pipe_close_fd}>&- ++} ++ ++function tester_start() { ++ declare -ag tester ++ local tester_in ++ local tester_out ++ ++ pipe_create tester_in ++ pipe_create tester_out ++ ++ ${WD}/tester <&${tester_in} >&${tester_out} & ++ ++ tester=("$!" "${tester_in}" "${tester_out}") ++} ++ ++function tester_send() { ++ declare -ag tester ++ local tester_res ++ local tester_extra ++ ++ echo "${*}" >&${tester[1]} ++ ++ read -t 3 -u ${tester[2]} tester_res tester_extra ++ echo "${tester_res} ${tester_extra}" ++ if [[ "${tester_res}" == "OK" ]]; then ++ return 0 ++ fi ++ ++ return 1 ++} ++ ++function tester_stop() { ++ declare -ag tester ++ local tester_res ++ ++ tester_send "quit" ++ ++ tester_res=0 ++ if ! wait ${tester[0]}; then ++ tester_res=$? ++ fi ++ ++ unset tester ++ ++ return ${tester_res} ++} ++ ++function count_open() { ++ local file="$(realpath "${B0}/${V0}/${1}")" ++ local count="0" ++ local inode ++ local ref ++ ++ inode="$(stat -c %i "${file}")" ++ ++ for fd in /proc/${BRICK_PID}/fd/*; do ++ ref="$(readlink "${fd}")" ++ if [[ "${ref}" == "${B0}/${V0}/"* ]]; then ++ if [[ "$(stat -c %i "${ref}")" == "${inode}" ]]; then ++ count="$((${count} + 1))" ++ fi ++ fi ++ done ++ ++ echo "${count}" ++} ++ ++cleanup ++ ++TEST build_tester ${WD}/tester.c ${WD}/tester-fd.c ++ ++TEST glusterd ++TEST pidof glusterd ++TEST ${CLI} volume create ${V0} ${H0}:${B0}/${V0} ++TEST ${CLI} volume set ${V0} flush-behind off ++TEST ${CLI} volume set ${V0} write-behind off ++TEST ${CLI} volume set ${V0} quick-read off ++TEST ${CLI} volume set ${V0} stat-prefetch on ++TEST ${CLI} volume set ${V0} io-cache off ++TEST ${CLI} volume set ${V0} open-behind on ++TEST ${CLI} volume set ${V0} lazy-open off ++TEST ${CLI} volume set ${V0} read-after-open off ++TEST ${CLI} volume start ${V0} ++ ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++BRICK_PID="$(get_brick_pid ${V0} ${H0} ${B0}/${V0})" ++ ++TEST touch "${M0}/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_start ++ ++TEST tester_send fd open 0 "${M0}/test" ++EXPECT_WITHIN 5 "1" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT_WITHIN 5 "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${CLI} volume set ${V0} lazy-open on ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_send fd open 0 "${M0}/test" ++sleep 2 ++EXPECT "0" count_open "/test" ++TEST tester_send fd write 0 "test" ++EXPECT "1" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT_WITHIN 5 "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_send fd open 0 "${M0}/test" ++EXPECT "0" count_open "/test" ++EXPECT "test" tester_send fd read 0 64 ++# Even though read-after-open is disabled, use-anonymous-fd is also disabled, ++# so reads need to open the file first. ++EXPECT "1" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_send fd open 0 "${M0}/test" ++EXPECT "0" count_open "/test" ++TEST tester_send fd open 1 "${M0}/test" ++EXPECT "2" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT_WITHIN 5 "1" count_open "/test" ++TEST tester_send fd close 1 ++EXPECT_WITHIN 5 "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++TEST ${CLI} volume set ${V0} read-after-open on ++TEST ${GFS} --volfile-id=/${V0} --volfile-server=${H0} ${M0}; ++ ++TEST tester_send fd open 0 "${M0}/test" ++EXPECT "0" count_open "/test" ++EXPECT "test" tester_send fd read 0 64 ++EXPECT "1" count_open "/test" ++TEST tester_send fd close 0 ++EXPECT_WITHIN 5 "0" count_open "/test" ++ ++EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 ++ ++TEST tester_stop ++ ++cleanup +diff --git a/tests/basic/open-behind/tester-fd.c b/tests/basic/open-behind/tester-fd.c +new file mode 100644 +index 0000000..00f02bc +--- /dev/null ++++ b/tests/basic/open-behind/tester-fd.c +@@ -0,0 +1,99 @@ ++/* ++ Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#include "tester.h" ++ ++#include <stdlib.h> ++#include <unistd.h> ++#include <sys/types.h> ++#include <sys/stat.h> ++#include <fcntl.h> ++#include <string.h> ++#include <ctype.h> ++#include <errno.h> ++ ++static int32_t ++fd_open(context_t *ctx, command_t *cmd) ++{ ++ obj_t *obj; ++ int32_t fd; ++ ++ obj = cmd->args[0].obj.ref; ++ ++ fd = open(cmd->args[1].str.data, O_RDWR); ++ if (fd < 0) { ++ return error(errno, "open() failed"); ++ } ++ ++ obj->type = OBJ_TYPE_FD; ++ obj->fd = fd; ++ ++ out_ok("%d", fd); ++ ++ return 0; ++} ++ ++static int32_t ++fd_close(context_t *ctx, command_t *cmd) ++{ ++ obj_t *obj; ++ ++ obj = cmd->args[0].obj.ref; ++ obj->type = OBJ_TYPE_NONE; ++ ++ if (close(obj->fd) != 0) { ++ return error(errno, "close() failed"); ++ } ++ ++ out_ok(); ++ ++ return 0; ++} ++ ++static int32_t ++fd_write(context_t *ctx, command_t *cmd) ++{ ++ ssize_t len, ret; ++ ++ len = strlen(cmd->args[1].str.data); ++ ret = write(cmd->args[0].obj.ref->fd, cmd->args[1].str.data, len); ++ if (ret < 0) { ++ return error(errno, "write() failed"); ++ } ++ ++ out_ok("%zd", ret); ++ ++ return 0; ++} ++ ++static int32_t ++fd_read(context_t *ctx, command_t *cmd) ++{ ++ char data[cmd->args[1].num.value + 1]; ++ ssize_t ret; ++ ++ ret = read(cmd->args[0].obj.ref->fd, data, cmd->args[1].num.value); ++ if (ret < 0) { ++ return error(errno, "read() failed"); ++ } ++ ++ data[ret] = 0; ++ ++ out_ok("%zd %s", ret, data); ++ ++ return 0; ++} ++ ++command_t fd_commands[] = { ++ {"open", fd_open, CMD_ARGS(ARG_VAL(OBJ_TYPE_NONE), ARG_STR(1024))}, ++ {"close", fd_close, CMD_ARGS(ARG_VAL(OBJ_TYPE_FD))}, ++ {"write", fd_write, CMD_ARGS(ARG_VAL(OBJ_TYPE_FD), ARG_STR(1024))}, ++ {"read", fd_read, CMD_ARGS(ARG_VAL(OBJ_TYPE_FD), ARG_NUM(0, 1024))}, ++ CMD_END}; +diff --git a/tests/basic/open-behind/tester.c b/tests/basic/open-behind/tester.c +new file mode 100644 +index 0000000..b2da71c +--- /dev/null ++++ b/tests/basic/open-behind/tester.c +@@ -0,0 +1,444 @@ ++/* ++ Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#include "tester.h" ++ ++#include <stdlib.h> ++#include <unistd.h> ++#include <string.h> ++#include <ctype.h> ++#include <errno.h> ++ ++static void * ++mem_alloc(size_t size) ++{ ++ void *ptr; ++ ++ ptr = malloc(size); ++ if (ptr == NULL) { ++ error(ENOMEM, "Failed to allocate memory (%zu bytes)", size); ++ } ++ ++ return ptr; ++} ++ ++static void ++mem_free(void *ptr) ++{ ++ free(ptr); ++} ++ ++static bool ++buffer_create(context_t *ctx, size_t size) ++{ ++ ctx->buffer.base = mem_alloc(size); ++ if (ctx->buffer.base == NULL) { ++ return false; ++ } ++ ++ ctx->buffer.size = size; ++ ctx->buffer.len = 0; ++ ctx->buffer.pos = 0; ++ ++ return true; ++} ++ ++static void ++buffer_destroy(context_t *ctx) ++{ ++ mem_free(ctx->buffer.base); ++ ctx->buffer.size = 0; ++ ctx->buffer.len = 0; ++} ++ ++static int32_t ++buffer_get(context_t *ctx) ++{ ++ ssize_t len; ++ ++ if (ctx->buffer.pos >= ctx->buffer.len) { ++ len = read(0, ctx->buffer.base, ctx->buffer.size); ++ if (len < 0) { ++ return error(errno, "read() failed"); ++ } ++ if (len == 0) { ++ return 0; ++ } ++ ++ ctx->buffer.len = len; ++ ctx->buffer.pos = 0; ++ } ++ ++ return ctx->buffer.base[ctx->buffer.pos++]; ++} ++ ++static int32_t ++str_skip_spaces(context_t *ctx, int32_t current) ++{ ++ while ((current > 0) && (current != '\n') && isspace(current)) { ++ current = buffer_get(ctx); ++ } ++ ++ return current; ++} ++ ++static int32_t ++str_token(context_t *ctx, char *buffer, uint32_t size, int32_t current) ++{ ++ uint32_t len; ++ ++ current = str_skip_spaces(ctx, current); ++ ++ len = 0; ++ while ((size > 0) && (current > 0) && (current != '\n') && ++ !isspace(current)) { ++ len++; ++ *buffer++ = current; ++ size--; ++ current = buffer_get(ctx); ++ } ++ ++ if (len == 0) { ++ return error(ENODATA, "Expecting a token"); ++ } ++ ++ if (size == 0) { ++ return error(ENOBUFS, "Token too long"); ++ } ++ ++ *buffer = 0; ++ ++ return current; ++} ++ ++static int32_t ++str_number(context_t *ctx, uint64_t min, uint64_t max, uint64_t *value, ++ int32_t current) ++{ ++ char text[32], *ptr; ++ uint64_t num; ++ ++ current = str_token(ctx, text, sizeof(text), current); ++ if (current > 0) { ++ num = strtoul(text, &ptr, 0); ++ if ((*ptr != 0) || (num < min) || (num > max)) { ++ return error(ERANGE, "Invalid number"); ++ } ++ *value = num; ++ } ++ ++ return current; ++} ++ ++static int32_t ++str_eol(context_t *ctx, int32_t current) ++{ ++ current = str_skip_spaces(ctx, current); ++ if (current != '\n') { ++ return error(EINVAL, "Expecting end of command"); ++ } ++ ++ return current; ++} ++ ++static void ++str_skip(context_t *ctx, int32_t current) ++{ ++ while ((current > 0) && (current != '\n')) { ++ current = buffer_get(ctx); ++ } ++} ++ ++static int32_t ++cmd_parse_obj(context_t *ctx, arg_t *arg, int32_t current) ++{ ++ obj_t *obj; ++ uint64_t id; ++ ++ current = str_number(ctx, 0, ctx->obj_count, &id, current); ++ if (current <= 0) { ++ return current; ++ } ++ ++ obj = &ctx->objs[id]; ++ if (obj->type != arg->obj.type) { ++ if (obj->type != OBJ_TYPE_NONE) { ++ return error(EBUSY, "Object is in use"); ++ } ++ return error(ENOENT, "Object is not defined"); ++ } ++ ++ arg->obj.ref = obj; ++ ++ return current; ++} ++ ++static int32_t ++cmd_parse_num(context_t *ctx, arg_t *arg, int32_t current) ++{ ++ return str_number(ctx, arg->num.min, arg->num.max, &arg->num.value, ++ current); ++} ++ ++static int32_t ++cmd_parse_str(context_t *ctx, arg_t *arg, int32_t current) ++{ ++ return str_token(ctx, arg->str.data, arg->str.size, current); ++} ++ ++static int32_t ++cmd_parse_args(context_t *ctx, command_t *cmd, int32_t current) ++{ ++ arg_t *arg; ++ ++ for (arg = cmd->args; arg->type != ARG_TYPE_NONE; arg++) { ++ switch (arg->type) { ++ case ARG_TYPE_OBJ: ++ current = cmd_parse_obj(ctx, arg, current); ++ break; ++ case ARG_TYPE_NUM: ++ current = cmd_parse_num(ctx, arg, current); ++ break; ++ case ARG_TYPE_STR: ++ current = cmd_parse_str(ctx, arg, current); ++ break; ++ default: ++ return error(EINVAL, "Unknown argument type"); ++ } ++ } ++ ++ if (current < 0) { ++ return current; ++ } ++ ++ current = str_eol(ctx, current); ++ if (current <= 0) { ++ return error(EINVAL, "Syntax error"); ++ } ++ ++ return cmd->handler(ctx, cmd); ++} ++ ++static int32_t ++cmd_parse(context_t *ctx, command_t *cmds) ++{ ++ char text[32]; ++ command_t *cmd; ++ int32_t current; ++ ++ cmd = cmds; ++ do { ++ current = str_token(ctx, text, sizeof(text), buffer_get(ctx)); ++ if (current <= 0) { ++ return current; ++ } ++ ++ while (cmd->name != NULL) { ++ if (strcmp(cmd->name, text) == 0) { ++ if (cmd->handler != NULL) { ++ return cmd_parse_args(ctx, cmd, current); ++ } ++ cmd = cmd->cmds; ++ break; ++ } ++ cmd++; ++ } ++ } while (cmd->name != NULL); ++ ++ str_skip(ctx, current); ++ ++ return error(ENOTSUP, "Unknown command"); ++} ++ ++static void ++cmd_fini(context_t *ctx, command_t *cmds) ++{ ++ command_t *cmd; ++ arg_t *arg; ++ ++ for (cmd = cmds; cmd->name != NULL; cmd++) { ++ if (cmd->handler == NULL) { ++ cmd_fini(ctx, cmd->cmds); ++ } else { ++ for (arg = cmd->args; arg->type != ARG_TYPE_NONE; arg++) { ++ switch (arg->type) { ++ case ARG_TYPE_STR: ++ mem_free(arg->str.data); ++ arg->str.data = NULL; ++ break; ++ default: ++ break; ++ } ++ } ++ } ++ } ++} ++ ++static bool ++cmd_init(context_t *ctx, command_t *cmds) ++{ ++ command_t *cmd; ++ arg_t *arg; ++ ++ for (cmd = cmds; cmd->name != NULL; cmd++) { ++ if (cmd->handler == NULL) { ++ if (!cmd_init(ctx, cmd->cmds)) { ++ return false; ++ } ++ } else { ++ for (arg = cmd->args; arg->type != ARG_TYPE_NONE; arg++) { ++ switch (arg->type) { ++ case ARG_TYPE_STR: ++ arg->str.data = mem_alloc(arg->str.size); ++ if (arg->str.data == NULL) { ++ return false; ++ } ++ break; ++ default: ++ break; ++ } ++ } ++ } ++ } ++ ++ return true; ++} ++ ++static bool ++objs_create(context_t *ctx, uint32_t count) ++{ ++ uint32_t i; ++ ++ ctx->objs = mem_alloc(sizeof(obj_t) * count); ++ if (ctx->objs == NULL) { ++ return false; ++ } ++ ctx->obj_count = count; ++ ++ for (i = 0; i < count; i++) { ++ ctx->objs[i].type = OBJ_TYPE_NONE; ++ } ++ ++ return true; ++} ++ ++static int32_t ++objs_destroy(context_t *ctx) ++{ ++ uint32_t i; ++ int32_t err; ++ ++ err = 0; ++ for (i = 0; i < ctx->obj_count; i++) { ++ if (ctx->objs[i].type != OBJ_TYPE_NONE) { ++ err = error(ENOTEMPTY, "Objects not destroyed"); ++ break; ++ } ++ } ++ ++ mem_free(ctx->objs); ++ ctx->objs = NULL; ++ ctx->obj_count = 0; ++ ++ return err; ++} ++ ++static context_t * ++init(size_t size, uint32_t objs, command_t *cmds) ++{ ++ context_t *ctx; ++ ++ ctx = mem_alloc(sizeof(context_t)); ++ if (ctx == NULL) { ++ goto failed; ++ } ++ ++ if (!buffer_create(ctx, size)) { ++ goto failed_ctx; ++ } ++ ++ if (!objs_create(ctx, objs)) { ++ goto failed_buffer; ++ } ++ ++ if (!cmd_init(ctx, cmds)) { ++ goto failed_objs; ++ } ++ ++ ctx->active = true; ++ ++ return ctx; ++ ++failed_objs: ++ cmd_fini(ctx, cmds); ++ objs_destroy(ctx); ++failed_buffer: ++ buffer_destroy(ctx); ++failed_ctx: ++ mem_free(ctx); ++failed: ++ return NULL; ++} ++ ++static int32_t ++fini(context_t *ctx, command_t *cmds) ++{ ++ int32_t ret; ++ ++ cmd_fini(ctx, cmds); ++ buffer_destroy(ctx); ++ ++ ret = objs_destroy(ctx); ++ ++ ctx->active = false; ++ ++ return ret; ++} ++ ++static int32_t ++exec_quit(context_t *ctx, command_t *cmd) ++{ ++ ctx->active = false; ++ ++ return 0; ++} ++ ++static command_t commands[] = {{"fd", NULL, CMD_SUB(fd_commands)}, ++ {"quit", exec_quit, CMD_ARGS()}, ++ CMD_END}; ++ ++int32_t ++main(int32_t argc, char *argv[]) ++{ ++ context_t *ctx; ++ int32_t res; ++ ++ ctx = init(1024, 16, commands); ++ if (ctx == NULL) { ++ return 1; ++ } ++ ++ do { ++ res = cmd_parse(ctx, commands); ++ if (res < 0) { ++ out_err(-res); ++ } ++ } while (ctx->active); ++ ++ res = fini(ctx, commands); ++ if (res >= 0) { ++ out_ok(); ++ return 0; ++ } ++ ++ out_err(-res); ++ ++ return 1; ++} +diff --git a/tests/basic/open-behind/tester.h b/tests/basic/open-behind/tester.h +new file mode 100644 +index 0000000..64e940c +--- /dev/null ++++ b/tests/basic/open-behind/tester.h +@@ -0,0 +1,145 @@ ++/* ++ Copyright (c) 2020 Red Hat, Inc. <http://www.redhat.com> ++ This file is part of GlusterFS. ++ ++ This file is licensed to you under your choice of the GNU Lesser ++ General Public License, version 3 or any later version (LGPLv3 or ++ later), or the GNU General Public License, version 2 (GPLv2), in all ++ cases as published by the Free Software Foundation. ++*/ ++ ++#ifndef __TESTER_H__ ++#define __TESTER_H__ ++ ++#include <stdio.h> ++#include <inttypes.h> ++#include <stdbool.h> ++ ++enum _obj_type; ++typedef enum _obj_type obj_type_t; ++ ++enum _arg_type; ++typedef enum _arg_type arg_type_t; ++ ++struct _buffer; ++typedef struct _buffer buffer_t; ++ ++struct _obj; ++typedef struct _obj obj_t; ++ ++struct _context; ++typedef struct _context context_t; ++ ++struct _arg; ++typedef struct _arg arg_t; ++ ++struct _command; ++typedef struct _command command_t; ++ ++enum _obj_type { OBJ_TYPE_NONE, OBJ_TYPE_FD }; ++ ++enum _arg_type { ARG_TYPE_NONE, ARG_TYPE_OBJ, ARG_TYPE_NUM, ARG_TYPE_STR }; ++ ++struct _buffer { ++ char *base; ++ uint32_t size; ++ uint32_t len; ++ uint32_t pos; ++}; ++ ++struct _obj { ++ obj_type_t type; ++ union { ++ int32_t fd; ++ }; ++}; ++ ++struct _context { ++ obj_t *objs; ++ buffer_t buffer; ++ uint32_t obj_count; ++ bool active; ++}; ++ ++struct _arg { ++ arg_type_t type; ++ union { ++ struct { ++ obj_type_t type; ++ obj_t *ref; ++ } obj; ++ struct { ++ uint64_t value; ++ uint64_t min; ++ uint64_t max; ++ } num; ++ struct { ++ uint32_t size; ++ char *data; ++ } str; ++ }; ++}; ++ ++struct _command { ++ const char *name; ++ int32_t (*handler)(context_t *ctx, command_t *cmd); ++ union { ++ arg_t *args; ++ command_t *cmds; ++ }; ++}; ++ ++#define msg(_stream, _fmt, _args...) \ ++ do { \ ++ fprintf(_stream, _fmt "\n", ##_args); \ ++ fflush(_stream); \ ++ } while (0) ++ ++#define msg_out(_fmt, _args...) msg(stdout, _fmt, ##_args) ++#define msg_err(_err, _fmt, _args...) \ ++ ({ \ ++ int32_t __msg_err = (_err); \ ++ msg(stderr, "[%4u:%-15s] " _fmt, __LINE__, __FUNCTION__, __msg_err, \ ++ ##_args); \ ++ -__msg_err; \ ++ }) ++ ++#define error(_err, _fmt, _args...) msg_err(_err, "E(%4d) " _fmt, ##_args) ++#define warn(_err, _fmt, _args...) msg_err(_err, "W(%4d) " _fmt, ##_args) ++#define info(_err, _fmt, _args...) msg_err(_err, "I(%4d) " _fmt, ##_args) ++ ++#define out_ok(_args...) msg_out("OK " _args) ++#define out_err(_err) msg_out("ERR %d", _err) ++ ++#define ARG_END \ ++ { \ ++ ARG_TYPE_NONE \ ++ } ++ ++#define CMD_ARGS1(_x, _args...) \ ++ .args = (arg_t[]) { _args } ++#define CMD_ARGS(_args...) CMD_ARGS1(, ##_args, ARG_END) ++ ++#define CMD_SUB(_cmds) .cmds = _cmds ++ ++#define CMD_END \ ++ { \ ++ NULL, NULL, CMD_SUB(NULL) \ ++ } ++ ++#define ARG_VAL(_type) \ ++ { \ ++ ARG_TYPE_OBJ, .obj = {.type = _type } \ ++ } ++#define ARG_NUM(_min, _max) \ ++ { \ ++ ARG_TYPE_NUM, .num = {.min = _min, .max = _max } \ ++ } ++#define ARG_STR(_size) \ ++ { \ ++ ARG_TYPE_STR, .str = {.size = _size } \ ++ } ++ ++extern command_t fd_commands[]; ++ ++#endif /* __TESTER_H__ */ +\ No newline at end of file +diff --git a/tests/bugs/glusterfs/bug-873962-spb.t b/tests/bugs/glusterfs/bug-873962-spb.t +index db84a22..db71cc0 100644 +--- a/tests/bugs/glusterfs/bug-873962-spb.t ++++ b/tests/bugs/glusterfs/bug-873962-spb.t +@@ -14,6 +14,7 @@ TEST $CLI volume set $V0 performance.io-cache off + TEST $CLI volume set $V0 performance.write-behind off + TEST $CLI volume set $V0 performance.stat-prefetch off + TEST $CLI volume set $V0 performance.read-ahead off ++TEST $CLI volume set $V0 performance.open-behind off + TEST $CLI volume set $V0 cluster.background-self-heal-count 0 + TEST $CLI volume start $V0 + TEST glusterfs --entry-timeout=0 --attribute-timeout=0 -s $H0 --volfile-id=$V0 $M0 --direct-io-mode=enable +diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c +index 919eea3..76b5809 100644 +--- a/xlators/mount/fuse/src/fuse-bridge.c ++++ b/xlators/mount/fuse/src/fuse-bridge.c +@@ -3398,6 +3398,8 @@ fuse_release(xlator_t *this, fuse_in_header_t *finh, void *msg, + gf_log("glusterfs-fuse", GF_LOG_TRACE, + "finh->unique: %" PRIu64 ": RELEASE %p", finh->unique, state->fd); + ++ fd_close(state->fd); ++ + fuse_fd_ctx_destroy(this, state->fd); + fd_unref(fd); + +diff --git a/xlators/performance/open-behind/src/open-behind-messages.h b/xlators/performance/open-behind/src/open-behind-messages.h +index f250824..0e78917 100644 +--- a/xlators/performance/open-behind/src/open-behind-messages.h ++++ b/xlators/performance/open-behind/src/open-behind-messages.h +@@ -23,6 +23,10 @@ + */ + + GLFS_MSGID(OPEN_BEHIND, OPEN_BEHIND_MSG_XLATOR_CHILD_MISCONFIGURED, +- OPEN_BEHIND_MSG_VOL_MISCONFIGURED, OPEN_BEHIND_MSG_NO_MEMORY); ++ OPEN_BEHIND_MSG_VOL_MISCONFIGURED, OPEN_BEHIND_MSG_NO_MEMORY, ++ OPEN_BEHIND_MSG_FAILED, OPEN_BEHIND_MSG_BAD_STATE); ++ ++#define OPEN_BEHIND_MSG_FAILED_STR "Failed to submit fop" ++#define OPEN_BEHIND_MSG_BAD_STATE_STR "Unexpected state" + + #endif /* _OPEN_BEHIND_MESSAGES_H_ */ +diff --git a/xlators/performance/open-behind/src/open-behind.c b/xlators/performance/open-behind/src/open-behind.c +index cbe89ec..e43fe73 100644 +--- a/xlators/performance/open-behind/src/open-behind.c ++++ b/xlators/performance/open-behind/src/open-behind.c +@@ -16,6 +16,18 @@ + #include "open-behind-messages.h" + #include <glusterfs/glusterfs-acl.h> + ++/* Note: The initial design of open-behind was made to cover the simple case ++ * of open, read, close for small files. This pattern combined with ++ * quick-read can do the whole operation without a single request to the ++ * bricks (except the initial lookup). ++ * ++ * The way to do this has been improved, but the logic remains the same. ++ * Basically, this means that any operation sent to the fd or the inode ++ * that it's not a read, causes the open request to be sent to the ++ * bricks, and all future operations will be executed synchronously, ++ * including opens (it's reset once all fd's are closed). ++ */ ++ + typedef struct ob_conf { + gf_boolean_t use_anonymous_fd; /* use anonymous FDs wherever safe + e.g - fstat() readv() +@@ -32,1096 +44,754 @@ typedef struct ob_conf { + */ + } ob_conf_t; + +-typedef struct ob_inode { +- inode_t *inode; +- struct list_head resume_fops; +- struct list_head ob_fds; +- int count; +- int op_ret; +- int op_errno; +- gf_boolean_t open_in_progress; +- int unlinked; +-} ob_inode_t; ++/* A negative state represents an errno value negated. In this case the ++ * current operation cannot be processed. */ ++typedef enum _ob_state { ++ /* There are no opens on the inode or the first open is already ++ * completed. The current operation can be sent directly. */ ++ OB_STATE_READY = 0, + +-typedef struct ob_fd { +- call_frame_t *open_frame; +- loc_t loc; +- dict_t *xdata; +- int flags; +- int op_errno; +- ob_inode_t *ob_inode; +- fd_t *fd; +- gf_boolean_t opened; +- gf_boolean_t ob_inode_fops_waiting; +- struct list_head list; +- struct list_head ob_fds_on_inode; +-} ob_fd_t; ++ /* There's an open pending and it has been triggered. The current ++ * operation should be "stubbified" and processed with ++ * ob_stub_dispatch(). */ ++ OB_STATE_OPEN_TRIGGERED, + +-ob_inode_t * +-ob_inode_alloc(inode_t *inode) +-{ +- ob_inode_t *ob_inode = NULL; ++ /* There's an open pending but it has not been triggered. The current ++ * operation can be processed directly but using an anonymous fd. */ ++ OB_STATE_OPEN_PENDING, + +- ob_inode = GF_CALLOC(1, sizeof(*ob_inode), gf_ob_mt_inode_t); +- if (ob_inode == NULL) +- goto out; ++ /* The current operation is the first open on the inode. */ ++ OB_STATE_FIRST_OPEN ++} ob_state_t; + +- ob_inode->inode = inode; +- INIT_LIST_HEAD(&ob_inode->resume_fops); +- INIT_LIST_HEAD(&ob_inode->ob_fds); +-out: +- return ob_inode; +-} +- +-void +-ob_inode_free(ob_inode_t *ob_inode) +-{ +- if (ob_inode == NULL) +- goto out; ++typedef struct ob_inode { ++ /* List of stubs pending on the first open. Once the first open is ++ * complete, all these stubs will be resubmitted, and dependencies ++ * will be checked again. */ ++ struct list_head resume_fops; + +- list_del_init(&ob_inode->resume_fops); +- list_del_init(&ob_inode->ob_fds); ++ /* The inode this object references. */ ++ inode_t *inode; + +- GF_FREE(ob_inode); +-out: +- return; +-} ++ /* The fd from the first open sent to this inode. It will be set ++ * from the moment the open is processed until the open if fully ++ * executed or closed before actually opened. It's NULL in all ++ * other cases. */ ++ fd_t *first_fd; ++ ++ /* The stub from the first open operation. When open fop starts ++ * being processed, it's assigned the OB_OPEN_PREPARING value ++ * until the actual stub is created. This is necessary to avoid ++ * creating the stub inside a locked region. Once the stub is ++ * successfully created, it's assigned here. This value is set ++ * to NULL once the stub is resumed. */ ++ call_stub_t *first_open; ++ ++ /* The total number of currently open fd's on this inode. */ ++ int32_t open_count; ++ ++ /* This flag is set as soon as we know that the open will be ++ * sent to the bricks, even before the stub is ready. */ ++ bool triggered; ++} ob_inode_t; + +-ob_inode_t * +-ob_inode_get(xlator_t *this, inode_t *inode) ++/* Dummy pointer used temporarily while the actual open stub is being created */ ++#define OB_OPEN_PREPARING ((call_stub_t *)-1) ++ ++#define OB_POST_COMMON(_fop, _xl, _frame, _fd, _args...) \ ++ case OB_STATE_FIRST_OPEN: \ ++ gf_smsg((_xl)->name, GF_LOG_ERROR, EINVAL, OPEN_BEHIND_MSG_BAD_STATE, \ ++ "fop=%s", #_fop, "state=%d", __ob_state, NULL); \ ++ default_##_fop##_failure_cbk(_frame, EINVAL); \ ++ break; \ ++ case OB_STATE_READY: \ ++ default_##_fop(_frame, _xl, ##_args); \ ++ break; \ ++ case OB_STATE_OPEN_TRIGGERED: { \ ++ call_stub_t *__ob_stub = fop_##_fop##_stub(_frame, ob_##_fop, \ ++ ##_args); \ ++ if (__ob_stub != NULL) { \ ++ ob_stub_dispatch(_xl, __ob_inode, _fd, __ob_stub); \ ++ break; \ ++ } \ ++ __ob_state = -ENOMEM; \ ++ } \ ++ default: \ ++ gf_smsg((_xl)->name, GF_LOG_ERROR, -__ob_state, \ ++ OPEN_BEHIND_MSG_FAILED, "fop=%s", #_fop, NULL); \ ++ default_##_fop##_failure_cbk(_frame, -__ob_state) ++ ++#define OB_POST_FD(_fop, _xl, _frame, _fd, _trigger, _args...) \ ++ do { \ ++ ob_inode_t *__ob_inode; \ ++ fd_t *__first_fd; \ ++ ob_state_t __ob_state = ob_open_and_resume_fd( \ ++ _xl, _fd, 0, true, _trigger, &__ob_inode, &__first_fd); \ ++ switch (__ob_state) { \ ++ case OB_STATE_OPEN_PENDING: \ ++ if (!(_trigger)) { \ ++ fd_t *__ob_fd = fd_anonymous_with_flags((_fd)->inode, \ ++ (_fd)->flags); \ ++ if (__ob_fd != NULL) { \ ++ default_##_fop(_frame, _xl, ##_args); \ ++ fd_unref(__ob_fd); \ ++ break; \ ++ } \ ++ __ob_state = -ENOMEM; \ ++ } \ ++ OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args); \ ++ } \ ++ } while (0) ++ ++#define OB_POST_FLUSH(_xl, _frame, _fd, _args...) \ ++ do { \ ++ ob_inode_t *__ob_inode; \ ++ fd_t *__first_fd; \ ++ ob_state_t __ob_state = ob_open_and_resume_fd( \ ++ _xl, _fd, 0, true, false, &__ob_inode, &__first_fd); \ ++ switch (__ob_state) { \ ++ case OB_STATE_OPEN_PENDING: \ ++ default_flush_cbk(_frame, NULL, _xl, 0, 0, NULL); \ ++ break; \ ++ OB_POST_COMMON(flush, _xl, _frame, __first_fd, ##_args); \ ++ } \ ++ } while (0) ++ ++#define OB_POST_INODE(_fop, _xl, _frame, _inode, _trigger, _args...) \ ++ do { \ ++ ob_inode_t *__ob_inode; \ ++ fd_t *__first_fd; \ ++ ob_state_t __ob_state = ob_open_and_resume_inode( \ ++ _xl, _inode, NULL, 0, true, _trigger, &__ob_inode, &__first_fd); \ ++ switch (__ob_state) { \ ++ case OB_STATE_OPEN_PENDING: \ ++ OB_POST_COMMON(_fop, _xl, _frame, __first_fd, ##_args); \ ++ } \ ++ } while (0) ++ ++static ob_inode_t * ++ob_inode_get_locked(xlator_t *this, inode_t *inode) + { + ob_inode_t *ob_inode = NULL; + uint64_t value = 0; +- int ret = 0; + +- if (!inode) +- goto out; ++ if ((__inode_ctx_get(inode, this, &value) == 0) && (value != 0)) { ++ return (ob_inode_t *)(uintptr_t)value; ++ } + +- LOCK(&inode->lock); +- { +- __inode_ctx_get(inode, this, &value); +- if (value == 0) { +- ob_inode = ob_inode_alloc(inode); +- if (ob_inode == NULL) +- goto unlock; +- +- value = (uint64_t)(uintptr_t)ob_inode; +- ret = __inode_ctx_set(inode, this, &value); +- if (ret < 0) { +- ob_inode_free(ob_inode); +- ob_inode = NULL; +- } +- } else { +- ob_inode = (ob_inode_t *)(uintptr_t)value; ++ ob_inode = GF_CALLOC(1, sizeof(*ob_inode), gf_ob_mt_inode_t); ++ if (ob_inode != NULL) { ++ ob_inode->inode = inode; ++ INIT_LIST_HEAD(&ob_inode->resume_fops); ++ ++ value = (uint64_t)(uintptr_t)ob_inode; ++ if (__inode_ctx_set(inode, this, &value) < 0) { ++ GF_FREE(ob_inode); ++ ob_inode = NULL; + } + } +-unlock: +- UNLOCK(&inode->lock); + +-out: + return ob_inode; + } + +-ob_fd_t * +-__ob_fd_ctx_get(xlator_t *this, fd_t *fd) ++static ob_state_t ++ob_open_and_resume_inode(xlator_t *xl, inode_t *inode, fd_t *fd, ++ int32_t open_count, bool synchronous, bool trigger, ++ ob_inode_t **pob_inode, fd_t **pfd) + { +- uint64_t value = 0; +- int ret = -1; +- ob_fd_t *ob_fd = NULL; ++ ob_conf_t *conf; ++ ob_inode_t *ob_inode; ++ call_stub_t *open_stub; + +- ret = __fd_ctx_get(fd, this, &value); +- if (ret) +- return NULL; ++ if (inode == NULL) { ++ return OB_STATE_READY; ++ } + +- ob_fd = (void *)((long)value); ++ conf = xl->private; + +- return ob_fd; +-} ++ *pfd = NULL; + +-ob_fd_t * +-ob_fd_ctx_get(xlator_t *this, fd_t *fd) +-{ +- ob_fd_t *ob_fd = NULL; +- +- LOCK(&fd->lock); ++ LOCK(&inode->lock); + { +- ob_fd = __ob_fd_ctx_get(this, fd); +- } +- UNLOCK(&fd->lock); +- +- return ob_fd; +-} ++ ob_inode = ob_inode_get_locked(xl, inode); ++ if (ob_inode == NULL) { ++ UNLOCK(&inode->lock); + +-int +-__ob_fd_ctx_set(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) +-{ +- uint64_t value = 0; +- int ret = -1; ++ return -ENOMEM; ++ } ++ *pob_inode = ob_inode; ++ ++ ob_inode->open_count += open_count; ++ ++ /* If first_fd is not NULL, it means that there's a previous open not ++ * yet completed. */ ++ if (ob_inode->first_fd != NULL) { ++ *pfd = ob_inode->first_fd; ++ /* If the current request doesn't trigger the open and it hasn't ++ * been triggered yet, we can continue without issuing the open ++ * only if the current request belongs to the same fd as the ++ * first one. */ ++ if (!trigger && !ob_inode->triggered && ++ (ob_inode->first_fd == fd)) { ++ UNLOCK(&inode->lock); ++ ++ return OB_STATE_OPEN_PENDING; ++ } + +- value = (long)((void *)ob_fd); ++ /* We need to issue the open. It could have already been triggered ++ * before. In this case open_stub will be NULL. Or the initial open ++ * may not be completely ready yet. In this case open_stub will be ++ * OB_OPEN_PREPARING. */ ++ open_stub = ob_inode->first_open; ++ ob_inode->first_open = NULL; ++ ob_inode->triggered = true; + +- ret = __fd_ctx_set(fd, this, value); ++ UNLOCK(&inode->lock); + +- return ret; +-} ++ if ((open_stub != NULL) && (open_stub != OB_OPEN_PREPARING)) { ++ call_resume(open_stub); ++ } + +-int +-ob_fd_ctx_set(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) +-{ +- int ret = -1; ++ return OB_STATE_OPEN_TRIGGERED; ++ } + +- LOCK(&fd->lock); +- { +- ret = __ob_fd_ctx_set(this, fd, ob_fd); +- } +- UNLOCK(&fd->lock); ++ /* There's no pending open. Only opens can be non synchronous, so all ++ * regular fops will be processed directly. For non synchronous opens, ++ * we'll still process them normally (i.e. synchornous) if there are ++ * more file descriptors open. */ ++ if (synchronous || (ob_inode->open_count > open_count)) { ++ UNLOCK(&inode->lock); + +- return ret; +-} ++ return OB_STATE_READY; ++ } + +-ob_fd_t * +-ob_fd_new(void) +-{ +- ob_fd_t *ob_fd = NULL; ++ *pfd = fd; + +- ob_fd = GF_CALLOC(1, sizeof(*ob_fd), gf_ob_mt_fd_t); ++ /* This is the first open. We keep a reference on the fd and set ++ * first_open stub to OB_OPEN_PREPARING until the actual stub can ++ * be assigned (we don't create the stub here to avoid doing memory ++ * allocations inside the mutex). */ ++ ob_inode->first_fd = __fd_ref(fd); ++ ob_inode->first_open = OB_OPEN_PREPARING; + +- INIT_LIST_HEAD(&ob_fd->list); +- INIT_LIST_HEAD(&ob_fd->ob_fds_on_inode); ++ /* If lazy_open is not set, we'll need to immediately send the open, ++ * so we set triggered right now. */ ++ ob_inode->triggered = !conf->lazy_open; ++ } ++ UNLOCK(&inode->lock); + +- return ob_fd; ++ return OB_STATE_FIRST_OPEN; + } + +-void +-ob_fd_free(ob_fd_t *ob_fd) ++static ob_state_t ++ob_open_and_resume_fd(xlator_t *xl, fd_t *fd, int32_t open_count, ++ bool synchronous, bool trigger, ob_inode_t **pob_inode, ++ fd_t **pfd) + { +- LOCK(&ob_fd->fd->inode->lock); +- { +- list_del_init(&ob_fd->ob_fds_on_inode); +- } +- UNLOCK(&ob_fd->fd->inode->lock); +- +- loc_wipe(&ob_fd->loc); +- +- if (ob_fd->xdata) +- dict_unref(ob_fd->xdata); ++ uint64_t err; + +- if (ob_fd->open_frame) { +- /* If we sill have a frame it means that background open has never +- * been triggered. We need to release the pending reference. */ +- fd_unref(ob_fd->fd); +- +- STACK_DESTROY(ob_fd->open_frame->root); ++ if ((fd_ctx_get(fd, xl, &err) == 0) && (err != 0)) { ++ return (ob_state_t)-err; + } + +- GF_FREE(ob_fd); ++ return ob_open_and_resume_inode(xl, fd->inode, fd, open_count, synchronous, ++ trigger, pob_inode, pfd); + } + +-int +-ob_wake_cbk(call_frame_t *frame, void *cookie, xlator_t *this, int op_ret, +- int op_errno, fd_t *fd_ret, dict_t *xdata) ++static ob_state_t ++ob_open_behind(xlator_t *xl, fd_t *fd, int32_t flags, ob_inode_t **pob_inode, ++ fd_t **pfd) + { +- fd_t *fd = NULL; +- int count = 0; +- int ob_inode_op_ret = 0; +- int ob_inode_op_errno = 0; +- ob_fd_t *ob_fd = NULL; +- call_stub_t *stub = NULL, *tmp = NULL; +- ob_inode_t *ob_inode = NULL; +- gf_boolean_t ob_inode_fops_waiting = _gf_false; +- struct list_head fops_waiting_on_fd, fops_waiting_on_inode; ++ bool synchronous; + +- fd = frame->local; +- frame->local = NULL; +- +- INIT_LIST_HEAD(&fops_waiting_on_fd); +- INIT_LIST_HEAD(&fops_waiting_on_inode); ++ /* TODO: If O_CREAT, O_APPEND, O_WRONLY or O_DIRECT are specified, shouldn't ++ * we also execute this open synchronously ? */ ++ synchronous = (flags & O_TRUNC) != 0; + +- ob_inode = ob_inode_get(this, fd->inode); ++ return ob_open_and_resume_fd(xl, fd, 1, synchronous, true, pob_inode, pfd); ++} + +- LOCK(&fd->lock); ++static int32_t ++ob_stub_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, ++ call_stub_t *stub) ++{ ++ LOCK(&ob_inode->inode->lock); + { +- ob_fd = __ob_fd_ctx_get(this, fd); +- ob_fd->opened = _gf_true; +- +- ob_inode_fops_waiting = ob_fd->ob_inode_fops_waiting; +- +- list_splice_init(&ob_fd->list, &fops_waiting_on_fd); +- +- if (op_ret < 0) { +- /* mark fd BAD for ever */ +- ob_fd->op_errno = op_errno; +- ob_fd = NULL; /*shouldn't be freed*/ +- } else { +- __fd_ctx_del(fd, this, NULL); +- } +- } +- UNLOCK(&fd->lock); +- +- if (ob_inode_fops_waiting) { +- LOCK(&fd->inode->lock); +- { +- count = --ob_inode->count; +- if (op_ret < 0) { +- /* TODO: when to reset the error? */ +- ob_inode->op_ret = -1; +- ob_inode->op_errno = op_errno; +- } +- +- if (count == 0) { +- ob_inode->open_in_progress = _gf_false; +- ob_inode_op_ret = ob_inode->op_ret; +- ob_inode_op_errno = ob_inode->op_errno; +- list_splice_init(&ob_inode->resume_fops, +- &fops_waiting_on_inode); +- } ++ /* We only queue a stub if the open has not been completed or ++ * cancelled. */ ++ if (ob_inode->first_fd == fd) { ++ list_add_tail(&stub->list, &ob_inode->resume_fops); ++ stub = NULL; + } +- UNLOCK(&fd->inode->lock); +- } +- +- if (ob_fd) +- ob_fd_free(ob_fd); +- +- list_for_each_entry_safe(stub, tmp, &fops_waiting_on_fd, list) +- { +- list_del_init(&stub->list); +- +- if (op_ret < 0) +- call_unwind_error(stub, -1, op_errno); +- else +- call_resume(stub); + } ++ UNLOCK(&ob_inode->inode->lock); + +- list_for_each_entry_safe(stub, tmp, &fops_waiting_on_inode, list) +- { +- list_del_init(&stub->list); +- +- if (ob_inode_op_ret < 0) +- call_unwind_error(stub, -1, ob_inode_op_errno); +- else +- call_resume(stub); ++ if (stub != NULL) { ++ call_resume(stub); + } + +- /* The background open is completed. We can release the 'fd' reference. */ +- fd_unref(fd); +- +- STACK_DESTROY(frame->root); +- + return 0; + } + +-int +-ob_fd_wake(xlator_t *this, fd_t *fd, ob_fd_t *ob_fd) ++static int32_t ++ob_open_dispatch(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, ++ call_stub_t *stub) + { +- call_frame_t *frame = NULL; +- +- if (ob_fd == NULL) { +- LOCK(&fd->lock); +- { +- ob_fd = __ob_fd_ctx_get(this, fd); +- if (!ob_fd) +- goto unlock; ++ bool closed; + +- frame = ob_fd->open_frame; +- ob_fd->open_frame = NULL; +- } +- unlock: +- UNLOCK(&fd->lock); +- } else { +- LOCK(&fd->lock); +- { +- frame = ob_fd->open_frame; +- ob_fd->open_frame = NULL; ++ LOCK(&ob_inode->inode->lock); ++ { ++ closed = ob_inode->first_fd != fd; ++ if (!closed) { ++ if (ob_inode->triggered) { ++ ob_inode->first_open = NULL; ++ } else { ++ ob_inode->first_open = stub; ++ stub = NULL; ++ } + } +- UNLOCK(&fd->lock); + } ++ UNLOCK(&ob_inode->inode->lock); + +- if (frame) { +- /* We don't need to take a reference here. We already have a reference +- * while the open is pending. */ +- frame->local = fd; +- +- STACK_WIND(frame, ob_wake_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->open, &ob_fd->loc, ob_fd->flags, fd, +- ob_fd->xdata); ++ if (stub != NULL) { ++ if (closed) { ++ call_stub_destroy(stub); ++ fd_unref(fd); ++ } else { ++ call_resume(stub); ++ } + } + + return 0; + } + +-void +-ob_inode_wake(xlator_t *this, struct list_head *ob_fds) ++static void ++ob_resume_pending(struct list_head *list) + { +- ob_fd_t *ob_fd = NULL, *tmp = NULL; ++ call_stub_t *stub; + +- if (!list_empty(ob_fds)) { +- list_for_each_entry_safe(ob_fd, tmp, ob_fds, ob_fds_on_inode) +- { +- ob_fd_wake(this, ob_fd->fd, ob_fd); +- ob_fd_free(ob_fd); +- } +- } +-} ++ while (!list_empty(list)) { ++ stub = list_first_entry(list, call_stub_t, list); ++ list_del_init(&stub->list); + +-/* called holding inode->lock and fd->lock */ +-void +-ob_fd_copy(ob_fd_t *src, ob_fd_t *dst) +-{ +- if (!src || !dst) +- goto out; +- +- dst->fd = src->fd; +- dst->loc.inode = inode_ref(src->loc.inode); +- gf_uuid_copy(dst->loc.gfid, src->loc.gfid); +- dst->flags = src->flags; +- dst->xdata = dict_ref(src->xdata); +- dst->ob_inode = src->ob_inode; +-out: +- return; ++ call_resume(stub); ++ } + } + +-int +-open_all_pending_fds_and_resume(xlator_t *this, inode_t *inode, +- call_stub_t *stub) ++static void ++ob_open_completed(xlator_t *xl, ob_inode_t *ob_inode, fd_t *fd, int32_t op_ret, ++ int32_t op_errno) + { +- ob_inode_t *ob_inode = NULL; +- ob_fd_t *ob_fd = NULL, *tmp = NULL; +- gf_boolean_t was_open_in_progress = _gf_false; +- gf_boolean_t wait_for_open = _gf_false; +- struct list_head ob_fds; ++ struct list_head list; + +- ob_inode = ob_inode_get(this, inode); +- if (ob_inode == NULL) +- goto out; ++ INIT_LIST_HEAD(&list); + +- INIT_LIST_HEAD(&ob_fds); ++ if (op_ret < 0) { ++ fd_ctx_set(fd, xl, op_errno <= 0 ? EIO : op_errno); ++ } + +- LOCK(&inode->lock); ++ LOCK(&ob_inode->inode->lock); + { +- was_open_in_progress = ob_inode->open_in_progress; +- ob_inode->unlinked = 1; +- +- if (was_open_in_progress) { +- list_add_tail(&stub->list, &ob_inode->resume_fops); +- goto inode_unlock; +- } +- +- list_for_each_entry(ob_fd, &ob_inode->ob_fds, ob_fds_on_inode) +- { +- LOCK(&ob_fd->fd->lock); +- { +- if (ob_fd->opened) +- goto fd_unlock; +- +- ob_inode->count++; +- ob_fd->ob_inode_fops_waiting = _gf_true; +- +- if (ob_fd->open_frame == NULL) { +- /* open in progress no need of wake */ +- } else { +- tmp = ob_fd_new(); +- tmp->open_frame = ob_fd->open_frame; +- ob_fd->open_frame = NULL; +- +- ob_fd_copy(ob_fd, tmp); +- list_add_tail(&tmp->ob_fds_on_inode, &ob_fds); +- } +- } +- fd_unlock: +- UNLOCK(&ob_fd->fd->lock); +- } +- +- if (ob_inode->count) { +- wait_for_open = ob_inode->open_in_progress = _gf_true; +- list_add_tail(&stub->list, &ob_inode->resume_fops); ++ /* Only update the fields if the file has not been closed before ++ * getting here. */ ++ if (ob_inode->first_fd == fd) { ++ list_splice_init(&ob_inode->resume_fops, &list); ++ ob_inode->first_fd = NULL; ++ ob_inode->first_open = NULL; ++ ob_inode->triggered = false; + } + } +-inode_unlock: +- UNLOCK(&inode->lock); ++ UNLOCK(&ob_inode->inode->lock); + +-out: +- if (!was_open_in_progress) { +- if (!wait_for_open) { +- call_resume(stub); +- } else { +- ob_inode_wake(this, &ob_fds); +- } +- } ++ ob_resume_pending(&list); + +- return 0; ++ fd_unref(fd); + } + +-int +-open_and_resume(xlator_t *this, fd_t *fd, call_stub_t *stub) ++static int32_t ++ob_open_cbk(call_frame_t *frame, void *cookie, xlator_t *xl, int32_t op_ret, ++ int32_t op_errno, fd_t *fd, dict_t *xdata) + { +- ob_fd_t *ob_fd = NULL; +- int op_errno = 0; +- +- if (!fd) +- goto nofd; +- +- LOCK(&fd->lock); +- { +- ob_fd = __ob_fd_ctx_get(this, fd); +- if (!ob_fd) +- goto unlock; ++ ob_inode_t *ob_inode; + +- if (ob_fd->op_errno) { +- op_errno = ob_fd->op_errno; +- goto unlock; +- } ++ ob_inode = frame->local; ++ frame->local = NULL; + +- list_add_tail(&stub->list, &ob_fd->list); +- } +-unlock: +- UNLOCK(&fd->lock); ++ ob_open_completed(xl, ob_inode, cookie, op_ret, op_errno); + +-nofd: +- if (op_errno) +- call_unwind_error(stub, -1, op_errno); +- else if (ob_fd) +- ob_fd_wake(this, fd, NULL); +- else +- call_resume(stub); ++ STACK_DESTROY(frame->root); + + return 0; + } + +-int +-ob_open_behind(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, ++static int32_t ++ob_open_resume(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, + fd_t *fd, dict_t *xdata) + { +- ob_fd_t *ob_fd = NULL; +- int ret = -1; +- ob_conf_t *conf = NULL; +- ob_inode_t *ob_inode = NULL; +- gf_boolean_t open_in_progress = _gf_false; +- int unlinked = 0; +- +- conf = this->private; +- +- if (flags & O_TRUNC) { +- STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); +- return 0; +- } +- +- ob_inode = ob_inode_get(this, fd->inode); +- +- ob_fd = ob_fd_new(); +- if (!ob_fd) +- goto enomem; +- +- ob_fd->ob_inode = ob_inode; +- +- ob_fd->fd = fd; +- +- ob_fd->open_frame = copy_frame(frame); +- if (!ob_fd->open_frame) +- goto enomem; +- ret = loc_copy(&ob_fd->loc, loc); +- if (ret) +- goto enomem; +- +- ob_fd->flags = flags; +- if (xdata) +- ob_fd->xdata = dict_ref(xdata); +- +- LOCK(&fd->inode->lock); +- { +- open_in_progress = ob_inode->open_in_progress; +- unlinked = ob_inode->unlinked; +- if (!open_in_progress && !unlinked) { +- ret = ob_fd_ctx_set(this, fd, ob_fd); +- if (ret) { +- UNLOCK(&fd->inode->lock); +- goto enomem; +- } +- +- list_add(&ob_fd->ob_fds_on_inode, &ob_inode->ob_fds); +- } +- } +- UNLOCK(&fd->inode->lock); +- +- /* We take a reference while the background open is pending or being +- * processed. If we finally wind the request in the foreground, then +- * ob_fd_free() will take care of this additional reference. */ +- fd_ref(fd); +- +- if (!open_in_progress && !unlinked) { +- STACK_UNWIND_STRICT(open, frame, 0, 0, fd, xdata); +- +- if (!conf->lazy_open) +- ob_fd_wake(this, fd, NULL); +- } else { +- ob_fd_free(ob_fd); +- STACK_WIND(frame, default_open_cbk, FIRST_CHILD(this), +- FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); +- } ++ STACK_WIND_COOKIE(frame, ob_open_cbk, fd, FIRST_CHILD(this), ++ FIRST_CHILD(this)->fops->open, loc, flags, fd, xdata); + + return 0; +-enomem: +- if (ob_fd) { +- if (ob_fd->open_frame) +- STACK_DESTROY(ob_fd->open_frame->root); +- +- loc_wipe(&ob_fd->loc); +- if (ob_fd->xdata) +- dict_unref(ob_fd->xdata); +- +- GF_FREE(ob_fd); +- } +- +- return -1; + } + +-int ++static int32_t + ob_open(call_frame_t *frame, xlator_t *this, loc_t *loc, int flags, fd_t *fd, + dict_t *xdata) + { +- fd_t *old_fd = NULL; +- int ret = -1; +- int op_errno = ENOMEM; +- call_stub_t *stub = NULL; +- +- old_fd = fd_lookup(fd->inode, 0); +- if (old_fd) { +- /* open-behind only when this is the first FD */ +- stub = fop_open_stub(frame, default_open_resume, loc, flags, fd, xdata); +- if (!stub) { +- fd_unref(old_fd); +- goto err; +- } +- +- open_and_resume(this, old_fd, stub); ++ ob_inode_t *ob_inode; ++ call_frame_t *open_frame; ++ call_stub_t *stub; ++ fd_t *first_fd; ++ ob_state_t state; ++ ++ state = ob_open_behind(this, fd, flags, &ob_inode, &first_fd); ++ if (state == OB_STATE_READY) { ++ /* There's no pending open, but there are other file descriptors opened ++ * or the current flags require a synchronous open. */ ++ return default_open(frame, this, loc, flags, fd, xdata); ++ } + +- fd_unref(old_fd); ++ if (state == OB_STATE_OPEN_TRIGGERED) { ++ /* The first open is in progress (either because it was already issued ++ * or because this request triggered it). We try to create a new stub ++ * to retry the operation once the initial open completes. */ ++ stub = fop_open_stub(frame, ob_open, loc, flags, fd, xdata); ++ if (stub != NULL) { ++ return ob_stub_dispatch(this, ob_inode, first_fd, stub); ++ } + +- return 0; ++ state = -ENOMEM; + } + +- ret = ob_open_behind(frame, this, loc, flags, fd, xdata); +- if (ret) { +- goto err; +- } ++ if (state == OB_STATE_FIRST_OPEN) { ++ /* We try to create a stub for the new open. A new frame needs to be ++ * used because the current one may be destroyed soon after sending ++ * the open's reply. */ ++ open_frame = copy_frame(frame); ++ if (open_frame != NULL) { ++ stub = fop_open_stub(open_frame, ob_open_resume, loc, flags, fd, ++ xdata); ++ if (stub != NULL) { ++ open_frame->local = ob_inode; + +- return 0; +-err: +- gf_msg(this->name, GF_LOG_ERROR, op_errno, OPEN_BEHIND_MSG_NO_MEMORY, "%s", +- loc->path); ++ /* TODO: Previous version passed xdata back to the caller, but ++ * probably this doesn't make sense since it won't contain ++ * any requested data. I think it would be better to pass ++ * NULL for xdata. */ ++ default_open_cbk(frame, NULL, this, 0, 0, fd, xdata); + +- STACK_UNWIND_STRICT(open, frame, -1, op_errno, 0, 0); ++ return ob_open_dispatch(this, ob_inode, first_fd, stub); ++ } + +- return 0; +-} ++ STACK_DESTROY(open_frame->root); ++ } + +-fd_t * +-ob_get_wind_fd(xlator_t *this, fd_t *fd, uint32_t *flag) +-{ +- fd_t *wind_fd = NULL; +- ob_fd_t *ob_fd = NULL; +- ob_conf_t *conf = NULL; ++ /* In case of error, simulate a regular completion but with an error ++ * code. */ ++ ob_open_completed(this, ob_inode, first_fd, -1, ENOMEM); + +- conf = this->private; ++ state = -ENOMEM; ++ } + +- ob_fd = ob_fd_ctx_get(this, fd); ++ /* In case of failure we need to decrement the number of open files because ++ * ob_fdclose() won't be called. */ + +- if (ob_fd && ob_fd->open_frame && conf->use_anonymous_fd) { +- wind_fd = fd_anonymous(fd->inode); +- if ((ob_fd->flags & O_DIRECT) && (flag)) +- *flag = *flag | O_DIRECT; +- } else { +- wind_fd = fd_ref(fd); ++ LOCK(&fd->inode->lock); ++ { ++ ob_inode->open_count--; + } ++ UNLOCK(&fd->inode->lock); + +- return wind_fd; ++ gf_smsg(this->name, GF_LOG_ERROR, -state, OPEN_BEHIND_MSG_FAILED, "fop=%s", ++ "open", "path=%s", loc->path, NULL); ++ ++ return default_open_failure_cbk(frame, -state); + } + +-int ++static int32_t + ob_readv(call_frame_t *frame, xlator_t *this, fd_t *fd, size_t size, + off_t offset, uint32_t flags, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- fd_t *wind_fd = NULL; +- ob_conf_t *conf = NULL; ++ ob_conf_t *conf = this->private; ++ bool trigger = conf->read_after_open || !conf->use_anonymous_fd; + +- conf = this->private; +- +- if (!conf->read_after_open) +- wind_fd = ob_get_wind_fd(this, fd, &flags); +- else +- wind_fd = fd_ref(fd); +- +- stub = fop_readv_stub(frame, default_readv_resume, wind_fd, size, offset, +- flags, xdata); +- fd_unref(wind_fd); +- +- if (!stub) +- goto err; +- +- open_and_resume(this, wind_fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(readv, frame, -1, ENOMEM, 0, 0, 0, 0, 0); ++ OB_POST_FD(readv, this, frame, fd, trigger, fd, size, offset, flags, xdata); + + return 0; + } + +-int ++static int32_t + ob_writev(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iovec *iov, + int count, off_t offset, uint32_t flags, struct iobref *iobref, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_writev_stub(frame, default_writev_resume, fd, iov, count, offset, +- flags, iobref, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(writev, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_FD(writev, this, frame, fd, true, fd, iov, count, offset, flags, ++ iobref, xdata); + + return 0; + } + +-int ++static int32_t + ob_fstat(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- fd_t *wind_fd = NULL; +- +- wind_fd = ob_get_wind_fd(this, fd, NULL); +- +- stub = fop_fstat_stub(frame, default_fstat_resume, wind_fd, xdata); ++ ob_conf_t *conf = this->private; ++ bool trigger = !conf->use_anonymous_fd; + +- fd_unref(wind_fd); +- +- if (!stub) +- goto err; +- +- open_and_resume(this, wind_fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(fstat, this, frame, fd, trigger, fd, xdata); + + return 0; + } + +-int ++static int32_t + ob_seek(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + gf_seek_what_t what, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- fd_t *wind_fd = NULL; +- +- wind_fd = ob_get_wind_fd(this, fd, NULL); ++ ob_conf_t *conf = this->private; ++ bool trigger = !conf->use_anonymous_fd; + +- stub = fop_seek_stub(frame, default_seek_resume, wind_fd, offset, what, +- xdata); +- +- fd_unref(wind_fd); +- +- if (!stub) +- goto err; +- +- open_and_resume(this, wind_fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fstat, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(seek, this, frame, fd, trigger, fd, offset, what, xdata); + + return 0; + } + +-int ++static int32_t + ob_flush(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- ob_fd_t *ob_fd = NULL; +- gf_boolean_t unwind = _gf_false; +- +- LOCK(&fd->lock); +- { +- ob_fd = __ob_fd_ctx_get(this, fd); +- if (ob_fd && ob_fd->open_frame) +- /* if open() was never wound to backend, +- no need to wind flush() either. +- */ +- unwind = _gf_true; +- } +- UNLOCK(&fd->lock); +- +- if (unwind) +- goto unwind; +- +- stub = fop_flush_stub(frame, default_flush_resume, fd, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(flush, frame, -1, ENOMEM, 0); +- +- return 0; +- +-unwind: +- STACK_UNWIND_STRICT(flush, frame, 0, 0, 0); ++ OB_POST_FLUSH(this, frame, fd, fd, xdata); + + return 0; + } + +-int ++static int32_t + ob_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int flag, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fsync_stub(frame, default_fsync_resume, fd, flag, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fsync, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_FD(fsync, this, frame, fd, true, fd, flag, xdata); + + return 0; + } + +-int ++static int32_t + ob_lk(call_frame_t *frame, xlator_t *this, fd_t *fd, int cmd, + struct gf_flock *flock, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_lk_stub(frame, default_lk_resume, fd, cmd, flock, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(lk, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(lk, this, frame, fd, true, fd, cmd, flock, xdata); + + return 0; + } + +-int ++static int32_t + ob_ftruncate(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_ftruncate_stub(frame, default_ftruncate_resume, fd, offset, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(ftruncate, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_FD(ftruncate, this, frame, fd, true, fd, offset, xdata); + + return 0; + } + +-int ++static int32_t + ob_fsetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, dict_t *xattr, + int flags, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fsetxattr_stub(frame, default_fsetxattr_resume, fd, xattr, flags, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fsetxattr, frame, -1, ENOMEM, 0); ++ OB_POST_FD(fsetxattr, this, frame, fd, true, fd, xattr, flags, xdata); + + return 0; + } + +-int ++static int32_t + ob_fgetxattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fgetxattr_stub(frame, default_fgetxattr_resume, fd, name, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fgetxattr, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(fgetxattr, this, frame, fd, true, fd, name, xdata); + + return 0; + } + +-int ++static int32_t + ob_fremovexattr(call_frame_t *frame, xlator_t *this, fd_t *fd, const char *name, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fremovexattr_stub(frame, default_fremovexattr_resume, fd, name, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fremovexattr, frame, -1, ENOMEM, 0); ++ OB_POST_FD(fremovexattr, this, frame, fd, true, fd, name, xdata); + + return 0; + } + +-int ++static int32_t + ob_finodelk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + int cmd, struct gf_flock *flock, dict_t *xdata) + { +- call_stub_t *stub = fop_finodelk_stub(frame, default_finodelk_resume, +- volume, fd, cmd, flock, xdata); +- if (stub) +- open_and_resume(this, fd, stub); +- else +- STACK_UNWIND_STRICT(finodelk, frame, -1, ENOMEM, 0); ++ OB_POST_FD(finodelk, this, frame, fd, true, volume, fd, cmd, flock, xdata); + + return 0; + } + +-int ++static int32_t + ob_fentrylk(call_frame_t *frame, xlator_t *this, const char *volume, fd_t *fd, + const char *basename, entrylk_cmd cmd, entrylk_type type, + dict_t *xdata) + { +- call_stub_t *stub = fop_fentrylk_stub( +- frame, default_fentrylk_resume, volume, fd, basename, cmd, type, xdata); +- if (stub) +- open_and_resume(this, fd, stub); +- else +- STACK_UNWIND_STRICT(fentrylk, frame, -1, ENOMEM, 0); ++ OB_POST_FD(fentrylk, this, frame, fd, true, volume, fd, basename, cmd, type, ++ xdata); + + return 0; + } + +-int ++static int32_t + ob_fxattrop(call_frame_t *frame, xlator_t *this, fd_t *fd, + gf_xattrop_flags_t optype, dict_t *xattr, dict_t *xdata) + { +- call_stub_t *stub = fop_fxattrop_stub(frame, default_fxattrop_resume, fd, +- optype, xattr, xdata); +- if (stub) +- open_and_resume(this, fd, stub); +- else +- STACK_UNWIND_STRICT(fxattrop, frame, -1, ENOMEM, 0, 0); ++ OB_POST_FD(fxattrop, this, frame, fd, true, fd, optype, xattr, xdata); + + return 0; + } + +-int ++static int32_t + ob_fsetattr(call_frame_t *frame, xlator_t *this, fd_t *fd, struct iatt *iatt, + int valid, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_fsetattr_stub(frame, default_fsetattr_resume, fd, iatt, valid, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(fsetattr, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_FD(fsetattr, this, frame, fd, true, fd, iatt, valid, xdata); + + return 0; + } + +-int ++static int32_t + ob_fallocate(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t mode, + off_t offset, size_t len, dict_t *xdata) + { +- call_stub_t *stub; +- +- stub = fop_fallocate_stub(frame, default_fallocate_resume, fd, mode, offset, +- len, xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); ++ OB_POST_FD(fallocate, this, frame, fd, true, fd, mode, offset, len, xdata); + + return 0; +-err: +- STACK_UNWIND_STRICT(fallocate, frame, -1, ENOMEM, NULL, NULL, NULL); +- return 0; + } + +-int ++static int32_t + ob_discard(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + size_t len, dict_t *xdata) + { +- call_stub_t *stub; +- +- stub = fop_discard_stub(frame, default_discard_resume, fd, offset, len, +- xdata); +- if (!stub) +- goto err; +- +- open_and_resume(this, fd, stub); ++ OB_POST_FD(discard, this, frame, fd, true, fd, offset, len, xdata); + + return 0; +-err: +- STACK_UNWIND_STRICT(discard, frame, -1, ENOMEM, NULL, NULL, NULL); +- return 0; + } + +-int ++static int32_t + ob_zerofill(call_frame_t *frame, xlator_t *this, fd_t *fd, off_t offset, + off_t len, dict_t *xdata) + { +- call_stub_t *stub; +- +- stub = fop_zerofill_stub(frame, default_zerofill_resume, fd, offset, len, +- xdata); +- if (!stub) +- goto err; ++ OB_POST_FD(zerofill, this, frame, fd, true, fd, offset, len, xdata); + +- open_and_resume(this, fd, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(zerofill, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } + +-int ++static int32_t + ob_unlink(call_frame_t *frame, xlator_t *this, loc_t *loc, int xflags, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_unlink_stub(frame, default_unlink_resume, loc, xflags, xdata); +- if (!stub) +- goto err; +- +- open_all_pending_fds_and_resume(this, loc->inode, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(unlink, frame, -1, ENOMEM, 0, 0, 0); ++ OB_POST_INODE(unlink, this, frame, loc->inode, true, loc, xflags, xdata); + + return 0; + } + +-int ++static int32_t + ob_rename(call_frame_t *frame, xlator_t *this, loc_t *src, loc_t *dst, + dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_rename_stub(frame, default_rename_resume, src, dst, xdata); +- if (!stub) +- goto err; +- +- open_all_pending_fds_and_resume(this, dst->inode, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(rename, frame, -1, ENOMEM, 0, 0, 0, 0, 0, 0); ++ OB_POST_INODE(rename, this, frame, dst->inode, true, src, dst, xdata); + + return 0; + } + +-int32_t ++static int32_t + ob_setattr(call_frame_t *frame, xlator_t *this, loc_t *loc, struct iatt *stbuf, + int32_t valid, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- +- stub = fop_setattr_stub(frame, default_setattr_resume, loc, stbuf, valid, +- xdata); +- if (!stub) +- goto err; ++ OB_POST_INODE(setattr, this, frame, loc->inode, true, loc, stbuf, valid, ++ xdata); + +- open_all_pending_fds_and_resume(this, loc->inode, stub); +- +- return 0; +-err: +- STACK_UNWIND_STRICT(setattr, frame, -1, ENOMEM, NULL, NULL, NULL); + return 0; + } + +-int32_t ++static int32_t + ob_setxattr(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *dict, + int32_t flags, dict_t *xdata) + { +- call_stub_t *stub = NULL; +- gf_boolean_t access_xattr = _gf_false; +- + if (dict_get(dict, POSIX_ACL_DEFAULT_XATTR) || + dict_get(dict, POSIX_ACL_ACCESS_XATTR) || +- dict_get(dict, GF_SELINUX_XATTR_KEY)) +- access_xattr = _gf_true; +- +- if (!access_xattr) ++ dict_get(dict, GF_SELINUX_XATTR_KEY)) { + return default_setxattr(frame, this, loc, dict, flags, xdata); ++ } + +- stub = fop_setxattr_stub(frame, default_setxattr_resume, loc, dict, flags, +- xdata); +- if (!stub) +- goto err; +- +- open_all_pending_fds_and_resume(this, loc->inode, stub); ++ OB_POST_INODE(setxattr, this, frame, loc->inode, true, loc, dict, flags, ++ xdata); + + return 0; +-err: +- STACK_UNWIND_STRICT(setxattr, frame, -1, ENOMEM, NULL); +- return 0; + } + +-int +-ob_release(xlator_t *this, fd_t *fd) ++static void ++ob_fdclose(xlator_t *this, fd_t *fd) + { +- ob_fd_t *ob_fd = NULL; ++ struct list_head list; ++ ob_inode_t *ob_inode; ++ call_stub_t *stub; ++ ++ INIT_LIST_HEAD(&list); ++ stub = NULL; + +- ob_fd = ob_fd_ctx_get(this, fd); ++ LOCK(&fd->inode->lock); ++ { ++ ob_inode = ob_inode_get_locked(this, fd->inode); ++ if (ob_inode != NULL) { ++ ob_inode->open_count--; ++ ++ /* If this fd is the same as ob_inode->first_fd, it means that ++ * the initial open has not fully completed. We'll try to cancel ++ * it. */ ++ if (ob_inode->first_fd == fd) { ++ if (ob_inode->first_open == OB_OPEN_PREPARING) { ++ /* In this case ob_open_dispatch() has not been called yet. ++ * We clear first_fd and first_open to allow that function ++ * to know that the open is not really needed. This also ++ * allows other requests to work as expected if they ++ * arrive before the dispatch function is called. If there ++ * are pending fops, we can directly process them here. ++ * (note that there shouldn't be any fd related fops, but ++ * if there are, it's fine if they fail). */ ++ ob_inode->first_fd = NULL; ++ ob_inode->first_open = NULL; ++ ob_inode->triggered = false; ++ list_splice_init(&ob_inode->resume_fops, &list); ++ } else if (!ob_inode->triggered) { ++ /* If the open has already been dispatched, we can only ++ * cancel it if it has not been triggered. Otherwise we ++ * simply wait until it completes. While it's not triggered, ++ * first_open must be a valid stub and there can't be any ++ * pending fops. */ ++ GF_ASSERT((ob_inode->first_open != NULL) && ++ list_empty(&ob_inode->resume_fops)); ++ ++ ob_inode->first_fd = NULL; ++ stub = ob_inode->first_open; ++ ob_inode->first_open = NULL; ++ } ++ } ++ } ++ } ++ UNLOCK(&fd->inode->lock); + +- ob_fd_free(ob_fd); ++ if (stub != NULL) { ++ call_stub_destroy(stub); ++ fd_unref(fd); ++ } + +- return 0; ++ ob_resume_pending(&list); + } + + int + ob_forget(xlator_t *this, inode_t *inode) + { +- ob_inode_t *ob_inode = NULL; ++ ob_inode_t *ob_inode; + uint64_t value = 0; + +- inode_ctx_del(inode, this, &value); +- +- if (value) { ++ if ((inode_ctx_del(inode, this, &value) == 0) && (value != 0)) { + ob_inode = (ob_inode_t *)(uintptr_t)value; +- ob_inode_free(ob_inode); ++ GF_FREE(ob_inode); + } + + return 0; +@@ -1153,20 +823,18 @@ ob_priv_dump(xlator_t *this) + int + ob_fdctx_dump(xlator_t *this, fd_t *fd) + { +- ob_fd_t *ob_fd = NULL; + char key_prefix[GF_DUMP_MAX_BUF_LEN] = { + 0, + }; +- int ret = 0; ++ uint64_t value = 0; ++ int ret = 0, error = 0; + + ret = TRY_LOCK(&fd->lock); + if (ret) + return 0; + +- ob_fd = __ob_fd_ctx_get(this, fd); +- if (!ob_fd) { +- UNLOCK(&fd->lock); +- return 0; ++ if ((__fd_ctx_get(fd, this, &value) == 0) && (value != 0)) { ++ error = (int32_t)value; + } + + gf_proc_dump_build_key(key_prefix, "xlator.performance.open-behind", +@@ -1175,17 +843,7 @@ ob_fdctx_dump(xlator_t *this, fd_t *fd) + + gf_proc_dump_write("fd", "%p", fd); + +- gf_proc_dump_write("open_frame", "%p", ob_fd->open_frame); +- +- if (ob_fd->open_frame) +- gf_proc_dump_write("open_frame.root.unique", "%" PRIu64, +- ob_fd->open_frame->root->unique); +- +- gf_proc_dump_write("loc.path", "%s", ob_fd->loc.path); +- +- gf_proc_dump_write("loc.ino", "%s", uuid_utoa(ob_fd->loc.gfid)); +- +- gf_proc_dump_write("flags", "%d", ob_fd->flags); ++ gf_proc_dump_write("error", "%d", error); + + UNLOCK(&fd->lock); + +@@ -1307,7 +965,7 @@ struct xlator_fops fops = { + }; + + struct xlator_cbks cbks = { +- .release = ob_release, ++ .fdclose = ob_fdclose, + .forget = ob_forget, + }; + +-- +1.8.3.1 + |