summaryrefslogtreecommitdiff
path: root/0439-cluster-afr-Delay-post-op-for-fsync.patch
diff options
context:
space:
mode:
Diffstat (limited to '0439-cluster-afr-Delay-post-op-for-fsync.patch')
-rw-r--r--0439-cluster-afr-Delay-post-op-for-fsync.patch438
1 files changed, 438 insertions, 0 deletions
diff --git a/0439-cluster-afr-Delay-post-op-for-fsync.patch b/0439-cluster-afr-Delay-post-op-for-fsync.patch
new file mode 100644
index 0000000..dc1593b
--- /dev/null
+++ b/0439-cluster-afr-Delay-post-op-for-fsync.patch
@@ -0,0 +1,438 @@
+From 3ed98fc9dcb39223032e343fd5b0ad17fa3cae14 Mon Sep 17 00:00:00 2001
+From: Pranith Kumar K <pkarampu@redhat.com>
+Date: Fri, 29 May 2020 14:24:53 +0530
+Subject: [PATCH 439/449] cluster/afr: Delay post-op for fsync
+
+Problem:
+AFR doesn't delay post-op for fsync fop. For fsync heavy workloads
+this leads to un-necessary fxattrop/finodelk for every fsync leading
+to bad performance.
+
+Fix:
+Have delayed post-op for fsync. Add special flag in xdata to indicate
+that afr shouldn't delay post-op in cases where either the
+process will terminate or graph-switch would happen. Otherwise it leads
+to un-necessary heals when the graph-switch/process-termination
+happens before delayed-post-op completes.
+
+> Upstream-patch: https://review.gluster.org/c/glusterfs/+/24473
+> Fixes: #1253
+
+BUG: 1838479
+Change-Id: I531940d13269a111c49e0510d49514dc169f4577
+Signed-off-by: Pranith Kumar K <pkarampu@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/202676
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
+---
+ api/src/glfs-resolve.c | 14 ++-
+ tests/basic/afr/durability-off.t | 2 +
+ tests/basic/gfapi/gfapi-graph-switch-open-fd.t | 44 +++++++++
+ tests/basic/gfapi/gfapi-keep-writing.c | 129 +++++++++++++++++++++++++
+ xlators/cluster/afr/src/afr-inode-write.c | 11 ++-
+ xlators/cluster/afr/src/afr-transaction.c | 9 +-
+ xlators/cluster/afr/src/afr.h | 2 +-
+ xlators/cluster/dht/src/dht-rebalance.c | 15 ++-
+ xlators/mount/fuse/src/fuse-bridge.c | 23 ++++-
+ 9 files changed, 239 insertions(+), 10 deletions(-)
+ create mode 100644 tests/basic/gfapi/gfapi-graph-switch-open-fd.t
+ create mode 100644 tests/basic/gfapi/gfapi-keep-writing.c
+
+diff --git a/api/src/glfs-resolve.c b/api/src/glfs-resolve.c
+index a79f490..062b7dc 100644
+--- a/api/src/glfs-resolve.c
++++ b/api/src/glfs-resolve.c
+@@ -722,6 +722,7 @@ glfs_migrate_fd_safe(struct glfs *fs, xlator_t *newsubvol, fd_t *oldfd)
+ 0,
+ };
+ char uuid1[64];
++ dict_t *xdata = NULL;
+
+ oldinode = oldfd->inode;
+ oldsubvol = oldinode->table->xl;
+@@ -730,7 +731,15 @@ glfs_migrate_fd_safe(struct glfs *fs, xlator_t *newsubvol, fd_t *oldfd)
+ return fd_ref(oldfd);
+
+ if (!oldsubvol->switched) {
+- ret = syncop_fsync(oldsubvol, oldfd, 0, NULL, NULL, NULL, NULL);
++ xdata = dict_new();
++ if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
++ gf_msg(fs->volname, GF_LOG_WARNING, ENOMEM, API_MSG_FSYNC_FAILED,
++ "last-fsync set failed on %s graph %s (%d)",
++ uuid_utoa_r(oldfd->inode->gfid, uuid1),
++ graphid_str(oldsubvol), oldsubvol->graph->id);
++ }
++
++ ret = syncop_fsync(oldsubvol, oldfd, 0, NULL, NULL, xdata, NULL);
+ DECODE_SYNCOP_ERR(ret);
+ if (ret) {
+ gf_msg(fs->volname, GF_LOG_WARNING, errno, API_MSG_FSYNC_FAILED,
+@@ -809,6 +818,9 @@ out:
+ newfd = NULL;
+ }
+
++ if (xdata)
++ dict_unref(xdata);
++
+ return newfd;
+ }
+
+diff --git a/tests/basic/afr/durability-off.t b/tests/basic/afr/durability-off.t
+index 155ffa0..6e0f18b 100644
+--- a/tests/basic/afr/durability-off.t
++++ b/tests/basic/afr/durability-off.t
+@@ -26,6 +26,8 @@ TEST $CLI volume heal $V0
+ EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0
+ EXPECT "^0$" echo $($CLI volume profile $V0 info | grep -w FSYNC | wc -l)
+
++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0
++EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1
+ #Test that fsyncs happen when durability is on
+ TEST $CLI volume set $V0 cluster.ensure-durability on
+ TEST $CLI volume set $V0 performance.strict-write-ordering on
+diff --git a/tests/basic/gfapi/gfapi-graph-switch-open-fd.t b/tests/basic/gfapi/gfapi-graph-switch-open-fd.t
+new file mode 100644
+index 0000000..2e666be
+--- /dev/null
++++ b/tests/basic/gfapi/gfapi-graph-switch-open-fd.t
+@@ -0,0 +1,44 @@
++#!/bin/bash
++
++. $(dirname $0)/../../include.rc
++. $(dirname $0)/../../volume.rc
++
++cleanup;
++
++TEST glusterd
++
++TEST $CLI volume create $V0 replica 3 ${H0}:$B0/brick{0..2};
++EXPECT 'Created' volinfo_field $V0 'Status';
++
++TEST $CLI volume start $V0;
++EXPECT 'Started' volinfo_field $V0 'Status';
++
++TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0;
++TEST touch $M0/sync
++logdir=`gluster --print-logdir`
++
++TEST build_tester $(dirname $0)/gfapi-keep-writing.c -lgfapi
++
++
++#Launch a program to keep doing writes on an fd
++./$(dirname $0)/gfapi-keep-writing ${H0} $V0 $logdir/gfapi-async-calls-test.log sync &
++p=$!
++sleep 1 #Let some writes go through
++#Check if graph switch will lead to any pending markers for ever
++TEST $CLI volume set $V0 performance.quick-read off
++TEST $CLI volume set $V0 performance.io-cache off
++TEST $CLI volume set $V0 performance.stat-prefetch off
++TEST $CLI volume set $V0 performance.read-ahead off
++
++
++TEST rm -f $M0/sync #Make sure the glfd is closed
++TEST wait #Wait for background process to die
++#Goal is to check if there is permanent FOOL changelog
++sleep 5
++EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/brick0/glfs_test.txt trusted.afr.dirty
++EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/brick1/glfs_test.txt trusted.afr.dirty
++EXPECT "0x000000000000000000000000" afr_get_changelog_xattr $B0/brick2/glfs_test.txt trusted.afr.dirty
++
++cleanup_tester $(dirname $0)/gfapi-async-calls-test
++
++cleanup;
+diff --git a/tests/basic/gfapi/gfapi-keep-writing.c b/tests/basic/gfapi/gfapi-keep-writing.c
+new file mode 100644
+index 0000000..91b59ce
+--- /dev/null
++++ b/tests/basic/gfapi/gfapi-keep-writing.c
+@@ -0,0 +1,129 @@
++#include <fcntl.h>
++#include <unistd.h>
++#include <time.h>
++#include <limits.h>
++#include <string.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <errno.h>
++#include <glusterfs/api/glfs.h>
++#include <glusterfs/api/glfs-handles.h>
++
++#define LOG_ERR(msg) \
++ do { \
++ fprintf(stderr, "%s : Error (%s)\n", msg, strerror(errno)); \
++ } while (0)
++
++glfs_t *
++init_glfs(const char *hostname, const char *volname, const char *logfile)
++{
++ int ret = -1;
++ glfs_t *fs = NULL;
++
++ fs = glfs_new(volname);
++ if (!fs) {
++ LOG_ERR("glfs_new failed");
++ return NULL;
++ }
++
++ ret = glfs_set_volfile_server(fs, "tcp", hostname, 24007);
++ if (ret < 0) {
++ LOG_ERR("glfs_set_volfile_server failed");
++ goto out;
++ }
++
++ ret = glfs_set_logging(fs, logfile, 7);
++ if (ret < 0) {
++ LOG_ERR("glfs_set_logging failed");
++ goto out;
++ }
++
++ ret = glfs_init(fs);
++ if (ret < 0) {
++ LOG_ERR("glfs_init failed");
++ goto out;
++ }
++
++ ret = 0;
++out:
++ if (ret) {
++ glfs_fini(fs);
++ fs = NULL;
++ }
++
++ return fs;
++}
++
++int
++glfs_test_function(const char *hostname, const char *volname,
++ const char *logfile, const char *syncfile)
++{
++ int ret = -1;
++ int flags = O_CREAT | O_RDWR;
++ glfs_t *fs = NULL;
++ glfs_fd_t *glfd = NULL;
++ const char *buff = "This is from my prog\n";
++ const char *filename = "glfs_test.txt";
++ struct stat buf = {0};
++
++ fs = init_glfs(hostname, volname, logfile);
++ if (fs == NULL) {
++ LOG_ERR("init_glfs failed");
++ return -1;
++ }
++
++ glfd = glfs_creat(fs, filename, flags, 0644);
++ if (glfd == NULL) {
++ LOG_ERR("glfs_creat failed");
++ goto out;
++ }
++
++ while (glfs_stat(fs, syncfile, &buf) == 0) {
++ ret = glfs_write(glfd, buff, strlen(buff), flags);
++ if (ret < 0) {
++ LOG_ERR("glfs_write failed");
++ goto out;
++ }
++ }
++
++ ret = glfs_close(glfd);
++ if (ret < 0) {
++ LOG_ERR("glfs_write failed");
++ goto out;
++ }
++
++out:
++ ret = glfs_fini(fs);
++ if (ret) {
++ LOG_ERR("glfs_fini failed");
++ }
++
++ return ret;
++}
++
++int
++main(int argc, char *argv[])
++{
++ int ret = 0;
++ char *hostname = NULL;
++ char *volname = NULL;
++ char *logfile = NULL;
++ char *syncfile = NULL;
++
++ if (argc != 5) {
++ fprintf(stderr, "Invalid argument\n");
++ exit(1);
++ }
++
++ hostname = argv[1];
++ volname = argv[2];
++ logfile = argv[3];
++ syncfile = argv[4];
++
++ ret = glfs_test_function(hostname, volname, logfile, syncfile);
++ if (ret) {
++ LOG_ERR("glfs_test_function failed");
++ }
++
++ return ret;
++}
+diff --git a/xlators/cluster/afr/src/afr-inode-write.c b/xlators/cluster/afr/src/afr-inode-write.c
+index 7fcc9d4..df82b6e 100644
+--- a/xlators/cluster/afr/src/afr-inode-write.c
++++ b/xlators/cluster/afr/src/afr-inode-write.c
+@@ -2492,6 +2492,7 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ call_frame_t *transaction_frame = NULL;
+ int ret = -1;
+ int32_t op_errno = ENOMEM;
++ int8_t last_fsync = 0;
+
+ transaction_frame = copy_frame(frame);
+ if (!transaction_frame)
+@@ -2501,10 +2502,16 @@ afr_fsync(call_frame_t *frame, xlator_t *this, fd_t *fd, int32_t datasync,
+ if (!local)
+ goto out;
+
+- if (xdata)
++ if (xdata) {
+ local->xdata_req = dict_copy_with_ref(xdata, NULL);
+- else
++ if (dict_get_int8(xdata, "last-fsync", &last_fsync) == 0) {
++ if (last_fsync) {
++ local->transaction.disable_delayed_post_op = _gf_true;
++ }
++ }
++ } else {
+ local->xdata_req = dict_new();
++ }
+
+ if (!local->xdata_req)
+ goto out;
+diff --git a/xlators/cluster/afr/src/afr-transaction.c b/xlators/cluster/afr/src/afr-transaction.c
+index 8e65ae2..ffd0ab8 100644
+--- a/xlators/cluster/afr/src/afr-transaction.c
++++ b/xlators/cluster/afr/src/afr-transaction.c
+@@ -2385,8 +2385,13 @@ afr_is_delayed_changelog_post_op_needed(call_frame_t *frame, xlator_t *this,
+ goto out;
+ }
+
+- if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP)) {
+- /*Only allow writes but shard does [f]xattrops on writes, so
++ if (local->transaction.disable_delayed_post_op) {
++ goto out;
++ }
++
++ if ((local->op != GF_FOP_WRITE) && (local->op != GF_FOP_FXATTROP) &&
++ (local->op != GF_FOP_FSYNC)) {
++ /*Only allow writes/fsyncs but shard does [f]xattrops on writes, so
+ * they are fine too*/
+ goto out;
+ }
+diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
+index 18f1a6a..ff96246 100644
+--- a/xlators/cluster/afr/src/afr.h
++++ b/xlators/cluster/afr/src/afr.h
+@@ -854,7 +854,7 @@ typedef struct _afr_local {
+
+ int (*unwind)(call_frame_t *frame, xlator_t *this);
+
+- /* post-op hook */
++ gf_boolean_t disable_delayed_post_op;
+ } transaction;
+
+ syncbarrier_t barrier;
+diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
+index d0c21b4..e9974cd 100644
+--- a/xlators/cluster/dht/src/dht-rebalance.c
++++ b/xlators/cluster/dht/src/dht-rebalance.c
+@@ -1550,6 +1550,7 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
+ xlator_t *old_target = NULL;
+ xlator_t *hashed_subvol = NULL;
+ fd_t *linkto_fd = NULL;
++ dict_t *xdata = NULL;
+
+ if (from == to) {
+ gf_msg_debug(this->name, 0,
+@@ -1868,7 +1869,15 @@ dht_migrate_file(xlator_t *this, loc_t *loc, xlator_t *from, xlator_t *to,
+
+ /* TODO: Sync the locks */
+
+- ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, NULL, NULL);
++ xdata = dict_new();
++ if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
++ gf_log(this->name, GF_LOG_ERROR,
++ "%s: failed to set last-fsync flag on "
++ "%s (%s)",
++ loc->path, to->name, strerror(ENOMEM));
++ }
++
++ ret = syncop_fsync(to, dst_fd, 0, NULL, NULL, xdata, NULL);
+ if (ret) {
+ gf_log(this->name, GF_LOG_WARNING, "%s: failed to fsync on %s (%s)",
+ loc->path, to->name, strerror(-ret));
+@@ -2342,11 +2351,15 @@ out:
+
+ if (dst_fd)
+ syncop_close(dst_fd);
++
+ if (src_fd)
+ syncop_close(src_fd);
+ if (linkto_fd)
+ syncop_close(linkto_fd);
+
++ if (xdata)
++ dict_unref(xdata);
++
+ loc_wipe(&tmp_loc);
+ loc_wipe(&parent_loc);
+
+diff --git a/xlators/mount/fuse/src/fuse-bridge.c b/xlators/mount/fuse/src/fuse-bridge.c
+index fdeec49..4264fad 100644
+--- a/xlators/mount/fuse/src/fuse-bridge.c
++++ b/xlators/mount/fuse/src/fuse-bridge.c
+@@ -5559,6 +5559,7 @@ fuse_migrate_fd(xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
+ char create_in_progress = 0;
+ fuse_fd_ctx_t *basefd_ctx = NULL;
+ fd_t *oldfd = NULL;
++ dict_t *xdata = NULL;
+
+ basefd_ctx = fuse_fd_ctx_get(this, basefd);
+ GF_VALIDATE_OR_GOTO("glusterfs-fuse", basefd_ctx, out);
+@@ -5595,10 +5596,23 @@ fuse_migrate_fd(xlator_t *this, fd_t *basefd, xlator_t *old_subvol,
+ }
+
+ if (oldfd->inode->table->xl == old_subvol) {
+- if (IA_ISDIR(oldfd->inode->ia_type))
++ if (IA_ISDIR(oldfd->inode->ia_type)) {
+ ret = syncop_fsyncdir(old_subvol, oldfd, 0, NULL, NULL);
+- else
+- ret = syncop_fsync(old_subvol, oldfd, 0, NULL, NULL, NULL, NULL);
++ } else {
++ xdata = dict_new();
++ if (!xdata || dict_set_int8(xdata, "last-fsync", 1)) {
++ gf_log("glusterfs-fuse", GF_LOG_WARNING,
++ "last-fsync set failed (%s) on fd (%p)"
++ "(basefd:%p basefd-inode.gfid:%s) "
++ "(old-subvolume:%s-%d new-subvolume:%s-%d)",
++ strerror(ENOMEM), oldfd, basefd,
++ uuid_utoa(basefd->inode->gfid), old_subvol->name,
++ old_subvol->graph->id, new_subvol->name,
++ new_subvol->graph->id);
++ }
++
++ ret = syncop_fsync(old_subvol, oldfd, 0, NULL, NULL, xdata, NULL);
++ }
+
+ if (ret < 0) {
+ gf_log("glusterfs-fuse", GF_LOG_WARNING,
+@@ -5653,6 +5667,9 @@ out:
+
+ fd_unref(oldfd);
+
++ if (xdata)
++ dict_unref(xdata);
++
+ return ret;
+ }
+
+--
+1.8.3.1
+