summaryrefslogtreecommitdiff
path: root/0163-geo-rep-Fix-sync-hang-with-tarssh.patch
diff options
context:
space:
mode:
Diffstat (limited to '0163-geo-rep-Fix-sync-hang-with-tarssh.patch')
-rw-r--r--0163-geo-rep-Fix-sync-hang-with-tarssh.patch255
1 files changed, 255 insertions, 0 deletions
diff --git a/0163-geo-rep-Fix-sync-hang-with-tarssh.patch b/0163-geo-rep-Fix-sync-hang-with-tarssh.patch
new file mode 100644
index 0000000..9b90128
--- /dev/null
+++ b/0163-geo-rep-Fix-sync-hang-with-tarssh.patch
@@ -0,0 +1,255 @@
+From 29ec87484b1ee3ad6417c37726db8aa9296f3a83 Mon Sep 17 00:00:00 2001
+From: Kotresh HR <khiremat@redhat.com>
+Date: Wed, 8 May 2019 11:26:06 +0530
+Subject: [PATCH 163/169] geo-rep: Fix sync hang with tarssh
+
+Problem:
+Geo-rep sync hangs when tarssh is used as sync
+engine at heavy workload.
+
+Analysis and Root cause:
+It's found out that the tar process was hung.
+When debugged further, it's found out that stderr
+buffer of tar process on master was full i.e., 64k.
+When the buffer was copied to a file from /proc/pid/fd/2,
+the hang is resolved.
+
+This can happen when files picked by tar process
+to sync doesn't exist on master anymore. If this count
+increases around 1k, the stderr buffer is filled up.
+
+Fix:
+The tar process is executed using Popen with stderr as PIPE.
+The final execution is something like below.
+
+tar | ssh <args> root@slave tar --overwrite -xf - -C <path>
+
+It was waiting on ssh process first using communicate() and then tar.
+Note that communicate() reads stdout and stderr. So when stderr of tar
+process is filled up, there is no one to read until untar via ssh is
+completed. This can't happen and leads to deadlock.
+Hence we should be waiting on both process parallely, so that stderr is
+read on both processes.
+
+Backport of:
+ > Patch: https://review.gluster.org/22684
+ > Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
+ > BUG: 1707728
+ > Signed-off-by: Kotresh HR <khiremat@redhat.com>
+
+Change-Id: I609c7cc5c07e210c504771115b4d551a2e891adf
+fixes: bz#1708116
+Signed-off-by: Kotresh HR <khiremat@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/172395
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
+---
+ geo-replication/syncdaemon/resource.py | 22 ++++--
+ tests/00-geo-rep/georep-stderr-hang.t | 128 +++++++++++++++++++++++++++++++++
+ tests/geo-rep.rc | 17 +++++
+ 3 files changed, 163 insertions(+), 4 deletions(-)
+ create mode 100644 tests/00-geo-rep/georep-stderr-hang.t
+
+diff --git a/geo-replication/syncdaemon/resource.py b/geo-replication/syncdaemon/resource.py
+index 522279b..b16db60 100644
+--- a/geo-replication/syncdaemon/resource.py
++++ b/geo-replication/syncdaemon/resource.py
+@@ -1540,15 +1540,29 @@ class SSH(object):
+
+ p0.stdin.close()
+ p0.stdout.close() # Allow p0 to receive a SIGPIPE if p1 exits.
+- # wait for tar to terminate, collecting any errors, further
+- # waiting for transfer to complete
+- _, stderr1 = p1.communicate()
+
+ # stdin and stdout of p0 is already closed, Reset to None and
+ # wait for child process to complete
+ p0.stdin = None
+ p0.stdout = None
+- p0.communicate()
++
++ def wait_for_tar(p0):
++ _, stderr = p0.communicate()
++ if log_err:
++ for errline in stderr.strip().split("\n")[:-1]:
++ if "No such file or directory" not in errline:
++ logging.error(lf("SYNC Error",
++ sync_engine="Tarssh",
++ error=errline))
++
++ t = syncdutils.Thread(target=wait_for_tar, args=(p0, ))
++ # wait for tar to terminate, collecting any errors, further
++ # waiting for transfer to complete
++ t.start()
++
++ # wait for ssh process
++ _, stderr1 = p1.communicate()
++ t.join()
+
+ if log_err:
+ for errline in stderr1.strip().split("\n")[:-1]:
+diff --git a/tests/00-geo-rep/georep-stderr-hang.t b/tests/00-geo-rep/georep-stderr-hang.t
+new file mode 100644
+index 0000000..496f0e6
+--- /dev/null
++++ b/tests/00-geo-rep/georep-stderr-hang.t
+@@ -0,0 +1,128 @@
++#!/bin/bash
++
++. $(dirname $0)/../include.rc
++. $(dirname $0)/../volume.rc
++. $(dirname $0)/../geo-rep.rc
++. $(dirname $0)/../env.rc
++
++SCRIPT_TIMEOUT=500
++
++AREQUAL_PATH=$(dirname $0)/../utils
++test "`uname -s`" != "Linux" && {
++ CFLAGS="$CFLAGS -lintl";
++}
++build_tester $AREQUAL_PATH/arequal-checksum.c $CFLAGS
++
++### Basic Tests with Distribute Replicate volumes
++
++##Cleanup and start glusterd
++cleanup;
++TEST glusterd;
++TEST pidof glusterd
++
++
++##Variables
++GEOREP_CLI="$CLI volume geo-replication"
++master=$GMV0
++SH0="127.0.0.1"
++slave=${SH0}::${GSV0}
++num_active=2
++num_passive=2
++master_mnt=$M0
++slave_mnt=$M1
++
++############################################################
++#SETUP VOLUMES AND GEO-REPLICATION
++############################################################
++
++##create_and_start_master_volume
++TEST $CLI volume create $GMV0 $H0:$B0/${GMV0}1;
++TEST $CLI volume start $GMV0
++
++##create_and_start_slave_volume
++TEST $CLI volume create $GSV0 $H0:$B0/${GSV0}1;
++TEST $CLI volume start $GSV0
++TEST $CLI volume set $GSV0 performance.stat-prefetch off
++TEST $CLI volume set $GSV0 performance.quick-read off
++TEST $CLI volume set $GSV0 performance.readdir-ahead off
++TEST $CLI volume set $GSV0 performance.read-ahead off
++
++##Mount master
++TEST glusterfs -s $H0 --volfile-id $GMV0 $M0
++
++##Mount slave
++TEST glusterfs -s $H0 --volfile-id $GSV0 $M1
++
++############################################################
++#BASIC GEO-REPLICATION TESTS
++############################################################
++
++TEST create_georep_session $master $slave
++EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Created"
++
++#Config gluster-command-dir
++TEST $GEOREP_CLI $master $slave config gluster-command-dir ${GLUSTER_CMD_DIR}
++
++#Config gluster-command-dir
++TEST $GEOREP_CLI $master $slave config slave-gluster-command-dir ${GLUSTER_CMD_DIR}
++
++#Set changelog roll-over time to 45 secs
++TEST $CLI volume set $GMV0 changelog.rollover-time 45
++
++#Wait for common secret pem file to be created
++EXPECT_WITHIN $GEO_REP_TIMEOUT 0 check_common_secret_file
++
++#Verify the keys are distributed
++EXPECT_WITHIN $GEO_REP_TIMEOUT 0 check_keys_distributed
++
++#Set sync-jobs to 1
++TEST $GEOREP_CLI $master $slave config sync-jobs 1
++
++#Start_georep
++TEST $GEOREP_CLI $master $slave start
++
++touch $M0
++EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
++EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Changelog Crawl"
++
++#Check History Crawl.
++TEST $GEOREP_CLI $master $slave stop
++TEST create_data_hang "rsync_hang"
++TEST create_data "history_rsync"
++TEST $GEOREP_CLI $master $slave start
++EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
++
++#Verify arequal for whole volume
++EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
++
++#Stop Geo-rep
++TEST $GEOREP_CLI $master $slave stop
++
++#Config tarssh as sync-engine
++TEST $GEOREP_CLI $master $slave config sync-method tarssh
++
++#Create tarssh hang data
++TEST create_data_hang "tarssh_hang"
++TEST create_data "history_tar"
++
++TEST $GEOREP_CLI $master $slave start
++EXPECT_WITHIN $GEO_REP_TIMEOUT 1 check_status_num_rows "Active"
++
++#Verify arequal for whole volume
++EXPECT_WITHIN $GEO_REP_TIMEOUT "x0" arequal_checksum ${master_mnt} ${slave_mnt}
++
++#Stop Geo-rep
++TEST $GEOREP_CLI $master $slave stop
++
++#Delete Geo-rep
++TEST $GEOREP_CLI $master $slave delete
++
++#Cleanup are-equal binary
++TEST rm $AREQUAL_PATH/arequal-checksum
++
++#Cleanup authorized keys
++sed -i '/^command=.*SSH_ORIGINAL_COMMAND#.*/d' ~/.ssh/authorized_keys
++sed -i '/^command=.*gsyncd.*/d' ~/.ssh/authorized_keys
++
++cleanup;
++#G_TESTDEF_TEST_STATUS_NETBSD7=BAD_TEST,BUG=000000
+diff --git a/tests/geo-rep.rc b/tests/geo-rep.rc
+index 2035b9f..e4f014e 100644
+--- a/tests/geo-rep.rc
++++ b/tests/geo-rep.rc
+@@ -101,6 +101,23 @@ function create_data()
+ chown 1000:1000 ${master_mnt}/${prefix}_chown_f1_ಸಂತಸ
+ }
+
++function create_data_hang()
++{
++ prefix=$1
++ mkdir ${master_mnt}/${prefix}
++ cd ${master_mnt}/${prefix}
++ # ~1k files is required with 1 sync-job and hang happens if
++ # stderr buffer of tar/ssh executed with Popen is full (i.e., 64k).
++ # 64k is hit when ~800 files were not found while syncing data
++ # from master. So around 1k files is required to hit the condition.
++ for i in {1..1000}
++ do
++ echo "test data" > file$i
++ mv -f file$i file
++ done
++ cd -
++}
++
+ function chown_file_ok()
+ {
+ local file_owner=$(stat --format "%u:%g" "$1")
+--
+1.8.3.1
+