diff options
Diffstat (limited to '0178-Cluster-afr-Don-t-treat-all-bricks-having-metadata-p.patch')
-rw-r--r-- | 0178-Cluster-afr-Don-t-treat-all-bricks-having-metadata-p.patch | 287 |
1 files changed, 287 insertions, 0 deletions
diff --git a/0178-Cluster-afr-Don-t-treat-all-bricks-having-metadata-p.patch b/0178-Cluster-afr-Don-t-treat-all-bricks-having-metadata-p.patch new file mode 100644 index 0000000..1ff767d --- /dev/null +++ b/0178-Cluster-afr-Don-t-treat-all-bricks-having-metadata-p.patch @@ -0,0 +1,287 @@ +From 307074330db6e9f14941dfbabbe6f299cf841533 Mon Sep 17 00:00:00 2001 +From: karthik-us <ksubrahm@redhat.com> +Date: Mon, 10 Jun 2019 23:58:16 +0530 +Subject: [PATCH 178/178] Cluster/afr: Don't treat all bricks having metadata + pending as split-brain + +Backport of: https://review.gluster.org/#/c/glusterfs/+/22831/ + +Problem: +We currently don't have a roll-back/undoing of post-ops if quorum is not met. +Though the FOP is still unwound with failure, the xattrs remain on the disk. +Due to these partial post-ops and partial heals (healing only when 2 bricks +are up), we can end up in metadata split-brain purely from the afr xattrs +point of view i.e each brick is blamed by atleast one of the others for +metadata. These scenarios are hit when there is frequent connect/disconnect +of the client/shd to the bricks. + +Fix: +Pick a source based on the xattr values. If 2 bricks blame one, the blamed +one must be treated as sink. If there is no majority, all are sources. Once +we pick a source, self-heal will then do the heal instead of erroring out +due to split-brain. +This patch also adds restriction of all the bricks to be up to perform +metadata heal to avoid any metadata loss. + +Removed the test case tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t +as it was doing metadata heal even when only 2 of 3 bricks were up. + +Change-Id: I02064ecb7d68d498f75a353af64f75249a633508 +fixes: bz#1715438 +Signed-off-by: karthik-us <ksubrahm@redhat.com> +Reviewed-on: https://code.engineering.redhat.com/gerrit/172935 +Tested-by: RHGS Build Bot <nigelb@redhat.com> +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> +--- + .../bug-1468279-source-not-blaming-sinks.t | 64 ---------- + .../bug-1717819-metadata-split-brain-detection.t | 130 +++++++++++++++++++++ + xlators/cluster/afr/src/afr-self-heal-common.c | 4 +- + xlators/cluster/afr/src/afr-self-heal-metadata.c | 2 +- + 4 files changed, 133 insertions(+), 67 deletions(-) + delete mode 100644 tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t + create mode 100644 tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t + +diff --git a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t b/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t +deleted file mode 100644 +index 054a4ad..0000000 +--- a/tests/bugs/replicate/bug-1468279-source-not-blaming-sinks.t ++++ /dev/null +@@ -1,64 +0,0 @@ +-#!/bin/bash +-. $(dirname $0)/../../include.rc +-. $(dirname $0)/../../volume.rc +-cleanup; +- +-TEST glusterd +-TEST pidof glusterd +-TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2} +-TEST $CLI volume start $V0 +-TEST $CLI volume set $V0 cluster.self-heal-daemon off +-TEST $CLI volume set $V0 cluster.metadata-self-heal off +-TEST $GFS --volfile-id=$V0 --volfile-server=$H0 --attribute-timeout=0 --entry-timeout=0 $M0; +-TEST touch $M0/file +- +-# Kill B1, create a pending metadata heal. +-TEST kill_brick $V0 $H0 $B0/${V0}0 +-TEST setfattr -n user.xattr -v value1 $M0/file +-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file +-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file +- +-# Kill B2, heal from B3 to B1. +-TEST $CLI volume start $V0 force +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 0 +-TEST kill_brick $V0 $H0 $B0/${V0}1 +-TEST $CLI volume set $V0 cluster.self-heal-daemon on +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +-$CLI volume heal $V0 +-EXPECT_WITHIN $HEAL_TIMEOUT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-0 "metadata" +-TEST $CLI volume set $V0 cluster.self-heal-daemon off +- +-# Create another pending metadata heal. +-TEST setfattr -n user.xattr -v value2 $M0/file +-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file +-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file +- +-# Kill B1, heal from B3 to B2 +-TEST $CLI volume start $V0 force +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 +-TEST kill_brick $V0 $H0 $B0/${V0}0 +-TEST $CLI volume set $V0 cluster.self-heal-daemon on +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +-$CLI volume heal $V0 +-EXPECT_WITHIN $HEAL_TIMEOUT "00000000" afr_get_specific_changelog_xattr $B0/${V0}2/file trusted.afr.$V0-client-1 "metadata" +-TEST $CLI volume set $V0 cluster.self-heal-daemon off +- +-# ALL bricks up again. +-TEST $CLI volume start $V0 force +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 1 +-EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" afr_child_up_status $V0 1 +-# B1 and B2 blame each other, B3 doesn't blame anyone. +-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}0/file +-EXPECT "0000000000000010000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}1/file +-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-0 $B0/${V0}2/file +-EXPECT "0000000000000000000000" get_hex_xattr trusted.afr.$V0-client-1 $B0/${V0}2/file +-TEST $CLI volume set $V0 cluster.self-heal-daemon on +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 0 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 1 +-EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status_in_shd $V0 2 +-TEST $CLI volume heal $V0 +-EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 +- +-cleanup; +diff --git a/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t +new file mode 100644 +index 0000000..94b8bf3 +--- /dev/null ++++ b/tests/bugs/replicate/bug-1717819-metadata-split-brain-detection.t +@@ -0,0 +1,130 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../afr.rc ++ ++cleanup; ++ ++## Start and create a volume ++TEST glusterd; ++TEST pidof glusterd; ++TEST $CLI volume info; ++ ++TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}; ++TEST $CLI volume start $V0; ++EXPECT 'Started' volinfo_field $V0 'Status'; ++TEST $CLI volume heal $V0 disable ++TEST $GFS --volfile-id=/$V0 --volfile-server=$H0 $M0 ++ ++############################################################################### ++# Case of 2 bricks blaming the third and the third blaming the other two. ++ ++TEST mkdir $M0/dir ++ ++# B0 and B2 must blame B1 ++TEST kill_brick $V0 $H0 $B0/$V0"1" ++TEST setfattr -n user.metadata -v 1 $M0/dir ++EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir trusted.afr.$V0-client-1 metadata ++EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir trusted.afr.$V0-client-1 metadata ++CLIENT_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $M0/dir) ++ ++# B1 must blame B0 and B2 ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir ++ ++# Launch heal ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2 ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++ ++B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir) ++B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir) ++B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir) ++ ++TEST [ "$CLIENT_XATTR" == "$B0_XATTR" ] ++TEST [ "$CLIENT_XATTR" == "$B1_XATTR" ] ++TEST [ "$CLIENT_XATTR" == "$B2_XATTR" ] ++TEST setfattr -x user.metadata $M0/dir ++ ++############################################################################### ++# Case of each brick blaming the next one in a cyclic manner ++ ++TEST $CLI volume heal $V0 disable ++TEST `echo "hello" >> $M0/dir/file` ++# Mark cyclic xattrs and modify metadata directly on the bricks. ++setfattr -n trusted.afr.$V0-client-1 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"2"/dir/file ++ ++setfattr -n user.metadata -v 1 $B0/$V0"0"/dir/file ++setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file ++setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file ++ ++# Add entry to xattrop dir to trigger index heal. ++xattrop_dir0=$(afr_get_index_path $B0/$V0"0") ++base_entry_b0=`ls $xattrop_dir0` ++gfid_str=$(gf_gfid_xattr_to_str $(gf_get_gfid_xattr $B0/$V0"0"/dir/file)) ++ln $xattrop_dir0/$base_entry_b0 $xattrop_dir0/$gfid_str ++EXPECT_WITHIN $HEAL_TIMEOUT "^1$" get_pending_heal_count $V0 ++ ++# Launch heal ++TEST $CLI volume heal $V0 enable ++TEST $CLI volume heal $V0 ++EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0 ++ ++B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file) ++B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file) ++B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file) ++ ++TEST [ "$B0_XATTR" == "$B1_XATTR" ] ++TEST [ "$B0_XATTR" == "$B2_XATTR" ] ++TEST rm -f $M0/dir/file ++ ++############################################################################### ++# Case of 2 bricks having quorum blaming and the other having only one blaming. ++ ++TEST $CLI volume heal $V0 disable ++TEST `echo "hello" >> $M0/dir/file` ++# B0 and B2 must blame B1 ++TEST kill_brick $V0 $H0 $B0/$V0"1" ++TEST setfattr -n user.metadata -v 1 $M0/dir/file ++EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}0/dir/file trusted.afr.$V0-client-1 metadata ++EXPECT "00000001" afr_get_specific_changelog_xattr $B0/${V0}2/dir/file trusted.afr.$V0-client-1 metadata ++ ++# B1 must blame B0 and B2 ++setfattr -n trusted.afr.$V0-client-0 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"1"/dir/file ++ ++# B0 must blame B2 ++setfattr -n trusted.afr.$V0-client-2 -v 0x000000000000000100000000 $B0/$V0"0"/dir/file ++ ++# Modify the metadata directly on the bricks B1 & B2. ++setfattr -n user.metadata -v 2 $B0/$V0"1"/dir/file ++setfattr -n user.metadata -v 3 $B0/$V0"2"/dir/file ++ ++# Launch heal ++TEST $CLI volume start $V0 force ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^1$" brick_up_status $V0 $H0 $B0/${V0}1 ++TEST $CLI volume heal $V0 enable ++EXPECT_WITHIN $PROCESS_UP_TIMEOUT "^Y$" glustershd_up_status ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 0 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 1 ++EXPECT_WITHIN $CHILD_UP_TIMEOUT "^1$" afr_child_up_status_in_shd $V0 2 ++ ++B0_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}0/dir/file) ++B1_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}1/dir/file) ++B2_XATTR=$(getfattr -n 'user.metadata' --absolute-names --only-values $B0/${V0}2/dir/file) ++ ++TEST [ "$B0_XATTR" == "$B1_XATTR" ] ++TEST [ "$B0_XATTR" == "$B2_XATTR" ] ++ ++############################################################################### ++ ++cleanup +diff --git a/xlators/cluster/afr/src/afr-self-heal-common.c b/xlators/cluster/afr/src/afr-self-heal-common.c +index 595bed4..5157e7d 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-common.c ++++ b/xlators/cluster/afr/src/afr-self-heal-common.c +@@ -1590,7 +1590,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, + } + } + +- if (type == AFR_DATA_TRANSACTION) { ++ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) { + min_participants = priv->child_count; + } else { + min_participants = AFR_SH_MIN_PARTICIPANTS; +@@ -1656,7 +1656,7 @@ afr_selfheal_find_direction(call_frame_t *frame, xlator_t *this, + } + } + +- if (type == AFR_DATA_TRANSACTION) ++ if (type == AFR_DATA_TRANSACTION || type == AFR_METADATA_TRANSACTION) + afr_selfheal_post_op_failure_accounting(priv, accused, sources, + locked_on); + +diff --git a/xlators/cluster/afr/src/afr-self-heal-metadata.c b/xlators/cluster/afr/src/afr-self-heal-metadata.c +index ba43341..ecfa791 100644 +--- a/xlators/cluster/afr/src/afr-self-heal-metadata.c ++++ b/xlators/cluster/afr/src/afr-self-heal-metadata.c +@@ -398,7 +398,7 @@ afr_selfheal_metadata(call_frame_t *frame, xlator_t *this, inode_t *inode) + ret = afr_selfheal_inodelk(frame, this, inode, this->name, LLONG_MAX - 1, 0, + data_lock); + { +- if (ret < AFR_SH_MIN_PARTICIPANTS) { ++ if (ret < priv->child_count) { + ret = -ENOTCONN; + goto unlock; + } +-- +1.8.3.1 + |