diff options
Diffstat (limited to '0362-write-behind-fix-data-corruption.patch')
-rw-r--r-- | 0362-write-behind-fix-data-corruption.patch | 454 |
1 files changed, 454 insertions, 0 deletions
diff --git a/0362-write-behind-fix-data-corruption.patch b/0362-write-behind-fix-data-corruption.patch new file mode 100644 index 0000000..aeb7242 --- /dev/null +++ b/0362-write-behind-fix-data-corruption.patch @@ -0,0 +1,454 @@ +From 48f6929590157d9a1697e11c02441207afdc1bed Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez <xhernandez@redhat.com> +Date: Fri, 27 Mar 2020 23:56:15 +0100 +Subject: [PATCH 362/362] write-behind: fix data corruption + +There was a bug in write-behind that allowed a previous completed write +to overwrite the overlapping region of data from a future write. + +Suppose we want to send three writes (W1, W2 and W3). W1 and W2 are +sequential, and W3 writes at the same offset of W2: + + W2.offset = W3.offset = W1.offset + W1.size + +Both W1 and W2 are sent in parallel. W3 is only sent after W2 completes. +So W3 should *always* overwrite the overlapping part of W2. + +Suppose write-behind processes the requests from 2 concurrent threads: + + Thread 1 Thread 2 + + <received W1> + <received W2> + wb_enqueue_tempted(W1) + /* W1 is assigned gen X */ + wb_enqueue_tempted(W2) + /* W2 is assigned gen X */ + + wb_process_queue() + __wb_preprocess_winds() + /* W1 and W2 are sequential and all + * other requisites are met to merge + * both requests. */ + __wb_collapse_small_writes(W1, W2) + __wb_fulfill_request(W2) + + __wb_pick_unwinds() -> W2 + /* In this case, since the request is + * already fulfilled, wb_inode->gen + * is not updated. */ + + wb_do_unwinds() + STACK_UNWIND(W2) + + /* The application has received the + * result of W2, so it can send W3. */ + <received W3> + + wb_enqueue_tempted(W3) + /* W3 is assigned gen X */ + + wb_process_queue() + /* Here we have W1 (which contains + * the conflicting W2) and W3 with + * same gen, so they are interpreted + * as concurrent writes that do not + * conflict. */ + __wb_pick_winds() -> W3 + + wb_do_winds() + STACK_WIND(W3) + + wb_process_queue() + /* Eventually W1 will be + * ready to be sent */ + __wb_pick_winds() -> W1 + __wb_pick_unwinds() -> W1 + /* Here wb_inode->gen is + * incremented. */ + + wb_do_unwinds() + STACK_UNWIND(W1) + + wb_do_winds() + STACK_WIND(W1) + +So, as we can see, W3 is sent before W1, which shouldn't happen. + +The problem is that wb_inode->gen is only incremented for requests that +have not been fulfilled but, after a merge, the request is marked as +fulfilled even though it has not been sent to the brick. This allows +that future requests are assigned to the same generation, which could +be internally reordered. + +Solution: + +Increment wb_inode->gen before any unwind, even if it's for a fulfilled +request. + +Special thanks to Stefan Ring for writing a reproducer that has been +crucial to identify the issue. + +Upstream patch: +> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24263 +> Change-Id: Id4ab0f294a09aca9a863ecaeef8856474662ab45 +> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com> +> Fixes: #884 + +Change-Id: Id4ab0f294a09aca9a863ecaeef8856474662ab45 +Signed-off-by: Xavi Hernandez <xhernandez@redhat.com> +BUG: 1819059 +Reviewed-on: https://code.engineering.redhat.com/gerrit/196250 +Tested-by: RHGS Build Bot <nigelb@redhat.com> +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> +--- + tests/bugs/write-behind/issue-884.c | 267 +++++++++++++++++++++ + tests/bugs/write-behind/issue-884.t | 40 +++ + .../performance/write-behind/src/write-behind.c | 4 +- + 3 files changed, 309 insertions(+), 2 deletions(-) + create mode 100644 tests/bugs/write-behind/issue-884.c + create mode 100755 tests/bugs/write-behind/issue-884.t + +diff --git a/tests/bugs/write-behind/issue-884.c b/tests/bugs/write-behind/issue-884.c +new file mode 100644 +index 0000000..e9c33b3 +--- /dev/null ++++ b/tests/bugs/write-behind/issue-884.c +@@ -0,0 +1,267 @@ ++ ++#define _GNU_SOURCE ++ ++#include <stdlib.h> ++#include <stdio.h> ++#include <string.h> ++#include <time.h> ++#include <assert.h> ++#include <errno.h> ++#include <sys/types.h> ++#include <sys/stat.h> ++#include <pthread.h> ++ ++#include <glusterfs/api/glfs.h> ++ ++/* Based on a reproducer by Stefan Ring. It seems to be quite sensible to any ++ * timing modification, so the code has been maintained as is, only with minor ++ * changes. */ ++ ++struct glfs *glfs; ++ ++pthread_mutex_t the_mutex = PTHREAD_MUTEX_INITIALIZER; ++pthread_cond_t the_cond = PTHREAD_COND_INITIALIZER; ++ ++typedef struct _my_aiocb { ++ int64_t size; ++ volatile int64_t seq; ++ int which; ++} my_aiocb; ++ ++typedef struct _worker_data { ++ my_aiocb cb; ++ struct iovec iov; ++ int64_t offset; ++} worker_data; ++ ++typedef struct { ++ worker_data wdata[2]; ++ ++ volatile unsigned busy; ++} all_data_t; ++ ++all_data_t all_data; ++ ++static void ++completion_fnc(struct glfs_fd *fd, ssize_t ret, struct glfs_stat *pre, ++ struct glfs_stat *post, void *arg) ++{ ++ void *the_thread; ++ my_aiocb *cb = (my_aiocb *)arg; ++ long seq = cb->seq; ++ ++ assert(ret == cb->size); ++ ++ pthread_mutex_lock(&the_mutex); ++ pthread_cond_broadcast(&the_cond); ++ ++ all_data.busy &= ~(1 << cb->which); ++ cb->seq = -1; ++ ++ the_thread = (void *)pthread_self(); ++ printf("worker %d is done from thread %p, seq %ld!\n", cb->which, ++ the_thread, seq); ++ ++ pthread_mutex_unlock(&the_mutex); ++} ++ ++static void ++init_wdata(worker_data *data, int which) ++{ ++ data->cb.which = which; ++ data->cb.seq = -1; ++ ++ data->iov.iov_base = malloc(1024 * 1024); ++ memset(data->iov.iov_base, 6, ++ 1024 * 1024); /* tail part never overwritten */ ++} ++ ++static void ++init() ++{ ++ all_data.busy = 0; ++ ++ init_wdata(&all_data.wdata[0], 0); ++ init_wdata(&all_data.wdata[1], 1); ++} ++ ++static void ++do_write(struct glfs_fd *fd, int content, int size, int64_t seq, ++ worker_data *wdata, const char *name) ++{ ++ int ret; ++ ++ wdata->cb.size = size; ++ wdata->cb.seq = seq; ++ ++ if (content >= 0) ++ memset(wdata->iov.iov_base, content, size); ++ wdata->iov.iov_len = size; ++ ++ pthread_mutex_lock(&the_mutex); ++ printf("(%d) dispatching write \"%s\", offset %lx, len %x, seq %ld\n", ++ wdata->cb.which, name, (long)wdata->offset, size, (long)seq); ++ pthread_mutex_unlock(&the_mutex); ++ ret = glfs_pwritev_async(fd, &wdata->iov, 1, wdata->offset, 0, ++ completion_fnc, &wdata->cb); ++ assert(ret >= 0); ++} ++ ++#define IDLE 0 // both workers must be idle ++#define ANY 1 // use any worker, other one may be busy ++ ++int ++get_worker(int waitfor, int64_t excl_seq) ++{ ++ int which; ++ ++ pthread_mutex_lock(&the_mutex); ++ ++ while (waitfor == IDLE && (all_data.busy & 3) != 0 || ++ waitfor == ANY && ++ ((all_data.busy & 3) == 3 || ++ excl_seq >= 0 && (all_data.wdata[0].cb.seq == excl_seq || ++ all_data.wdata[1].cb.seq == excl_seq))) ++ pthread_cond_wait(&the_cond, &the_mutex); ++ ++ if (!(all_data.busy & 1)) ++ which = 0; ++ else ++ which = 1; ++ ++ all_data.busy |= (1 << which); ++ ++ pthread_mutex_unlock(&the_mutex); ++ ++ return which; ++} ++ ++static int ++doit(struct glfs_fd *fd) ++{ ++ int ret; ++ int64_t seq = 0; ++ int64_t offset = 0; // position in file, in blocks ++ int64_t base = 0x1000; // where to place the data, in blocks ++ ++ int async_mode = ANY; ++ ++ init(); ++ ++ for (;;) { ++ int which; ++ worker_data *wdata; ++ ++ // for growing to the first offset ++ for (;;) { ++ int gap = base + 0x42 - offset; ++ if (!gap) ++ break; ++ if (gap > 80) ++ gap = 80; ++ ++ which = get_worker(IDLE, -1); ++ wdata = &all_data.wdata[which]; ++ ++ wdata->offset = offset << 9; ++ do_write(fd, 0, gap << 9, seq++, wdata, "gap-filling"); ++ ++ offset += gap; ++ } ++ ++ // 8700 ++ which = get_worker(IDLE, -1); ++ wdata = &all_data.wdata[which]; ++ ++ wdata->offset = (base + 0x42) << 9; ++ do_write(fd, 1, 62 << 9, seq++, wdata, "!8700"); ++ ++ // 8701 ++ which = get_worker(IDLE, -1); ++ wdata = &all_data.wdata[which]; ++ ++ wdata->offset = (base + 0x42) << 9; ++ do_write(fd, 2, 55 << 9, seq++, wdata, "!8701"); ++ ++ // 8702 ++ which = get_worker(async_mode, -1); ++ wdata = &all_data.wdata[which]; ++ ++ wdata->offset = (base + 0x79) << 9; ++ do_write(fd, 3, 54 << 9, seq++, wdata, "!8702"); ++ ++ // 8703 ++ which = get_worker(async_mode, -1); ++ wdata = &all_data.wdata[which]; ++ ++ wdata->offset = (base + 0xaf) << 9; ++ do_write(fd, 4, 81 << 9, seq++, wdata, "!8703"); ++ ++ // 8704 ++ // this writes both 5s and 6s ++ // the range of 5s is the one that overwrites 8703 ++ ++ which = get_worker(async_mode, seq - 1); ++ wdata = &all_data.wdata[which]; ++ ++ memset(wdata->iov.iov_base, 5, 81 << 9); ++ wdata->offset = (base + 0xaf) << 9; ++ do_write(fd, -1, 1623 << 9, seq++, wdata, "!8704"); ++ ++ offset = base + 0x706; ++ base += 0x1000; ++ if (base >= 0x100000) ++ break; ++ } ++ ++ printf("done!\n"); ++ fflush(stdout); ++ ++ pthread_mutex_lock(&the_mutex); ++ ++ while ((all_data.busy & 3) != 0) ++ pthread_cond_wait(&the_cond, &the_mutex); ++ ++ pthread_mutex_unlock(&the_mutex); ++ ++ ret = glfs_close(fd); ++ assert(ret >= 0); ++ /* ++ ret = glfs_fini(glfs); ++ assert(ret >= 0); ++ */ ++ return 0; ++} ++ ++int ++main(int argc, char *argv[]) ++{ ++ int ret; ++ int open_flags = O_RDWR | O_DIRECT | O_TRUNC; ++ struct glfs_fd *fd; ++ ++ glfs = glfs_new(argv[1]); ++ if (!glfs) { ++ printf("glfs_new!\n"); ++ goto out; ++ } ++ ret = glfs_set_volfile_server(glfs, "tcp", "localhost", 24007); ++ if (ret < 0) { ++ printf("set_volfile!\n"); ++ goto out; ++ } ++ ret = glfs_init(glfs); ++ if (ret) { ++ printf("init!\n"); ++ goto out; ++ } ++ fd = glfs_open(glfs, argv[2], open_flags); ++ if (!fd) { ++ printf("open!\n"); ++ goto out; ++ } ++ srand(time(NULL)); ++ return doit(fd); ++out: ++ return 1; ++} +diff --git a/tests/bugs/write-behind/issue-884.t b/tests/bugs/write-behind/issue-884.t +new file mode 100755 +index 0000000..2bcf7d1 +--- /dev/null ++++ b/tests/bugs/write-behind/issue-884.t +@@ -0,0 +1,40 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++ ++# This test tries to detect a race condition in write-behind. It's based on a ++# reproducer written by Stefan Ring that is able to hit it sometimes. On my ++# system, it happened around 10% of the runs. This means that if this bug ++# appears again, this test will fail once every 10 runs. Most probably this ++# failure will be hidden by the automatic test retry of the testing framework. ++# ++# Please, if this test fails, it needs to be analyzed in detail. ++ ++function run() { ++ "${@}" >/dev/null ++} ++ ++cleanup ++ ++TEST glusterd ++TEST pidof glusterd ++ ++TEST $CLI volume create $V0 $H0:$B0/$V0 ++# This makes it easier to hit the issue ++TEST $CLI volume set $V0 client-log-level TRACE ++TEST $CLI volume start $V0 ++ ++TEST $GFS --volfile-server=$H0 --volfile-id=$V0 $M0 ++ ++build_tester $(dirname $0)/issue-884.c -lgfapi ++ ++TEST touch $M0/testfile ++ ++# This program generates a file of 535694336 bytes with a fixed pattern ++TEST run $(dirname $0)/issue-884 $V0 testfile ++ ++# This is the md5sum of the expected pattern without corruption ++EXPECT "ad105f9349345a70fc697632cbb5eec8" echo "$(md5sum $B0/$V0/testfile | awk '{ print $1; }')" ++ ++cleanup +diff --git a/xlators/performance/write-behind/src/write-behind.c b/xlators/performance/write-behind/src/write-behind.c +index 70e281a..90a0bcf 100644 +--- a/xlators/performance/write-behind/src/write-behind.c ++++ b/xlators/performance/write-behind/src/write-behind.c +@@ -1284,14 +1284,14 @@ __wb_pick_unwinds(wb_inode_t *wb_inode, list_head_t *lies) + + wb_inode->window_current += req->orig_size; + ++ wb_inode->gen++; ++ + if (!req->ordering.fulfilled) { + /* burden increased */ + list_add_tail(&req->lie, &wb_inode->liability); + + req->ordering.lied = 1; + +- wb_inode->gen++; +- + uuid_utoa_r(req->gfid, gfid); + gf_msg_debug(wb_inode->this->name, 0, + "(unique=%" PRIu64 +-- +1.8.3.1 + |