diff options
Diffstat (limited to '0271-cluster-ec-fix-EIO-error-for-concurrent-writes-on-sp.patch')
-rw-r--r-- | 0271-cluster-ec-fix-EIO-error-for-concurrent-writes-on-sp.patch | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/0271-cluster-ec-fix-EIO-error-for-concurrent-writes-on-sp.patch b/0271-cluster-ec-fix-EIO-error-for-concurrent-writes-on-sp.patch new file mode 100644 index 0000000..26ec3e7 --- /dev/null +++ b/0271-cluster-ec-fix-EIO-error-for-concurrent-writes-on-sp.patch @@ -0,0 +1,116 @@ +From 52d71ad0e5c27808e7d8eea8a0920298837e408c Mon Sep 17 00:00:00 2001 +From: Xavi Hernandez <xhernandez@redhat.com> +Date: Wed, 17 Jul 2019 14:50:22 +0200 +Subject: [PATCH 271/276] cluster/ec: fix EIO error for concurrent writes on + sparse files + +EC doesn't allow concurrent writes on overlapping areas, they are +serialized. However non-overlapping writes are serviced in parallel. +When a write is not aligned, EC first needs to read the entire chunk +from disk, apply the modified fragment and write it again. + +The problem appears on sparse files because a write to an offset +implicitly creates data on offsets below it (so, in some way, they +are overlapping). For example, if a file is empty and we read 10 bytes +from offset 10, read() will return 0 bytes. Now, if we write one byte +at offset 1M and retry the same read, the system call will return 10 +bytes (all containing 0's). + +So if we have two writes, the first one at offset 10 and the second one +at offset 1M, EC will send both in parallel because they do not overlap. +However, the first one will try to read missing data from the first chunk +(i.e. offsets 0 to 9) to recombine the entire chunk and do the final write. +This read will happen in parallel with the write to 1M. What could happen +is that half of the bricks process the write before the read, and the +half do the read before the write. Some bricks will return 10 bytes of +data while the otherw will return 0 bytes (because the file on the brick +has not been expanded yet). + +When EC tries to recombine the answers from the bricks, it can't, because +it needs more than half consistent answers to recover the data. So this +read fails with EIO error. This error is propagated to the parent write, +which is aborted and EIO is returned to the application. + +The issue happened because EC assumed that a write to a given offset +implies that offsets below it exist. + +This fix prevents the read of the chunk from bricks if the current size +of the file is smaller than the read chunk offset. This size is +correctly tracked, so this fixes the issue. + +Also modifying ec-stripe.t file for Test #13 within it. +In this patch, if a file size is less than the offset we are writing, we +fill zeros in head and tail and do not consider it strip cache miss. +That actually make sense as we know what data that part holds and there is +no need of reading it from bricks. + +Upstream-patch: https://review.gluster.org/c/glusterfs/+/23066 +Change-Id: Ic342e8c35c555b8534109e9314c9a0710b6225d6 +fixes: bz#1731448 +Signed-off-by: Xavi Hernandez <xhernandez@redhat.com> +Reviewed-on: https://code.engineering.redhat.com/gerrit/177975 +Tested-by: RHGS Build Bot <nigelb@redhat.com> +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> +--- + tests/basic/ec/ec-stripe.t | 2 +- + xlators/cluster/ec/src/ec-inode-write.c | 26 +++++++++++++++++--------- + 2 files changed, 18 insertions(+), 10 deletions(-) + +diff --git a/tests/basic/ec/ec-stripe.t b/tests/basic/ec/ec-stripe.t +index 1e940eb..98b9229 100644 +--- a/tests/basic/ec/ec-stripe.t ++++ b/tests/basic/ec/ec-stripe.t +@@ -202,7 +202,7 @@ TEST truncate -s 0 $B0/test_file + TEST truncate -s 0 $M0/test_file + TEST dd if=$B0/misc_file of=$B0/test_file bs=1022 count=5 oflag=seek_bytes,sync seek=400 conv=notrunc + TEST dd if=$B0/misc_file of=$M0/test_file bs=1022 count=5 oflag=seek_bytes,sync seek=400 conv=notrunc +-check_statedump_md5sum 4 5 ++check_statedump_md5sum 4 4 + clean_file_unmount + + ### 14 - Truncate to invalidate all but one the stripe in cache #### +diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c +index ea55140..a45e6d6 100644 +--- a/xlators/cluster/ec/src/ec-inode-write.c ++++ b/xlators/cluster/ec/src/ec-inode-write.c +@@ -2013,20 +2013,28 @@ ec_writev_start(ec_fop_data_t *fop) + if (err != 0) { + goto failed_fd; + } ++ tail = fop->size - fop->user_size - fop->head; + if (fop->head > 0) { +- found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD); +- if (!found_stripe) { +- if (ec_make_internal_fop_xdata(&xdata)) { +- err = -ENOMEM; +- goto failed_xdata; ++ if (current > fop->offset) { ++ found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD); ++ if (!found_stripe) { ++ if (ec_make_internal_fop_xdata(&xdata)) { ++ err = -ENOMEM; ++ goto failed_xdata; ++ } ++ ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, ++ ec_writev_merge_head, NULL, fd, ec->stripe_size, ++ fop->offset, 0, xdata); ++ } ++ } else { ++ memset(fop->vector[0].iov_base, 0, fop->head); ++ memset(fop->vector[0].iov_base + fop->size - tail, 0, tail); ++ if (ec->stripe_cache && (fop->size <= ec->stripe_size)) { ++ ec_add_stripe_in_cache(ec, fop); + } +- ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN, +- ec_writev_merge_head, NULL, fd, ec->stripe_size, +- fop->offset, 0, xdata); + } + } + +- tail = fop->size - fop->user_size - fop->head; + if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) { + /* Current locking scheme will make sure the 'current' below will + * never decrease while the fop is in progress, so the checks will +-- +1.8.3.1 + |