1 files changed, 116 insertions, 0 deletions
diff --git a/0271-cluster-ec-fix-EIO-error-for-concurrent-writes-on-sp.patch b/0271-cluster-ec-fix-EIO-error-for-concurrent-writes-on-sp.patch
new file mode 100644
index 0000000..26ec3e7
--- /dev/null
+++ b/0271-cluster-ec-fix-EIO-error-for-concurrent-writes-on-sp.patch
@@ -0,0 +1,116 @@
+From 52d71ad0e5c27808e7d8eea8a0920298837e408c Mon Sep 17 00:00:00 2001
+From: Xavi Hernandez <xhernandez@redhat.com>
+Date: Wed, 17 Jul 2019 14:50:22 +0200
+Subject: [PATCH 271/276] cluster/ec: fix EIO error for concurrent writes on
+ sparse files
+
+EC doesn't allow concurrent writes on overlapping areas, they are
+serialized. However non-overlapping writes are serviced in parallel.
+When a write is not aligned, EC first needs to read the entire chunk
+from disk, apply the modified fragment and write it again.
+
+The problem appears on sparse files because a write to an offset
+implicitly creates data on offsets below it (so, in some way, they
+are overlapping). For example, if a file is empty and we read 10 bytes
+from offset 10, read() will return 0 bytes. Now, if we write one byte
+at offset 1M and retry the same read, the system call will return 10
+bytes (all containing 0's).
+
+So if we have two writes, the first one at offset 10 and the second one
+at offset 1M, EC will send both in parallel because they do not overlap.
+However, the first one will try to read missing data from the first chunk
+(i.e. offsets 0 to 9) to recombine the entire chunk and do the final write.
+This read will happen in parallel with the write to 1M. What could happen
+is that half of the bricks process the write before the read, and the
+half do the read before the write. Some bricks will return 10 bytes of
+data while the otherw will return 0 bytes (because the file on the brick
+has not been expanded yet).
+
+When EC tries to recombine the answers from the bricks, it can't, because
+it needs more than half consistent answers to recover the data. So this
+read fails with EIO error. This error is propagated to the parent write,
+which is aborted and EIO is returned to the application.
+
+The issue happened because EC assumed that a write to a given offset
+implies that offsets below it exist.
+
+This fix prevents the read of the chunk from bricks if the current size
+of the file is smaller than the read chunk offset. This size is
+correctly tracked, so this fixes the issue.
+
+Also modifying ec-stripe.t file for Test #13 within it.
+In this patch, if a file size is less than the offset we are writing, we
+fill zeros in head and tail and do not consider it strip cache miss.
+That actually make sense as we know what data that part holds and there is
+no need of reading it from bricks.
+
+Upstream-patch: https://review.gluster.org/c/glusterfs/+/23066
+Change-Id: Ic342e8c35c555b8534109e9314c9a0710b6225d6
+fixes: bz#1731448
+Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/177975
+Tested-by: RHGS Build Bot <nigelb@redhat.com>
+Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
+---
+ tests/basic/ec/ec-stripe.t              |  2 +-
+ xlators/cluster/ec/src/ec-inode-write.c | 26 +++++++++++++++++---------
+ 2 files changed, 18 insertions(+), 10 deletions(-)
+
+diff --git a/tests/basic/ec/ec-stripe.t b/tests/basic/ec/ec-stripe.t
+index 1e940eb..98b9229 100644
+--- a/tests/basic/ec/ec-stripe.t
++++ b/tests/basic/ec/ec-stripe.t
+@@ -202,7 +202,7 @@ TEST truncate -s 0 $B0/test_file
+ TEST truncate -s 0 $M0/test_file
+ TEST dd if=$B0/misc_file of=$B0/test_file  bs=1022 count=5  oflag=seek_bytes,sync seek=400 conv=notrunc
+ TEST dd if=$B0/misc_file of=$M0/test_file  bs=1022 count=5  oflag=seek_bytes,sync seek=400 conv=notrunc
+-check_statedump_md5sum 4 5
++check_statedump_md5sum 4 4
+ clean_file_unmount
+ 
+ ### 14 - Truncate to invalidate  all but one the stripe in cache  ####
+diff --git a/xlators/cluster/ec/src/ec-inode-write.c b/xlators/cluster/ec/src/ec-inode-write.c
+index ea55140..a45e6d6 100644
+--- a/xlators/cluster/ec/src/ec-inode-write.c
++++ b/xlators/cluster/ec/src/ec-inode-write.c
+@@ -2013,20 +2013,28 @@ ec_writev_start(ec_fop_data_t *fop)
+     if (err != 0) {
+         goto failed_fd;
+     }
++    tail = fop->size - fop->user_size - fop->head;
+     if (fop->head > 0) {
+-        found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
+-        if (!found_stripe) {
+-            if (ec_make_internal_fop_xdata(&xdata)) {
+-                err = -ENOMEM;
+-                goto failed_xdata;
++        if (current > fop->offset) {
++            found_stripe = ec_get_and_merge_stripe(ec, fop, EC_STRIPE_HEAD);
++            if (!found_stripe) {
++                if (ec_make_internal_fop_xdata(&xdata)) {
++                    err = -ENOMEM;
++                    goto failed_xdata;
++                }
++                ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
++                         ec_writev_merge_head, NULL, fd, ec->stripe_size,
++                         fop->offset, 0, xdata);
++            }
++        } else {
++            memset(fop->vector[0].iov_base, 0, fop->head);
++            memset(fop->vector[0].iov_base + fop->size - tail, 0, tail);
++            if (ec->stripe_cache && (fop->size <= ec->stripe_size)) {
++                ec_add_stripe_in_cache(ec, fop);
+             }
+-            ec_readv(fop->frame, fop->xl, -1, EC_MINIMUM_MIN,
+-                     ec_writev_merge_head, NULL, fd, ec->stripe_size,
+-                     fop->offset, 0, xdata);
+         }
+     }
+ 
+-    tail = fop->size - fop->user_size - fop->head;
+     if ((tail > 0) && ((fop->head == 0) || (fop->size > ec->stripe_size))) {
+         /* Current locking scheme will make sure the 'current' below will
+          * never decrease while the fop is in progress, so the checks will
+-- 
+1.8.3.1
+