1 files changed, 324 insertions, 0 deletions
diff --git a/0438-dht-sparse-files-rebalance-enhancements.patch b/0438-dht-sparse-files-rebalance-enhancements.patch
new file mode 100644
index 0000000..6e10ce6
--- /dev/null
+++ b/0438-dht-sparse-files-rebalance-enhancements.patch
@@ -0,0 +1,324 @@
+From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
+From: Barak Sason Rofman <bsasonro@redhat.com>
+Date: Wed, 6 May 2020 13:28:40 +0300
+Subject: [PATCH 438/449] dht - sparse files rebalance enhancements
+
+Currently data migration in rebalance reads sparse file sequentially,
+disregarding which segments are holes and which are data. This can lead
+to extremely long migration time for large sparse file.
+Data migration mechanism needs to be enhanced so only data segments are
+read and migrated. This can be achieved using lseek to seek for holes
+and data in the file.
+This enhancement is a consequence of
+https://bugzilla.redhat.com/show_bug.cgi?id=1823703
+
+> fixes: #1222
+> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
+> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
+> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
+> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)
+
+BUG: 1836099
+Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
+Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
+Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
+Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
+---
+ tests/basic/distribute/spare_file_rebalance.t |  51 ++++++++
+ xlators/cluster/dht/src/dht-rebalance.c       | 172 ++++++++++++--------------
+ 2 files changed, 130 insertions(+), 93 deletions(-)
+ create mode 100644 tests/basic/distribute/spare_file_rebalance.t
+
+diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
+new file mode 100644
+index 0000000..061c02f
+--- /dev/null
++++ b/tests/basic/distribute/spare_file_rebalance.t
+@@ -0,0 +1,51 @@
++#!/bin/bash
++
++. $(dirname $0)/../../include.rc
++. $(dirname $0)/../../volume.rc
++. $(dirname $0)/../../dht.rc
++
++# Initialize
++#------------------------------------------------------------
++cleanup;
++
++# Start glusterd
++TEST glusterd;
++TEST pidof glusterd;
++TEST $CLI volume info;
++
++# Create a volume
++TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
++
++# Verify volume creation
++EXPECT "$V0" volinfo_field $V0 'Volume Name';
++EXPECT 'Created' volinfo_field $V0 'Status';
++
++# Start volume and verify successful start
++TEST $CLI volume start $V0;
++EXPECT 'Started' volinfo_field $V0 'Status';
++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
++
++#------------------------------------------------------------
++
++# Test case - Create sparse files on MP and verify
++# file info after rebalance
++#------------------------------------------------------------
++
++# Create some sparse files and get their size
++TEST cd $M0;
++dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
++cp --sparse=always sparse_file sparse_file_3;
++
++# Add a 3rd brick
++TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
++
++# Trigger rebalance
++TEST $CLI volume rebalance $V0 start force;
++EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
++
++# Compare original and rebalanced files
++TEST cd $B0/${V0}2
++TEST cmp sparse_file $B0/${V0}3/sparse_file_3
++EXPECT_WITHIN 30 "";
++
++cleanup;
+diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
+index 88b6b54..d0c21b4 100644
+--- a/xlators/cluster/dht/src/dht-rebalance.c
++++ b/xlators/cluster/dht/src/dht-rebalance.c
+@@ -18,8 +18,8 @@
+ #include <glusterfs/events.h>
+ 
+ #define GF_DISK_SECTOR_SIZE 512
+-#define DHT_REBALANCE_PID 4242              /* Change it if required */
+-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
++#define DHT_REBALANCE_PID 4242        /* Change it if required */
++#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
+ #define MAX_MIGRATE_QUEUE_COUNT 500
+ #define MIN_MIGRATE_QUEUE_COUNT 200
+ #define MAX_REBAL_TYPE_SIZE 16
+@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
+     }
+ }
+ 
+-static int
+-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
+-                     int32_t size, off_t offset, struct iobref *iobref,
+-                     int *fop_errno)
+-{
+-    int i = 0;
+-    int ret = -1;
+-    int start_idx = 0;
+-    int tmp_offset = 0;
+-    int write_needed = 0;
+-    int buf_len = 0;
+-    int size_pending = 0;
+-    char *buf = NULL;
+-
+-    /* loop through each vector */
+-    for (i = 0; i < count; i++) {
+-        buf = vec[i].iov_base;
+-        buf_len = vec[i].iov_len;
+-
+-        for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
+-             start_idx += GF_DISK_SECTOR_SIZE) {
+-            if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
+-                write_needed = 1;
+-                continue;
+-            }
+-
+-            if (write_needed) {
+-                ret = syncop_write(
+-                    to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
+-                    (offset + tmp_offset), iobref, 0, NULL, NULL);
+-                /* 'path' will be logged in calling function */
+-                if (ret < 0) {
+-                    gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
+-                           strerror(-ret));
+-                    *fop_errno = -ret;
+-                    ret = -1;
+-                    goto out;
+-                }
+-
+-                write_needed = 0;
+-            }
+-            tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
+-        }
+-
+-        if ((start_idx < buf_len) || write_needed) {
+-            /* This means, last chunk is not yet written.. write it */
+-            ret = syncop_write(to, fd, (buf + tmp_offset),
+-                               (buf_len - tmp_offset), (offset + tmp_offset),
+-                               iobref, 0, NULL, NULL);
+-            if (ret < 0) {
+-                /* 'path' will be logged in calling function */
+-                gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
+-                       strerror(-ret));
+-                *fop_errno = -ret;
+-                ret = -1;
+-                goto out;
+-            }
+-        }
+-
+-        size_pending = (size - buf_len);
+-        if (!size_pending)
+-            break;
+-    }
+-
+-    ret = size;
+-out:
+-    return ret;
+-}
+-
+ /*
+    return values:
+    -1 : failure
+@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
+     int ret = 0;
+     int count = 0;
+     off_t offset = 0;
++    off_t data_offset = 0;
++    off_t hole_offset = 0;
+     struct iovec *vector = NULL;
+     struct iobref *iobref = NULL;
+     uint64_t total = 0;
+     size_t read_size = 0;
++    size_t data_block_size = 0;
+     dict_t *xdata = NULL;
+     dht_conf_t *conf = NULL;
+ 
+     conf = this->private;
++
+     /* if file size is '0', no need to enter this loop */
+     while (total < ia_size) {
+-        read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
+-                         ? DHT_REBALANCE_BLKSIZE
+-                         : (ia_size - total));
++        /* This is a regular file - read it sequentially */
++        if (!hole_exists) {
++            read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
++                             ? DHT_REBALANCE_BLKSIZE
++                             : (ia_size - total));
++        } else {
++            /* This is a sparse file - read only the data segments in the file
++             */
++
++            /* If the previous data block is fully copied, find the next data
++             * segment
++             * starting at the offset of the last read and written byte,  */
++            if (data_block_size <= 0) {
++                ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
++                                  &data_offset);
++                if (ret) {
++                    if (ret == -ENXIO)
++                        ret = 0; /* No more data segments */
++                    else
++                        *fop_errno = -ret; /* Error occurred */
++
++                    break;
++                }
++
++                /* If the position of the current data segment is greater than
++                 * the position of the next hole, find the next hole in order to
++                 * calculate the length of the new data segment */
++                if (data_offset > hole_offset) {
++                    /* Starting at the offset of the last data segment, find the
++                     * next hole */
++                    ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
++                                      NULL, &hole_offset);
++                    if (ret) {
++                        /* If an error occurred here it's a real error because
++                         * if the seek for a data segment was successful then
++                         * necessarily another hole must exist (EOF is a hole)
++                         */
++                        *fop_errno = -ret;
++                        break;
++                    }
++
++                    /* Calculate the total size of the current data block */
++                    data_block_size = hole_offset - data_offset;
++                }
++            } else {
++                /* There is still data in the current segment, move the
++                 * data_offset to the position of the last written byte */
++                data_offset = offset;
++            }
++
++            /* Calculate how much data needs to be read and written. If the data
++             * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
++             * write DHT_REBALANCE_BLKSIZE data length and the rest in the
++             * next iteration(s) */
++            read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
++                             ? DHT_REBALANCE_BLKSIZE
++                             : data_block_size);
++
++            /* Calculate the remaining size of the data block - maybe there's no
++             * need to seek for data in the next iteration */
++            data_block_size -= read_size;
++
++            /* Set offset to the offset of the data segment so read and write
++             * will have the correct position */
++            offset = data_offset;
++        }
+ 
+         ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
+                            &iobref, NULL, NULL, NULL);
++
+         if (!ret || (ret < 0)) {
+             *fop_errno = -ret;
+             break;
+         }
+ 
+-        if (hole_exists) {
+-            ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
+-                                       iobref, fop_errno);
+-        } else {
+-            if (!conf->force_migration && !dht_is_tier_xlator(this)) {
++        if (!conf->force_migration && !dht_is_tier_xlator(this)) {
++            if (!xdata) {
+                 xdata = dict_new();
+                 if (!xdata) {
+                     gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
+                  * https://github.com/gluster/glusterfs/issues/308
+                  * for more details.
+                  */
+-                ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
++                ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
+                 if (ret) {
+                     gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
+                            "failed to set dict");
+@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
+                     break;
+                 }
+             }
+-
+-            ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
+-                                NULL, xdata, NULL);
+-            if (ret < 0) {
+-                *fop_errno = -ret;
+-            }
+-        }
+-
+-        if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
+-            (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
+-            gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
+-                   "Migrate file paused");
+-            ret = -1;
+         }
+ 
++        ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
++                            NULL, xdata, NULL);
+         if (ret < 0) {
++            *fop_errno = -ret;
+             break;
+         }
+ 
+-- 
+1.8.3.1
+