summaryrefslogtreecommitdiff
path: root/0438-dht-sparse-files-rebalance-enhancements.patch
diff options
context:
space:
mode:
Diffstat (limited to '0438-dht-sparse-files-rebalance-enhancements.patch')
-rw-r--r--0438-dht-sparse-files-rebalance-enhancements.patch324
1 files changed, 324 insertions, 0 deletions
diff --git a/0438-dht-sparse-files-rebalance-enhancements.patch b/0438-dht-sparse-files-rebalance-enhancements.patch
new file mode 100644
index 0000000..6e10ce6
--- /dev/null
+++ b/0438-dht-sparse-files-rebalance-enhancements.patch
@@ -0,0 +1,324 @@
+From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001
+From: Barak Sason Rofman <bsasonro@redhat.com>
+Date: Wed, 6 May 2020 13:28:40 +0300
+Subject: [PATCH 438/449] dht - sparse files rebalance enhancements
+
+Currently data migration in rebalance reads sparse file sequentially,
+disregarding which segments are holes and which are data. This can lead
+to extremely long migration time for large sparse file.
+Data migration mechanism needs to be enhanced so only data segments are
+read and migrated. This can be achieved using lseek to seek for holes
+and data in the file.
+This enhancement is a consequence of
+https://bugzilla.redhat.com/show_bug.cgi?id=1823703
+
+> fixes: #1222
+> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
+> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com>
+> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40)
+> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/)
+
+BUG: 1836099
+Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3
+Signed-off-by: Mohit Agrawal <moagrawa@redhat.com>
+Reviewed-on: https://code.engineering.redhat.com/gerrit/202647
+Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com>
+Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
+---
+ tests/basic/distribute/spare_file_rebalance.t | 51 ++++++++
+ xlators/cluster/dht/src/dht-rebalance.c | 172 ++++++++++++--------------
+ 2 files changed, 130 insertions(+), 93 deletions(-)
+ create mode 100644 tests/basic/distribute/spare_file_rebalance.t
+
+diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t
+new file mode 100644
+index 0000000..061c02f
+--- /dev/null
++++ b/tests/basic/distribute/spare_file_rebalance.t
+@@ -0,0 +1,51 @@
++#!/bin/bash
++
++. $(dirname $0)/../../include.rc
++. $(dirname $0)/../../volume.rc
++. $(dirname $0)/../../dht.rc
++
++# Initialize
++#------------------------------------------------------------
++cleanup;
++
++# Start glusterd
++TEST glusterd;
++TEST pidof glusterd;
++TEST $CLI volume info;
++
++# Create a volume
++TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2};
++
++# Verify volume creation
++EXPECT "$V0" volinfo_field $V0 'Volume Name';
++EXPECT 'Created' volinfo_field $V0 'Status';
++
++# Start volume and verify successful start
++TEST $CLI volume start $V0;
++EXPECT 'Started' volinfo_field $V0 'Status';
++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0;
++
++#------------------------------------------------------------
++
++# Test case - Create sparse files on MP and verify
++# file info after rebalance
++#------------------------------------------------------------
++
++# Create some sparse files and get their size
++TEST cd $M0;
++dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M
++cp --sparse=always sparse_file sparse_file_3;
++
++# Add a 3rd brick
++TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3;
++
++# Trigger rebalance
++TEST $CLI volume rebalance $V0 start force;
++EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed;
++
++# Compare original and rebalanced files
++TEST cd $B0/${V0}2
++TEST cmp sparse_file $B0/${V0}3/sparse_file_3
++EXPECT_WITHIN 30 "";
++
++cleanup;
+diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c
+index 88b6b54..d0c21b4 100644
+--- a/xlators/cluster/dht/src/dht-rebalance.c
++++ b/xlators/cluster/dht/src/dht-rebalance.c
+@@ -18,8 +18,8 @@
+ #include <glusterfs/events.h>
+
+ #define GF_DISK_SECTOR_SIZE 512
+-#define DHT_REBALANCE_PID 4242 /* Change it if required */
+-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */
++#define DHT_REBALANCE_PID 4242 /* Change it if required */
++#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */
+ #define MAX_MIGRATE_QUEUE_COUNT 500
+ #define MIN_MIGRATE_QUEUE_COUNT 200
+ #define MAX_REBAL_TYPE_SIZE 16
+@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict)
+ }
+ }
+
+-static int
+-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count,
+- int32_t size, off_t offset, struct iobref *iobref,
+- int *fop_errno)
+-{
+- int i = 0;
+- int ret = -1;
+- int start_idx = 0;
+- int tmp_offset = 0;
+- int write_needed = 0;
+- int buf_len = 0;
+- int size_pending = 0;
+- char *buf = NULL;
+-
+- /* loop through each vector */
+- for (i = 0; i < count; i++) {
+- buf = vec[i].iov_base;
+- buf_len = vec[i].iov_len;
+-
+- for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len;
+- start_idx += GF_DISK_SECTOR_SIZE) {
+- if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) {
+- write_needed = 1;
+- continue;
+- }
+-
+- if (write_needed) {
+- ret = syncop_write(
+- to, fd, (buf + tmp_offset), (start_idx - tmp_offset),
+- (offset + tmp_offset), iobref, 0, NULL, NULL);
+- /* 'path' will be logged in calling function */
+- if (ret < 0) {
+- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
+- strerror(-ret));
+- *fop_errno = -ret;
+- ret = -1;
+- goto out;
+- }
+-
+- write_needed = 0;
+- }
+- tmp_offset = start_idx + GF_DISK_SECTOR_SIZE;
+- }
+-
+- if ((start_idx < buf_len) || write_needed) {
+- /* This means, last chunk is not yet written.. write it */
+- ret = syncop_write(to, fd, (buf + tmp_offset),
+- (buf_len - tmp_offset), (offset + tmp_offset),
+- iobref, 0, NULL, NULL);
+- if (ret < 0) {
+- /* 'path' will be logged in calling function */
+- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)",
+- strerror(-ret));
+- *fop_errno = -ret;
+- ret = -1;
+- goto out;
+- }
+- }
+-
+- size_pending = (size - buf_len);
+- if (!size_pending)
+- break;
+- }
+-
+- ret = size;
+-out:
+- return ret;
+-}
+-
+ /*
+ return values:
+ -1 : failure
+@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
+ int ret = 0;
+ int count = 0;
+ off_t offset = 0;
++ off_t data_offset = 0;
++ off_t hole_offset = 0;
+ struct iovec *vector = NULL;
+ struct iobref *iobref = NULL;
+ uint64_t total = 0;
+ size_t read_size = 0;
++ size_t data_block_size = 0;
+ dict_t *xdata = NULL;
+ dht_conf_t *conf = NULL;
+
+ conf = this->private;
++
+ /* if file size is '0', no need to enter this loop */
+ while (total < ia_size) {
+- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
+- ? DHT_REBALANCE_BLKSIZE
+- : (ia_size - total));
++ /* This is a regular file - read it sequentially */
++ if (!hole_exists) {
++ read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE)
++ ? DHT_REBALANCE_BLKSIZE
++ : (ia_size - total));
++ } else {
++ /* This is a sparse file - read only the data segments in the file
++ */
++
++ /* If the previous data block is fully copied, find the next data
++ * segment
++ * starting at the offset of the last read and written byte, */
++ if (data_block_size <= 0) {
++ ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL,
++ &data_offset);
++ if (ret) {
++ if (ret == -ENXIO)
++ ret = 0; /* No more data segments */
++ else
++ *fop_errno = -ret; /* Error occurred */
++
++ break;
++ }
++
++ /* If the position of the current data segment is greater than
++ * the position of the next hole, find the next hole in order to
++ * calculate the length of the new data segment */
++ if (data_offset > hole_offset) {
++ /* Starting at the offset of the last data segment, find the
++ * next hole */
++ ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE,
++ NULL, &hole_offset);
++ if (ret) {
++ /* If an error occurred here it's a real error because
++ * if the seek for a data segment was successful then
++ * necessarily another hole must exist (EOF is a hole)
++ */
++ *fop_errno = -ret;
++ break;
++ }
++
++ /* Calculate the total size of the current data block */
++ data_block_size = hole_offset - data_offset;
++ }
++ } else {
++ /* There is still data in the current segment, move the
++ * data_offset to the position of the last written byte */
++ data_offset = offset;
++ }
++
++ /* Calculate how much data needs to be read and written. If the data
++ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and
++ * write DHT_REBALANCE_BLKSIZE data length and the rest in the
++ * next iteration(s) */
++ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE)
++ ? DHT_REBALANCE_BLKSIZE
++ : data_block_size);
++
++ /* Calculate the remaining size of the data block - maybe there's no
++ * need to seek for data in the next iteration */
++ data_block_size -= read_size;
++
++ /* Set offset to the offset of the data segment so read and write
++ * will have the correct position */
++ offset = data_offset;
++ }
+
+ ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count,
+ &iobref, NULL, NULL, NULL);
++
+ if (!ret || (ret < 0)) {
+ *fop_errno = -ret;
+ break;
+ }
+
+- if (hole_exists) {
+- ret = dht_write_with_holes(to, dst, vector, count, ret, offset,
+- iobref, fop_errno);
+- } else {
+- if (!conf->force_migration && !dht_is_tier_xlator(this)) {
++ if (!conf->force_migration && !dht_is_tier_xlator(this)) {
++ if (!xdata) {
+ xdata = dict_new();
+ if (!xdata) {
+ gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED,
+@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
+ * https://github.com/gluster/glusterfs/issues/308
+ * for more details.
+ */
+- ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1);
++ ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1);
+ if (ret) {
+ gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM,
+ "failed to set dict");
+@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag,
+ break;
+ }
+ }
+-
+- ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
+- NULL, xdata, NULL);
+- if (ret < 0) {
+- *fop_errno = -ret;
+- }
+- }
+-
+- if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) &&
+- (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) {
+- gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED,
+- "Migrate file paused");
+- ret = -1;
+ }
+
++ ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL,
++ NULL, xdata, NULL);
+ if (ret < 0) {
++ *fop_errno = -ret;
+ break;
+ }
+
+--
+1.8.3.1
+