diff options
Diffstat (limited to '0438-dht-sparse-files-rebalance-enhancements.patch')
-rw-r--r-- | 0438-dht-sparse-files-rebalance-enhancements.patch | 324 |
1 files changed, 324 insertions, 0 deletions
diff --git a/0438-dht-sparse-files-rebalance-enhancements.patch b/0438-dht-sparse-files-rebalance-enhancements.patch new file mode 100644 index 0000000..6e10ce6 --- /dev/null +++ b/0438-dht-sparse-files-rebalance-enhancements.patch @@ -0,0 +1,324 @@ +From 7b2f1bd4e5a57ea3abd5f14a7d81b120735faecd Mon Sep 17 00:00:00 2001 +From: Barak Sason Rofman <bsasonro@redhat.com> +Date: Wed, 6 May 2020 13:28:40 +0300 +Subject: [PATCH 438/449] dht - sparse files rebalance enhancements + +Currently data migration in rebalance reads sparse file sequentially, +disregarding which segments are holes and which are data. This can lead +to extremely long migration time for large sparse file. +Data migration mechanism needs to be enhanced so only data segments are +read and migrated. This can be achieved using lseek to seek for holes +and data in the file. +This enhancement is a consequence of +https://bugzilla.redhat.com/show_bug.cgi?id=1823703 + +> fixes: #1222 +> Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3 +> Signed-off-by: Barak Sason Rofman <bsasonro@redhat.com> +> (Cherry pick from commit 7b7559733ca0c25c63f9d56cb7f4650dbd694c40) +> (Reviewed on upstream link https://review.gluster.org/#/c/glusterfs/+/24409/) + +BUG: 1836099 +Change-Id: If5f448a0c532926464e1f34f504c5c94749b08c3 +Signed-off-by: Mohit Agrawal <moagrawa@redhat.com> +Reviewed-on: https://code.engineering.redhat.com/gerrit/202647 +Reviewed-by: Barak Sason Rofman <bsasonro@redhat.com> +Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com> +--- + tests/basic/distribute/spare_file_rebalance.t | 51 ++++++++ + xlators/cluster/dht/src/dht-rebalance.c | 172 ++++++++++++-------------- + 2 files changed, 130 insertions(+), 93 deletions(-) + create mode 100644 tests/basic/distribute/spare_file_rebalance.t + +diff --git a/tests/basic/distribute/spare_file_rebalance.t b/tests/basic/distribute/spare_file_rebalance.t +new file mode 100644 +index 0000000..061c02f +--- /dev/null ++++ b/tests/basic/distribute/spare_file_rebalance.t +@@ -0,0 +1,51 @@ ++#!/bin/bash ++ ++. $(dirname $0)/../../include.rc ++. $(dirname $0)/../../volume.rc ++. $(dirname $0)/../../dht.rc ++ ++# Initialize ++#------------------------------------------------------------ ++cleanup; ++ ++# Start glusterd ++TEST glusterd; ++TEST pidof glusterd; ++TEST $CLI volume info; ++ ++# Create a volume ++TEST $CLI volume create $V0 $H0:$B0/${V0}{1,2}; ++ ++# Verify volume creation ++EXPECT "$V0" volinfo_field $V0 'Volume Name'; ++EXPECT 'Created' volinfo_field $V0 'Status'; ++ ++# Start volume and verify successful start ++TEST $CLI volume start $V0; ++EXPECT 'Started' volinfo_field $V0 'Status'; ++TEST glusterfs --volfile-id=$V0 --volfile-server=$H0 --entry-timeout=0 $M0; ++ ++#------------------------------------------------------------ ++ ++# Test case - Create sparse files on MP and verify ++# file info after rebalance ++#------------------------------------------------------------ ++ ++# Create some sparse files and get their size ++TEST cd $M0; ++dd if=/dev/urandom of=sparse_file bs=10k count=1 seek=2M ++cp --sparse=always sparse_file sparse_file_3; ++ ++# Add a 3rd brick ++TEST $CLI volume add-brick $V0 $H0:$B0/${V0}3; ++ ++# Trigger rebalance ++TEST $CLI volume rebalance $V0 start force; ++EXPECT_WITHIN $REBALANCE_TIMEOUT "0" rebalance_completed; ++ ++# Compare original and rebalanced files ++TEST cd $B0/${V0}2 ++TEST cmp sparse_file $B0/${V0}3/sparse_file_3 ++EXPECT_WITHIN 30 ""; ++ ++cleanup; +diff --git a/xlators/cluster/dht/src/dht-rebalance.c b/xlators/cluster/dht/src/dht-rebalance.c +index 88b6b54..d0c21b4 100644 +--- a/xlators/cluster/dht/src/dht-rebalance.c ++++ b/xlators/cluster/dht/src/dht-rebalance.c +@@ -18,8 +18,8 @@ + #include <glusterfs/events.h> + + #define GF_DISK_SECTOR_SIZE 512 +-#define DHT_REBALANCE_PID 4242 /* Change it if required */ +-#define DHT_REBALANCE_BLKSIZE (1024 * 1024) /* 1 MB */ ++#define DHT_REBALANCE_PID 4242 /* Change it if required */ ++#define DHT_REBALANCE_BLKSIZE 1048576 /* 1 MB */ + #define MAX_MIGRATE_QUEUE_COUNT 500 + #define MIN_MIGRATE_QUEUE_COUNT 200 + #define MAX_REBAL_TYPE_SIZE 16 +@@ -178,75 +178,6 @@ dht_strip_out_acls(dict_t *dict) + } + } + +-static int +-dht_write_with_holes(xlator_t *to, fd_t *fd, struct iovec *vec, int count, +- int32_t size, off_t offset, struct iobref *iobref, +- int *fop_errno) +-{ +- int i = 0; +- int ret = -1; +- int start_idx = 0; +- int tmp_offset = 0; +- int write_needed = 0; +- int buf_len = 0; +- int size_pending = 0; +- char *buf = NULL; +- +- /* loop through each vector */ +- for (i = 0; i < count; i++) { +- buf = vec[i].iov_base; +- buf_len = vec[i].iov_len; +- +- for (start_idx = 0; (start_idx + GF_DISK_SECTOR_SIZE) <= buf_len; +- start_idx += GF_DISK_SECTOR_SIZE) { +- if (mem_0filled(buf + start_idx, GF_DISK_SECTOR_SIZE) != 0) { +- write_needed = 1; +- continue; +- } +- +- if (write_needed) { +- ret = syncop_write( +- to, fd, (buf + tmp_offset), (start_idx - tmp_offset), +- (offset + tmp_offset), iobref, 0, NULL, NULL); +- /* 'path' will be logged in calling function */ +- if (ret < 0) { +- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)", +- strerror(-ret)); +- *fop_errno = -ret; +- ret = -1; +- goto out; +- } +- +- write_needed = 0; +- } +- tmp_offset = start_idx + GF_DISK_SECTOR_SIZE; +- } +- +- if ((start_idx < buf_len) || write_needed) { +- /* This means, last chunk is not yet written.. write it */ +- ret = syncop_write(to, fd, (buf + tmp_offset), +- (buf_len - tmp_offset), (offset + tmp_offset), +- iobref, 0, NULL, NULL); +- if (ret < 0) { +- /* 'path' will be logged in calling function */ +- gf_log(THIS->name, GF_LOG_WARNING, "failed to write (%s)", +- strerror(-ret)); +- *fop_errno = -ret; +- ret = -1; +- goto out; +- } +- } +- +- size_pending = (size - buf_len); +- if (!size_pending) +- break; +- } +- +- ret = size; +-out: +- return ret; +-} +- + /* + return values: + -1 : failure +@@ -1101,32 +1032,97 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + int ret = 0; + int count = 0; + off_t offset = 0; ++ off_t data_offset = 0; ++ off_t hole_offset = 0; + struct iovec *vector = NULL; + struct iobref *iobref = NULL; + uint64_t total = 0; + size_t read_size = 0; ++ size_t data_block_size = 0; + dict_t *xdata = NULL; + dht_conf_t *conf = NULL; + + conf = this->private; ++ + /* if file size is '0', no need to enter this loop */ + while (total < ia_size) { +- read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) +- ? DHT_REBALANCE_BLKSIZE +- : (ia_size - total)); ++ /* This is a regular file - read it sequentially */ ++ if (!hole_exists) { ++ read_size = (((ia_size - total) > DHT_REBALANCE_BLKSIZE) ++ ? DHT_REBALANCE_BLKSIZE ++ : (ia_size - total)); ++ } else { ++ /* This is a sparse file - read only the data segments in the file ++ */ ++ ++ /* If the previous data block is fully copied, find the next data ++ * segment ++ * starting at the offset of the last read and written byte, */ ++ if (data_block_size <= 0) { ++ ret = syncop_seek(from, src, offset, GF_SEEK_DATA, NULL, ++ &data_offset); ++ if (ret) { ++ if (ret == -ENXIO) ++ ret = 0; /* No more data segments */ ++ else ++ *fop_errno = -ret; /* Error occurred */ ++ ++ break; ++ } ++ ++ /* If the position of the current data segment is greater than ++ * the position of the next hole, find the next hole in order to ++ * calculate the length of the new data segment */ ++ if (data_offset > hole_offset) { ++ /* Starting at the offset of the last data segment, find the ++ * next hole */ ++ ret = syncop_seek(from, src, data_offset, GF_SEEK_HOLE, ++ NULL, &hole_offset); ++ if (ret) { ++ /* If an error occurred here it's a real error because ++ * if the seek for a data segment was successful then ++ * necessarily another hole must exist (EOF is a hole) ++ */ ++ *fop_errno = -ret; ++ break; ++ } ++ ++ /* Calculate the total size of the current data block */ ++ data_block_size = hole_offset - data_offset; ++ } ++ } else { ++ /* There is still data in the current segment, move the ++ * data_offset to the position of the last written byte */ ++ data_offset = offset; ++ } ++ ++ /* Calculate how much data needs to be read and written. If the data ++ * segment's length is bigger than DHT_REBALANCE_BLKSIZE, read and ++ * write DHT_REBALANCE_BLKSIZE data length and the rest in the ++ * next iteration(s) */ ++ read_size = ((data_block_size > DHT_REBALANCE_BLKSIZE) ++ ? DHT_REBALANCE_BLKSIZE ++ : data_block_size); ++ ++ /* Calculate the remaining size of the data block - maybe there's no ++ * need to seek for data in the next iteration */ ++ data_block_size -= read_size; ++ ++ /* Set offset to the offset of the data segment so read and write ++ * will have the correct position */ ++ offset = data_offset; ++ } + + ret = syncop_readv(from, src, read_size, offset, 0, &vector, &count, + &iobref, NULL, NULL, NULL); ++ + if (!ret || (ret < 0)) { + *fop_errno = -ret; + break; + } + +- if (hole_exists) { +- ret = dht_write_with_holes(to, dst, vector, count, ret, offset, +- iobref, fop_errno); +- } else { +- if (!conf->force_migration && !dht_is_tier_xlator(this)) { ++ if (!conf->force_migration && !dht_is_tier_xlator(this)) { ++ if (!xdata) { + xdata = dict_new(); + if (!xdata) { + gf_msg("dht", GF_LOG_ERROR, 0, DHT_MSG_MIGRATE_FILE_FAILED, +@@ -1146,7 +1142,7 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + * https://github.com/gluster/glusterfs/issues/308 + * for more details. + */ +- ret = dict_set_int32(xdata, GF_AVOID_OVERWRITE, 1); ++ ret = dict_set_int32_sizen(xdata, GF_AVOID_OVERWRITE, 1); + if (ret) { + gf_msg("dht", GF_LOG_ERROR, 0, ENOMEM, + "failed to set dict"); +@@ -1155,22 +1151,12 @@ __dht_rebalance_migrate_data(xlator_t *this, gf_defrag_info_t *defrag, + break; + } + } +- +- ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL, +- NULL, xdata, NULL); +- if (ret < 0) { +- *fop_errno = -ret; +- } +- } +- +- if ((defrag && defrag->cmd == GF_DEFRAG_CMD_START_TIER) && +- (gf_defrag_get_pause_state(&defrag->tier_conf) != TIER_RUNNING)) { +- gf_msg("tier", GF_LOG_INFO, 0, DHT_MSG_TIER_PAUSED, +- "Migrate file paused"); +- ret = -1; + } + ++ ret = syncop_writev(to, dst, vector, count, offset, iobref, 0, NULL, ++ NULL, xdata, NULL); + if (ret < 0) { ++ *fop_errno = -ret; + break; + } + +-- +1.8.3.1 + |