0176-features-shard-Fix-crash-during-background-shard-del.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289

From 40ac42501d6bbff7206e753e8e988beefe74f5f4 Mon Sep 17 00:00:00 2001
From: Krutika Dhananjay <kdhananj@redhat.com>
Date: Fri, 5 Apr 2019 10:30:23 +0530
Subject: [PATCH 176/178] features/shard: Fix crash during background shard
 deletion in a specific case

Consider the following case -
1. A file gets FALLOCATE'd such that > "shard-lru-limit" number of
   shards are created.
2. And then it is deleted after that.

The unique thing about FALLOCATE is that unlike WRITE, all of the
participant shards are resolved and created and fallocated in a single
batch. This means, in this case, after the first "shard-lru-limit"
number of shards are resolved and added to lru list, as part of
resolution of the remaining shards, some of the existing shards in lru
list will need to be evicted. So these evicted shards will be
inode_unlink()d as part of eviction. Now once the fop gets to the actual
FALLOCATE stage, the lru'd-out shards get added to fsync list.

2 things to note at this point:
i. the lru'd out shards are only part of fsync list, so each holds 1 ref
   on base shard
ii. and the more recently used shards are part of both fsync and lru list.
    So each of these shards holds 2 refs on base inode  - one for being
    part of fsync list, and the other for being part of lru list.

FALLOCATE completes successfully and then this very file is deleted, and
background shard deletion launched. Here's where the ref counts get mismatched.
First as part of inode_resolve()s during the deletion, the lru'd-out inodes
return NULL, because they are inode_unlink()'d by now. So these inodes need to
be freshly looked up. But as part of linking them in lookup_cbk (precisely in
shard_link_block_inode()), inode_link() returns the lru'd-out inode object.
And its inode ctx is still valid and ctx->base_inode valid from the last
time it was added to list.

But shard_common_lookup_shards_cbk() passes NULL in the place of base_pointer
to __shard_update_shards_inode_list(). This means, as part of adding the lru'd out
inode back to lru list, base inode is not ref'd since its NULL.

Whereas post unlinking this shard, during shard_unlink_block_inode(),
ctx->base_inode is accessible and is unref'd because the shard was found to be part
of LRU list, although the matching ref didn't occur. This at some point leads to
base_inode refcount becoming 0 and it getting destroyed and released back while some
of its associated shards are continuing to be unlinked in parallel and the client crashes
whenever it is accessed next.

Fix is to pass base shard correctly, if available, in shard_link_block_inode().

Also, the patch fixes the ret value check in tests/bugs/shard/shard-fallocate.c

>Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
>Updates: bz#1696136
>Signed-off-by: Krutika Dhananjay <kdhananj@redhat.com>

Upstream Patch: https://review.gluster.org/#/c/glusterfs/+/22507/

BUG: 1694595
Change-Id: Ibd0bc4c6952367608e10701473cbad3947d7559f
Signed-off-by: Sunil Kumar Acharya <sheggodu@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/172856
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Atin Mukherjee <amukherj@redhat.com>
---
 tests/bugs/shard/bug-1696136.c     | 121 +++++++++++++++++++++++++++++++++++++
 tests/bugs/shard/bug-1696136.t     |  33 ++++++++++
 tests/bugs/shard/shard-fallocate.c |   2 +-
 xlators/features/shard/src/shard.c |  12 +++-
 4 files changed, 164 insertions(+), 4 deletions(-)
 create mode 100644 tests/bugs/shard/bug-1696136.c
 create mode 100644 tests/bugs/shard/bug-1696136.t

diff --git a/tests/bugs/shard/bug-1696136.c b/tests/bugs/shard/bug-1696136.c
new file mode 100644
index 0000000..b9e8d13
--- /dev/null
+++ b/tests/bugs/shard/bug-1696136.c
@@ -0,0 +1,121 @@
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <glusterfs/api/glfs.h>
+#include <glusterfs/api/glfs-handles.h>
+
+enum fallocate_flag {
+    TEST_FALLOCATE_NONE,
+    TEST_FALLOCATE_KEEP_SIZE,
+    TEST_FALLOCATE_ZERO_RANGE,
+    TEST_FALLOCATE_PUNCH_HOLE,
+    TEST_FALLOCATE_MAX,
+};
+
+int
+get_fallocate_flag(int opcode)
+{
+    int ret = 0;
+
+    switch (opcode) {
+        case TEST_FALLOCATE_NONE:
+            ret = 0;
+            break;
+        case TEST_FALLOCATE_KEEP_SIZE:
+            ret = FALLOC_FL_KEEP_SIZE;
+            break;
+        case TEST_FALLOCATE_ZERO_RANGE:
+            ret = FALLOC_FL_ZERO_RANGE;
+            break;
+        case TEST_FALLOCATE_PUNCH_HOLE:
+            ret = FALLOC_FL_PUNCH_HOLE;
+            break;
+        default:
+            ret = -1;
+            break;
+    }
+    return ret;
+}
+
+int
+main(int argc, char *argv[])
+{
+    int ret = 1;
+    int opcode = -1;
+    off_t offset = 0;
+    size_t len = 0;
+    glfs_t *fs = NULL;
+    glfs_fd_t *fd = NULL;
+
+    if (argc != 8) {
+        fprintf(stderr,
+                "Syntax: %s <host> <volname> <opcode> <offset> <len> "
+                "<file-path> <log-file>\n",
+                argv[0]);
+        return 1;
+    }
+
+    fs = glfs_new(argv[2]);
+    if (!fs) {
+        fprintf(stderr, "glfs_new: returned NULL\n");
+        return 1;
+    }
+
+    ret = glfs_set_volfile_server(fs, "tcp", argv[1], 24007);
+    if (ret != 0) {
+        fprintf(stderr, "glfs_set_volfile_server: returned %d\n", ret);
+        goto out;
+    }
+
+    ret = glfs_set_logging(fs, argv[7], 7);
+    if (ret != 0) {
+        fprintf(stderr, "glfs_set_logging: returned %d\n", ret);
+        goto out;
+    }
+
+    ret = glfs_init(fs);
+    if (ret != 0) {
+        fprintf(stderr, "glfs_init: returned %d\n", ret);
+        goto out;
+    }
+
+    opcode = atoi(argv[3]);
+    opcode = get_fallocate_flag(opcode);
+    if (opcode < 0) {
+        fprintf(stderr, "get_fallocate_flag: invalid flag \n");
+        goto out;
+    }
+
+    offset = atoi(argv[4]);
+    len = atoi(argv[5]);
+
+    fd = glfs_open(fs, argv[6], O_RDWR);
+    if (fd == NULL) {
+        fprintf(stderr, "glfs_open: returned NULL\n");
+        goto out;
+    }
+
+    ret = glfs_fallocate(fd, opcode, offset, len);
+    if (ret < 0) {
+        fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
+        goto out;
+    }
+
+    ret = glfs_unlink(fs, argv[6]);
+    if (ret < 0) {
+        fprintf(stderr, "glfs_unlink: returned %d\n", ret);
+        goto out;
+    }
+    /* Sleep for 3s to give enough time for background deletion to complete
+     * during which if the bug exists, the process will crash.
+     */
+    sleep(3);
+    ret = 0;
+
+out:
+    if (fd)
+        glfs_close(fd);
+    glfs_fini(fs);
+    return ret;
+}
diff --git a/tests/bugs/shard/bug-1696136.t b/tests/bugs/shard/bug-1696136.t
new file mode 100644
index 0000000..b6dc858
--- /dev/null
+++ b/tests/bugs/shard/bug-1696136.t
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+. $(dirname $0)/../../include.rc
+. $(dirname $0)/../../volume.rc
+. $(dirname $0)/../../fallocate.rc
+
+cleanup
+
+TEST glusterd
+TEST pidof glusterd
+TEST $CLI volume create $V0 replica 3 $H0:$B0/${V0}{0,1,2}
+TEST $CLI volume set $V0 features.shard on
+TEST $CLI volume set $V0 features.shard-block-size 4MB
+TEST $CLI volume set $V0 features.shard-lru-limit 120
+TEST $CLI volume set $V0 performance.write-behind off
+TEST $CLI volume start $V0
+
+TEST $GFS --volfile-id=$V0 --volfile-server=$H0 $M0
+
+TEST build_tester $(dirname $0)/bug-1696136.c -lgfapi -Wall -O2
+
+# Create a file
+TEST touch $M0/file1
+
+# Fallocate a 500M file. This will make sure number of participant shards are > lru-limit
+TEST $(dirname $0)/bug-1696136 $H0 $V0 "0" "0" "536870912" /file1 `gluster --print-logdir`/glfs-$V0.log
+
+EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0
+TEST $CLI volume stop $V0
+TEST $CLI volume delete $V0
+rm -f $(dirname $0)/bug-1696136
+
+cleanup
diff --git a/tests/bugs/shard/shard-fallocate.c b/tests/bugs/shard/shard-fallocate.c
index 3a784d3..45b9ce0 100644
--- a/tests/bugs/shard/shard-fallocate.c
+++ b/tests/bugs/shard/shard-fallocate.c
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
     }
 
     ret = glfs_fallocate(fd, opcode, offset, len);
-    if (ret <= 0) {
+    if (ret < 0) {
         fprintf(stderr, "glfs_fallocate: returned %d\n", ret);
         goto out;
     }
diff --git a/xlators/features/shard/src/shard.c b/xlators/features/shard/src/shard.c
index fa3564a..3c4bcdc 100644
--- a/xlators/features/shard/src/shard.c
+++ b/xlators/features/shard/src/shard.c
@@ -2213,13 +2213,19 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
     xlator_t *this = NULL;
     inode_t *fsync_inode = NULL;
     shard_priv_t *priv = NULL;
+    inode_t *base_inode = NULL;
 
     this = THIS;
     priv = this->private;
-    if (local->loc.inode)
+    if (local->loc.inode) {
         gf_uuid_copy(gfid, local->loc.inode->gfid);
-    else
+        base_inode = local->loc.inode;
+    } else if (local->resolver_base_inode) {
+        gf_uuid_copy(gfid, local->resolver_base_inode->gfid);
+        base_inode = local->resolver_base_inode;
+    } else {
         gf_uuid_copy(gfid, local->base_gfid);
+    }
 
     shard_make_block_bname(block_num, gfid, block_bname, sizeof(block_bname));
 
@@ -2232,7 +2238,7 @@ shard_link_block_inode(shard_local_t *local, int block_num, inode_t *inode,
     LOCK(&priv->lock);
     {
         fsync_inode = __shard_update_shards_inode_list(
-            linked_inode, this, local->loc.inode, block_num, gfid);
+            linked_inode, this, base_inode, block_num, gfid);
     }
     UNLOCK(&priv->lock);
     if (fsync_inode)
-- 
1.8.3.1