summaryrefslogtreecommitdiff
path: root/0514-afr-event-gen-changes.patch
blob: 9f9562e307f2da00ab8dce77e33377ac7fb18f99 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
From 4c47d6dd7c5ddcaa2a1e159427c0f6713fd33907 Mon Sep 17 00:00:00 2001
From: karthik-us <ksubrahm@redhat.com>
Date: Wed, 23 Dec 2020 14:57:51 +0530
Subject: [PATCH 514/517] afr: event gen changes

The general idea of the changes is to prevent resetting event generation
to zero in the inode ctx, since event gen is something that should
follow 'causal order'.

Change #1:
For a read txn, in inode refresh cbk, if event_generation is
found zero, we are failing the read fop. This is not needed
because change in event gen is only a marker for the next inode refresh to
happen and should not be taken into account by the current read txn.

Change #2:
The event gen being zero above can happen if there is a racing lookup,
which resets even get (in afr_lookup_done) if there are non zero afr
xattrs. The resetting is done only to trigger an inode refresh and a
possible client side heal on the next lookup. That can be acheived by
setting the need_refresh flag in the inode ctx. So replaced all
occurences of resetting even gen to zero with a call to
afr_inode_need_refresh_set().

Change #3:
In both lookup and discover path, we are doing an inode refresh which is
not required since all 3 essentially do the same thing- update the inode
ctx with the good/bad copies from the brick replies. Inode refresh also
triggers background heals, but I think it is okay to do it when we call
refresh during the read and write txns and not in the lookup path.

The .ts which relied on inode refresh in lookup path to trigger heals are
now changed to do read txn so that inode refresh and the heal happens.

Upstream patch details:
> Change-Id: Iebf39a9be6ffd7ffd6e4046c96b0fa78ade6c5ec
> Fixes: #1179
> Signed-off-by: Ravishankar N <ravishankar@redhat.com>
> Reported-by: Erik Jacobson <erik.jacobson at hpe.com>
Upstream patch: https://review.gluster.org/#/c/glusterfs/+/24316/

BUG: 1640148
Change-Id: Iebf39a9be6ffd7ffd6e4046c96b0fa78ade6c5ec
Signed-off-by: karthik-us <ksubrahm@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/222074
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Ravishankar Narayanankutty <ravishankar@redhat.com>
---
 ...fid-mismatch-resolution-with-fav-child-policy.t |  8 +-
 xlators/cluster/afr/src/afr-common.c               | 92 +++++-----------------
 xlators/cluster/afr/src/afr-dir-write.c            |  6 +-
 xlators/cluster/afr/src/afr.h                      |  5 +-
 4 files changed, 29 insertions(+), 82 deletions(-)

diff --git a/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t b/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t
index f4aa351..12af0c8 100644
--- a/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t
+++ b/tests/basic/afr/gfid-mismatch-resolution-with-fav-child-policy.t
@@ -168,8 +168,8 @@ TEST [ "$gfid_1" != "$gfid_2" ]
 #We know that second brick has the bigger size file
 BIGGER_FILE_MD5=$(md5sum $B0/${V0}1/f3 | cut -d\  -f1)
 
-TEST ls $M0/f3
-TEST cat $M0/f3
+TEST ls $M0 #Trigger entry heal via readdir inode refresh
+TEST cat $M0/f3 #Trigger data heal via readv inode refresh
 EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
 
 #gfid split-brain should be resolved
@@ -215,8 +215,8 @@ TEST $CLI volume start $V0 force
 EXPECT_WITHIN $PROCESS_UP_TIMEOUT "1" brick_up_status $V0 $H0 $B0/${V0}2
 EXPECT_WITHIN $CHILD_UP_TIMEOUT "1" afr_child_up_status $V0 2
 
-TEST ls $M0/f4
-TEST cat $M0/f4
+TEST ls $M0 #Trigger entry heal via readdir inode refresh
+TEST cat $M0/f4  #Trigger data heal via readv inode refresh
 EXPECT_WITHIN $HEAL_TIMEOUT "^0$" get_pending_heal_count $V0
 
 #gfid split-brain should be resolved
diff --git a/xlators/cluster/afr/src/afr-common.c b/xlators/cluster/afr/src/afr-common.c
index fca2cd5..90b4f14 100644
--- a/xlators/cluster/afr/src/afr-common.c
+++ b/xlators/cluster/afr/src/afr-common.c
@@ -284,7 +284,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
                 metadatamap |= (1 << index);
             }
             if (metadatamap_old != metadatamap) {
-                event = 0;
+                __afr_inode_need_refresh_set(inode, this);
             }
             break;
 
@@ -297,7 +297,7 @@ __afr_set_in_flight_sb_status(xlator_t *this, afr_local_t *local,
                 datamap |= (1 << index);
             }
             if (datamap_old != datamap)
-                event = 0;
+                __afr_inode_need_refresh_set(inode, this);
             break;
 
         default:
@@ -461,34 +461,6 @@ out:
 }
 
 int
-__afr_inode_event_gen_reset_small(inode_t *inode, xlator_t *this)
-{
-    int ret = -1;
-    uint16_t datamap = 0;
-    uint16_t metadatamap = 0;
-    uint32_t event = 0;
-    uint64_t val = 0;
-    afr_inode_ctx_t *ctx = NULL;
-
-    ret = __afr_inode_ctx_get(this, inode, &ctx);
-    if (ret)
-        return ret;
-
-    val = ctx->read_subvol;
-
-    metadatamap = (val & 0x000000000000ffff) >> 0;
-    datamap = (val & 0x00000000ffff0000) >> 16;
-    event = 0;
-
-    val = ((uint64_t)metadatamap) | (((uint64_t)datamap) << 16) |
-          (((uint64_t)event) << 32);
-
-    ctx->read_subvol = val;
-
-    return ret;
-}
-
-int
 __afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
                             unsigned char *metadata, int *event_p)
 {
@@ -559,22 +531,6 @@ out:
 }
 
 int
-__afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
-{
-    afr_private_t *priv = NULL;
-    int ret = -1;
-
-    priv = this->private;
-
-    if (priv->child_count <= 16)
-        ret = __afr_inode_event_gen_reset_small(inode, this);
-    else
-        ret = -1;
-
-    return ret;
-}
-
-int
 afr_inode_read_subvol_get(inode_t *inode, xlator_t *this, unsigned char *data,
                           unsigned char *metadata, int *event_p)
 {
@@ -723,30 +679,22 @@ out:
     return need_refresh;
 }
 
-static int
-afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
+int
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
 {
     int ret = -1;
     afr_inode_ctx_t *ctx = NULL;
 
-    GF_VALIDATE_OR_GOTO(this->name, inode, out);
-
-    LOCK(&inode->lock);
-    {
-        ret = __afr_inode_ctx_get(this, inode, &ctx);
-        if (ret)
-            goto unlock;
-
+    ret = __afr_inode_ctx_get(this, inode, &ctx);
+    if (ret == 0) {
         ctx->need_refresh = _gf_true;
     }
-unlock:
-    UNLOCK(&inode->lock);
-out:
+
     return ret;
 }
 
 int
-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this)
 {
     int ret = -1;
 
@@ -754,7 +702,7 @@ afr_inode_event_gen_reset(inode_t *inode, xlator_t *this)
 
     LOCK(&inode->lock);
     {
-        ret = __afr_inode_event_gen_reset(inode, this);
+        ret = __afr_inode_need_refresh_set(inode, this);
     }
     UNLOCK(&inode->lock);
 out:
@@ -1191,7 +1139,7 @@ afr_txn_refresh_done(call_frame_t *frame, xlator_t *this, int err)
     ret = afr_inode_get_readable(frame, inode, this, local->readable,
                                  &event_generation, local->transaction.type);
 
-    if (ret == -EIO || (local->is_read_txn && !event_generation)) {
+    if (ret == -EIO) {
         /* No readable subvolume even after refresh ==> splitbrain.*/
         if (!priv->fav_child_policy) {
             err = EIO;
@@ -2413,7 +2361,7 @@ afr_lookup_done(call_frame_t *frame, xlator_t *this)
         if (read_subvol == -1)
             goto cant_interpret;
         if (ret) {
-            afr_inode_event_gen_reset(local->inode, this);
+            afr_inode_need_refresh_set(local->inode, this);
             dict_del_sizen(local->replies[read_subvol].xdata, GF_CONTENT_KEY);
         }
     } else {
@@ -2971,6 +2919,7 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this)
     afr_private_t *priv = NULL;
     afr_local_t *local = NULL;
     int read_subvol = -1;
+    int ret = 0;
     unsigned char *data_readable = NULL;
     unsigned char *success_replies = NULL;
 
@@ -2992,7 +2941,10 @@ afr_discover_unwind(call_frame_t *frame, xlator_t *this)
     if (!afr_has_quorum(success_replies, this, frame))
         goto unwind;
 
-    afr_replies_interpret(frame, this, local->inode, NULL);
+    ret = afr_replies_interpret(frame, this, local->inode, NULL);
+    if (ret) {
+        afr_inode_need_refresh_set(local->inode, this);
+    }
 
     read_subvol = afr_read_subvol_decide(local->inode, this, NULL,
                                          data_readable);
@@ -3248,11 +3200,7 @@ afr_discover(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
     afr_read_subvol_get(loc->inode, this, NULL, NULL, &event,
                         AFR_DATA_TRANSACTION, NULL);
 
-    if (afr_is_inode_refresh_reqd(loc->inode, this, event,
-                                  local->event_generation))
-        afr_inode_refresh(frame, this, loc->inode, NULL, afr_discover_do);
-    else
-        afr_discover_do(frame, this, 0);
+    afr_discover_do(frame, this, 0);
 
     return 0;
 out:
@@ -3393,11 +3341,7 @@ afr_lookup(call_frame_t *frame, xlator_t *this, loc_t *loc, dict_t *xattr_req)
     afr_read_subvol_get(loc->parent, this, NULL, NULL, &event,
                         AFR_DATA_TRANSACTION, NULL);
 
-    if (afr_is_inode_refresh_reqd(loc->inode, this, event,
-                                  local->event_generation))
-        afr_inode_refresh(frame, this, loc->parent, NULL, afr_lookup_do);
-    else
-        afr_lookup_do(frame, this, 0);
+    afr_lookup_do(frame, this, 0);
 
     return 0;
 out:
diff --git a/xlators/cluster/afr/src/afr-dir-write.c b/xlators/cluster/afr/src/afr-dir-write.c
index 416c19d..d419bfc 100644
--- a/xlators/cluster/afr/src/afr-dir-write.c
+++ b/xlators/cluster/afr/src/afr-dir-write.c
@@ -123,11 +123,11 @@ __afr_dir_write_finalize(call_frame_t *frame, xlator_t *this)
             continue;
         if (local->replies[i].op_ret < 0) {
             if (local->inode)
-                afr_inode_event_gen_reset(local->inode, this);
+                afr_inode_need_refresh_set(local->inode, this);
             if (local->parent)
-                afr_inode_event_gen_reset(local->parent, this);
+                afr_inode_need_refresh_set(local->parent, this);
             if (local->parent2)
-                afr_inode_event_gen_reset(local->parent2, this);
+                afr_inode_need_refresh_set(local->parent2, this);
             continue;
         }
 
diff --git a/xlators/cluster/afr/src/afr.h b/xlators/cluster/afr/src/afr.h
index ed5096e..3a2b26d 100644
--- a/xlators/cluster/afr/src/afr.h
+++ b/xlators/cluster/afr/src/afr.h
@@ -948,7 +948,10 @@ afr_inode_read_subvol_set(inode_t *inode, xlator_t *this,
                           int event_generation);
 
 int
-afr_inode_event_gen_reset(inode_t *inode, xlator_t *this);
+__afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
+
+int
+afr_inode_need_refresh_set(inode_t *inode, xlator_t *this);
 
 int
 afr_read_subvol_select_by_policy(inode_t *inode, xlator_t *this,
-- 
1.8.3.1