summaryrefslogtreecommitdiff
path: root/0354-core-fix-memory-pool-management-races.patch
blob: a7cdfc03d131b3924f9f6d873a3209fc3fab4f49 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
From 75a9d946d252ce70460144615ca17dbdf2e80fab Mon Sep 17 00:00:00 2001
From: Xavi Hernandez <xhernandez@redhat.com>
Date: Fri, 7 Feb 2020 10:19:57 +0100
Subject: [PATCH 354/355] core: fix memory pool management races

Objects allocated from a per-thread memory pool keep a reference to it
to be able to return the object to the pool when not used anymore. The
object holding this reference can have a long life cycle that could
survive a glfs_fini() call.

This means that it's unsafe to destroy memory pools from glfs_fini().

Another side effect of destroying memory pools from glfs_fini() is that
the TLS variable that points to one of those pools cannot be reset for
all alive threads.  This means that any attempt to allocate memory from
those threads will access already free'd memory, which is very
dangerous.

To fix these issues, mem_pools_fini() doesn't destroy pool lists
anymore. Only at process termination the pools are destroyed.

Upatream patch:
> Upstream patch link: https://review.gluster.org/c/glusterfs/+/24099
> Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
> Fixes: bz#1801684
> Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>

Change-Id: Ib189a5510ab6bdac78983c6c65a022e9634b0965
BUG: 1800703
Signed-off-by: Xavi Hernandez <xhernandez@redhat.com>
Reviewed-on: https://code.engineering.redhat.com/gerrit/192262
Tested-by: RHGS Build Bot <nigelb@redhat.com>
Reviewed-by: Sunil Kumar Heggodu Gopala Acharya <sheggodu@redhat.com>
---
 libglusterfs/src/globals.c            |  13 ++-
 libglusterfs/src/glusterfs/globals.h  |   3 +
 libglusterfs/src/glusterfs/mem-pool.h |  28 ++---
 libglusterfs/src/mem-pool.c           | 201 ++++++++++++++++++----------------
 libglusterfs/src/syncop.c             |   7 ++
 5 files changed, 146 insertions(+), 106 deletions(-)

diff --git a/libglusterfs/src/globals.c b/libglusterfs/src/globals.c
index 02098e6..e433ee8 100644
--- a/libglusterfs/src/globals.c
+++ b/libglusterfs/src/globals.c
@@ -319,7 +319,18 @@ glusterfs_cleanup(void *ptr)
         GF_FREE(thread_syncopctx.groups);
     }
 
-    mem_pool_thread_destructor();
+    mem_pool_thread_destructor(NULL);
+}
+
+void
+gf_thread_needs_cleanup(void)
+{
+    /* The value stored in free_key TLS is not really used for anything, but
+     * pthread implementation doesn't call the TLS destruction function unless
+     * it's != NULL. This function must be called whenever something is
+     * allocated for this thread so that glusterfs_cleanup() will be called
+     * and resources can be released. */
+    (void)pthread_setspecific(free_key, (void *)1);
 }
 
 static void
diff --git a/libglusterfs/src/glusterfs/globals.h b/libglusterfs/src/glusterfs/globals.h
index e218285..31717ed 100644
--- a/libglusterfs/src/glusterfs/globals.h
+++ b/libglusterfs/src/glusterfs/globals.h
@@ -181,6 +181,9 @@ glusterfs_leaseid_exist(void);
 int
 glusterfs_globals_init(glusterfs_ctx_t *ctx);
 
+void
+gf_thread_needs_cleanup(void);
+
 struct tvec_base *
 glusterfs_ctx_tw_get(glusterfs_ctx_t *ctx);
 void
diff --git a/libglusterfs/src/glusterfs/mem-pool.h b/libglusterfs/src/glusterfs/mem-pool.h
index be0a26d..97bf76c 100644
--- a/libglusterfs/src/glusterfs/mem-pool.h
+++ b/libglusterfs/src/glusterfs/mem-pool.h
@@ -245,24 +245,26 @@ typedef struct per_thread_pool {
 } per_thread_pool_t;
 
 typedef struct per_thread_pool_list {
-    /*
-     * These first two members are protected by the global pool lock.  When
-     * a thread first tries to use any pool, we create one of these.  We
-     * link it into the global list using thr_list so the pool-sweeper
-     * thread can find it, and use pthread_setspecific so this thread can
-     * find it.  When the per-thread destructor runs, we "poison" the pool
-     * list to prevent further allocations.  This also signals to the
-     * pool-sweeper thread that the list should be detached and freed after
-     * the next time it's swept.
-     */
+    /* thr_list is used to place the TLS pool_list into the active global list
+     * (pool_threads) or the inactive global list (pool_free_threads). It's
+     * protected by the global pool_lock. */
     struct list_head thr_list;
-    unsigned int poison;
+
+    /* This lock is used to update poison and the hot/cold lists of members
+     * of 'pools' array. */
+    pthread_spinlock_t lock;
+
+    /* This field is used to mark a pool_list as not being owned by any thread.
+     * This means that the sweeper thread won't be cleaning objects stored in
+     * its pools. mem_put() uses it to decide if the object being released is
+     * placed into its original pool_list or directly destroyed. */
+    bool poison;
+
     /*
      * There's really more than one pool, but the actual number is hidden
      * in the implementation code so we just make it a single-element array
      * here.
      */
-    pthread_spinlock_t lock;
     per_thread_pool_t pools[1];
 } per_thread_pool_list_t;
 
@@ -307,7 +309,7 @@ void
 mem_pool_destroy(struct mem_pool *pool);
 
 void
-mem_pool_thread_destructor(void);
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list);
 
 void
 gf_mem_acct_enable_set(void *ctx);
diff --git a/libglusterfs/src/mem-pool.c b/libglusterfs/src/mem-pool.c
index d88041d..2b41c01 100644
--- a/libglusterfs/src/mem-pool.c
+++ b/libglusterfs/src/mem-pool.c
@@ -367,7 +367,6 @@ static __thread per_thread_pool_list_t *thread_pool_list = NULL;
 #define POOL_SWEEP_SECS 30
 
 typedef struct {
-    struct list_head death_row;
     pooled_obj_hdr_t *cold_lists[N_COLD_LISTS];
     unsigned int n_cold_lists;
 } sweep_state_t;
@@ -384,36 +383,33 @@ static pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
 static unsigned int init_count = 0;
 static pthread_t sweeper_tid;
 
-gf_boolean_t
+static bool
 collect_garbage(sweep_state_t *state, per_thread_pool_list_t *pool_list)
 {
     unsigned int i;
     per_thread_pool_t *pt_pool;
-    gf_boolean_t poisoned;
 
     (void)pthread_spin_lock(&pool_list->lock);
 
-    poisoned = pool_list->poison != 0;
-    if (!poisoned) {
-        for (i = 0; i < NPOOLS; ++i) {
-            pt_pool = &pool_list->pools[i];
-            if (pt_pool->cold_list) {
-                if (state->n_cold_lists >= N_COLD_LISTS) {
-                    break;
-                }
-                state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
+    for (i = 0; i < NPOOLS; ++i) {
+        pt_pool = &pool_list->pools[i];
+        if (pt_pool->cold_list) {
+            if (state->n_cold_lists >= N_COLD_LISTS) {
+                (void)pthread_spin_unlock(&pool_list->lock);
+                return true;
             }
-            pt_pool->cold_list = pt_pool->hot_list;
-            pt_pool->hot_list = NULL;
+            state->cold_lists[state->n_cold_lists++] = pt_pool->cold_list;
         }
+        pt_pool->cold_list = pt_pool->hot_list;
+        pt_pool->hot_list = NULL;
     }
 
     (void)pthread_spin_unlock(&pool_list->lock);
 
-    return poisoned;
+    return false;
 }
 
-void
+static void
 free_obj_list(pooled_obj_hdr_t *victim)
 {
     pooled_obj_hdr_t *next;
@@ -425,82 +421,96 @@ free_obj_list(pooled_obj_hdr_t *victim)
     }
 }
 
-void *
+static void *
 pool_sweeper(void *arg)
 {
     sweep_state_t state;
     per_thread_pool_list_t *pool_list;
-    per_thread_pool_list_t *next_pl;
-    per_thread_pool_t *pt_pool;
-    unsigned int i;
-    gf_boolean_t poisoned;
+    uint32_t i;
+    bool pending;
 
     /*
      * This is all a bit inelegant, but the point is to avoid doing
      * expensive things (like freeing thousands of objects) while holding a
-     * global lock.  Thus, we split each iteration into three passes, with
+     * global lock.  Thus, we split each iteration into two passes, with
      * only the first and fastest holding the lock.
      */
 
+    pending = true;
+
     for (;;) {
-        sleep(POOL_SWEEP_SECS);
+        /* If we know there's pending work to do (or it's the first run), we
+         * do collect garbage more often. */
+        sleep(pending ? POOL_SWEEP_SECS / 5 : POOL_SWEEP_SECS);
+
         (void)pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
-        INIT_LIST_HEAD(&state.death_row);
         state.n_cold_lists = 0;
+        pending = false;
 
         /* First pass: collect stuff that needs our attention. */
         (void)pthread_mutex_lock(&pool_lock);
-        list_for_each_entry_safe(pool_list, next_pl, &pool_threads, thr_list)
+        list_for_each_entry(pool_list, &pool_threads, thr_list)
         {
-            (void)pthread_mutex_unlock(&pool_lock);
-            poisoned = collect_garbage(&state, pool_list);
-            (void)pthread_mutex_lock(&pool_lock);
-
-            if (poisoned) {
-                list_move(&pool_list->thr_list, &state.death_row);
+            if (collect_garbage(&state, pool_list)) {
+                pending = true;
             }
         }
         (void)pthread_mutex_unlock(&pool_lock);
 
-        /* Second pass: free dead pools. */
-        (void)pthread_mutex_lock(&pool_free_lock);
-        list_for_each_entry_safe(pool_list, next_pl, &state.death_row, thr_list)
-        {
-            for (i = 0; i < NPOOLS; ++i) {
-                pt_pool = &pool_list->pools[i];
-                free_obj_list(pt_pool->cold_list);
-                free_obj_list(pt_pool->hot_list);
-                pt_pool->hot_list = pt_pool->cold_list = NULL;
-            }
-            list_del(&pool_list->thr_list);
-            list_add(&pool_list->thr_list, &pool_free_threads);
-        }
-        (void)pthread_mutex_unlock(&pool_free_lock);
-
-        /* Third pass: free cold objects from live pools. */
+        /* Second pass: free cold objects from live pools. */
         for (i = 0; i < state.n_cold_lists; ++i) {
             free_obj_list(state.cold_lists[i]);
         }
         (void)pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
     }
+
+    return NULL;
 }
 
 void
-mem_pool_thread_destructor(void)
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
 {
-    per_thread_pool_list_t *pool_list = thread_pool_list;
-
-    /* The pool-sweeper thread will take it from here.
-     *
-     * We can change 'poison' here without taking locks because the change
-     * itself doesn't interact with other parts of the code and a simple write
-     * is already atomic from the point of view of the processor.
-     *
-     * This change can modify what mem_put() does, but both possibilities are
-     * fine until the sweeper thread kicks in. The real synchronization must be
-     * between mem_put() and the sweeper thread. */
+    per_thread_pool_t *pt_pool;
+    uint32_t i;
+
+    if (pool_list == NULL) {
+        pool_list = thread_pool_list;
+    }
+
+    /* The current thread is terminating. None of the allocated objects will
+     * be used again. We can directly destroy them here instead of delaying
+     * it until the next sweeper loop. */
     if (pool_list != NULL) {
-        pool_list->poison = 1;
+        /* Remove pool_list from the global list to avoid that sweeper
+         * could touch it. */
+        pthread_mutex_lock(&pool_lock);
+        list_del(&pool_list->thr_list);
+        pthread_mutex_unlock(&pool_lock);
+
+        /* We need to protect hot/cold changes from potential mem_put() calls
+         * that reference this pool_list. Once poison is set to true, we are
+         * sure that no one else will touch hot/cold lists. The only possible
+         * race is when at the same moment a mem_put() is adding a new item
+         * to the hot list. We protect from that by taking pool_list->lock.
+         * After that we don't need the lock to destroy the hot/cold lists. */
+        pthread_spin_lock(&pool_list->lock);
+        pool_list->poison = true;
+        pthread_spin_unlock(&pool_list->lock);
+
+        for (i = 0; i < NPOOLS; i++) {
+            pt_pool = &pool_list->pools[i];
+
+            free_obj_list(pt_pool->hot_list);
+            pt_pool->hot_list = NULL;
+
+            free_obj_list(pt_pool->cold_list);
+            pt_pool->cold_list = NULL;
+        }
+
+        pthread_mutex_lock(&pool_free_lock);
+        list_add(&pool_list->thr_list, &pool_free_threads);
+        pthread_mutex_unlock(&pool_free_lock);
+
         thread_pool_list = NULL;
     }
 }
@@ -528,6 +538,30 @@ mem_pools_preinit(void)
     init_done = GF_MEMPOOL_INIT_EARLY;
 }
 
+static __attribute__((destructor)) void
+mem_pools_postfini(void)
+{
+    per_thread_pool_list_t *pool_list, *next;
+
+    /* This is part of a process shutdown (or dlclose()) which means that
+     * most probably all threads should be stopped. However this is not the
+     * case for gluster and there are even legitimate situations in which we
+     * could have some threads alive. What is sure is that none of those
+     * threads should be using anything from this library, so destroying
+     * everything here should be fine and safe. */
+
+    list_for_each_entry_safe(pool_list, next, &pool_threads, thr_list)
+    {
+        mem_pool_thread_destructor(pool_list);
+    }
+
+    list_for_each_entry_safe(pool_list, next, &pool_free_threads, thr_list)
+    {
+        list_del(&pool_list->thr_list);
+        FREE(pool_list);
+    }
+}
+
 /* Call mem_pools_init() once threading has been configured completely. This
  * prevent the pool_sweeper thread from getting killed once the main() thread
  * exits during deamonizing. */
@@ -560,10 +594,6 @@ mem_pools_fini(void)
              */
             break;
         case 1: {
-            per_thread_pool_list_t *pool_list;
-            per_thread_pool_list_t *next_pl;
-            unsigned int i;
-
             /* if mem_pools_init() was not called, sweeper_tid will be invalid
              * and the functions will error out. That is not critical. In all
              * other cases, the sweeper_tid will be valid and the thread gets
@@ -571,32 +601,11 @@ mem_pools_fini(void)
             (void)pthread_cancel(sweeper_tid);
             (void)pthread_join(sweeper_tid, NULL);
 
-            /* At this point all threads should have already terminated, so
-             * it should be safe to destroy all pending per_thread_pool_list_t
-             * structures that are stored for each thread. */
-            mem_pool_thread_destructor();
-
-            /* free all objects from all pools */
-            list_for_each_entry_safe(pool_list, next_pl, &pool_threads,
-                                     thr_list)
-            {
-                for (i = 0; i < NPOOLS; ++i) {
-                    free_obj_list(pool_list->pools[i].hot_list);
-                    free_obj_list(pool_list->pools[i].cold_list);
-                    pool_list->pools[i].hot_list = NULL;
-                    pool_list->pools[i].cold_list = NULL;
-                }
-
-                list_del(&pool_list->thr_list);
-                FREE(pool_list);
-            }
-
-            list_for_each_entry_safe(pool_list, next_pl, &pool_free_threads,
-                                     thr_list)
-            {
-                list_del(&pool_list->thr_list);
-                FREE(pool_list);
-            }
+            /* There could be threads still running in some cases, so we can't
+             * destroy pool_lists in use. We can also not destroy unused
+             * pool_lists because some allocated objects may still be pointing
+             * to them. */
+            mem_pool_thread_destructor(NULL);
 
             init_done = GF_MEMPOOL_INIT_DESTROY;
             /* Fall through. */
@@ -617,7 +626,7 @@ mem_pools_fini(void)
 {
 }
 void
-mem_pool_thread_destructor(void)
+mem_pool_thread_destructor(per_thread_pool_list_t *pool_list)
 {
 }
 
@@ -738,13 +747,21 @@ mem_get_pool_list(void)
         }
     }
 
+    /* There's no need to take pool_list->lock, because this is already an
+     * atomic operation and we don't need to synchronize it with any change
+     * in hot/cold lists. */
+    pool_list->poison = false;
+
     (void)pthread_mutex_lock(&pool_lock);
-    pool_list->poison = 0;
     list_add(&pool_list->thr_list, &pool_threads);
     (void)pthread_mutex_unlock(&pool_lock);
 
     thread_pool_list = pool_list;
 
+    /* Ensure that all memory objects associated to the new pool_list are
+     * destroyed when the thread terminates. */
+    gf_thread_needs_cleanup();
+
     return pool_list;
 }
 
diff --git a/libglusterfs/src/syncop.c b/libglusterfs/src/syncop.c
index 2eb7b49..0de53c6 100644
--- a/libglusterfs/src/syncop.c
+++ b/libglusterfs/src/syncop.c
@@ -97,6 +97,13 @@ syncopctx_setfsgroups(int count, const void *groups)
 
     /* set/reset the ngrps, this is where reset of groups is handled */
     opctx->ngrps = count;
+
+    if ((opctx->valid & SYNCOPCTX_GROUPS) == 0) {
+        /* This is the first time we are storing groups into the TLS structure
+         * so we mark the current thread so that it will be properly cleaned
+         * up when the thread terminates. */
+        gf_thread_needs_cleanup();
+    }
     opctx->valid |= SYNCOPCTX_GROUPS;
 
 out:
-- 
1.8.3.1