summaryrefslogtreecommitdiff
path: root/lib/ovs-rcu.c
blob: 946aa04d18e4ff1aae01fe04ea0cd51411ca4723 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
/*
 * Copyright (c) 2014, 2017 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <config.h>
#include <errno.h>
#include "ovs-rcu.h"
#include "fatal-signal.h"
#include "guarded-list.h"
#include "latch.h"
#include "openvswitch/list.h"
#include "ovs-thread.h"
#include "openvswitch/poll-loop.h"
#include "seq.h"
#include "timeval.h"
#include "util.h"
#include "openvswitch/vlog.h"

VLOG_DEFINE_THIS_MODULE(ovs_rcu);

#define MIN_CBS 16

struct ovsrcu_cb {
    void (*function)(void *aux);
    void *aux;
};

struct ovsrcu_cbset {
    struct ovs_list list_node;
    struct ovsrcu_cb *cbs;
    size_t n_allocated;
    int n_cbs;
};

struct ovsrcu_perthread {
    struct ovs_list list_node;  /* In global list. */

    uint64_t seqno;
    struct ovsrcu_cbset *cbset;
    char name[16];              /* This thread's name. */
};

static struct seq *global_seqno;

static pthread_key_t perthread_key;
static struct ovs_list ovsrcu_threads;
static struct ovs_mutex ovsrcu_threads_mutex;

static struct guarded_list flushed_cbsets;
static struct seq *flushed_cbsets_seq;

static struct latch postpone_exit;
static struct ovs_barrier postpone_barrier;

static void ovsrcu_init_module(void);
static void ovsrcu_flush_cbset__(struct ovsrcu_perthread *, bool);
static void ovsrcu_flush_cbset(struct ovsrcu_perthread *);
static void ovsrcu_unregister__(struct ovsrcu_perthread *);
static bool ovsrcu_call_postponed(void);
static void *ovsrcu_postpone_thread(void *arg OVS_UNUSED);

static struct ovsrcu_perthread *
ovsrcu_perthread_get(void)
{
    struct ovsrcu_perthread *perthread;

    ovsrcu_init_module();

    perthread = pthread_getspecific(perthread_key);
    if (!perthread) {
        const char *name = get_subprogram_name();

        perthread = xmalloc(sizeof *perthread);
        perthread->seqno = seq_read(global_seqno);
        perthread->cbset = NULL;
        ovs_strlcpy(perthread->name, name[0] ? name : "main",
                    sizeof perthread->name);

        ovs_mutex_lock(&ovsrcu_threads_mutex);
        ovs_list_push_back(&ovsrcu_threads, &perthread->list_node);
        ovs_mutex_unlock(&ovsrcu_threads_mutex);

        pthread_setspecific(perthread_key, perthread);
    }
    return perthread;
}

/* Indicates the end of a quiescent state.  See "Details" near the top of
 * ovs-rcu.h.
 *
 * Quiescent states don't stack or nest, so this always ends a quiescent state
 * even if ovsrcu_quiesce_start() was called multiple times in a row. */
void
ovsrcu_quiesce_end(void)
{
    ovsrcu_perthread_get();
}

static void
ovsrcu_quiesced(void)
{
    if (single_threaded()) {
        ovsrcu_call_postponed();
    } else {
        static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
        if (ovsthread_once_start(&once)) {
            latch_init(&postpone_exit);
            ovs_barrier_init(&postpone_barrier, 2);
            ovs_thread_create("urcu", ovsrcu_postpone_thread, NULL);
            ovsthread_once_done(&once);
        }
    }
}

/* Indicates the beginning of a quiescent state.  See "Details" near the top of
 * ovs-rcu.h. */
void
ovsrcu_quiesce_start(void)
{
    struct ovsrcu_perthread *perthread;

    ovsrcu_init_module();
    perthread = pthread_getspecific(perthread_key);
    if (perthread) {
        pthread_setspecific(perthread_key, NULL);
        ovsrcu_unregister__(perthread);
    }

    ovsrcu_quiesced();
}

/* Indicates a momentary quiescent state.  See "Details" near the top of
 * ovs-rcu.h.
 *
 * Provides a full memory barrier via seq_change().
 */
void
ovsrcu_quiesce(void)
{
    struct ovsrcu_perthread *perthread;

    perthread = ovsrcu_perthread_get();
    perthread->seqno = seq_read(global_seqno);
    if (perthread->cbset) {
        ovsrcu_flush_cbset(perthread);
    }
    seq_change(global_seqno);

    ovsrcu_quiesced();
}

int
ovsrcu_try_quiesce(void)
{
    struct ovsrcu_perthread *perthread;
    int ret = EBUSY;

    ovs_assert(!single_threaded());
    perthread = ovsrcu_perthread_get();
    if (!seq_try_lock()) {
        perthread->seqno = seq_read_protected(global_seqno);
        if (perthread->cbset) {
            ovsrcu_flush_cbset__(perthread, true);
        }
        seq_change_protected(global_seqno);
        seq_unlock();
        ovsrcu_quiesced();
        ret = 0;
    }
    return ret;
}

bool
ovsrcu_is_quiescent(void)
{
    ovsrcu_init_module();
    return pthread_getspecific(perthread_key) == NULL;
}

void
ovsrcu_synchronize(void)
{
    unsigned int warning_threshold = 1000;
    uint64_t target_seqno;
    long long int start;

    if (single_threaded()) {
        return;
    }

    target_seqno = seq_read(global_seqno);
    ovsrcu_quiesce_start();
    start = time_msec();

    for (;;) {
        uint64_t cur_seqno = seq_read(global_seqno);
        struct ovsrcu_perthread *perthread;
        char stalled_thread[16];
        unsigned int elapsed;
        bool done = true;

        ovs_mutex_lock(&ovsrcu_threads_mutex);
        LIST_FOR_EACH (perthread, list_node, &ovsrcu_threads) {
            if (perthread->seqno <= target_seqno) {
                ovs_strlcpy_arrays(stalled_thread, perthread->name);
                done = false;
                break;
            }
        }
        ovs_mutex_unlock(&ovsrcu_threads_mutex);

        if (done) {
            break;
        }

        elapsed = time_msec() - start;
        if (elapsed >= warning_threshold) {
            VLOG_WARN("blocked %u ms waiting for %s to quiesce",
                      elapsed, stalled_thread);
            warning_threshold *= 2;
        }
        poll_timer_wait_until(start + warning_threshold);

        seq_wait(global_seqno, cur_seqno);
        poll_block();
    }
    ovsrcu_quiesce_end();
}

/* Waits until as many postponed callbacks as possible have executed.
 *
 * As a side effect, stops the background thread that calls the callbacks and
 * prevents it from being restarted.  This means that this function should only
 * be called soon before a process exits, as a mechanism for releasing memory
 * to make memory leaks easier to detect, since any further postponed callbacks
 * won't actually get called.
 *
 * This function can only wait for callbacks registered by the current thread
 * and the background thread that calls the callbacks.  Thus, it will be most
 * effective if other threads have already exited. */
void
ovsrcu_exit(void)
{
    /* Stop the postpone thread and wait for it to exit.  Otherwise, there's no
     * way to wait for that thread to finish calling callbacks itself. */
    if (!single_threaded()) {
        ovsrcu_quiesced();      /* Ensure that the postpone thread exists. */
        latch_set(&postpone_exit);
        ovs_barrier_block(&postpone_barrier);
    }

    /* Repeatedly:
     *
     *    - Wait for a grace period.  One important side effect is to push the
     *      running thread's cbset into 'flushed_cbsets' so that the next call
     *      has something to call.
     *
     *    - Call all the callbacks in 'flushed_cbsets'.  If there aren't any,
     *      we're done, otherwise the callbacks themselves might have requested
     *      more deferred callbacks so we go around again.
     *
     * We limit the number of iterations just in case some bug causes an
     * infinite loop.  This function is just for making memory leaks easier to
     * spot so there's no point in breaking things on that basis. */
    for (int i = 0; i < 8; i++) {
        ovsrcu_synchronize();
        if (!ovsrcu_call_postponed()) {
            break;
        }
    }
}

/* Registers 'function' to be called, passing 'aux' as argument, after the
 * next grace period.
 *
 * The call is guaranteed to happen after the next time all participating
 * threads have quiesced at least once, but there is no quarantee that all
 * registered functions are called as early as possible, or that the functions
 * registered by different threads would be called in the order the
 * registrations took place.  In particular, even if two threads provably
 * register a function each in a specific order, the functions may still be
 * called in the opposite order, depending on the timing of when the threads
 * call ovsrcu_quiesce(), how many functions they postpone, and when the
 * ovs-rcu thread happens to grab the functions to be called.
 *
 * All functions registered by a single thread are guaranteed to execute in the
 * registering order, however.
 *
 * This function is more conveniently called through the ovsrcu_postpone()
 * macro, which provides a type-safe way to allow 'function''s parameter to be
 * any pointer type. */
void
ovsrcu_postpone__(void (*function)(void *aux), void *aux)
{
    struct ovsrcu_perthread *perthread = ovsrcu_perthread_get();
    struct ovsrcu_cbset *cbset;
    struct ovsrcu_cb *cb;

    cbset = perthread->cbset;
    if (!cbset) {
        cbset = perthread->cbset = xmalloc(sizeof *perthread->cbset);
        cbset->cbs = xmalloc(MIN_CBS * sizeof *cbset->cbs);
        cbset->n_allocated = MIN_CBS;
        cbset->n_cbs = 0;
    }

    if (cbset->n_cbs == cbset->n_allocated) {
        cbset->cbs = x2nrealloc(cbset->cbs, &cbset->n_allocated,
                                sizeof *cbset->cbs);
    }

    cb = &cbset->cbs[cbset->n_cbs++];
    cb->function = function;
    cb->aux = aux;
}

static bool
ovsrcu_call_postponed(void)
{
    struct ovsrcu_cbset *cbset;
    struct ovs_list cbsets;

    guarded_list_pop_all(&flushed_cbsets, &cbsets);
    if (ovs_list_is_empty(&cbsets)) {
        return false;
    }

    ovsrcu_synchronize();

    LIST_FOR_EACH_POP (cbset, list_node, &cbsets) {
        struct ovsrcu_cb *cb;

        for (cb = cbset->cbs; cb < &cbset->cbs[cbset->n_cbs]; cb++) {
            cb->function(cb->aux);
        }
        free(cbset->cbs);
        free(cbset);
    }

    return true;
}

static void *
ovsrcu_postpone_thread(void *arg OVS_UNUSED)
{
    pthread_detach(pthread_self());

    while (!latch_is_set(&postpone_exit)) {
        uint64_t seqno = seq_read(flushed_cbsets_seq);
        if (!ovsrcu_call_postponed()) {
            seq_wait(flushed_cbsets_seq, seqno);
            latch_wait(&postpone_exit);
            poll_block();
        }
    }

    ovs_barrier_block(&postpone_barrier);
    return NULL;
}

static void
ovsrcu_flush_cbset__(struct ovsrcu_perthread *perthread, bool protected)
{
    struct ovsrcu_cbset *cbset = perthread->cbset;

    if (cbset) {
        guarded_list_push_back(&flushed_cbsets, &cbset->list_node, SIZE_MAX);
        perthread->cbset = NULL;

        if (protected) {
            seq_change_protected(flushed_cbsets_seq);
        } else {
            seq_change(flushed_cbsets_seq);
        }
    }
}

static void
ovsrcu_flush_cbset(struct ovsrcu_perthread *perthread)
{
    ovsrcu_flush_cbset__(perthread, false);
}

static void
ovsrcu_unregister__(struct ovsrcu_perthread *perthread)
{
    if (perthread->cbset) {
        ovsrcu_flush_cbset(perthread);
    }

    ovs_mutex_lock(&ovsrcu_threads_mutex);
    ovs_list_remove(&perthread->list_node);
    ovs_mutex_unlock(&ovsrcu_threads_mutex);

    free(perthread);

    seq_change(global_seqno);
}

static void
ovsrcu_thread_exit_cb(void *perthread)
{
    ovsrcu_unregister__(perthread);
}

/* Cancels the callback to ovsrcu_thread_exit_cb().
 *
 * Cancelling the call to the destructor during the main thread exit
 * is needed while using pthreads-win32 library in Windows. It has been
 * observed that in pthreads-win32, a call to the destructor during
 * main thread exit causes undefined behavior. */
static void
ovsrcu_cancel_thread_exit_cb(void *aux OVS_UNUSED)
{
    pthread_setspecific(perthread_key, NULL);
}

static void
ovsrcu_init_module(void)
{
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
    if (ovsthread_once_start(&once)) {
        global_seqno = seq_create();
        xpthread_key_create(&perthread_key, ovsrcu_thread_exit_cb);
        fatal_signal_add_hook(ovsrcu_cancel_thread_exit_cb, NULL, NULL, true);
        ovs_list_init(&ovsrcu_threads);
        ovs_mutex_init(&ovsrcu_threads_mutex);

        guarded_list_init(&flushed_cbsets);
        flushed_cbsets_seq = seq_create();

        ovsthread_once_done(&once);
    }
}

static void
ovsrcu_barrier_func(void *seq_)
{
    struct seq *seq = (struct seq *) seq_;
    seq_change(seq);
}

/* Similar to the kernel rcu_barrier, ovsrcu_barrier waits for all outstanding
 * RCU callbacks to complete. However, unlike the kernel rcu_barrier, which
 * might return immediately if there are no outstanding RCU callbacks,
 * this API will at least wait for a grace period.
 *
 * Another issue the caller might need to know is that the barrier is just
 * for "one-shot", i.e. if inside some RCU callbacks, another RCU callback is
 * registered, this API only guarantees the first round of RCU callbacks have
 * been executed after it returns.
 */
void
ovsrcu_barrier(void)
{
    struct seq *seq = seq_create();
    /* First let all threads flush their cbsets. */
    ovsrcu_synchronize();

    /* Then register a new cbset, ensure this cbset
     * is at the tail of the global list. */
    uint64_t seqno = seq_read(seq);
    ovsrcu_postpone__(ovsrcu_barrier_func, (void *) seq);

    do {
        seq_wait(seq, seqno);
        poll_block();
    } while (seqno == seq_read(seq));

    seq_destroy(seq);
}