src/third_party/wiredtiger/src/session/session_compact.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457

/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * Compaction is the place where the underlying block manager becomes visible
 * in the higher engine btree and API layers.  As there is currently only one
 * block manager, this code is written with it in mind: other block managers
 * may need changes to support compaction, and a smart block manager might need
 * far less support from the engine.
 *
 * First, the default block manager cannot entirely own compaction because it
 * has no way to find a block after it moves other than a request from the
 * btree layer with the new address.  In other words, if internal page X points
 * to leaf page Y, and page Y moves, the address of page Y has to be updated in
 * page X.  Generally, this is solved by building a translation layer in the
 * block manager so internal pages don't require updates to relocate blocks:
 * however, the translation table must be durable, has its own garbage
 * collection issues and might be slower, all of which have their own problems.
 *
 * Second, the btree layer cannot entirely own compaction because page
 * addresses are opaque, it cannot know where a page is in the file from the
 * address cookie.
 *
 * For these reasons, compaction is a cooperative process between the btree
 * layer and the block manager.  The btree layer walks files, and asks the
 * block manager if rewriting a particular block would reduce the file
 * footprint: if writing the page will help, the page is marked dirty so it
 * will eventually be written.  As pages are written, the original page
 * potentially becomes available for reuse and if enough pages at the end of
 * the file are available for reuse, the file can be truncated, and compaction
 * succeeds.
 *
 * However, writing a page is not by itself sufficient to make a page available
 * for reuse.  The original version of the page is still referenced by at least
 * the most recent checkpoint in the file.  To make a page available for reuse,
 * we have to checkpoint the file so we can discard the checkpoint referencing
 * the original version of the block; once no checkpoint references a block, it
 * becomes available for reuse.
 *
 * Compaction is not necessarily possible in WiredTiger, even in a file with
 * lots of available space.  If a block at the end of the file is referenced by
 * a named checkpoint, there is nothing we can do to compact the file, no
 * matter how many times we rewrite the block, the named checkpoint can't be
 * discarded and so the reference count on the original block will never go to
 * zero. What's worse, because the block manager doesn't reference count
 * blocks, it can't easily know this is the case, and so we'll waste a lot of
 * effort trying to compact files that can't be compacted.
 *
 * Finally, compaction checkpoints are database-wide, otherwise we can corrupt
 * file relationships, for example, an index checkpointed by compaction could
 * be out of sync with the primary after a crash.
 *
 * Now, to the actual process.  First, we checkpoint the database: there are
 * potentially many dirty blocks in the cache, and we want to write them out
 * and then discard previous checkpoints so we have as many blocks as possible
 * on the file's "available for reuse" list when we start compaction.
 *
 * Then, we compact the high-level object.
 *
 * Compacting the object is done 10% at a time, that is, we try and move blocks
 * from the last 10% of the file into the beginning of the file (the 10% is
 * hard coded in the block manager).  The reason for this is because we are
 * walking the file in logical order, not block offset order, and we can fail
 * to compact a file if we write the wrong blocks first.
 *
 * For example, imagine a file with 10 blocks in the first 10% of a file, 1,000
 * blocks in the 3rd quartile of the file, and 10 blocks in the last 10% of the
 * file.  If we were to rewrite blocks from more than the last 10% of the file,
 * and found the 1,000 blocks in the 3rd quartile of the file first, we'd copy
 * 10 of them without ever rewriting the blocks from the end of the file which
 * would allow us to compact the file.  So, we compact the last 10% of the
 * file, and if that works, we compact the last 10% of the file again, and so
 * on.  Note the block manager uses a first-fit block selection algorithm
 * during compaction to maximize block movement.
 *
 * After each 10% compaction, we checkpoint two more times (seriously, twice).
 * The second and third checkpoints are because the block manager checkpoints
 * in two steps: blocks made available for reuse during a checkpoint are put on
 * a special checkpoint-available list and only moved to the real available
 * list after the metadata has been updated with the new checkpoint's
 * information.  (Otherwise it is possible to allocate a rewritten block, crash
 * before the metadata is updated, and see corruption.)  For this reason,
 * blocks allocated to write the checkpoint itself cannot be taken from the
 * blocks made available by the checkpoint.
 *
 * To say it another way, the second checkpoint puts the blocks from the end of
 * the file that were made available by compaction onto the checkpoint-available
 * list, but then potentially writes the checkpoint itself at the end of the
 * file, which would prevent any file truncation.  When the metadata is updated
 * for the second checkpoint, the blocks freed by compaction become available
 * for the third checkpoint, so the third checkpoint's blocks are written
 * towards the beginning of the file, and then the file can be truncated.
 */

/*
 * __compact_start --
 *     Start object compaction.
 */
static int
__compact_start(WT_SESSION_IMPL *session)
{
    WT_BM *bm;

    bm = S2BT(session)->bm;
    return (bm->compact_start(bm, session));
}

/*
 * __compact_end --
 *     End object compaction.
 */
static int
__compact_end(WT_SESSION_IMPL *session)
{
    WT_BM *bm;

    bm = S2BT(session)->bm;
    return (bm->compact_end(bm, session));
}

/*
 * __compact_uri_analyze --
 *     Extract information relevant to deciding what work compact needs to do from a URI that is
 *     part of a table schema. Called via the schema_worker function.
 */
static int
__compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp)
{
    /*
     * Add references to schema URI objects to the list of objects to be compacted. Skip over LSM
     * trees or we will get false positives on the "file:" URIs for the chunks.
     */
    if (WT_PREFIX_MATCH(uri, "lsm:")) {
        session->compact->lsm_count++;
        *skipp = true;
    } else if (WT_PREFIX_MATCH(uri, "file:"))
        session->compact->file_count++;
    if (WT_PREFIX_MATCH(uri, "tiered:"))
        WT_RET(ENOTSUP);

    return (0);
}

/*
 * __compact_handle_append --
 *     Gather a file handle to be compacted. Called via the schema_worker function.
 */
static int
__compact_handle_append(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_DECL_RET;

    WT_UNUSED(cfg);

    WT_ASSERT_SPINLOCK_OWNED(session, &S2C(session)->schema_lock);

    WT_RET(__wt_session_get_dhandle(session, session->dhandle->name, NULL, NULL, 0));

    /* Set compact active on the handle. */
    if ((ret = __compact_start(session)) != 0) {
        WT_TRET(__wt_session_release_dhandle(session));
        return (ret);
    }

    /* Make sure there is space for the next entry. */
    WT_RET(__wt_realloc_def(
      session, &session->op_handle_allocated, session->op_handle_next + 1, &session->op_handle));

    session->op_handle[session->op_handle_next++] = session->dhandle;
    return (0);
}

/*
 * __wt_session_compact_check_timeout --
 *     Check if the timeout has been exceeded.
 */
int
__wt_session_compact_check_timeout(WT_SESSION_IMPL *session)
{
    struct timespec end;
    WT_DECL_RET;

    if (session->compact->max_time == 0)
        return (0);

    __wt_epoch(session, &end);

    ret =
      session->compact->max_time > WT_TIMEDIFF_SEC(end, session->compact->begin) ? 0 : ETIMEDOUT;
    if (ret != 0) {
        WT_STAT_CONN_INCR(session, session_table_compact_timeout);

        __wt_verbose_info(session, WT_VERB_COMPACT,
          "Compact has timed out! The operation has been running for %" PRIu64
          " second(s). Configured timeout is %" PRIu64 " second(s).",
          WT_TIMEDIFF_SEC(end, session->compact->begin), session->compact->max_time);
    }
    return (ret);
}

/*
 * __compact_checkpoint --
 *     This function does wait and force checkpoint.
 */
static int
__compact_checkpoint(WT_SESSION_IMPL *session)
{
    /*
     * Force compaction checkpoints: we don't want to skip it because the work we need to have done
     * is done in the underlying block manager.
     */
    const char *checkpoint_cfg[] = {
      WT_CONFIG_BASE(session, WT_SESSION_checkpoint), "force=1", NULL};

    /* Checkpoints take a lot of time, check if we've run out. */
    WT_RET(__wt_session_compact_check_timeout(session));
    return (__wt_txn_checkpoint(session, checkpoint_cfg, true));
}

/*
 * __compact_worker --
 *     Function to alternate between checkpoints and compaction calls.
 */
static int
__compact_worker(WT_SESSION_IMPL *session)
{
    WT_DECL_RET;
    u_int i, loop;
    bool another_pass;

    /*
     * Reset the handles' compaction skip flag (we don't bother setting or resetting it when we
     * finish compaction, it's simpler to do it once, here).
     */
    for (i = 0; i < session->op_handle_next; ++i)
        session->op_handle[i]->compact_skip = false;

    /*
     * Perform an initial checkpoint (see this file's leading comment for details).
     */
    WT_ERR(__compact_checkpoint(session));

    /*
     * We compact 10% of a file on each pass (but the overall size of the file is decreasing each
     * time, so we're not compacting 10% of the original file each time). Try 100 times (which is
     * clearly more than we need); quit if we make no progress.
     */
    for (loop = 0; loop < 100; ++loop) {
        /* Step through the list of files being compacted. */
        for (another_pass = false, i = 0; i < session->op_handle_next; ++i) {
            /* Skip objects where there's no more work. */
            if (session->op_handle[i]->compact_skip)
                continue;

            __wt_timing_stress(session, WT_TIMING_STRESS_COMPACT_SLOW, NULL);
            __wt_verbose_debug2(
              session, WT_VERB_COMPACT, "%s: compact pass %u", session->op_handle[i]->name, loop);

            session->compact_state = WT_COMPACT_RUNNING;
            WT_WITH_DHANDLE(session, session->op_handle[i], ret = __wt_compact(session));
            WT_ERR_ERROR_OK(ret, EBUSY, true);
            /*
             * If successful and we did work, schedule another pass. If successful and we did no
             * work, skip this file in the future.
             */
            if (ret == 0) {
                if (session->compact_state == WT_COMPACT_SUCCESS)
                    another_pass = true;
                else
                    session->op_handle[i]->compact_skip = true;
                continue;
            }

            /*
             * If compaction failed because checkpoint was running, continue with the next handle.
             * We might continue to race with checkpoint on each handle, but that's OK, we'll step
             * through all the handles, and then we'll block until a checkpoint completes.
             *
             * Just quit if eviction is the problem.
             */
            if (ret == EBUSY) {
                if (__wt_cache_stuck(session)) {
                    WT_STAT_CONN_INCR(session, session_table_compact_fail_cache_pressure);
                    WT_ERR_MSG(session, EBUSY, "compaction halted by eviction pressure");
                }
                ret = 0;
                another_pass = true;

                __wt_verbose_info(session, WT_VERB_COMPACT, "%s",
                  "Data handle compaction failed with EBUSY but the cache is not stuck. "
                  "Will give it another go.");
            }
        }
        if (!another_pass)
            break;

        /*
         * Perform two checkpoints (see this file's leading comment for details).
         */
        WT_ERR(__compact_checkpoint(session));
        WT_ERR(__compact_checkpoint(session));
    }

err:
    session->compact_state = WT_COMPACT_NONE;

    return (ret);
}

/*
 * __wt_session_compact --
 *     WT_SESSION.compact method.
 */
int
__wt_session_compact(WT_SESSION *wt_session, const char *uri, const char *config)
{
    WT_COMPACT_STATE compact;
    WT_CONFIG_ITEM cval;
    WT_DATA_SOURCE *dsrc;
    WT_DECL_RET;
    WT_SESSION_IMPL *session;
    u_int i;
    bool ignore_cache_size_set;

    ignore_cache_size_set = false;

    session = (WT_SESSION_IMPL *)wt_session;
    SESSION_API_CALL(session, compact, config, cfg);

    WT_STAT_CONN_SET(session, session_table_compact_running, 1);

    __wt_verbose_debug1(session, WT_VERB_COMPACT, "Compacting %s", uri);

    /*
     * The compaction thread should not block when the cache is full: it is holding locks blocking
     * checkpoints and once the cache is full, it can spend a long time doing eviction.
     */
    if (!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) {
        ignore_cache_size_set = true;
        F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE);
    }

    /* In-memory ignores compaction operations. */
    if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) {
        __wt_verbose_warning(
          session, WT_VERB_COMPACT, "%s", "Compact does not work for in-memory databases.");
        goto err;
    }

    /*
     * Non-LSM object compaction requires checkpoints, which are impossible in transactional
     * contexts. Disallow in all contexts (there's no reason for LSM to allow this, possible or
     * not), and check now so the error message isn't confusing.
     */
    WT_ERR(__wt_txn_context_check(session, false));

    /* Disallow objects in the WiredTiger name space. */
    WT_ERR(__wt_str_name_check(session, uri));

    if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") &&
      !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "lsm:") &&
      !WT_PREFIX_MATCH(uri, "table:") && !WT_PREFIX_MATCH(uri, "tiered:")) {
        if ((dsrc = __wt_schema_get_source(session, uri)) != NULL)
            ret = dsrc->compact == NULL ?
              __wt_object_unsupported(session, uri) :
              dsrc->compact(dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg);
        else
            ret = __wt_bad_object_type(session, uri);
        goto err;
    }

    /* Setup the session handle's compaction state structure. */
    memset(&compact, 0, sizeof(WT_COMPACT_STATE));
    session->compact = &compact;

    /* Compaction can be time-limited. */
    WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval));
    session->compact->max_time = (uint64_t)cval.val;
    __wt_epoch(session, &session->compact->begin);

    /*
     * Find the types of data sources being compacted. This could involve opening indexes for a
     * table, so acquire the table lock in write mode.
     */
    WT_WITH_SCHEMA_LOCK(session,
      WT_WITH_TABLE_WRITE_LOCK(session,
        ret = __wt_schema_worker(
          session, uri, __compact_handle_append, __compact_uri_analyze, cfg, 0)));
    WT_ERR(ret);

    if (session->compact->lsm_count != 0)
        WT_ERR(__wt_schema_worker(session, uri, NULL, __wt_lsm_compact, cfg, 0));
    if (session->compact->file_count != 0)
        WT_ERR(__compact_worker(session));

err:
    session->compact = NULL;

    for (i = 0; i < session->op_handle_next; ++i) {
        WT_WITH_DHANDLE(session, session->op_handle[i], WT_TRET(__compact_end(session)));
        WT_WITH_DHANDLE(
          session, session->op_handle[i], WT_TRET(__wt_session_release_dhandle(session)));
    }

    __wt_free(session, session->op_handle);
    session->op_handle_allocated = session->op_handle_next = 0;

    /*
     * Release common session resources (for example, checkpoint may acquire significant
     * reconciliation structures/memory).
     */
    WT_TRET(__wt_session_release_resources(session));

    if (ignore_cache_size_set)
        F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE);

    if (ret != 0)
        WT_STAT_CONN_INCR(session, session_table_compact_fail);
    else
        WT_STAT_CONN_INCR(session, session_table_compact_success);
    WT_STAT_CONN_SET(session, session_table_compact_running, 0);

    /* Map prepare-conflict to rollback. */
    if (ret == WT_PREPARE_CONFLICT)
        ret = WT_ROLLBACK;

    API_END_RET_NOTFOUND_MAP(session, ret);
}

/*
 * __wt_session_compact_readonly --
 *     WT_SESSION.compact method; readonly version.
 */
int
__wt_session_compact_readonly(WT_SESSION *wt_session, const char *uri, const char *config)
{
    WT_DECL_RET;
    WT_SESSION_IMPL *session;

    WT_UNUSED(uri);
    WT_UNUSED(config);

    session = (WT_SESSION_IMPL *)wt_session;
    SESSION_API_CALL_NOCONF(session, compact);

    WT_STAT_CONN_INCR(session, session_table_compact_fail);
    ret = __wt_session_notsup(session);
err:
    API_END_RET(session, ret);
}