src/third_party/wiredtiger/src/block/block_ckpt.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033

/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
static int __ckpt_update(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_CKPT *, WT_BLOCK_CKPT *);

/*
 * __block_extlist_setup --
 *     Initialize the extent lists in a checkpoint structure.
 */
static int
__block_extlist_setup(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
{
    WT_RET(__wt_block_extlist_init(session, &ci->alloc, name, "alloc", false));
    WT_RET(__wt_block_extlist_init(session, &ci->avail, name, "avail", true));
    WT_RET(__wt_block_extlist_init(session, &ci->discard, name, "discard", false));
    WT_RET(__wt_block_extlist_init(session, &ci->ckpt_avail, name, "ckpt_avail", true));

    return (0);
}

/*
 * __block_extlist_reset --
 *     Discard and reinitialize the extent lists in a checkpoint structure
 */
static int
__block_extlist_reset(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
{
    __wt_block_ckpt_destroy(session, ci);
    WT_RET(__block_extlist_setup(session, ci, name));

    return (0);
}

/*
 * __wt_block_ckpt_init --
 *     Return the address cookie for the root page of a checkpoint. Also initialize extent lists if
 *     we are loading the live checkpoint in a writable file.
 */
int
__wt_block_ckpt_init(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
{
    WT_CLEAR(*ci);

    ci->version = WT_BM_CHECKPOINT_VERSION;
    ci->root_offset = WT_BLOCK_INVALID_OFFSET;

    return (__block_extlist_setup(session, ci, name));
}

/*
 * __wt_block_checkpoint_load --
 *     Return the address cookie for the root page of a checkpoint. Also initialize its extent lists
 *     if loading the live checkpoint from a writeable file.
 */
int
__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr,
  size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
{
    WT_BLOCK_CKPT *ci, _ci;
    WT_DECL_RET;
    uint8_t *endp;
    bool live_open;

    /*
     * Sometimes we don't find a root page (we weren't given a checkpoint, or the checkpoint was
     * empty). In that case we return an empty root address, set that up now.
     */
    *root_addr_sizep = 0;

    ci = NULL;

    if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_CHECKPOINT, WT_VERBOSE_DEBUG_1))
        __wt_ckpt_verbose(session, block, "load", NULL, addr, addr_size);

    /*
     * There's a single checkpoint in the file that can be written, all of the others are read-only.
     * We use the same initialization calls for readonly checkpoints, but the information doesn't
     * persist.
     */
    if (checkpoint) {
        ci = &_ci;
        WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
    } else {
        /*
         * We depend on the btree level for locking: things will go bad fast if we open the live
         * system in two handles, or salvage, truncate or verify the live/running file.
         */
        __wt_spin_lock(session, &block->live_lock);
        live_open = block->live_open;
        block->live_open = true;
        __wt_spin_unlock(session, &block->live_lock);
        WT_ERR_ASSERT(session, WT_DIAGNOSTIC_CHECKPOINT_VALIDATE, live_open == false, EBUSY,
          "%s: attempt to re-open live file", block->name);

        ci = &block->live;
        WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
    }

    /*
     * If the checkpoint has an on-disk root page, load it. Otherwise, size the file past the
     * description information.
     */
    if (addr == NULL || addr_size == 0)
        ci->file_size = block->allocsize;
    else {
        /* Crack the checkpoint cookie. */
        WT_ERR(__wt_block_ckpt_unpack(session, block, addr, addr_size, ci));

        /* Verify sets up next. */
        if (block->verify) {
            /*
             * FIXME: We may need to change how we setup for verify when it supports tiered tables.
             * Until then, an attempt to verify a tiered table should return before getting here.
             */
            WT_ASSERT(session, block->objectid == 0 && ci->root_objectid == 0);
            WT_ERR(__wt_verify_ckpt_load(session, block, ci));
        }

        /* Read any root page. */
        if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
            /* A checkpoint shouldn't point to an object created after this one. */
            WT_ASSERT(session, block->objectid >= ci->root_objectid);

            endp = root_addr;
            WT_ERR(__wt_block_addr_pack(
              block, &endp, ci->root_objectid, ci->root_offset, ci->root_size, ci->root_checksum));
            *root_addr_sizep = WT_PTRDIFF(endp, root_addr);
        }

        if (!checkpoint) {
            /*
             * The checkpoint address may point to an earlier object. If so, the object backing this
             * block handle doesn't have valid data -- i.e., it must have been written after the
             * checkpoint we are opening. So we discard the incorrect extent lists and reinitialize
             * them to be empty.
             */
            if (block->objectid != ci->root_objectid)
                __block_extlist_reset(session, ci, "live");

            /*
             * Rolling a checkpoint forward requires the avail list, the blocks from which we can
             * allocate.
             */
            WT_ERR(__wt_block_extlist_read_avail(session, block, &ci->avail, ci->file_size));
        }
    }

    /*
     * If the object can be written, that means anything written after the checkpoint is no longer
     * interesting, truncate the file. Don't bother checking the avail list for a block at the end
     * of the file, that was done when the checkpoint was first written (re-writing the checkpoint
     * might possibly make it relevant here, but it's unlikely enough I don't bother).
     */
    if (!checkpoint)
        WT_ERR(__wt_block_truncate(session, block, ci->file_size));

    if (0) {
err:
        /*
         * Don't call checkpoint-unload: unload does real work including file truncation. If we fail
         * early enough that the checkpoint information isn't correct, bad things would happen. The
         * only allocated memory was in the service of verify, clean that up.
         */
        if (block->verify)
            WT_TRET(__wt_verify_ckpt_unload(session, block));
    }

    /* Checkpoints don't need the original information, discard it. */
    if (checkpoint)
        __wt_block_ckpt_destroy(session, ci);

    return (ret);
}

/*
 * __wt_block_checkpoint_unload --
 *     Unload a checkpoint.
 */
int
__wt_block_checkpoint_unload(WT_SESSION_IMPL *session, WT_BLOCK *block, bool checkpoint)
{
    WT_DECL_RET;

    /* Verify cleanup. */
    if (block->verify)
        WT_TRET(__wt_verify_ckpt_unload(session, block));

    /*
     * If it's the live system, truncate to discard any extended blocks and discard the active
     * extent lists. Hold the lock even though we're unloading the live checkpoint, there could be
     * readers active in other checkpoints.
     */
    if (!checkpoint) {
        WT_TRET(__wt_block_truncate(session, block, block->size));

        __wt_spin_lock(session, &block->live_lock);
        __wt_block_ckpt_destroy(session, &block->live);
        block->live_open = false;
        __wt_spin_unlock(session, &block->live_lock);
    }

    return (ret);
}

/*
 * __wt_block_ckpt_destroy --
 *     Clear a checkpoint structure. Free the extent lists, but leave the rest of the state intact
 *     in case the caller is re-using it.
 */
void
__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
{
    /*
     * We should hold the live lock here when running on the live checkpoint. But there is no easy
     * way to determine if the checkpoint is live so we cannot assert the locking here.
     */

    /* Discard the extent lists. */
    __wt_block_extlist_free(session, &ci->alloc);
    __wt_block_extlist_free(session, &ci->avail);
    __wt_block_extlist_free(session, &ci->discard);
    __wt_block_extlist_free(session, &ci->ckpt_alloc);
    __wt_block_extlist_free(session, &ci->ckpt_avail);
    __wt_block_extlist_free(session, &ci->ckpt_discard);
}

/*
 * __wt_block_checkpoint_start --
 *     Start a checkpoint.
 */
int
__wt_block_checkpoint_start(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
    WT_DECL_RET;

    __wt_spin_lock(session, &block->live_lock);
    switch (block->ckpt_state) {
    case WT_CKPT_INPROGRESS:
    case WT_CKPT_PANIC_ON_FAILURE:
    case WT_CKPT_SALVAGE:
        ret = __wt_panic(session, EINVAL,
          "%s: an unexpected checkpoint start: the checkpoint has already started or was "
          "configured for salvage",
          block->name);
        __wt_blkcache_set_readonly(session);
        break;
    case WT_CKPT_NONE:
        block->ckpt_state = WT_CKPT_INPROGRESS;
        break;
    }
    __wt_spin_unlock(session, &block->live_lock);
    return (ret);
}

/*
 * __wt_block_checkpoint --
 *     Create a new checkpoint.
 */
int
__wt_block_checkpoint(
  WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_checksum)
{
    WT_BLOCK_CKPT *ci;
    WT_DECL_RET;

    ci = &block->live;

    /* Switch to first-fit allocation. */
    __wt_block_configure_first_fit(block, true);

    /*
     * Write the root page: it's possible for there to be a checkpoint of
     * an empty tree, in which case, we store an illegal root offset.
     *
     * !!!
     * We happen to know that checkpoints are single-threaded above us in
     * the btree engine.  That's probably something we want to guarantee
     * for any WiredTiger block manager.
     */
    if (buf == NULL) {
        ci->root_offset = WT_BLOCK_INVALID_OFFSET;
        ci->root_objectid = ci->root_size = ci->root_checksum = 0;
    } else
        WT_ERR(__wt_block_write_off(session, block, buf, &ci->root_objectid, &ci->root_offset,
          &ci->root_size, &ci->root_checksum, data_checksum, true, false));

    /*
     * Checkpoints are potentially reading/writing/merging lots of blocks, pre-allocate structures
     * for this thread's use.
     */
    WT_ERR(__wt_block_ext_prealloc(session, 250));

    /* Process the checkpoint list, deleting and updating as required. */
    ret = __ckpt_process(session, block, ckptbase);

    /* Discard any excessive memory we've allocated. */
    WT_TRET(__wt_block_ext_discard(session, 250));

/* Restore the original allocation plan. */
err:
    __wt_block_configure_first_fit(block, false);

    return (ret);
}

/*
 * __ckpt_extlist_read --
 *     Read a checkpoint's extent lists.
 */
static int
__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt, bool *localp)
{
    WT_BLOCK_CKPT *ci;

    /* Default to a local file. */
    *localp = true;

    /*
     * Allocate a checkpoint structure, crack the cookie and read the checkpoint's extent lists.
     *
     * Ignore the avail list: checkpoint avail lists are only useful if we are rolling forward from
     * the particular checkpoint and they represent our best understanding of what blocks can be
     * allocated. If we are not operating on the live checkpoint, subsequent checkpoints might have
     * allocated those blocks, and the avail list is useless. We don't discard it, because it is
     * useful as part of verification, but we don't re-write it either.
     */
    WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));

    ci = ckpt->bpriv;
    WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
    WT_RET(__wt_block_ckpt_unpack(session, block, ckpt->raw.data, ckpt->raw.size, ci));

    /* Extent lists from non-local objects aren't useful, we're going to skip them. */
    if (ci->root_objectid != block->objectid) {
        *localp = false;
        return (0);
    }

    WT_RET(__wt_block_extlist_read(session, block, &ci->alloc, ci->file_size));
    WT_RET(__wt_block_extlist_read(session, block, &ci->discard, ci->file_size));

    return (0);
}

/*
 * __ckpt_extlist_fblocks --
 *     If a checkpoint's extent list is going away, free its blocks.
 */
static int
__ckpt_extlist_fblocks(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
{
    if (el->offset == WT_BLOCK_INVALID_OFFSET)
        return (0);

    /*
     * Free blocks used to write checkpoint extents into the live system's checkpoint avail list
     * (they were never on any alloc list). Do not use the live system's avail list because that
     * list is used to decide if the file can be truncated, and we can't truncate any part of the
     * file that contains a previous checkpoint's extents.
     */
    return (__wt_block_insert_ext(session, block, &block->live.ckpt_avail, el->offset, el->size));
}

/*
 * __ckpt_verify --
 *     Diagnostic code, confirm we get what we expect in the checkpoint array.
 */
static int
__ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
{
    WT_CKPT *ckpt;

    /*
     * Fast check that we're seeing what we expect to see: some number of checkpoints to add, delete
     * or ignore, terminated by a new checkpoint.
     */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        switch (ckpt->flags) {
        case 0:
        case WT_CKPT_DELETE:
        case WT_CKPT_DELETE | WT_CKPT_FAKE:
        case WT_CKPT_FAKE:
            break;
        case WT_CKPT_ADD | WT_CKPT_BLOCK_MODS:
        case WT_CKPT_ADD:
            if (ckpt[1].name == NULL)
                break;
        /* FALLTHROUGH */
        default:
            return (__wt_illegal_value(session, ckpt->flags));
        }
    return (0);
}

/*
 * __ckpt_add_blkmod_entry --
 *     Add an offset/length entry to the bitstring based on granularity.
 */
static int
__ckpt_add_blkmod_entry(
  WT_SESSION_IMPL *session, WT_BLOCK_MODS *blk_mod, wt_off_t offset, wt_off_t len)
{
    uint64_t end_bit, start_bit;
    uint32_t end_buf_bytes, end_rdup_bits, end_rdup_bytes;

    WT_ASSERT(session, blk_mod->granularity != 0);
    /*
     * Figure out the starting and ending locations in the bitmap based on its granularity and our
     * offset and length. The bit locations are zero-based; be careful translating to sizes.
     */
    start_bit = (uint64_t)offset / blk_mod->granularity;
    end_bit = (uint64_t)(offset + len - 1) / blk_mod->granularity;
    WT_ASSERT(session, end_bit < UINT32_MAX);
    /* We want to grow the bitmap by 64 bits, or 8 bytes at a time. */
    end_rdup_bits = WT_MAX(__wt_rduppo2((uint32_t)end_bit + 1, 64), WT_BLOCK_MODS_LIST_MIN);
    end_rdup_bytes = __bitstr_size(end_rdup_bits);
    end_buf_bytes = __bitstr_size((uint32_t)blk_mod->nbits);
    /*
     * We are doing a lot of shifting. Make sure that the number of bytes we end up with is a
     * multiple of eight. We guarantee that in the rounding up call, but also make sure that the
     * constant stays a multiple of eight.
     */
    WT_ASSERT(session, end_rdup_bytes % 8 == 0);
    if (end_rdup_bytes > end_buf_bytes) {
        /* If we don't have enough, extend the buffer. */
        if (blk_mod->nbits == 0) {
            WT_RET(__wt_buf_initsize(session, &blk_mod->bitstring, end_rdup_bytes));
            memset(blk_mod->bitstring.mem, 0, end_rdup_bytes);
        } else {
            WT_RET(
              __wt_buf_set(session, &blk_mod->bitstring, blk_mod->bitstring.data, end_rdup_bytes));
            memset(
              (uint8_t *)blk_mod->bitstring.mem + end_buf_bytes, 0, end_rdup_bytes - end_buf_bytes);
        }
        blk_mod->nbits = end_rdup_bits;
    }
    /* Make sure we're not going to run past the end of the bitmap */
    WT_ASSERT(session, blk_mod->bitstring.size >= __bitstr_size((uint32_t)blk_mod->nbits));
    WT_ASSERT(session, end_bit < blk_mod->nbits);
    /* Set all the bits needed to record this offset/length pair. */
    __bit_nset(blk_mod->bitstring.mem, start_bit, end_bit);
    return (0);
}

/*
 * __ckpt_add_blk_mods_alloc --
 *     Add the checkpoint's allocated blocks to all valid incremental backup source identifiers.
 */
static int
__ckpt_add_blk_mods_alloc(
  WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_BLOCK_CKPT *ci, WT_BLOCK *block)
{
    WT_BLOCK_MODS *blk_mod;
    WT_CKPT *ckpt;
    WT_EXT *ext;
    u_int i;

    if (&block->live == ci)
        WT_ASSERT_SPINLOCK_OWNED(session, &block->live_lock);

    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (F_ISSET(ckpt, WT_CKPT_ADD))
            break;
    }
    /* If this is not the live checkpoint or we don't care about incremental blocks, we're done. */
    if (ckpt == NULL || !F_ISSET(ckpt, WT_CKPT_BLOCK_MODS))
        return (0);
    for (i = 0; i < WT_BLKINCR_MAX; ++i) {
        blk_mod = &ckpt->backup_blocks[i];
        /* If there is no information at this entry, we're done. */
        if (!F_ISSET(blk_mod, WT_BLOCK_MODS_VALID))
            continue;

        if (block->created_during_backup)
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, 0, block->allocsize));
        WT_EXT_FOREACH (ext, ci->alloc.off) {
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, ext->off, ext->size));
        }
    }
    block->created_during_backup = false;
    return (0);
}

/*
 * __ckpt_add_blk_mods_ext --
 *     Add a set of extent blocks to all valid incremental backup source identifiers.
 */
static int
__ckpt_add_blk_mods_ext(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, WT_BLOCK_CKPT *ci)
{
    WT_BLOCK_MODS *blk_mod;
    WT_CKPT *ckpt;
    u_int i;

    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (F_ISSET(ckpt, WT_CKPT_ADD))
            break;
    }
    /* If this is not the live checkpoint or we don't care about incremental blocks, we're done. */
    if (ckpt == NULL || !F_ISSET(ckpt, WT_CKPT_BLOCK_MODS))
        return (0);
    for (i = 0; i < WT_BLKINCR_MAX; ++i) {
        blk_mod = &ckpt->backup_blocks[i];
        /* If there is no information at this entry, we're done. */
        if (!F_ISSET(blk_mod, WT_BLOCK_MODS_VALID))
            continue;

        if (ci->alloc.offset != WT_BLOCK_INVALID_OFFSET)
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, ci->alloc.offset, ci->alloc.size));
        if (ci->discard.offset != WT_BLOCK_INVALID_OFFSET)
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, ci->discard.offset, ci->discard.size));
        if (ci->avail.offset != WT_BLOCK_INVALID_OFFSET)
            WT_RET(__ckpt_add_blkmod_entry(session, blk_mod, ci->avail.offset, ci->avail.size));
    }
    return (0);
}

/*
 * __ckpt_process --
 *     Process the list of checkpoints.
 */
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
    WT_BLOCK_CKPT *a, *b, *ci;
    WT_CKPT *ckpt, *next_ckpt;
    WT_DECL_RET;
    uint64_t ckpt_size;
    bool deleting, fatal, local;

    ci = &block->live;
    fatal = false;

    if (EXTRA_DIAGNOSTICS_ENABLED(session, WT_DIAGNOSTIC_CHECKPOINT_VALIDATE))
        WT_RET(__ckpt_verify(session, ckptbase));

    /*
     * Checkpoints are a two-step process: first, write a new checkpoint to disk (including all the
     * new extent lists for modified checkpoints and the live system). As part of this, create a
     * list of file blocks newly available for reallocation, based on checkpoints being deleted. We
     * then return the locations of the new checkpoint information to our caller. Our caller has to
     * write that information into some kind of stable storage, and once that's done, we can
     * actually allocate from that list of newly available file blocks. (We can't allocate from that
     * list immediately because the allocation might happen before our caller saves the new
     * checkpoint information, and if we crashed before the new checkpoint location was saved, we'd
     * have overwritten blocks still referenced by checkpoints in the system.) In summary, there is
     * a second step: after our caller saves the checkpoint information, we are called to add the
     * newly available blocks into the live system's available list.
     *
     * This function is the first step, the second step is in the resolve function.
     *
     * If we're called to checkpoint the same file twice (without the second resolution step), or
     * re-entered for any reason, it's an error in our caller, and our choices are all bad: leak
     * blocks or potentially crash with our caller not yet having saved previous checkpoint
     * information to stable storage.
     */
    __wt_spin_lock(session, &block->live_lock);
    switch (block->ckpt_state) {
    case WT_CKPT_INPROGRESS:
        block->ckpt_state = WT_CKPT_PANIC_ON_FAILURE;
        break;
    case WT_CKPT_NONE:
    case WT_CKPT_PANIC_ON_FAILURE:
        ret = __wt_panic(session, EINVAL,
          "%s: an unexpected checkpoint attempt: the checkpoint was never started or has already "
          "completed",
          block->name);
        __wt_blkcache_set_readonly(session);
        break;
    case WT_CKPT_SALVAGE:
        /* Salvage doesn't use the standard checkpoint APIs. */
        break;
    }
    __wt_spin_unlock(session, &block->live_lock);
    WT_RET(ret);

    /*
     * Extents newly available as a result of deleting previous checkpoints are added to a list of
     * extents. The list should be empty, but as described above, there is no "free the checkpoint
     * information" call into the block manager; if there was an error in an upper level that
     * resulted in some previous checkpoint never being resolved, the list may not be empty. We
     * should have caught that with the "checkpoint in progress" test, but it doesn't cost us
     * anything to be cautious.
     *
     * We free the checkpoint's allocation and discard extent lists as part of the resolution step,
     * not because they're needed at that time, but because it's potentially a lot of work, and
     * waiting allows the btree layer to continue eviction sooner. As for the checkpoint-available
     * list, make sure they get cleaned out.
     */
    __wt_block_extlist_free(session, &ci->ckpt_avail);
    WT_RET(__wt_block_extlist_init(session, &ci->ckpt_avail, "live", "ckpt_avail", true));
    __wt_block_extlist_free(session, &ci->ckpt_alloc);
    __wt_block_extlist_free(session, &ci->ckpt_discard);

    /*
     * To delete a checkpoint, we need checkpoint information for it and the subsequent checkpoint
     * into which it gets rolled; read them from disk before we lock things down.
     */
    deleting = false;
    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE))
            continue;

        /*
         * Read the checkpoint and next checkpoint extent lists if we haven't already read them (we
         * may have already read these extent blocks if there is more than one deleted checkpoint).
         *
         * We can only delete checkpoints in the current file. Checkpoints of tiered storage objects
         * are checkpoints for the logical object, including files that are no longer live. Skip any
         * checkpoints that aren't local to the live object.
         */
        if (ckpt->bpriv == NULL) {
            WT_ERR(__ckpt_extlist_read(session, block, ckpt, &local));
            if (!local)
                continue;
        }

        deleting = true;

        for (next_ckpt = ckpt + 1;; ++next_ckpt)
            if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
                break;

        /*
         * The "next" checkpoint may be the live tree which has no extent blocks to read.
         */
        if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) {
            WT_ERR(__ckpt_extlist_read(session, block, next_ckpt, &local));
            WT_ERR_ASSERT(session, WT_DIAGNOSTIC_CHECKPOINT_VALIDATE, local == true, WT_PANIC,
              "tiered storage checkpoint follows local checkpoint");
        }
    }

    /*
     * Failures are now fatal: we can't currently back out the merge of any deleted checkpoint
     * extent lists into the live system's extent lists, so continuing after error would leave the
     * live system's extent lists corrupted for any subsequent checkpoint (and potentially, should a
     * subsequent checkpoint succeed, for recovery).
     */
    fatal = true;

    /*
     * Hold a lock so the live extent lists and the file size can't change underneath us. I suspect
     * we'll tighten this if checkpoints take too much time away from real work: we read the
     * historic checkpoint information without a lock, but we could also merge and re-write the
     * deleted and merged checkpoint information without a lock, except for the final merge of
     * ranges into the live tree.
     */
    __wt_spin_lock(session, &block->live_lock);

    /*
     * We've allocated our last page, update the checkpoint size. We need to calculate the live
     * system's checkpoint size before merging checkpoint allocation and discard information from
     * the checkpoints we're deleting, those operations change the underlying byte counts.
     */
    ckpt_size = ci->ckpt_size;
    ckpt_size += ci->alloc.bytes;
    ckpt_size -= ci->discard.bytes;

    /*
     * Record the checkpoint's allocated blocks. Do so before skipping any processing and before
     * possibly merging in blocks from any previous checkpoint.
     */
    WT_ERR(__ckpt_add_blk_mods_alloc(session, ckptbase, ci, block));

    /* Skip the additional processing if we aren't deleting checkpoints. */
    if (!deleting)
        goto live_update;

    /*
     * Delete any no-longer-needed checkpoints: we do this first as it frees blocks to the live
     * lists, and the freed blocks will then be included when writing the live extent lists.
     */
    WT_CKPT_FOREACH (ckptbase, ckpt) {
        if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE))
            continue;

        if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_CHECKPOINT, WT_VERBOSE_DEBUG_2))
            __wt_ckpt_verbose(session, block, "delete", ckpt->name, ckpt->raw.data, ckpt->raw.size);

        /*
         * Find the checkpoint into which we'll roll this checkpoint's blocks: it's the next real
         * checkpoint in the list, and it better have been read in (if it's not the add slot).
         */
        for (next_ckpt = ckpt + 1;; ++next_ckpt)
            if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
                break;

        /*
         * Set the from/to checkpoint structures, where the "to" value may be the live tree.
         */
        a = ckpt->bpriv;
        if (F_ISSET(next_ckpt, WT_CKPT_ADD))
            b = &block->live;
        else
            b = next_ckpt->bpriv;

        /*
         * Free the root page: there's nothing special about this free, the root page is allocated
         * using normal rules, that is, it may have been taken from the avail list, and was entered
         * on the live system's alloc list at that time. We free it into the checkpoint's discard
         * list, however, not the live system's list because it appears on the checkpoint's alloc
         * list and so must be paired in the checkpoint.
         */
        if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
            WT_ERR(
              __wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size));

        /*
         * Free the blocks used to hold the "from" checkpoint's extent lists, including the avail
         * list.
         */
        WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
        WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
        WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

        /*
         * Roll the "from" alloc and discard extent lists into the "to" checkpoint's lists.
         */
        if (a->alloc.entries != 0)
            WT_ERR(__wt_block_extlist_merge(session, block, &a->alloc, &b->alloc));
        if (a->discard.entries != 0)
            WT_ERR(__wt_block_extlist_merge(session, block, &a->discard, &b->discard));

        /*
         * If the "to" checkpoint is also being deleted, we're done with it, it's merged into some
         * other checkpoint in the next loop. This means the extent lists may aggregate over a
         * number of checkpoints, but that's OK, they're disjoint sets of ranges.
         */
        if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
            continue;

        /*
         * Find blocks for re-use: wherever the "to" checkpoint's allocate and discard lists
         * overlap, move the range to the live system's checkpoint available list.
         */
        WT_ERR(__wt_block_extlist_overlap(session, block, b));

        /*
         * If we're updating the live system's information, we're done.
         */
        if (F_ISSET(next_ckpt, WT_CKPT_ADD))
            continue;

        /*
         * We have to write the "to" checkpoint's extent lists out in new blocks, and update its
         * cookie.
         *
         * Free the blocks used to hold the "to" checkpoint's extent lists; don't include the avail
         * list, it's not changing.
         */
        WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
        WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

        F_SET(next_ckpt, WT_CKPT_UPDATE);
    }

    /* Update checkpoints marked for update. */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if (F_ISSET(ckpt, WT_CKPT_UPDATE))
            WT_ERR(__ckpt_update(session, block, ckptbase, ckpt, ckpt->bpriv));

live_update:
    /* Truncate the file if that's possible. */
    WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

    /* Update the final, added checkpoint based on the live system. */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if (F_ISSET(ckpt, WT_CKPT_ADD)) {
            /*
             * !!!
             * Our caller wants the final checkpoint size. Setting the size here violates layering,
             * but the alternative is a call for the btree layer to crack the checkpoint cookie into
             * its components, and that's a fair amount of work.
             */
            ckpt->size = ckpt_size;

            /*
             * Set the rolling checkpoint size for the live system. The current size includes the
             * current checkpoint's root page size (root pages are on the checkpoint's block
             * allocation list as root pages are allocated with the usual block allocation
             * functions). That's correct, but we don't want to include it in the size for the next
             * checkpoint.
             */
            ckpt_size -= ci->root_size;

            /*
             * Additionally, we had a bug for awhile where the live checkpoint size grew without
             * bound. We can't sanity check the value, that would require walking the tree as part
             * of the checkpoint. Bound any bug at the size of the file. It isn't practical to
             * assert that the value is within bounds since databases created with older versions of
             * WiredTiger (2.8.0) would likely see an error.
             */
            ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size);

            WT_ERR(__ckpt_update(session, block, ckptbase, ckpt, ci));
        }

    /*
     * Reset the live system's alloc and discard extent lists, leave the avail list alone. This
     * includes freeing a lot of extents, so do it outside of the system's lock by copying and
     * resetting the original, then doing the work later.
     */
    ci->ckpt_alloc = ci->alloc;
    WT_ERR(__wt_block_extlist_init(session, &ci->alloc, "live", "alloc", false));
    ci->ckpt_discard = ci->discard;
    WT_ERR(__wt_block_extlist_init(session, &ci->discard, "live", "discard", false));

#ifdef HAVE_DIAGNOSTIC
    /*
     * The first checkpoint in the system should always have an empty discard list. If we've read
     * that checkpoint and/or created it, check.
     */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if (!F_ISSET(ckpt, WT_CKPT_DELETE))
            break;
    if ((a = ckpt->bpriv) == NULL)
        a = &block->live;
    if (a->discard.entries != 0)
        WT_ERR_MSG(
          session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard list");
#endif

err:
    if (ret != 0 && fatal) {
        ret = __wt_panic(session, ret, "%s: fatal checkpoint failure", block->name);
        __wt_blkcache_set_readonly(session);
    }

    __wt_spin_unlock_if_owned(session, &block->live_lock);

    /* Discard any checkpoint information we loaded. */
    WT_CKPT_FOREACH (ckptbase, ckpt)
        if ((ci = ckpt->bpriv) != NULL)
            __wt_block_ckpt_destroy(session, ci);

    return (ret);
}

/*
 * __ckpt_update --
 *     Update a checkpoint.
 */
static int
__ckpt_update(
  WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci)
{
    WT_DECL_ITEM(a);
    WT_DECL_RET;
    uint8_t *endp;
    bool is_live;

    is_live = F_ISSET(ckpt, WT_CKPT_ADD);
    if (is_live)
        WT_ASSERT_SPINLOCK_OWNED(session, &block->live_lock);

#ifdef HAVE_DIAGNOSTIC
    /* Check the extent list combinations for overlaps. */
    WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
    WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
    WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
#endif
    /*
     * Write the checkpoint's alloc and discard extent lists. Note these blocks never appear on the
     * system's allocation list, checkpoint extent blocks don't appear on any extent lists.
     */
    WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
    WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));

    /*
     * If this is the final block, we append an incomplete copy of the checkpoint information to the
     * avail list for standalone retrieval.
     */
    if (is_live) {
        /*
         * Copy the INCOMPLETE checkpoint information into the checkpoint.
         */
        WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BLOCK_CHECKPOINT_BUFFER));
        endp = ckpt->raw.mem;
        WT_RET(__wt_block_ckpt_pack(session, block, &endp, ci, true));
        ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);

        /*
         * Convert the INCOMPLETE checkpoint array into its metadata representation. This must match
         * what is eventually written into the metadata file, in other words, everything must be
         * initialized before the block manager does the checkpoint.
         */
        WT_RET(__wt_scr_alloc(session, 8 * 1024, &a));
        ret = __wt_meta_ckptlist_to_meta(session, ckptbase, a);
        if (ret == 0)
            ret = __wt_strndup(session, a->data, a->size, &ckpt->block_checkpoint);
        __wt_scr_free(session, &a);
        WT_RET(ret);
    }

    /*
     * We only write an avail list for the live system, other checkpoint's avail lists are static
     * and never change.
     *
     * Write the avail list last so it reflects changes due to allocating blocks for the alloc and
     * discard lists. Second, when we write the live system's avail list, it's two lists: the
     * current avail list plus the list of blocks to be made available when the new checkpoint
     * completes. We can't merge that second list into the real list yet, it's not truly available
     * until the new checkpoint locations have been saved to the metadata.
     */
    if (is_live) {
        block->final_ckpt = ckpt;
        ret = __wt_block_extlist_write(session, block, &ci->avail, &ci->ckpt_avail);
        block->final_ckpt = NULL;
        WT_RET(ret);
    }

    /*
     * Record the blocks allocated to write the extent lists. We must record blocks in the live
     * system's extent lists, as those blocks are a necessary part of the checkpoint a hot backup
     * might recover. Update blocks in extent lists used to rewrite other checkpoints (for example,
     * an intermediate checkpoint rewritten because a checkpoint was rolled into it), even though
     * it's not necessary: those blocks aren't the last checkpoint in the file and so aren't
     * included in a recoverable checkpoint, they don't matter on a hot backup target until they're
     * allocated and used in the context of a live system. Regardless, they shouldn't materially
     * affect how much data we're writing, and it keeps things more consistent on the target to
     * update them. (Ignore the live system's ckpt_avail list here. The blocks on that list were
     * written into the final avail extent list which will be copied to the hot backup, and that's
     * all that matters.)
     */
    WT_RET(__ckpt_add_blk_mods_ext(session, ckptbase, ci));

    /*
     * Set the file size for the live system.
     *
     * !!!
     * We do NOT set the file size when re-writing checkpoints because we want to test the
     * checkpoint's blocks against a reasonable maximum file size during verification. This is bad:
     * imagine a checkpoint appearing early in the file, re-written, and then the checkpoint
     * requires blocks at the end of the file, blocks after the listed file size. If the application
     * opens that checkpoint for writing (discarding subsequent checkpoints), we would truncate the
     * file to the early chunk, discarding the re-written checkpoint information. The alternative,
     * updating the file size has its own problems, in that case we'd work correctly, but we'd lose
     * all of the blocks between the original checkpoint and the re-written checkpoint. Currently,
     * there's no API to roll-forward intermediate checkpoints, if there ever is, this will need to
     * be fixed.
     */
    if (is_live)
        ci->file_size = block->size;

    /* Copy the COMPLETE checkpoint information into the checkpoint. */
    WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BLOCK_CHECKPOINT_BUFFER));
    endp = ckpt->raw.mem;
    WT_RET(__wt_block_ckpt_pack(session, block, &endp, ci, false));
    ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);

    if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_CHECKPOINT, WT_VERBOSE_DEBUG_2))
        __wt_ckpt_verbose(session, block, "create", ckpt->name, ckpt->raw.data, ckpt->raw.size);

    return (0);
}

/*
 * __wt_block_checkpoint_resolve --
 *     Resolve a checkpoint.
 */
int
__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block, bool failed)
{
    WT_BLOCK_CKPT *ci;
    WT_DECL_RET;

    ci = &block->live;

    /*
     * Resolve the checkpoint after our caller has written the checkpoint information to stable
     * storage.
     */
    __wt_spin_lock(session, &block->live_lock);
    switch (block->ckpt_state) {
    case WT_CKPT_INPROGRESS:
        /* Something went wrong, but it's recoverable at our level. */
        goto done;
    case WT_CKPT_NONE:
    case WT_CKPT_SALVAGE:
        ret = __wt_panic(session, EINVAL,
          "%s: an unexpected checkpoint resolution: the checkpoint was never started or completed, "
          "or configured for salvage",
          block->name);
        __wt_blkcache_set_readonly(session);
        break;
    case WT_CKPT_PANIC_ON_FAILURE:
        if (!failed)
            break;
        ret = __wt_panic(
          session, EINVAL, "%s: the checkpoint failed, the system must restart", block->name);
        __wt_blkcache_set_readonly(session);
        break;
    }
    WT_ERR(ret);

    if ((ret = __wt_block_extlist_merge(session, block, &ci->ckpt_avail, &ci->avail)) != 0) {
        ret = __wt_panic(
          session, ret, "%s: fatal checkpoint failure during extent list merge", block->name);
        __wt_blkcache_set_readonly(session);
    }
    __wt_spin_unlock(session, &block->live_lock);

    /* Discard the lists remaining after the checkpoint call. */
    __wt_block_extlist_free(session, &ci->ckpt_avail);
    __wt_block_extlist_free(session, &ci->ckpt_alloc);
    __wt_block_extlist_free(session, &ci->ckpt_discard);

    __wt_spin_lock(session, &block->live_lock);
done:
    block->ckpt_state = WT_CKPT_NONE;
err:
    __wt_spin_unlock(session, &block->live_lock);

    return (ret);
}

#ifdef HAVE_UNITTEST
int
__ut_ckpt_add_blkmod_entry(
  WT_SESSION_IMPL *session, WT_BLOCK_MODS *blk_mod, wt_off_t offset, wt_off_t len)
{
    return (__ckpt_add_blkmod_entry(session, blk_mod, offset, len));
}
#endif