summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/block/block_ckpt.c
blob: adbcf0e3fdceda1605146e9d5780fd9ec5421765 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
/*-
 * Copyright (c) 2014-2015 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
static int __ckpt_string(
	WT_SESSION_IMPL *, WT_BLOCK *, const uint8_t *, WT_ITEM *);
static int __ckpt_update(
	WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, bool);

/*
 * __wt_block_ckpt_init --
 *	Initialize a checkpoint structure.
 */
int
__wt_block_ckpt_init(
    WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci, const char *name)
{
	WT_CLEAR(*ci);

	ci->version = WT_BM_CHECKPOINT_VERSION;
	ci->root_offset = WT_BLOCK_INVALID_OFFSET;

	WT_RET(__wt_block_extlist_init(
	    session, &ci->alloc, name, "alloc", false));
	WT_RET(__wt_block_extlist_init(
	    session, &ci->avail, name, "avail", true));
	WT_RET(__wt_block_extlist_init(
	    session, &ci->discard, name, "discard", false));
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, name, "ckpt_avail", true));

	return (0);
}

/*
 * __wt_block_checkpoint_load --
 *	Load a checkpoint.
 */
int
__wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block,
    const uint8_t *addr, size_t addr_size,
    uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint)
{
	WT_BLOCK_CKPT *ci, _ci;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint8_t *endp;

	WT_UNUSED(addr_size);
	ci = NULL;

	/*
	 * Sometimes we don't find a root page (we weren't given a checkpoint,
	 * or the checkpoint was empty).  In that case we return an empty root
	 * address, set that up now.
	 */
	*root_addr_sizep = 0;

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		if (addr != NULL) {
			WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(session, block, addr, tmp));
		}
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "%s: load-checkpoint: %s", block->name,
		    addr == NULL ? "[Empty]" : (const char *)tmp->data));
	}

	/*
	 * There's a single checkpoint in the file that can be written, all of
	 * the others are read-only.  We use the same initialization calls for
	 * readonly checkpoints, but the information doesn't persist.
	 */
	if (checkpoint) {
		ci = &_ci;
		WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint"));
	} else {
		/*
		 * We depend on the btree level for locking: things will go bad
		 * fast if we open the live system in two handles, or salvage,
		 * truncate or verify the live/running file.
		 */
#ifdef HAVE_DIAGNOSTIC
		__wt_spin_lock(session, &block->live_lock);
		WT_ASSERT(session, block->live_open == false);
		block->live_open = true;
		__wt_spin_unlock(session, &block->live_lock);
#endif
		ci = &block->live;
		WT_ERR(__wt_block_ckpt_init(session, ci, "live"));
	}

	/*
	 * If the checkpoint has an on-disk root page, load it.  Otherwise, size
	 * the file past the description information.
	 */
	if (addr == NULL || addr_size == 0)
		ci->file_size = block->allocsize;
	else {
		/* Crack the checkpoint cookie. */
		WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci));

		/* Verify sets up next. */
		if (block->verify)
			WT_ERR(__wt_verify_ckpt_load(session, block, ci));

		/* Read any root page. */
		if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) {
			endp = root_addr;
			WT_ERR(__wt_block_addr_to_buffer(block, &endp,
			    ci->root_offset, ci->root_size, ci->root_cksum));
			*root_addr_sizep = WT_PTRDIFF(endp, root_addr);
		}

		/*
		 * Rolling a checkpoint forward requires the avail list, the
		 * blocks from which we can allocate.
		 */
		if (!checkpoint)
			WT_ERR(__wt_block_extlist_read_avail(
			    session, block, &ci->avail, ci->file_size));
	}

	/*
	 * If the checkpoint can be written, that means anything written after
	 * the checkpoint is no longer interesting, truncate the file.  Don't
	 * bother checking the avail list for a block at the end of the file,
	 * that was done when the checkpoint was first written (re-writing the
	 * checkpoint might possibly make it relevant here, but it's unlikely
	 * enough I don't bother).
	 */
	if (!checkpoint) {
		/*
		 * The truncate might fail if there's a file mapping (if there's
		 * an open checkpoint on the file), that's OK.
		 */
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "truncate file to %" PRIuMAX, (uintmax_t)ci->file_size));
		WT_ERR_BUSY_OK(
		    __wt_block_truncate(session, block->fh, ci->file_size));
	}

	if (0) {
err:		/*
		 * Don't call checkpoint-unload: unload does real work including
		 * file truncation.  If we fail early enough that the checkpoint
		 * information isn't correct, bad things would happen.  The only
		 * allocated memory was in the service of verify, clean that up.
		 */
		if (block->verify)
			WT_TRET(__wt_verify_ckpt_unload(session, block));
	}

	/* Checkpoints don't need the original information, discard it. */
	if (checkpoint && ci != NULL)
		__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}

/*
 * __wt_block_checkpoint_unload --
 *	Unload a checkpoint.
 */
int
__wt_block_checkpoint_unload(
    WT_SESSION_IMPL *session, WT_BLOCK *block, bool checkpoint)
{
	WT_DECL_RET;

	/* Verify cleanup. */
	if (block->verify)
		WT_TRET(__wt_verify_ckpt_unload(session, block));

	/*
	 * If it's the live system, truncate to discard any extended blocks and
	 * discard the active extent lists.  Hold the lock even though we're
	 * unloading the live checkpoint, there could be readers active in other
	 * checkpoints.
	 */
	if (!checkpoint) {
		/*
		 * The truncate might fail if there's a file mapping (if there's
		 * an open checkpoint on the file), that's OK.
		 */
		WT_TRET_BUSY_OK(
		    __wt_block_truncate(session, block->fh, block->fh->size));

		__wt_spin_lock(session, &block->live_lock);
		__wt_block_ckpt_destroy(session, &block->live);
#ifdef HAVE_DIAGNOSTIC
		block->live_open = false;
#endif
		__wt_spin_unlock(session, &block->live_lock);
	}

	return (ret);
}

/*
 * __wt_block_ckpt_destroy --
 *	Clear a checkpoint structure.
 */
void
__wt_block_ckpt_destroy(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
{
	/* Discard the extent lists. */
	__wt_block_extlist_free(session, &ci->alloc);
	__wt_block_extlist_free(session, &ci->avail);
	__wt_block_extlist_free(session, &ci->discard);
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	__wt_block_extlist_free(session, &ci->ckpt_discard);
}

/*
 * __wt_block_checkpoint --
 *	Create a new checkpoint.
 */
int
__wt_block_checkpoint(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *buf, WT_CKPT *ckptbase, bool data_cksum)
{
	WT_BLOCK_CKPT *ci;
	WT_DECL_RET;

	ci = &block->live;

	/* Switch to first-fit allocation. */
	__wt_block_configure_first_fit(block, true);

	/*
	 * Write the root page: it's possible for there to be a checkpoint of
	 * an empty tree, in which case, we store an illegal root offset.
	 *
	 * !!!
	 * We happen to know that checkpoints are single-threaded above us in
	 * the btree engine.  That's probably something we want to guarantee
	 * for any WiredTiger block manager.
	 */
	if (buf == NULL) {
		ci->root_offset = WT_BLOCK_INVALID_OFFSET;
		ci->root_size = ci->root_cksum = 0;
	} else
		WT_ERR(__wt_block_write_off(session, block, buf,
		    &ci->root_offset, &ci->root_size, &ci->root_cksum,
		    data_cksum, false));

	/*
	 * Checkpoints are potentially reading/writing/merging lots of blocks,
	 * pre-allocate structures for this thread's use.
	 */
	WT_ERR(__wt_block_ext_prealloc(session, 250));

	/* Process the checkpoint list, deleting and updating as required. */
	ret = __ckpt_process(session, block, ckptbase);

	/* Discard any excessive memory we've allocated. */
	WT_TRET(__wt_block_ext_discard(session, 250));

	/* Restore the original allocation plan. */
err:	__wt_block_configure_first_fit(block, false);

	return (ret);
}

/*
 * __ckpt_extlist_read --
 *	Read a checkpoints extent lists and copy
 */
static int
__ckpt_extlist_read(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckpt)
{
	WT_BLOCK_CKPT *ci;

	/*
	 * Allocate a checkpoint structure, crack the cookie and read the
	 * checkpoint's extent lists.
	 *
	 * Ignore the avail list: checkpoint avail lists are only useful if we
	 * are rolling forward from the particular checkpoint and they represent
	 * our best understanding of what blocks can be allocated.  If we are
	 * not operating on the live checkpoint, subsequent checkpoints might
	 * have allocated those blocks, and the avail list is useless.  We don't
	 * discard it, because it is useful as part of verification, but we
	 * don't re-write it either.
	 */
	WT_RET(__wt_calloc(session, 1, sizeof(WT_BLOCK_CKPT), &ckpt->bpriv));

	ci = ckpt->bpriv;
	WT_RET(__wt_block_ckpt_init(session, ci, ckpt->name));
	WT_RET(__wt_block_buffer_to_ckpt(session, block, ckpt->raw.data, ci));
	WT_RET(__wt_block_extlist_read(
	    session, block, &ci->alloc, ci->file_size));
	WT_RET(__wt_block_extlist_read(
	    session, block, &ci->discard, ci->file_size));

	return (0);
}

/*
 * __ckpt_extlist_fblocks --
 *	If a checkpoint's extent list is going away, free its blocks.
 */
static int
__ckpt_extlist_fblocks(
    WT_SESSION_IMPL *session, WT_BLOCK *block, WT_EXTLIST *el)
{
	if (el->offset == WT_BLOCK_INVALID_OFFSET)
		return (0);

	/*
	 * Free blocks used to write checkpoint extents into the live system's
	 * checkpoint avail list (they were never on any alloc list). Do not
	 * use the live system's avail list because that list is used to decide
	 * if the file can be truncated, and we can't truncate any part of the
	 * file that contains a previous checkpoint's extents.
	 */
	return (__wt_block_insert_ext(
	    session, block, &block->live.ckpt_avail, el->offset, el->size));
}

#ifdef HAVE_DIAGNOSTIC
/*
 * __ckpt_verify --
 *	Diagnostic code, confirm we get what we expect in the checkpoint array.
 */
static int
__ckpt_verify(WT_SESSION_IMPL *session, WT_CKPT *ckptbase)
{
	WT_CKPT *ckpt;

	/*
	 * Fast check that we're seeing what we expect to see: some number of
	 * checkpoints to add, delete or ignore, terminated by a new checkpoint.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		switch (ckpt->flags) {
		case 0:
		case WT_CKPT_DELETE:
		case WT_CKPT_DELETE | WT_CKPT_FAKE:
		case WT_CKPT_FAKE:
			break;
		case WT_CKPT_ADD:
			if (ckpt[1].name == NULL)
				break;
			/* FALLTHROUGH */
		default:
			return (
			    __wt_illegal_value(session, "checkpoint array"));
		}
	return (0);
}
#endif

/*
 * __ckpt_process --
 *	Process the list of checkpoints.
 */
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
	WT_BLOCK_CKPT *a, *b, *ci;
	WT_CKPT *ckpt, *next_ckpt;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint64_t ckpt_size;
	bool deleting, locked;

	ci = &block->live;
	locked = false;

#ifdef HAVE_DIAGNOSTIC
	WT_RET(__ckpt_verify(session, ckptbase));
#endif

	/*
	 * Checkpoints are a two-step process: first, write a new checkpoint to
	 * disk (including all the new extent lists for modified checkpoints
	 * and the live system).  As part of this, create a list of file blocks
	 * newly available for reallocation, based on checkpoints being deleted.
	 * We then return the locations of the new checkpoint information to our
	 * caller.  Our caller has to write that information into some kind of
	 * stable storage, and once that's done, we can actually allocate from
	 * that list of newly available file blocks.  (We can't allocate from
	 * that list immediately because the allocation might happen before our
	 * caller saves the new checkpoint information, and if we crashed before
	 * the new checkpoint location was saved, we'd have overwritten blocks
	 * still referenced by checkpoints in the system.)  In summary, there is
	 * a second step: after our caller saves the checkpoint information, we
	 * are called to add the newly available blocks into the live system's
	 * available list.
	 *
	 * This function is the first step, the second step is in the resolve
	 * function.
	 *
	 * If we're called to checkpoint the same file twice, without the second
	 * resolution step, it's an error at an upper level and our choices are
	 * all bad: either leak blocks or risk crashing with our caller not
	 * having saved the checkpoint information to stable storage.  Leaked
	 * blocks are a safer choice, but that means file verify will fail for
	 * the rest of "forever", and the chance of us allocating a block and
	 * then crashing such that it matters is reasonably low: don't leak the
	 * blocks.
	 */
	if (block->ckpt_inprogress) {
		__wt_errx(session,
		    "%s: checkpointed without first resolving the previous "
		    "checkpoint",
		    block->name);

		WT_RET(__wt_block_checkpoint_resolve(session, block));
	}

	/*
	 * Extents newly available as a result of deleting previous checkpoints
	 * are added to a list of extents.  The list should be empty, but as
	 * described above, there is no "free the checkpoint information" call
	 * into the block manager; if there was an error in an upper level that
	 * resulted in some previous checkpoint never being resolved, the list
	 * may not be empty.  We should have caught that with the "checkpoint
	 * in progress" test, but it doesn't cost us anything to be cautious.
	 *
	 * We free the checkpoint's allocation and discard extent lists as part
	 * of the resolution step, not because they're needed at that time, but
	 * because it's potentially a lot of work, and waiting allows the btree
	 * layer to continue eviction sooner.  As for the checkpoint-available
	 * list, make sure they get cleaned out.
	 */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, "live", "ckpt_avail", true));
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_discard);

	/*
	 * To delete a checkpoint, we'll need checkpoint information for it and
	 * the subsequent checkpoint into which it gets rolled; read them from
	 * disk before we lock things down.
	 */
	deleting = false;
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;
		deleting = true;

		/*
		 * Read the checkpoint and next checkpoint extent lists if we
		 * haven't already read them (we may have already read these
		 * extent blocks if there is more than one deleted checkpoint).
		 */
		if (ckpt->bpriv == NULL)
			WT_ERR(__ckpt_extlist_read(session, block, ckpt));

		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * The "next" checkpoint may be the live tree which has no
		 * extent blocks to read.
		 */
		if (next_ckpt->bpriv == NULL &&
		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
	}

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if checkpoints take too
	 * much time away from real work: we read the historic checkpoint
	 * information without a lock, but we could also merge and re-write the
	 * deleted and merged checkpoint information without a lock, except for
	 * the final merge of ranges into the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = true;

	/*
	 * We've allocated our last page, update the checkpoint size.  We need
	 * to calculate the live system's checkpoint size before merging
	 * checkpoint allocation and discard information from the checkpoints
	 * we're deleting, those operations change the underlying byte counts.
	 */
	ckpt_size = ci->ckpt_size;
	ckpt_size += ci->alloc.bytes;
	ckpt_size -= ci->discard.bytes;

	/* Skip the additional processing if we aren't deleting checkpoints. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed checkpoints: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;

		if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(
			    session, block, ckpt->raw.data, tmp));
			WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
			    "%s: delete-checkpoint: %s: %s",
			    block->name, ckpt->name, (const char *)tmp->data));
		}

		/*
		 * Find the checkpoint into which we'll roll this checkpoint's
		 * blocks: it's the next real checkpoint in the list, and it
		 * better have been read in (if it's not the add slot).
		 */
		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * Set the from/to checkpoint structures, where the "to" value
		 * may be the live tree.
		 */
		a = ckpt->bpriv;
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			b = &block->live;
		else
			b = next_ckpt->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the checkpoint's discard list, however, not the live system's
		 * list because it appears on the checkpoint's alloc list and so
		 * must be paired in the checkpoint.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session, block,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" checkpoint's extent
		 * lists, including the avail list.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * checkpoint's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->discard, &b->discard));

		/*
		 * If the "to" checkpoint is also being deleted, we're done with
		 * it, it's merged into some other checkpoint in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * checkpoints, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" checkpoint's
		 * allocate and discard lists overlap, move the range to
		 * the live system's checkpoint available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			continue;

		/*
		 * We have to write the "to" checkpoint's extent lists out in
		 * new blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" checkpoint's extent
		 * lists; don't include the avail list, it's not changing.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

		F_SET(next_ckpt, WT_CKPT_UPDATE);
	}

	/* Update checkpoints marked for update. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_UPDATE))
			WT_ERR(__ckpt_update(
			    session, block, ckpt, ckpt->bpriv, false));

live_update:
	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

	/* Update the final, added checkpoint based on the live system. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
			/*
			 * Set the checkpoint size for the live system.
			 *
			 * !!!
			 * Our caller wants the final checkpoint size.  Setting
			 * the size here violates layering, but the alternative
			 * is a call for the btree layer to crack the checkpoint
			 * cookie into its components, and that's a fair amount
			 * of work.
			 */
			ckpt->ckpt_size = ci->ckpt_size = ckpt_size;

			WT_ERR(__ckpt_update(session, block, ckpt, ci, true));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.  This includes freeing a lot of extents, so do it
	 * outside of the system's lock by copying and resetting the original,
	 * then doing the work later.
	 */
	ci->ckpt_alloc = ci->alloc;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->alloc, "live", "alloc", false));
	ci->ckpt_discard = ci->discard;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->discard, "live", "discard", false));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first checkpoint in the system should always have an empty
	 * discard list.  If we've read that checkpoint and/or created it,
	 * check.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
			break;
	if ((a = ckpt->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0)
		WT_ERR_MSG(session, WT_ERROR,
		    "first checkpoint incorrectly has blocks on the discard "
		    "list");
#endif

	block->ckpt_inprogress = true;

err:	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any checkpoint information we loaded. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if ((ci = ckpt->bpriv) != NULL)
			__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}

/*
 * __ckpt_update --
 *	Update a checkpoint.
 */
static int
__ckpt_update(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_CKPT *ckpt, WT_BLOCK_CKPT *ci, bool is_live)
{
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint8_t *endp;

#ifdef HAVE_DIAGNOSTIC
	/* Check the extent list combinations for overlaps. */
	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->avail));
	WT_RET(__wt_block_extlist_check(session, &ci->discard, &ci->avail));
	WT_RET(__wt_block_extlist_check(session, &ci->alloc, &ci->discard));
#endif
	/*
	 * Write the checkpoint's alloc and discard extent lists.  After each
	 * write, remove any allocated blocks from the system's allocation
	 * list, checkpoint extent blocks don't appear on any extent lists.
	 */
	WT_RET(__wt_block_extlist_write(session, block, &ci->alloc, NULL));
	WT_RET(__wt_block_extlist_write(session, block, &ci->discard, NULL));

	/*
	 * We only write an avail list for the live system, other checkpoint's
	 * avail lists are static and never change.
	 *
	 * Write the avail list last so it reflects changes due to allocating
	 * blocks for the alloc and discard lists.  Second, when we write the
	 * live system's avail list, it's two lists: the current avail list
	 * plus the list of blocks to be made available when the new checkpoint
	 * completes.  We can't merge that second list into the real list yet,
	 * it's not truly available until the new checkpoint locations have been
	 * saved to the metadata.
	 */
	if (is_live)
		WT_RET(__wt_block_extlist_write(
		    session, block, &ci->avail, &ci->ckpt_avail));

	/*
	 * Set the file size for the live system.
	 *
	 * !!!
	 * We do NOT set the file size when re-writing checkpoints because we
	 * want to test the checkpoint's blocks against a reasonable maximum
	 * file size during verification.  This is bad: imagine a checkpoint
	 * appearing early in the file, re-written, and then the checkpoint
	 * requires blocks at the end of the file, blocks after the listed file
	 * size.  If the application opens that checkpoint for writing
	 * (discarding subsequent checkpoints), we would truncate the file to
	 * the early chunk, discarding the re-written checkpoint information.
	 * The alternative, updating the file size has its own problems, in
	 * that case we'd work correctly, but we'd lose all of the blocks
	 * between the original checkpoint and the re-written checkpoint.
	 * Currently, there's no API to roll-forward intermediate checkpoints,
	 * if there ever is, this will need to be fixed.
	 */
	if (is_live)
		ci->file_size = block->fh->size;

	/*
	 * Copy the checkpoint information into the checkpoint array's address
	 * cookie.
	 */
	WT_RET(__wt_buf_init(session, &ckpt->raw, WT_BTREE_MAX_ADDR_COOKIE));
	endp = ckpt->raw.mem;
	WT_RET(__wt_block_ckpt_to_buffer(session, block, &endp, ci));
	ckpt->raw.size = WT_PTRDIFF(endp, ckpt->raw.mem);

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_RET(__wt_scr_alloc(session, 0, &tmp));
		WT_ERR(__ckpt_string(session, block, ckpt->raw.data, tmp));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "%s: create-checkpoint: %s: %s",
		    block->name, ckpt->name, (const char *)tmp->data));
	}

err:	__wt_scr_free(session, &tmp);
	return (ret);
}

/*
 * __wt_block_checkpoint_resolve --
 *	Resolve a checkpoint.
 */
int
__wt_block_checkpoint_resolve(WT_SESSION_IMPL *session, WT_BLOCK *block)
{
	WT_BLOCK_CKPT *ci;
	WT_DECL_RET;

	ci = &block->live;

	/*
	 * Resolve the checkpoint after our caller has written the checkpoint
	 * information to stable storage.
	 */
	if (!block->ckpt_inprogress)
		WT_RET_MSG(session, WT_ERROR,
		    "%s: checkpoint resolved, but no checkpoint in progress",
		    block->name);
	block->ckpt_inprogress = false;

	__wt_spin_lock(session, &block->live_lock);
	ret = __wt_block_extlist_merge(
	    session, block, &ci->ckpt_avail, &ci->avail);
	__wt_spin_unlock(session, &block->live_lock);

	/* Discard the lists remaining after the checkpoint call. */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_discard);

	return (ret);
}

/*
 * __ckpt_string --
 *	Return a printable string representation of a checkpoint address cookie.
 */
static int
__ckpt_string(WT_SESSION_IMPL *session,
    WT_BLOCK *block, const uint8_t *addr, WT_ITEM *buf)
{
	WT_BLOCK_CKPT *ci, _ci;

	/* Initialize the checkpoint, crack the cookie. */
	ci = &_ci;
	WT_RET(__wt_block_ckpt_init(session, ci, "string"));
	WT_RET(__wt_block_buffer_to_ckpt(session, block, addr, ci));

	WT_RET(__wt_buf_fmt(session, buf,
	    "version=%d",
	    ci->version));
	if (ci->root_offset == WT_BLOCK_INVALID_OFFSET)
		WT_RET(__wt_buf_catfmt(session, buf, ", root=[Empty]"));
	else
		WT_RET(__wt_buf_catfmt(session, buf,
		    ", root=[%"
		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
		    (uintmax_t)ci->root_offset,
		    (uintmax_t)(ci->root_offset + ci->root_size),
		    ci->root_size, ci->root_cksum));
	if (ci->alloc.offset == WT_BLOCK_INVALID_OFFSET)
		WT_RET(__wt_buf_catfmt(session, buf, ", alloc=[Empty]"));
	else
		WT_RET(__wt_buf_catfmt(session, buf,
		    ", alloc=[%"
		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
		    (uintmax_t)ci->alloc.offset,
		    (uintmax_t)(ci->alloc.offset + ci->alloc.size),
		    ci->alloc.size, ci->alloc.cksum));
	if (ci->avail.offset == WT_BLOCK_INVALID_OFFSET)
		WT_RET(__wt_buf_catfmt(session, buf, ", avail=[Empty]"));
	else
		WT_RET(__wt_buf_catfmt(session, buf,
		    ", avail=[%"
		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
		    (uintmax_t)ci->avail.offset,
		    (uintmax_t)(ci->avail.offset + ci->avail.size),
		    ci->avail.size, ci->avail.cksum));
	if (ci->discard.offset == WT_BLOCK_INVALID_OFFSET)
		WT_RET(__wt_buf_catfmt(session, buf, ", discard=[Empty]"));
	else
		WT_RET(__wt_buf_catfmt(session, buf,
		    ", discard=[%"
		    PRIuMAX "-%" PRIuMAX ", %" PRIu32 ", %" PRIu32 "]",
		    (uintmax_t)ci->discard.offset,
		    (uintmax_t)(ci->discard.offset + ci->discard.size),
		    ci->discard.size, ci->discard.cksum));
	WT_RET(__wt_buf_catfmt(session, buf,
	    ", file size=%" PRIuMAX, (uintmax_t)ci->file_size));

	__wt_block_ckpt_destroy(session, ci);

	return (0);
}