summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-12-21 17:27:48 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-12-21 07:28:22 +0000
commit4c3518a6ca6e669a42ebf845092a578f2a8a3503 (patch)
tree57187e5a07b94f5c69b14234928da81d9876c695 /src
parent4eee45a1167346286f9d7ddfd3cd636c2fdcbd1b (diff)
downloadmongo-4c3518a6ca6e669a42ebf845092a578f2a8a3503.tar.gz
Import wiredtiger: 991d999fc83966b38669db49368eff20ed06a522 from branch mongodb-5.2
ref: 33d207d74c..991d999fc8 for: 5.2.0-rc2 WT-8513 Conditionally eliminate the empty space on FLCS pages
Diffstat (limited to 'src')
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h4
-rw-r--r--src/third_party/wiredtiger/src/include/reconcile.h3
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_col.c24
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c80
5 files changed, 88 insertions, 25 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 1fd2ed9154b..c93b907bbbc 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-5.2",
- "commit": "33d207d74c7ba955c8b98286072b0922047c7bae"
+ "commit": "991d999fc83966b38669db49368eff20ed06a522"
}
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 2690ca9d748..ce333968311 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -1843,8 +1843,8 @@ extern void __wt_read_row_time_window(
WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_TIME_WINDOW *tw);
extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l);
-extern void __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r,
- uint32_t entries, uint32_t auxentries, uint8_t *image, size_t size);
+extern void __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, uint32_t entries,
+ uint32_t aux_start_offset, uint32_t auxentries, uint8_t *image, size_t size);
extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r);
extern void __wt_rec_dictionary_reset(WT_RECONCILE *r);
extern void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref);
diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h
index 85ac075d6d5..075d08beb49 100644
--- a/src/third_party/wiredtiger/src/include/reconcile.h
+++ b/src/third_party/wiredtiger/src/include/reconcile.h
@@ -59,7 +59,8 @@ struct __wt_rec_chunk {
WT_ITEM image; /* disk-image */
- /* For fixed-length column store, track how many time windows we have. */
+ /* For fixed-length column store, track where the time windows start and how many we have. */
+ uint32_t aux_start_offset;
uint32_t auxentries;
};
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c
index af38b0a0971..1b71533546e 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_col.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c
@@ -958,11 +958,11 @@ err:
* Write the auxiliary header into the page image.
*/
void
-__wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t entries,
- uint32_t auxentries, uint8_t *image, size_t size)
+__wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, uint32_t entries,
+ uint32_t aux_start_offset, uint32_t auxentries, uint8_t *image, size_t size)
{
WT_BTREE *btree;
- uint32_t auxdataoffset, auxheaderoffset, bitmapsize, offset, space;
+ uint32_t auxheaderoffset, bitmapsize, offset, space;
uint8_t *endp, *p;
btree = S2BT(session);
@@ -1009,6 +1009,10 @@ __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint
*
* However, this means that we should not assume the bitmap size is given by the btree maximum
* leaf page size but get it from the reconciliation info.
+ *
+ * Note: it is important to use *this* chunk's auxiliary start offset (passed in) and not read
+ * the auxiliary start offset from the WT_RECONCILE, as we may be writing the previous chunk and
+ * the latter describes the current chunk.
*/
/* Figure how much primary data we have. */
@@ -1017,14 +1021,8 @@ __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint
/* The auxiliary header goes after the bitmap, which goes after the page header. */
auxheaderoffset = WT_PAGE_HEADER_BYTE_SIZE(btree) + bitmapsize;
- /* The auxiliary data goes wherever we have been writing it. */
- auxdataoffset = r->aux_start_offset;
-
- /* This should be at or after the place it goes on a normal-sized page. */
- WT_ASSERT(session, auxdataoffset >= btree->maxleafpage + WT_COL_FIX_AUXHEADER_RESERVATION);
-
/* This should also have left sufficient room for the header. */
- WT_ASSERT(session, auxdataoffset >= auxheaderoffset + WT_COL_FIX_AUXHEADER_RESERVATION);
+ WT_ASSERT(session, aux_start_offset >= auxheaderoffset + WT_COL_FIX_AUXHEADER_RESERVATION);
/*
* If there is no auxiliary data, we will have already shortened the image size to discard the
@@ -1033,12 +1031,12 @@ __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint
* last page in the tree, this also avoids the space wastage described above.
*/
if (auxentries == 0) {
- WT_ASSERT(session, auxdataoffset >= size);
+ WT_ASSERT(session, aux_start_offset >= size);
return;
}
/* The offset we're going to write is the distance from the header start to the data. */
- offset = auxdataoffset - auxheaderoffset;
+ offset = aux_start_offset - auxheaderoffset;
/*
* Encoding the offset should fit -- either it is less than what encodes to 1 byte or greater
@@ -1050,7 +1048,7 @@ __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint
WT_STATIC_ASSERT(WT_COL_FIX_AUXHEADER_SIZE_MAX < POS_1BYTE_MAX);
p = image + auxheaderoffset;
- endp = image + auxdataoffset;
+ endp = image + aux_start_offset;
*(p++) = WT_COL_FIX_VERSION_TS;
WT_IGNORE_RET(__wt_vpack_uint(&p, WT_PTRDIFF32(endp, p), auxentries));
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index 1c65d9210a8..ea861779401 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -1329,6 +1329,54 @@ __wt_rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len)
return (0);
}
+/*
+ * __rec_split_fix_shrink --
+ * Consider eliminating the empty space on an FLCS page.
+ */
+static void
+__rec_split_fix_shrink(WT_SESSION_IMPL *session, WT_RECONCILE *r)
+{
+ uint32_t auxsize, emptysize, primarysize, totalsize;
+ uint8_t *src, *dst;
+
+ /* Total size of page. */
+ totalsize = WT_PTRDIFF32(r->aux_first_free, r->cur_ptr->image.mem);
+
+ /* Size of the entire primary data area, including headers. */
+ primarysize = WT_PTRDIFF32(r->first_free, r->cur_ptr->image.mem);
+
+ /* Size of the empty space. */
+ emptysize = r->aux_start_offset - (primarysize + WT_COL_FIX_AUXHEADER_RESERVATION);
+
+ /* Size of the auxiliary data. */
+ auxsize = totalsize - r->aux_start_offset;
+
+ /*
+ * Arbitrary criterion: if the empty space is bigger than the auxiliary data, memmove the
+ * auxiliary data, on the assumption that the cost of the memmove is outweighed by the cost of
+ * taking checksums of, writing out, and reading back in a bunch of useless empty space.
+ */
+ if (emptysize > auxsize) {
+ /* Source: current auxiliary start. */
+ src = (uint8_t *)r->cur_ptr->image.mem + r->aux_start_offset;
+
+ /* Destination: immediately after the primary data with space for the auxiliary header. */
+ dst = r->first_free + WT_COL_FIX_AUXHEADER_RESERVATION;
+
+ /* The move span should be the empty data size. */
+ WT_ASSERT(session, src == dst + emptysize);
+
+ /* Do the move. */
+ memmove(dst, src, auxsize);
+
+ /* Update the tracking information. */
+ r->aux_start_offset -= emptysize;
+ r->aux_first_free -= emptysize;
+ r->space_avail -= emptysize;
+ r->aux_space_avail += emptysize;
+ }
+}
+
/* The minimum number of entries before we'll split a row-store internal page. */
#define WT_PAGE_INTL_MINIMUM_ENTRIES 20
@@ -1378,9 +1426,17 @@ __wt_rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len)
/* Set the entries, timestamps and size for the just finished chunk. */
r->cur_ptr->entries = r->entries;
- if (r->page->type == WT_PAGE_COL_FIX && (r->cur_ptr->auxentries = r->aux_entries) != 0)
- r->cur_ptr->image.size = WT_PTRDIFF(r->aux_first_free, r->cur_ptr->image.mem);
- else
+ if (r->page->type == WT_PAGE_COL_FIX) {
+ if ((r->cur_ptr->auxentries = r->aux_entries) != 0) {
+ __rec_split_fix_shrink(session, r);
+ /* This must come after the shrink call, which can change the offset. */
+ r->cur_ptr->aux_start_offset = r->aux_start_offset;
+ r->cur_ptr->image.size = WT_PTRDIFF(r->aux_first_free, r->cur_ptr->image.mem);
+ } else {
+ r->cur_ptr->aux_start_offset = r->aux_start_offset;
+ r->cur_ptr->image.size = inuse;
+ }
+ } else
r->cur_ptr->image.size = inuse;
/*
@@ -1619,9 +1675,17 @@ __wt_rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r)
/* Set the number of entries and size for the just finished chunk. */
r->cur_ptr->entries = r->entries;
- if (r->page->type == WT_PAGE_COL_FIX && (r->cur_ptr->auxentries = r->aux_entries) != 0)
- r->cur_ptr->image.size = WT_PTRDIFF(r->aux_first_free, r->cur_ptr->image.mem);
- else
+ if (r->page->type == WT_PAGE_COL_FIX) {
+ if ((r->cur_ptr->auxentries = r->aux_entries) != 0) {
+ __rec_split_fix_shrink(session, r);
+ /* This must come after the shrink call, which can change the offset. */
+ r->cur_ptr->aux_start_offset = r->aux_start_offset;
+ r->cur_ptr->image.size = WT_PTRDIFF(r->aux_first_free, r->cur_ptr->image.mem);
+ } else {
+ r->cur_ptr->aux_start_offset = r->aux_start_offset;
+ r->cur_ptr->image.size = WT_PTRDIFF(r->first_free, r->cur_ptr->image.mem);
+ }
+ } else
r->cur_ptr->image.size = WT_PTRDIFF(r->first_free, r->cur_ptr->image.mem);
/*
@@ -2044,8 +2108,8 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK *chunk
/* Initialize the page header(s). */
__rec_split_write_header(session, r, chunk, multi, chunk->image.mem);
if (r->page->type == WT_PAGE_COL_FIX)
- __wt_rec_col_fix_write_auxheader(
- session, r, chunk->entries, chunk->auxentries, chunk->image.mem, chunk->image.size);
+ __wt_rec_col_fix_write_auxheader(session, chunk->entries, chunk->aux_start_offset,
+ chunk->auxentries, chunk->image.mem, chunk->image.size);
if (compressed_image != NULL)
__rec_split_write_header(session, r, chunk, multi, compressed_image->mem);