diff options
author | Luke Chen <luke.chen@mongodb.com> | 2021-12-21 17:27:48 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-12-21 07:28:22 +0000 |
commit | 4c3518a6ca6e669a42ebf845092a578f2a8a3503 (patch) | |
tree | 57187e5a07b94f5c69b14234928da81d9876c695 /src | |
parent | 4eee45a1167346286f9d7ddfd3cd636c2fdcbd1b (diff) | |
download | mongo-4c3518a6ca6e669a42ebf845092a578f2a8a3503.tar.gz |
Import wiredtiger: 991d999fc83966b38669db49368eff20ed06a522 from branch mongodb-5.2
ref: 33d207d74c..991d999fc8
for: 5.2.0-rc2
WT-8513 Conditionally eliminate the empty space on FLCS pages
Diffstat (limited to 'src')
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/extern.h | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/reconcile.h | 3 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_col.c | 24 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_write.c | 80 |
5 files changed, 88 insertions, 25 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 1fd2ed9154b..c93b907bbbc 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-5.2", - "commit": "33d207d74c7ba955c8b98286072b0922047c7bae" + "commit": "991d999fc83966b38669db49368eff20ed06a522" } diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 2690ca9d748..ce333968311 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -1843,8 +1843,8 @@ extern void __wt_read_row_time_window( WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_TIME_WINDOW *tw); extern void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); -extern void __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, - uint32_t entries, uint32_t auxentries, uint8_t *image, size_t size); +extern void __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, uint32_t entries, + uint32_t aux_start_offset, uint32_t auxentries, uint8_t *image, size_t size); extern void __wt_rec_dictionary_free(WT_SESSION_IMPL *session, WT_RECONCILE *r); extern void __wt_rec_dictionary_reset(WT_RECONCILE *r); extern void __wt_ref_addr_free(WT_SESSION_IMPL *session, WT_REF *ref); diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index 85ac075d6d5..075d08beb49 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -59,7 +59,8 @@ struct __wt_rec_chunk { WT_ITEM image; /* disk-image */ - /* For fixed-length column store, track how many time windows we have. */ + /* For fixed-length column store, track where the time windows start and how many we have. */ + uint32_t aux_start_offset; uint32_t auxentries; }; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index af38b0a0971..1b71533546e 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -958,11 +958,11 @@ err: * Write the auxiliary header into the page image. */ void -__wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint32_t entries, - uint32_t auxentries, uint8_t *image, size_t size) +__wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, uint32_t entries, + uint32_t aux_start_offset, uint32_t auxentries, uint8_t *image, size_t size) { WT_BTREE *btree; - uint32_t auxdataoffset, auxheaderoffset, bitmapsize, offset, space; + uint32_t auxheaderoffset, bitmapsize, offset, space; uint8_t *endp, *p; btree = S2BT(session); @@ -1009,6 +1009,10 @@ __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint * * However, this means that we should not assume the bitmap size is given by the btree maximum * leaf page size but get it from the reconciliation info. + * + * Note: it is important to use *this* chunk's auxiliary start offset (passed in) and not read + * the auxiliary start offset from the WT_RECONCILE, as we may be writing the previous chunk and + * the latter describes the current chunk. */ /* Figure how much primary data we have. */ @@ -1017,14 +1021,8 @@ __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint /* The auxiliary header goes after the bitmap, which goes after the page header. */ auxheaderoffset = WT_PAGE_HEADER_BYTE_SIZE(btree) + bitmapsize; - /* The auxiliary data goes wherever we have been writing it. */ - auxdataoffset = r->aux_start_offset; - - /* This should be at or after the place it goes on a normal-sized page. */ - WT_ASSERT(session, auxdataoffset >= btree->maxleafpage + WT_COL_FIX_AUXHEADER_RESERVATION); - /* This should also have left sufficient room for the header. */ - WT_ASSERT(session, auxdataoffset >= auxheaderoffset + WT_COL_FIX_AUXHEADER_RESERVATION); + WT_ASSERT(session, aux_start_offset >= auxheaderoffset + WT_COL_FIX_AUXHEADER_RESERVATION); /* * If there is no auxiliary data, we will have already shortened the image size to discard the @@ -1033,12 +1031,12 @@ __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint * last page in the tree, this also avoids the space wastage described above. */ if (auxentries == 0) { - WT_ASSERT(session, auxdataoffset >= size); + WT_ASSERT(session, aux_start_offset >= size); return; } /* The offset we're going to write is the distance from the header start to the data. */ - offset = auxdataoffset - auxheaderoffset; + offset = aux_start_offset - auxheaderoffset; /* * Encoding the offset should fit -- either it is less than what encodes to 1 byte or greater @@ -1050,7 +1048,7 @@ __wt_rec_col_fix_write_auxheader(WT_SESSION_IMPL *session, WT_RECONCILE *r, uint WT_STATIC_ASSERT(WT_COL_FIX_AUXHEADER_SIZE_MAX < POS_1BYTE_MAX); p = image + auxheaderoffset; - endp = image + auxdataoffset; + endp = image + aux_start_offset; *(p++) = WT_COL_FIX_VERSION_TS; WT_IGNORE_RET(__wt_vpack_uint(&p, WT_PTRDIFF32(endp, p), auxentries)); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 1c65d9210a8..ea861779401 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -1329,6 +1329,54 @@ __wt_rec_split_grow(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t add_len) return (0); } +/* + * __rec_split_fix_shrink -- + * Consider eliminating the empty space on an FLCS page. + */ +static void +__rec_split_fix_shrink(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + uint32_t auxsize, emptysize, primarysize, totalsize; + uint8_t *src, *dst; + + /* Total size of page. */ + totalsize = WT_PTRDIFF32(r->aux_first_free, r->cur_ptr->image.mem); + + /* Size of the entire primary data area, including headers. */ + primarysize = WT_PTRDIFF32(r->first_free, r->cur_ptr->image.mem); + + /* Size of the empty space. */ + emptysize = r->aux_start_offset - (primarysize + WT_COL_FIX_AUXHEADER_RESERVATION); + + /* Size of the auxiliary data. */ + auxsize = totalsize - r->aux_start_offset; + + /* + * Arbitrary criterion: if the empty space is bigger than the auxiliary data, memmove the + * auxiliary data, on the assumption that the cost of the memmove is outweighed by the cost of + * taking checksums of, writing out, and reading back in a bunch of useless empty space. + */ + if (emptysize > auxsize) { + /* Source: current auxiliary start. */ + src = (uint8_t *)r->cur_ptr->image.mem + r->aux_start_offset; + + /* Destination: immediately after the primary data with space for the auxiliary header. */ + dst = r->first_free + WT_COL_FIX_AUXHEADER_RESERVATION; + + /* The move span should be the empty data size. */ + WT_ASSERT(session, src == dst + emptysize); + + /* Do the move. */ + memmove(dst, src, auxsize); + + /* Update the tracking information. */ + r->aux_start_offset -= emptysize; + r->aux_first_free -= emptysize; + r->space_avail -= emptysize; + r->aux_space_avail += emptysize; + } +} + /* The minimum number of entries before we'll split a row-store internal page. */ #define WT_PAGE_INTL_MINIMUM_ENTRIES 20 @@ -1378,9 +1426,17 @@ __wt_rec_split(WT_SESSION_IMPL *session, WT_RECONCILE *r, size_t next_len) /* Set the entries, timestamps and size for the just finished chunk. */ r->cur_ptr->entries = r->entries; - if (r->page->type == WT_PAGE_COL_FIX && (r->cur_ptr->auxentries = r->aux_entries) != 0) - r->cur_ptr->image.size = WT_PTRDIFF(r->aux_first_free, r->cur_ptr->image.mem); - else + if (r->page->type == WT_PAGE_COL_FIX) { + if ((r->cur_ptr->auxentries = r->aux_entries) != 0) { + __rec_split_fix_shrink(session, r); + /* This must come after the shrink call, which can change the offset. */ + r->cur_ptr->aux_start_offset = r->aux_start_offset; + r->cur_ptr->image.size = WT_PTRDIFF(r->aux_first_free, r->cur_ptr->image.mem); + } else { + r->cur_ptr->aux_start_offset = r->aux_start_offset; + r->cur_ptr->image.size = inuse; + } + } else r->cur_ptr->image.size = inuse; /* @@ -1619,9 +1675,17 @@ __wt_rec_split_finish(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* Set the number of entries and size for the just finished chunk. */ r->cur_ptr->entries = r->entries; - if (r->page->type == WT_PAGE_COL_FIX && (r->cur_ptr->auxentries = r->aux_entries) != 0) - r->cur_ptr->image.size = WT_PTRDIFF(r->aux_first_free, r->cur_ptr->image.mem); - else + if (r->page->type == WT_PAGE_COL_FIX) { + if ((r->cur_ptr->auxentries = r->aux_entries) != 0) { + __rec_split_fix_shrink(session, r); + /* This must come after the shrink call, which can change the offset. */ + r->cur_ptr->aux_start_offset = r->aux_start_offset; + r->cur_ptr->image.size = WT_PTRDIFF(r->aux_first_free, r->cur_ptr->image.mem); + } else { + r->cur_ptr->aux_start_offset = r->aux_start_offset; + r->cur_ptr->image.size = WT_PTRDIFF(r->first_free, r->cur_ptr->image.mem); + } + } else r->cur_ptr->image.size = WT_PTRDIFF(r->first_free, r->cur_ptr->image.mem); /* @@ -2044,8 +2108,8 @@ __rec_split_write(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_CHUNK *chunk /* Initialize the page header(s). */ __rec_split_write_header(session, r, chunk, multi, chunk->image.mem); if (r->page->type == WT_PAGE_COL_FIX) - __wt_rec_col_fix_write_auxheader( - session, r, chunk->entries, chunk->auxentries, chunk->image.mem, chunk->image.size); + __wt_rec_col_fix_write_auxheader(session, chunk->entries, chunk->aux_start_offset, + chunk->auxentries, chunk->image.mem, chunk->image.size); if (compressed_image != NULL) __rec_split_write_header(session, r, chunk, multi, compressed_image->mem); |