diff options
author | Luke Chen <luke.chen@mongodb.com> | 2020-10-20 16:38:49 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-10-20 06:35:06 +0000 |
commit | d0c4d5770387b73d8074e24ac0134f6e70d34078 (patch) | |
tree | b0a447e9280f9665cc7981bc64087a29130b55e9 /src/third_party/wiredtiger | |
parent | 94da5dc43e80e969099dbd86a24c8d34e1c2d372 (diff) | |
download | mongo-d0c4d5770387b73d8074e24ac0134f6e70d34078.tar.gz |
Import wiredtiger: f827562f2b486e000665c97ea81674012d5a765b from branch mongodb-5.0
ref: 6614fa3dca..f827562f2b
for: 4.9.0
WT-6798 Utilize Arm LSE atomics and the correct strength barriers
WT-6808 Documentation: add top level architecture picture
WT-6812 Fix "out-of-order fixup" potentially corrupting historical values
Diffstat (limited to 'src/third_party/wiredtiger')
15 files changed, 269 insertions, 15 deletions
diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in index 0cbf422de08..a18834185d0 100644 --- a/src/third_party/wiredtiger/build_posix/configure.ac.in +++ b/src/third_party/wiredtiger/build_posix/configure.ac.in @@ -102,6 +102,12 @@ if test "$GCC" = "yes"; then # instructions. if test "$wt_cv_arm64" = "yes"; then AM_CFLAGS="$AM_CFLAGS -march=armv8-a+crc" + # moutline-atomics preserves backwards compatibility with Arm v8.0 + # systems but also supports using Arm v8.1 atomics. The latter can + # massively improve performance on larger Arm systems. The flag was + # back ported to gcc8, 9 and is the default in gcc10+. See if the + # compiler supports the flag. + AX_CHECK_COMPILE_FLAG([-moutline-atomics], [AM_CFLAGS="$AM_CFLAGS -moutline-atomics"]) fi else # The Solaris native compiler gets the additional -mt flag. diff --git a/src/third_party/wiredtiger/dist/s_docs b/src/third_party/wiredtiger/dist/s_docs index 095ada474d3..a8b1b2eb487 100755 --- a/src/third_party/wiredtiger/dist/s_docs +++ b/src/third_party/wiredtiger/dist/s_docs @@ -166,6 +166,23 @@ EOF sed -i~ -e 's,/\.html,/,' -e 's,\.html\.html,.html,' navtree.js && rm -f navtree.js~) + # Any cmapx files that are generated by plantuml need to be referred to + CMAPX=`find ../src/docs/ -type f -name "*.cmapx" 2>/dev/null` + if [ "$CMAPX" != '' ]; then + cd ../docs + for f in $CMAPX; do + b=`echo $f | sed -e 's:.*/::' -e 's/.cmapx$//'` + for html in `grep -l $b.png *.html`; do + # There's an image in this HTML file that has a map file + # generated by plantuml. Refer to the map by its name + # and insert the contents of the map into the HTML source. + sed -e "/<img.*=\"$b[.]png\"/s/\(\"$b[.]png\"\)/\1 usemap=\"#${b}_map\"/" \ + -e "/<img.*=\"$b[.]png\"/r $f" $html > $html.NEW + mv $html.NEW $html + done + done + fi + # Fixup the man pages generated by Doxygen. We want the command line # documentation to be the main man page, but also install a man page # for the WiredTiger header into the library section. diff --git a/src/third_party/wiredtiger/dist/s_docs_plantuml b/src/third_party/wiredtiger/dist/s_docs_plantuml index c646739d7ba..43c669591a7 100755 --- a/src/third_party/wiredtiger/dist/s_docs_plantuml +++ b/src/third_party/wiredtiger/dist/s_docs_plantuml @@ -35,7 +35,7 @@ test -f "../dist/plantuml.jar" || { if [ $download_plantuml -eq 1 ] then echo 'Downloading plantuml:' - wget $PLANTUML_URL -O ../dist/plantuml.jar + curl -Li $PLANTUML_URL -o ../dist/plantuml.jar else echo 'plantuml can be downloaded from:' echo 'https://sourceforge.net/projects/plantuml/files/plantuml.jar/download' diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index be6ae0893fe..3df400be17d 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -698,12 +698,14 @@ dll dlopen dlsym dmalloc +dmb dmsg doxgen doxygen drealloc ds dsb +dsbs dsk dsrc dst @@ -894,6 +896,9 @@ isalpha iscntrl isdigit isgraph +ish +ishld +ishst islocked islower ispo @@ -1224,6 +1229,7 @@ setstr setv setvbuf sfence +shareability signalled sii sizeof @@ -1306,6 +1312,7 @@ timestamp timestamped timestamps tinfo +tlb tmp todo tokenizer diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 99cbeecfe50..71f75ec285e 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-5.0", - "commit": "6614fa3dcaceaa95da289bbffe3d96eecb1c972c" + "commit": "f827562f2b486e000665c97ea81674012d5a765b" } diff --git a/src/third_party/wiredtiger/src/docs/arch-index.dox b/src/third_party/wiredtiger/src/docs/arch-index.dox index 53a49de7ed3..f5430ccb7e7 100644 --- a/src/third_party/wiredtiger/src/docs/arch-index.dox +++ b/src/third_party/wiredtiger/src/docs/arch-index.dox @@ -1,15 +1,136 @@ /*! @page arch-index WiredTiger Architecture Guide +WiredTiger is built as a library that is linked into the application. Applications +use the API interface to direct WiredTiger's operations (see @ref basic_api). +Here is an overview of the software components in WiredTiger and how they are organized. +An arrow indicates the "from" component uses "to" component. + +@plantuml_start{wt_diagram.png} +@startuml{wt_diagram.png} + +' We add spacing to the diagram in places to influence the layout. +' To do this, we create some invisible components with hidden arrows +' pointing to them. Since we don't otherwise use the "file" component, +' we set all its parts to be transparent, and any use of "file" results +' in an invisible spacer whose width is directed by the length of its label. +' When modifying this diagram, it's sometimes useful to comment out the +' following lines, and any [hidden] directives used below, to see how +' the spacers influence the layout. Note that this may be fragile; +' the spacers give hints to the layout, such hints will not always be honored. + +skinparam fileBorderColor Transparent +skinparam fileBackgroundColor Transparent +skinparam fileFontColor Transparent +skinparam fileShadowing false + +' Our diagram is simple. First, we define lots of labeled rectangles +' with most nesting within the "engine" rectangle. + +together { + rectangle "Python API" as python_api + ' "storage" displays as an oval. + storage " C/C++ \n applications " as application + rectangle "wt Utility" as utility +} + +' Trailing spaces for this label puts the text to the left. +rectangle "**WiredTiger Engine** " as wt_engine { + ' Leading and trailing spaces make a wide rectangle. + together { + file "____" as SPACE_api + rectangle " C API " as c_api + file "____" as SPACE_api2 + } + rectangle "[[#component-schema Schema]]" as schema + rectangle "Cursor" as cursor + rectangle "Transactions" as txn + rectangle "Metadata" as meta + rectangle "[[#component-dhandle dhandle/\n Btree]]" as btree + rectangle " Row\n storage" as row + rectangle " Column\n storage" as column + rectangle "History\n Store" as history + rectangle "Snapshots" as snapshot + rectangle "Cache" as cache + rectangle "Eviction" as evict + + together { + rectangle " Block\n Manager" as block + file "__________" as SPACE_log + rectangle "Logging" as log + file "___" as SPACE_log2 + } + rectangle " File System & OS \n interface" as os +} +together { + database "Database\n Files" as wt_file + database " Log \n Files" as log_file +} + +' Influence the ordering at the top using (hidden) directed labels +python_api -[hidden]right-> application +application -[hidden]right-> utility + +python_api -down-> c_api +application -down-> c_api +utility -down-> c_api + +c_api -down-> schema +c_api -down-> cursor +c_api -down-> txn +SPACE_api -[hidden]right-> c_api +c_api -[hidden]right-> SPACE_api2 + +schema -down-> meta +schema -down-> btree +cursor -down-> btree +btree -down-> row +btree -down-> column +meta -up-> cursor +' The hidden arrow helps our boxes to line up in a better way. +meta -[hidden]right-> btree +cursor -[hidden]right-> txn +txn -down-> snapshot +row -down-> cache +column -down-> cache +cache -down-> history +evict -down-> history +history -up-> cursor +snapshot -down-> evict +cache -right-> evict +cache -down-> block +evict -down-> block +txn -down-> log + +block -[hidden]right-> SPACE_log +cache -[hidden]down-> SPACE_log +evict -[hidden]down-> SPACE_log +SPACE_log -[hidden]right-> log +log -[hidden]right-> SPACE_log2 + +block -down-> os +log -down-> os +os -down-> wt_file +os -down-> log_file + +wt_file -[hidden]right-> log_file + +@enduml +@plantuml_end + +We go into some detail for some of the internal components. + @subpage arch-glossary WiredTiger assigns specific meanings to certain words. Here we decode them. +\anchor component-schema @subpage arch-schema Most applications begin to make use of WiredTiger by creating a table (or other data object) to store their data in. Create is one of several schema operations available in WiredTiger. +\anchor component-dhandle @subpage arch-dhandle-lifecycle An internal structure called Data Handle (dhandle) is used to represent and diff --git a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_create.png b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_create.png Binary files differindex d2699646e9c..d2fc00448f0 100644 --- a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_create.png +++ b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_create.png diff --git a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_generic.png b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_generic.png Binary files differindex 790584b3f7b..f046734ce52 100644 --- a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_generic.png +++ b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_generic.png diff --git a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_rename.png b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_rename.png Binary files differindex b0aa560946e..65ac52d6fa1 100644 --- a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_rename.png +++ b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/schema_rename.png diff --git a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/wt_diagram.cmapx b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/wt_diagram.cmapx new file mode 100644 index 00000000000..411d3905b8e --- /dev/null +++ b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/wt_diagram.cmapx @@ -0,0 +1,4 @@ +<map id="wt_diagram_map" name="wt_diagram_map"> +<area shape="rect" id="id1" href="#component-schema" title="#component-schema" alt="" coords="98,224,151,241"/> +<area shape="rect" id="id2" href="#component-dhandle" title="#component-dhandle" alt="" coords="181,320,244,353"/> +</map> diff --git a/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/wt_diagram.png b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/wt_diagram.png Binary files differnew file mode 100644 index 00000000000..57480c2156f --- /dev/null +++ b/src/third_party/wiredtiger/src/docs/images/plantuml_gen_img/wt_diagram.png diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok index 26f40b5e099..b153ba12668 100644 --- a/src/third_party/wiredtiger/src/docs/spell.ok +++ b/src/third_party/wiredtiger/src/docs/spell.ok @@ -246,6 +246,10 @@ fdatasync fextend fh fieldname +fileBackgroundColor +fileBorderColor +fileFontColor +fileShadowing fileID fileformats fileid @@ -481,6 +485,7 @@ seqno serializable sess sid +skinparam skiplist spinlock spinlocks diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c index a03b8e31739..2c5e48e1ca6 100644 --- a/src/third_party/wiredtiger/src/history/hs_rec.c +++ b/src/third_party/wiredtiger/src/history/hs_rec.c @@ -847,10 +847,11 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_CURSOR_BTREE *hs_cbt; WT_DECL_RET; WT_HS_TIME_POINT start_time_point, stop_time_point; - WT_ITEM hs_key; + WT_ITEM hs_key, hs_value; + WT_TIME_WINDOW tw; WT_UPDATE *tombstone; wt_timestamp_t hs_ts; - uint64_t hs_counter; + uint64_t hs_counter, hs_upd_type; uint32_t hs_btree_id; int cmp; char ts_string[5][WT_TS_INT_STRING_SIZE]; @@ -859,6 +860,7 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, insert_cursor = NULL; hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; WT_CLEAR(hs_key); + WT_CLEAR(hs_value); tombstone = NULL; /* @@ -973,9 +975,13 @@ __hs_fixup_out_of_order_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, stop_time_point.ts = stop_time_point.durable_ts = ts; stop_time_point.txnid = hs_cbt->upd_value->tw.stop_txn; + /* Extract the underlying value for reinsertion. */ + WT_ERR(hs_cursor->get_value( + hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value)); + /* Reinsert entry with earlier timestamp. */ while ((ret = __hs_insert_record_with_btree_int(session, insert_cursor, btree, key, - WT_UPDATE_STANDARD, &hs_cursor->value, &start_time_point, &stop_time_point, + (uint8_t)hs_upd_type, &hs_value, &start_time_point, &stop_time_point, *counter)) == WT_RESTART) ; WT_ERR(ret); diff --git a/src/third_party/wiredtiger/src/include/gcc.h b/src/third_party/wiredtiger/src/include/gcc.h index be4503f3492..85ba3214d3e 100644 --- a/src/third_party/wiredtiger/src/include/gcc.h +++ b/src/third_party/wiredtiger/src/include/gcc.h @@ -218,17 +218,28 @@ WT_ATOMIC_FUNC(size, size_t, size_t *vp, size_t v) #elif defined(__aarch64__) #define WT_PAUSE() __asm__ volatile("yield" ::: "memory") -#define WT_FULL_BARRIER() \ - do { \ - __asm__ volatile("dsb sy" ::: "memory"); \ + +/* + * dmb are chosen here because they are sufficient to guarantee the ordering described above. We + * don't want to use dsbs because they provide a much stronger guarantee of completion which isn't + * required. Additionally, dsbs synchronize other system activities such as tlb and cache + * maintenance instructions which is not required in this case. + * + * A shareability domain of inner-shareable is selected because all the entities participating in + * the ordering requirements are CPUs and ordering with respect to other devices or memory-types + * isn't required. + */ +#define WT_FULL_BARRIER() \ + do { \ + __asm__ volatile("dmb ish" ::: "memory"); \ } while (0) -#define WT_READ_BARRIER() \ - do { \ - __asm__ volatile("dsb ld" ::: "memory"); \ +#define WT_READ_BARRIER() \ + do { \ + __asm__ volatile("dsb ishld" ::: "memory"); \ } while (0) -#define WT_WRITE_BARRIER() \ - do { \ - __asm__ volatile("dsb st" ::: "memory"); \ +#define WT_WRITE_BARRIER() \ + do { \ + __asm__ volatile("dsb ishst" ::: "memory"); \ } while (0) #elif defined(__s390x__) diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp20.py b/src/third_party/wiredtiger/test/suite/test_timestamp20.py index 69aec301a47..63a2503d915 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp20.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp20.py @@ -37,7 +37,7 @@ class test_timestamp20(wttest.WiredTigerTestCase): conn_config = 'cache_size=50MB' session_config = 'isolation=snapshot' - def test_timestamp20(self): + def test_timestamp20_standard(self): uri = 'table:test_timestamp20' self.session.create(uri, 'key_format=S,value_format=S') self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1)) @@ -86,3 +86,80 @@ class test_timestamp20(wttest.WiredTigerTestCase): for i in range(1, 10000): self.assertEqual(old_reader_cursor[str(i)], value3) old_reader_session.rollback_transaction() + + # In this test we're using modifies since they are more sensitive to corruptions. + # + # Corruptions to string types may go undetected since non-ASCII characters won't be included in + # the conversion to a Python string. + def test_timestamp20_modify(self): + uri = 'table:test_timestamp20' + self.session.create(uri, 'key_format=S,value_format=S') + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(1)) + cursor = self.session.open_cursor(uri) + + value1 = 'a' * 500 + value2 = 'b' * 500 + value3 = 'c' * 500 + + # Apply the base value. + for i in range(1, 10000): + self.session.begin_transaction() + cursor[str(i)] = value1 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(10)) + + # Now apply a series of modifies. + for i in range(1, 10000): + self.session.begin_transaction() + cursor.set_key(str(i)) + self.assertEqual(cursor.modify([wiredtiger.Modify('B', 100, 1)]), 0) + self.session.commit_transaction('commit_timestamp=' + timestamp_str(20)) + + for i in range(1, 10000): + self.session.begin_transaction() + cursor.set_key(str(i)) + self.assertEqual(cursor.modify([wiredtiger.Modify('C', 200, 1)]), 0) + self.session.commit_transaction('commit_timestamp=' + timestamp_str(30)) + + # Open an old reader at this point. + # + # I'm trying to read the middle modify because I specifically don't want to read one that + # has been squashed into a full update. + old_reader_session = self.conn.open_session() + old_reader_cursor = old_reader_session.open_cursor(uri) + old_reader_session.begin_transaction('read_timestamp=' + timestamp_str(30)) + + # Now apply the last modify. + # This will be the end of the chain of modifies. + for i in range(1, 10000): + self.session.begin_transaction() + cursor.set_key(str(i)) + self.assertEqual(cursor.modify([wiredtiger.Modify('D', 300, 1)]), 0) + self.session.commit_transaction('commit_timestamp=' + timestamp_str(40)) + + # Now put two updates out of order. 5 will go to the history store and will trigger a + # correction to the existing contents. + for i in range(1, 10000): + self.session.begin_transaction() + cursor[str(i)] = value2 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(5)) + self.session.begin_transaction() + cursor[str(i)] = value3 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(50)) + + # Open up a new transaction and read at 30. + # We shouldn't be able to see past 5 due to txnid visibility. + self.session.begin_transaction('read_timestamp=' + timestamp_str(30)) + for i in range(1, 10000): + self.assertEqual(cursor[str(i)], value2) + self.session.rollback_transaction() + + # Put together expected value. + expected = list(value1) + expected[100] = 'B' + expected[200] = 'C' + expected = str().join(expected) + + # On the other hand, this older transaction SHOULD be able to read past the 5. + for i in range(1, 10000): + self.assertEqual(old_reader_cursor[str(i)], expected) + old_reader_session.rollback_transaction() |