diff options
author | Chenhao Qu <chenhao.qu@mongodb.com> | 2022-11-02 11:57:09 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-11-02 02:00:19 +0000 |
commit | 3bc6b5148c197c5874a69c43002958c633525819 (patch) | |
tree | 7a9196e3f6786a00c711d12b99478cda4d7618a9 | |
parent | 0c425f5959a796ee1328fa4b467af93978c04e09 (diff) | |
download | mongo-3bc6b5148c197c5874a69c43002958c633525819.tar.gz |
Import wiredtiger: bba610ce5d597ee4f67a67569c283dc035020d05 from branch mongodb-master
ref: 4754eda597..bba610ce5d
for: 6.2.0-rc0
WT-9988 Upgrade the wt util bson parsing python script.
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rwxr-xr-x | src/third_party/wiredtiger/tools/wt_to_mdb_bson.py | 166 |
2 files changed, 149 insertions, 19 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 237c53c09e2..3c260c84c65 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "4754eda597d63203388a316c7510b44c00ab2b21" + "commit": "bba610ce5d597ee4f67a67569c283dc035020d05" } diff --git a/src/third_party/wiredtiger/tools/wt_to_mdb_bson.py b/src/third_party/wiredtiger/tools/wt_to_mdb_bson.py index 353ee8c672d..eb6470f2fa8 100755 --- a/src/third_party/wiredtiger/tools/wt_to_mdb_bson.py +++ b/src/third_party/wiredtiger/tools/wt_to_mdb_bson.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # # Public Domain 2014-present MongoDB, Inc. # Public Domain 2008-2014 WiredTiger, Inc. @@ -26,23 +26,109 @@ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. -import bson, codecs, pprint, subprocess, sys +import bson, codecs, pprint, subprocess, sys, re +from enum import Enum + +# This script is intended to parse the output of three wt util commands and convert MongoDB bson +# from hexadecimal or byte format into ascii. +# +# It currently works with the following wt util commands: +# - dump +# - verify +# - printlog +# +# Those tools each perform a different function, and their usages are varied as such the script +# needs to handle their output separately. Originally many scripts existed for this purpose, the +# intent of this script is to provide a single place to perform all bson conversions. +# +# This script takes input of two forms, either through stdin or the user can pass the wt util +# location and the filename and the script will execute the required wt util command. When running +# with -f the script must be executed in the same directory as the database. +# +# Some example usages are: +# - ./wt -r dump -x file:foo.wt | ./wt_to_mdb_bson -m dump +# - ./wt -r verify -d dump_pages file:bar.wt | ./wt_to_mdb_bson -m verify +# - ./wt_to_mdb_bson -m dump -f ./wt file:foo.wt +# - ./wt_to_mdb_bson -m printlog -f ./wt + +# A basic enum to determine which mode we are operating in. +class Mode(Enum): + DUMP = 1 + VERIFY = 2 + PRINTLOG = 3 # Decodes a MongoDB file into a readable format. def util_usage(): - print("Usage: wt_to_mdb_bson <path_to_wt> filename") + print("Usage: wt_to_mdb_bson -m {dump|verify|printlog} [-f] [path_to_wt] [uri]") + print('\t-m the intended mode that the wt util operated in or will be executed using.') + print('\t-f the location of the wt util.') + sys.exit(1) -# Navigate to the data section of the MongoDB file if it exists. +# BSON printer helper. +def print_bson(bson): + return pprint.pformat(bson, indent=1).replace('\n', '\n\t ') + +# A utility function for converting verify byte output into parsible hex. +def convert_byte(inp): + ret = "" + idx = 0 + while True: + if idx >= len(inp): + break + ch = inp[idx] + if ord(ch) != 92: + ret += ch + idx += 1 + continue + lookAhead = inp[idx+1] + if ord(lookAhead) != 92: + ret += ch + 'x' + idx += 1 + continue + ret += ch + ch + idx += 2 + return codecs.escape_decode(ret)[0] + +# Converts the output of ./verify -d dump_pages to bson. +def wt_verify_to_bson(wt_output): + pattern = re.compile('V {(.*?)}$') + for line in wt_output: + print(line, end='') + matches = pattern.findall(line.strip()) + if matches: + obj = bson.decode_all(convert_byte(matches[0]))[0] + print('\t %s' % (print_bson(obj),)) + +# Converts the output of ./wt printlog -x -u. +# Doesn't convert hex keys as I don't think they're bson. +def wt_printlog_to_bson(wt_output): + pattern_value = re.compile('value-hex\": \"(.*)\"') + for line in wt_output: + value_match = pattern_value.search(line) + if value_match: + value_hex_str = value_match.group(1) + value_bytes = bytes.fromhex(value_hex_str) + try: + bson_obj = bson.decode_all(value_bytes) + print('\t\"value-bson\":%s' % (print_bson(bson_obj),)) + except Exception as e: + # If bsons don't appear to be printing uncomment this line for the error reason. + #logging.error('Error at %s', 'division', exc_info=e) + print('\t\"value-hex\": \"' + value_hex_str + '\"') + else: + print(line.rstrip()) + +# Navigate to the data section of the MongoDB file if it exists for ./wt dump. def find_data_section(mdb_file_contents): for i in range(len(mdb_file_contents)): line = mdb_file_contents[i].strip() if line == 'Data': return i + 1 - + # No data section was found, return an invalid index. return -1 -# Decode the keys and values from hex format to a readable BSON format. +# Decode the keys and values from hex format to a readable BSON format for ./wt dump. def decode_data_section(mdb_file_contents, data_index): # Loop through the data section and increment by 2, since we parse the K/V pairs. for i in range(data_index, len(mdb_file_contents), 2): @@ -53,24 +139,68 @@ def decode_data_section(mdb_file_contents, data_index): obj = bson.decode_all(byt)[0] print('Key:\t%s' % key) - print('Value:\n\t%s' % (pprint.pformat(obj, indent=1).replace('\n', '\n\t'),)) + print('Value:\n\t%s' % (print_bson(obj),)) -def dump_mdb_file(wtpath, filename): +# Convert the output of ./wt -r dump -x to bson. +def wt_dump_to_bson(wt_output): # Dump the MongoDB file into hex format. - mdb_hex = subprocess.check_output([wtpath, "dump", "-x", "file:" + filename], universal_newlines=True) - - mdb_file_contents = mdb_hex.splitlines() + mdb_file_contents = wt_output data_index = find_data_section(mdb_file_contents) if data_index > 0: decode_data_section(mdb_file_contents, data_index) else: print("Error: No data section was found in the file.") - exit() + exit() + +# Call the wt util if required. +def execute_wt(mode, wtpath, uri): + if mode == Mode.DUMP: + return subprocess.check_output( + [wtpath, "-r", "dump", "-x", uri], universal_newlines=True).splitlines() + elif mode == Mode.VERIFY: + return subprocess.check_output( + [wtpath, "-r", "verify", "-d", "dump_pages", uri], universal_newlines=True).splitlines() + else: + return subprocess.check_output( + [wtpath, "-r", "-C", "log=(compressor=snappy,path=journal/)", "printlog", "-u", "-x"], universal_newlines=True).splitlines() + +def main(): + if len(sys.argv) < 3: + util_usage() + exit() + + if sys.argv[1] != '-m': + print('A mode must be specified with -m.') + util_usage() -if len(sys.argv) != 3: - util_usage() - exit() + mode_str = sys.argv[2] + if mode_str == 'dump': + mode = Mode.DUMP + elif mode_str == 'verify': + mode = Mode.VERIFY + elif mode_str == 'printlog': + mode = Mode.PRINTLOG + else: + print('Invalid mode specified.') + util_usage() + + # Does the user plan on passing wt's location and a file? + if len(sys.argv) > 3: + if sys.argv[3] != '-f': + print('Invalid option specified.') + util_usage() + uri = None if mode == Mode.PRINTLOG else sys.argv[5] + wt_output = execute_wt(mode, sys.argv[4], uri) + else: + # Read in stdout to a string then pass it like the wt_output. + wt_output = sys.stdin.readlines() + + if mode == Mode.DUMP: + wt_dump_to_bson(wt_output) + elif mode == Mode.VERIFY: + wt_verify_to_bson(wt_output) + else: + wt_printlog_to_bson(wt_output) -wtpath = sys.argv[1] -filename = sys.argv[2] -dump_mdb_file(wtpath, filename) +if __name__ == "__main__": + main() |