WT-2268 WT-2597 JSON load/dump Unicode fixes (#2749)

* WT-2268 Change dump representation so every byte represents a single 0x00-0xff Unicode character. Otherwise arbitrary binary data (that does not conform to Unicode) cannot be represented. Fix an off-by-one error in counting bytes on input. Added some better Unicode tests. * JSON Dump now uses a 'dump version' stamp, as well as showing WT version. Dump input that is too old, and input that is too new will be rejected. * In setting a key for dump, converting a JSON string must always occur first. * When loading JSON, treat binary data just like strings. Fixed an error in determining the string length of JSON in the presence of Unicode strings. * Add LSM tests to JSON dump/load testing. * Add more extensive testing for dump/reload of JSON binary data. * Byte arrays differ from strings for JSON input, they do not null terminate. Handle escapes like '\n', '\t', etc. on input. * Added tests of JSON dump/load of all byte codes. * whitespace
author: Don Anderson <dda@mongodb.com> 2016-05-30 21:39:51 -0400
committer: Michael Cahill <michael.cahill@mongodb.com> 2016-05-31 11:39:51 +1000
commit: 27981762bac2b4b1d854826f845866f4f523a270 (patch)
tree: 1171c55e2bc95c42c088395be4ef8da892bd8ac1
parent: 12b772612a180db5db82f68920b00e297c43da2a (diff)
download: mongo-27981762bac2b4b1d854826f845866f4f523a270.tar.gz
9 files changed, 310 insertions, 90 deletions
diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c
index a7b1c98871a..32353e0a28d 100644
--- a/src/cursor/cur_dump.c
+++ b/src/cursor/cur_dump.c
@@ -155,7 +155,9 @@ __curdump_set_key(WT_CURSOR *cursor, ...)
 	WT_SESSION_IMPL *session;
 	uint64_t recno;
 	va_list ap;
+	const uint8_t *up;
 	const char *p;
+	bool json;
 
 	cdump = (WT_CURSOR_DUMP *)cursor;
 	child = cdump->child;
@@ -168,16 +170,23 @@ __curdump_set_key(WT_CURSOR *cursor, ...)
 		p = va_arg(ap, const char *);
 	va_end(ap);
 
+	json = F_ISSET(cursor, WT_CURSTD_DUMP_JSON);
+	if (json)
+		WT_ERR(__wt_json_to_item(session, p, cursor->key_format,
+		    (WT_CURSOR_JSON *)cursor->json_private, true,
+		    &cursor->key));
+
 	if (WT_CURSOR_RECNO(cursor) && !F_ISSET(cursor, WT_CURSTD_RAW)) {
-		WT_ERR(str2recno(session, p, &recno));
+		if (json) {
+			up = (const uint8_t *)cursor->key.data;
+			WT_ERR(__wt_vunpack_uint(&up, cursor->key.size,
+			    &recno));
+		} else
+			WT_ERR(str2recno(session, p, &recno));
 
 		child->set_key(child, recno);
 	} else {
-		if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON))
-			WT_ERR(__wt_json_to_item(session, p, cursor->key_format,
-			    (WT_CURSOR_JSON *)cursor->json_private, true,
-			    &cursor->key));
-		else
+		if (!json)
 			WT_ERR(__dump_to_raw(session, p, &cursor->key,
 			    F_ISSET(cursor, WT_CURSTD_DUMP_HEX)));
 
diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c
index fcb66d3e8b3..133b7b9ac9b 100644
--- a/src/cursor/cur_json.c
+++ b/src/cursor/cur_json.c
@@ -48,6 +48,10 @@ static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *,
 	case 't':							\
 		WT_RET(json_uint_arg(session, &jstr, &pv.u.u));		\
 		break;							\
+	case 'u':							\
+		WT_RET(json_string_arg(session, &jstr, &pv.u.item));	\
+		pv.type = 'K';						\
+		break;							\
 	/* User format strings have already been validated. */		\
 	WT_ILLEGAL_VALUE(session);					\
 	}								\
@@ -493,7 +497,7 @@ __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype,
 				    "invalid Unicode within JSON string");
 						return (-1);
 					}
-					src += 5;
+					src += 4;
 				}
 				backslash = false;
 			}
@@ -840,20 +844,17 @@ __wt_json_strlen(const char *src, size_t srclen)
 				if (__wt_hex2byte((const u_char *)src, &lo))
 					return (-1);
 				src += 2;
-				/* RFC 3629 */
-				if (hi >= 0x8) {
-					/* 3 bytes total */
-					dstlen += 2;
-				}
-				else if (hi != 0 || lo >= 0x80) {
-					/* 2 bytes total */
-					dstlen++;
-				}
-				/* else 1 byte total */
+				if (hi != 0)
+					/*
+					 * For our dump representation,
+					 * every Unicode character on input
+					 * represents a single byte.
+					 */
+					return (-1);
 			}
-		}
+		} else
+			src++;
 		dstlen++;
-		src++;
 	}
 	if (src != srcend)
 		return (-1);   /* invalid input, e.g. final char is '\\' */
@@ -867,55 +868,58 @@ __wt_json_strlen(const char *src, size_t srclen)
  *	the result if zero padded.
  */
 int
-__wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen)
+__wt_json_strncpy(WT_SESSION *wt_session, char **pdst, size_t dstlen,
+    const char *src, size_t srclen)
 {
-	char *dst;
+	WT_SESSION_IMPL *session;
+	char ch, *dst;
 	const char *dstend, *srcend;
 	u_char hi, lo;
 
+	session = (WT_SESSION_IMPL *)wt_session;
+
 	dst = *pdst;
 	dstend = dst + dstlen;
 	srcend = src + srclen;
 	while (src < srcend && dst < dstend) {
 		/* JSON can include any UTF-8 expressed in 4 hex chars. */
-		if (*src == '\\') {
-			if (*++src == 'u') {
-				if (__wt_hex2byte((const u_char *)++src, &hi))
+		if ((ch = *src++) == '\\')
+			switch (ch = *src++) {
+			case 'u':
+				if (__wt_hex2byte((const u_char *)src, &hi))
 					return (EINVAL);
 				src += 2;
 				if (__wt_hex2byte((const u_char *)src, &lo))
 					return (EINVAL);
 				src += 2;
-				/* RFC 3629 */
-				if (hi >= 0x8) {
-					/* 3 bytes total */
-					/* byte 0: 1110HHHH */
-					/* byte 1: 10HHHHLL */
-					/* byte 2: 10LLLLLL */
-					*dst++ = (char)(0xe0 |
-					    ((hi >> 4) & 0x0f));
-					*dst++ = (char)(0x80 |
-					    ((hi << 2) & 0x3c) |
-					    ((lo >> 6) & 0x03));
-					*dst++ = (char)(0x80 | (lo & 0x3f));
-				} else if (hi != 0 || lo >= 0x80) {
-					/* 2 bytes total */
-					/* byte 0: 110HHHLL */
-					/* byte 1: 10LLLLLL */
-					*dst++ = (char)(0xc0 |
-					    (hi << 2) |
-					    ((lo >> 6) & 0x03));
-					*dst++ = (char)(0x80 | (lo & 0x3f));
-				} else
-					/* else 1 byte total */
-					/* byte 0: 0LLLLLLL */
-					*dst++ = (char)lo;
+				if (hi != 0) {
+					__wt_errx(NULL, "Unicode \"%6.6s\""
+					    " byte out of range in JSON",
+					    src - 6);
+					return (EINVAL);
+				}
+				*dst++ = (char)lo;
+				break;
+			case 'f':
+				*dst++ = '\f';
+				break;
+			case 'n':
+				*dst++ = '\n';
+				break;
+			case 'r':
+				*dst++ = '\r';
+				break;
+			case 't':
+				*dst++ = '\t';
+				break;
+			case '"':
+			case '\\':
+				*dst++ = ch;
+				break;
+			WT_ILLEGAL_VALUE(session);
 			}
-			else
-				*dst++ = *src;
-		} else
-			*dst++ = *src;
-		src++;
+		else
+			*dst++ = ch;
 	}
 	if (src != srcend)
 		return (ENOMEM);
diff --git a/src/include/extern.h b/src/include/extern.h
index e8c20930aaf..bb2e6ae47cc 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -297,7 +297,7 @@ extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype
 extern const char *__wt_json_tokname(int toktype);
 extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, bool iskey, WT_ITEM *item);
 extern ssize_t __wt_json_strlen(const char *src, size_t srclen);
-extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen);
+extern int __wt_json_strncpy(WT_SESSION *wt_session, char **pdst, size_t dstlen, const char *src, size_t srclen);
 extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp);
 extern int __wt_schema_create_final( WT_SESSION_IMPL *session, char *cfg_arg[], char **value_ret);
 extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp);
diff --git a/src/include/packing.i b/src/include/packing.i
index 35b2ddc43db..9d5971ed99f 100644
--- a/src/include/packing.i
+++ b/src/include/packing.i
@@ -260,6 +260,8 @@ __pack_size(WT_SESSION_IMPL *session, WT_PACK_VALUE *pv)
 		return (pv->size);
 	case 'j':
 	case 'J':
+	case 'K':
+		/* These formats are only used internally. */
 		if (pv->type == 'j' || pv->havesize)
 			s = pv->size;
 		else {
@@ -269,7 +271,7 @@ __pack_size(WT_SESSION_IMPL *session, WT_PACK_VALUE *pv)
 			len = __wt_json_strlen(pv->u.item.data,
 			    pv->u.item.size);
 			WT_ASSERT(session, len >= 0);
-			s = (size_t)len + 1;
+			s = (size_t)len + (pv->type == 'K' ? 0 : 1);
 		}
 		return (s);
 	case 's':
@@ -357,18 +359,22 @@ __pack_write(
 		break;
 	case 'j':
 	case 'J':
+	case 'K':
+		/* These formats are only used internally. */
 		s = pv->u.item.size;
 		if ((pv->type == 'j' || pv->havesize) && pv->size < s) {
 			s = pv->size;
 			pad = 0;
 		} else if (pv->havesize)
 			pad = pv->size - s;
+		else if (pv->type == 'K')
+			pad = 0;
 		else
 			pad = 1;
 		if (s > 0) {
 			oldp = *pp;
-			WT_RET(__wt_json_strncpy((char **)pp, maxlen,
-			    pv->u.item.data, s));
+			WT_RET(__wt_json_strncpy((WT_SESSION *)session,
+			    (char **)pp, maxlen, pv->u.item.data, s));
 			maxlen -= (size_t)(*pp - oldp);
 		}
 		if (pad > 0) {
diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c
index 0f09009cd4c..3314b5ba485 100644
--- a/src/utilities/util_dump.c
+++ b/src/utilities/util_dump.c
@@ -7,6 +7,7 @@
  */
 
 #include "util.h"
+#include "util_dump.h"
 
 static int dump_config(WT_SESSION *, const char *, bool, bool);
 static int dump_json_begin(WT_SESSION *);
@@ -73,7 +74,9 @@ util_dump(WT_SESSION *session, int argc, char *argv[])
 	if (argc < 1 || (argc != 1 && !json))
 		return (usage());
 
-	if (json && (ret = dump_json_begin(session)) != 0)
+	if (json &&
+	    ((ret = dump_json_begin(session)) != 0 ||
+	    (ret = dump_prefix(session, hex, json)) != 0))
 		goto err;
 
 	for (i = 0; i < argc; i++) {
@@ -155,7 +158,7 @@ dump_config(WT_SESSION *session, const char *uri, bool hex, bool json)
 	 */
 	cursor->set_key(cursor, uri);
 	if ((ret = cursor->search(cursor)) == 0) {
-		if (dump_prefix(session, hex, json) != 0 ||
+		if ((!json && dump_prefix(session, hex, json) != 0) ||
 		    dump_table_config(session, cursor, uri, json) != 0 ||
 		    dump_suffix(session, json) != 0)
 			ret = 1;
@@ -456,17 +459,20 @@ dump_prefix(WT_SESSION *session, bool hex, bool json)
 {
 	int vmajor, vminor, vpatch;
 
-	if (json)
-		return (0);
-
 	(void)wiredtiger_version(&vmajor, &vminor, &vpatch);
 
-	if (printf(
+	if (!json && (printf(
 	    "WiredTiger Dump (WiredTiger Version %d.%d.%d)\n",
 	    vmajor, vminor, vpatch) < 0 ||
 	    printf("Format=%s\n", hex ? "hex" : "print") < 0 ||
-	    printf("Header\n") < 0)
+	    printf("Header\n") < 0))
 		return (util_err(session, EIO, NULL));
+	else if (json && printf(
+	    "    \"%s\" : \"%d (%d.%d.%d)\",\n",
+	    DUMP_JSON_VERSION_MARKER, DUMP_JSON_CURRENT_VERSION,
+	    vmajor, vminor, vpatch) < 0)
+		return (util_err(session, EIO, NULL));
+
 	return (0);
 }
 
diff --git a/src/utilities/util_dump.h b/src/utilities/util_dump.h
new file mode 100644
index 00000000000..e3fd8e6a501
--- /dev/null
+++ b/src/utilities/util_dump.h
@@ -0,0 +1,11 @@
+/*-
+ * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ *	All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#define	DUMP_JSON_VERSION_MARKER	"WiredTiger Dump Version"
+#define	DUMP_JSON_CURRENT_VERSION	1
+#define	DUMP_JSON_SUPPORTED_VERSION	1
diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c
index 3a1f847a95f..f1f6675e99c 100644
--- a/src/utilities/util_load_json.c
+++ b/src/utilities/util_load_json.c
@@ -7,6 +7,7 @@
  */
 
 #include "util.h"
+#include "util_dump.h"
 #include "util_load.h"
 
 /*
@@ -186,9 +187,8 @@ json_strdup(WT_SESSION *session, JSON_INPUT_STATE *ins, char **resultp)
 	}
 	*resultp = result;
 	resultcpy = result;
-	if ((ret = __wt_json_strncpy(&resultcpy, (size_t)resultlen, src,
-	    srclen))
-	    != 0) {
+	if ((ret = __wt_json_strncpy(
+	    session, &resultcpy, (size_t)resultlen, src, srclen)) != 0) {
 		ret = util_err(session, ret, NULL);
 		goto err;
 	}
@@ -344,13 +344,16 @@ json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags)
 {
 	CONFIG_LIST cl;
 	WT_DECL_RET;
-	int toktype;
 	static const char *json_markers[] = {
 	    "\"config\"", "\"colgroups\"", "\"indices\"", "\"data\"", NULL };
 	char *config, *tableuri;
+	int curversion, toktype;
+	bool hasversion;
 
 	memset(&cl, 0, sizeof(cl));
 	tableuri = NULL;
+	hasversion = false;
+
 	JSON_EXPECT(session, ins, '{');
 	while (json_peek(session, ins) == 's') {
 		JSON_EXPECT(session, ins, 's');
@@ -358,6 +361,24 @@ json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags)
 		snprintf(tableuri, ins->toklen, "%.*s",
 		    (int)(ins->toklen - 2), ins->tokstart + 1);
 		JSON_EXPECT(session, ins, ':');
+		if (!hasversion) {
+			if (strcmp(tableuri, DUMP_JSON_VERSION_MARKER) != 0) {
+				ret = util_err(session, ENOTSUP,
+				    "missing \"%s\"", DUMP_JSON_VERSION_MARKER);
+				goto err;
+			}
+			hasversion = true;
+			JSON_EXPECT(session, ins, 's');
+			if ((curversion = atoi(ins->tokstart + 1)) <= 0 ||
+			    curversion > DUMP_JSON_SUPPORTED_VERSION) {
+				ret = util_err(session, ENOTSUP,
+				    "unsupported JSON dump version \"%.*s\"",
+				    (int)(ins->toklen - 1), ins->tokstart + 1);
+				goto err;
+			}
+			JSON_EXPECT(session, ins, ',');
+			continue;
+		}
 
 		/*
 		 * Allow any ordering of 'config', 'colgroups',
diff --git a/test/suite/test_jsondump01.py b/test/suite/test_jsondump01.py
index ddf871d9a24..10262edc777 100644
--- a/test/suite/test_jsondump01.py
+++ b/test/suite/test_jsondump01.py
@@ -77,16 +77,22 @@ class test_jsondump01(wttest.WiredTigerTestCase, suite_subprocess):
         ('string', dict(keyfmt='S'))
     ]
     types = [
-        ('file', dict(type='file:',
-          name='file',
+        ('file', dict(uri='file:', config='', lsm=False,
           populate=simple_populate,
           populate_check=simple_populate_check_cursor)),
-        ('table-simple', dict(type='table:',
-          name='table-simple',
+        ('lsm', dict(uri='lsm:', config='', lsm=True,
           populate=simple_populate,
           populate_check=simple_populate_check_cursor)),
-        ('table-complex', dict(type='table:',
-          name='table-complex',
+        ('table-simple', dict(uri='table:', config='', lsm=False,
+          populate=simple_populate,
+          populate_check=simple_populate_check_cursor)),
+        ('table-simple-lsm', dict(uri='table:', config='type=lsm', lsm=True,
+          populate=simple_populate,
+          populate_check=simple_populate_check_cursor)),
+        ('table-complex', dict(uri='table:', config='', lsm=False,
+          populate=complex_populate,
+          populate_check=complex_populate_check_cursor)),
+        ('table-complex-lsm', dict(uri='table:', config='type=lsm', lsm=True,
           populate=complex_populate,
           populate_check=complex_populate_check_cursor))
     ]
@@ -95,9 +101,14 @@ class test_jsondump01(wttest.WiredTigerTestCase, suite_subprocess):
 
     # Dump using util, re-load using python's JSON, and do a content comparison.
     def test_jsondump_util(self):
+        # LSM and column-store isn't a valid combination.
+        if self.lsm and self.keyfmt == 'r':
+            return
+
         # Create the object.
-        uri = self.type + self.name
-        self.populate(self, uri, 'key_format=' + self.keyfmt, self.nentries)
+        uri = self.uri + self.name
+        self.populate(self, uri, self.config + ',key_format=' + self.keyfmt,
+            self.nentries)
 
         # Dump the object.
         self.runWt(['dump', '-j', uri], outfilename='jsondump.out')
@@ -125,9 +136,13 @@ class test_jsondump01(wttest.WiredTigerTestCase, suite_subprocess):
 
     # Dump using util, re-load using python's JSON, and do a content comparison.
     def test_jsonload_util(self):
+        # LSM and column-store isn't a valid combination.
+        if self.lsm and self.keyfmt == 'r':
+            return
+
         # Create the object.
-        uri = self.type + self.name
-        uri2 = self.type + self.name2
+        uri = self.uri + self.name
+        uri2 = self.uri + self.name2
         self.populate(self, uri, 'key_format=' + self.keyfmt, self.nentries)
 
         # Dump the object.
diff --git a/test/suite/test_jsondump02.py b/test/suite/test_jsondump02.py
index c6cd464e453..251237f3faf 100644
--- a/test/suite/test_jsondump02.py
+++ b/test/suite/test_jsondump02.py
@@ -28,16 +28,19 @@
 
 import os
 import wiredtiger, wttest
+from suite_subprocess import suite_subprocess
 
 # test_jsondump.py
 # Test dump output from json cursors.
-class test_jsondump02(wttest.WiredTigerTestCase):
+class test_jsondump02(wttest.WiredTigerTestCase, suite_subprocess):
 
     table_uri1 = 'table:jsondump02a.wt'
     table_uri2 = 'table:jsondump02b.wt'
     table_uri3 = 'table:jsondump02c.wt'
     basename_uri4 = 'jsondump02d.wt'
     table_uri4 = 'table:' + basename_uri4
+    table_uri5 = 'table:jsondump02e.wt'
+    table_uri6 = 'table:jsondump02f.wt'
 
     def set_kv(self, uri, key, val):
         cursor = self.session.open_cursor(uri, None, None)
@@ -80,15 +83,14 @@ class test_jsondump02(wttest.WiredTigerTestCase):
         pos = 0
         try:
             for insert in inserts:
-                #tty_pr('Insert: ' + str(insert))
                 cursor[insert[0]] = insert[1]
         finally:
             cursor.close()
 
-    # Create JSON cursors and test them directly.
     def test_json_cursor(self):
         """
-        Create a table, add a key, get it back
+        Create JSON cursors and test them directly, also test
+        dump/load commands.
         """
         extra_params = ',allocation_size=512,' +\
             'internal_page_max=16384,leaf_page_max=131072'
@@ -112,7 +114,12 @@ class test_jsondump02(wttest.WiredTigerTestCase):
         self.session.create(uri4index3, "columns=(i2,i4)")
 
         self.set_kv(self.table_uri1, 'KEY000', 'string value')
-        self.set_kv(self.table_uri1, 'KEY001', '\'\"({[]})\"\', etc. allowed')
+        self.set_kv(self.table_uri1, 'KEY001', '\'\"({[]})\"\'\\, etc. allowed')
+        # \u03c0 is pi in Unicode, converted by Python to UTF-8: 0xcf 0x80.
+        # Here's how UTF-8 might be used.
+        self.set_kv(self.table_uri1, 'KEY002', u'\u03c0'.encode('utf-8'))
+        # 0xf5-0xff are illegal in Unicode, but may occur legally in C strings.
+        self.set_kv(self.table_uri1, 'KEY003', '\xff\xfe')
         self.set_kv2(self.table_uri2, 'KEY000', 123, 'str0')
         self.set_kv2(self.table_uri2, 'KEY001', 234, 'str1')
         self.set_kv(self.table_uri3, 1, '\x01\x02\x03')
@@ -122,7 +129,9 @@ class test_jsondump02(wttest.WiredTigerTestCase):
         table1_json =  (
             ('"key0" : "KEY000"', '"value0" : "string value"'),
             ('"key0" : "KEY001"', '"value0" : ' +
-             '"\'\\\"({[]})\\\"\', etc. allowed"'))
+             '"\'\\\"({[]})\\\"\'\\\\, etc. allowed"'),
+            ('"key0" : "KEY002"', '"value0" : "\\u00cf\\u0080"'),
+            ('"key0" : "KEY003"', '"value0" : "\\u00ff\\u00fe"'))
         self.check_json(self.table_uri1, table1_json)
 
         self.session.truncate(self.table_uri1, None, None, None)
@@ -206,11 +215,12 @@ class test_jsondump02(wttest.WiredTigerTestCase):
               (('  "key0"\n:\t"KEY003"    ',
                 '"value0":456,"value1"\n\n\r\n:\t\n"str3"'),))
 
-        self.check_json(self.table_uri3, (
-                ('"key0" : 1', '"value0" : "\\u0001\\u0002\\u0003"'),
-                ('"key0" : 2',
-                 '"value0" : "\\u0077\\u0088\\u0099\\u0000\\u00ff\\u00fe"')))
-        self.check_json(self.table_uri4, (
+        table3_json =  (
+            ('"key0" : 1', '"value0" : "\\u0001\\u0002\\u0003"'),
+            ('"key0" : 2',
+             '"value0" : "\\u0077\\u0088\\u0099\\u0000\\u00ff\\u00fe"'))
+        self.check_json(self.table_uri3, table3_json)
+        table4_json = (
                 ('"ikey" : 1,\n"Skey" : "key1"',
                  '"S1" : "val1",\n"i2" : 1,\n"S3" : "val1",\n"i4" : 1'),
                 ('"ikey" : 2,\n"Skey" : "key2"',
@@ -218,7 +228,8 @@ class test_jsondump02(wttest.WiredTigerTestCase):
                 ('"ikey" : 3,\n"Skey" : "key3"',
                  '"S1" : "val9",\n"i2" : 9,\n"S3" : "val27",\n"i4" : 27'),
                 ('"ikey" : 4,\n"Skey" : "key4"',
-                 '"S1" : "val16",\n"i2" : 16,\n"S3" : "val64",\n"i4" : 64')))
+                 '"S1" : "val16",\n"i2" : 16,\n"S3" : "val64",\n"i4" : 64'))
+        self.check_json(self.table_uri4, table4_json)
         # The dump config currently is not supported for the index type.
         self.check_json(uri4index1, (
                 ('"Skey" : "key1"',
@@ -248,5 +259,142 @@ class test_jsondump02(wttest.WiredTigerTestCase):
                 ('"i2" : 16,\n"i4" : 64',
                  '"S1" : "val16",\n"i2" : 16,\n"S3" : "val64",\n"i4" : 64')))
 
+        # Dump all the tables into a single file, and also each
+        # table into its own file.
+        self.runWt(['dump', '-j',
+                    self.table_uri1,
+                    self.table_uri2,
+                    self.table_uri3,
+                    self.table_uri4],
+                   outfilename='jsondump-all.out')
+        self.runWt(['dump', '-j', self.table_uri1], outfilename='jsondump1.out')
+        self.runWt(['dump', '-j', self.table_uri2], outfilename='jsondump2.out')
+        self.runWt(['dump', '-j', self.table_uri3], outfilename='jsondump3.out')
+        self.runWt(['dump', '-j', self.table_uri4], outfilename='jsondump4.out')
+        self.session.drop(self.table_uri1)
+        self.session.drop(self.table_uri2)
+        self.session.drop(self.table_uri3)
+        self.session.drop(self.table_uri4)
+        self.runWt(['load', '-jf', 'jsondump1.out'])
+        self.session.drop(self.table_uri1)
+        self.runWt(['load', '-jf', 'jsondump2.out'])
+        self.session.drop(self.table_uri2)
+        self.runWt(['load', '-jf', 'jsondump3.out'])
+        self.session.drop(self.table_uri3)
+        self.runWt(['load', '-jf', 'jsondump4.out'])
+        self.session.drop(self.table_uri4)
+
+        # Note: only the first table is loaded.
+        self.runWt(['load', '-jf', 'jsondump-all.out'])
+        self.check_json(self.table_uri1, table1_json)
+        #self.check_json(self.table_uri2, table2_json)
+        #self.check_json(self.table_uri3, table3_json)
+        #self.check_json(self.table_uri4, table4_json)
+
+    # Generate two byte keys that cover some range of byte values.
+    # For simplicity, the keys are monotonically increasing.
+    # A null byte is disallowed in a string key, so we don't use it.
+    def generate_key(self, i, k):
+        k[0] = ((i & 0xffc0) >> 6) + 1
+        k[1] = (i & 0x3f) + 1
+
+    # Generate three byte values:
+    #    i==0  :  v:[0x00, 0x01, 0x02]
+    #    i==1  :  v:[0x01, 0x02, 0x03]
+    # etc.
+    # A null byte is disallowed in a string value, it is replaced by 'X'
+    def generate_value(self, i, v, isstring):
+        for j in range(0, 3):
+            val = (i + j) % 256
+            if isstring and val == 0:
+                val = 88  # 'X'
+            v[j] = val
+
+    def test_json_all_bytes(self):
+        """
+        Test the generated JSON for all byte values in byte array and
+        string formats.
+        """
+        self.session.create(self.table_uri5, 'key_format=u,value_format=u')
+        self.session.create(self.table_uri6, 'key_format=S,value_format=S')
+
+        c5 = self.session.open_cursor(self.table_uri5, None, None)
+        c6 = self.session.open_cursor(self.table_uri6, None, None)
+        k = bytearray(b'\x00\x00')
+        v = bytearray(b'\x00\x00\x00')
+        for i in range(0, 512):
+            self.generate_key(i, k)
+            self.generate_value(i, v, False)
+            c5[str(k)] = str(v)
+            self.generate_value(i, v, True)   # no embedded nuls
+            c6[str(k)] = str(v)
+        c5.close()
+        c6.close()
+
+        # Build table5_json, we want it to look like this:
+        #    ('"key0" : "\u0001\u0001"', '"value0" : "\u0000\u0001\u0002"'),
+        #    ('"key0" : "\u0001\u0002"', '"value0" : "\u0001\u0002\u0003"'))
+        #    ('"key0" : "\u0001\u0003"', '"value0" : "\u0003\u0003\u0004"'))
+        #    ...
+        # table6_json is similar, except that printable values like '\u0041'
+        # would appear as 'A'.  The string type cannot have embedded nulls,
+        # so '\u0000' in table6_json appears instead as an 'X'.
+        #
+        # Start by creating two tables of individual Unicode values.
+        # bin_unicode[] contains only the \u escape sequences.
+        # mix_unicode[] contains printable characters or \t \n etc. escapes
+        bin_unicode = []
+        mix_unicode = []
+        for i in range(0, 256):
+            u = "\\u00" + hex(256 + i)[3:]  # e.g. "\u00ab")
+            bin_unicode.append(u)
+            mix_unicode.append(u)
+        for i in range(0x20, 0x7f):
+            mix_unicode[i] = chr(i)
+        mix_unicode[ord('"')] = '\\"'
+        mix_unicode[ord('\\')] = '\\\\'
+        mix_unicode[ord('\f')] = '\\f'
+        mix_unicode[ord('\n')] = '\\n'
+        mix_unicode[ord('\r')] = '\\r'
+        mix_unicode[ord('\t')] = '\\t'
+
+        table5_json = []
+        table6_json = []
+        for i in range(0, 512):
+            self.generate_key(i, k)
+            self.generate_value(i, v, False)
+            j = i if (i > 0 and i < 254) or (i > 256 and i < 510) else 88
+            table5_json.append(('"key0" : "' + bin_unicode[k[0]] +
+                                bin_unicode[k[1]] + '"',
+                                '"value0" : "' + bin_unicode[v[0]] +
+                                bin_unicode[v[1]] +
+                                bin_unicode[v[2]] + '"'))
+            self.generate_value(i, v, True)
+            table6_json.append(('"key0" : "' + mix_unicode[k[0]] +
+                                mix_unicode[k[1]] + '"',
+                                '"value0" : "' + mix_unicode[v[0]] +
+                                mix_unicode[v[1]] +
+                                mix_unicode[v[2]] + '"'))
+
+        self.check_json(self.table_uri5, table5_json)
+        self.check_json(self.table_uri6, table6_json)
+
+        self.session.truncate(self.table_uri5, None, None, None)
+        self.session.truncate(self.table_uri6, None, None, None)
+        self.load_json(self.table_uri5, table5_json)
+        self.load_json(self.table_uri6, table6_json)
+        self.check_json(self.table_uri5, table5_json)
+        self.check_json(self.table_uri6, table6_json)
+
+        self.runWt(['dump', '-j', self.table_uri5], outfilename='jsondump5.out')
+        self.runWt(['dump', '-j', self.table_uri6], outfilename='jsondump6.out')
+        self.session.drop(self.table_uri5)
+        self.session.drop(self.table_uri6)
+        self.runWt(['load', '-jf', 'jsondump5.out'])
+        self.runWt(['load', '-jf', 'jsondump6.out'])
+        self.session.drop(self.table_uri5)
+        self.session.drop(self.table_uri6)
+
+
 if __name__ == '__main__':
     wttest.run()
author	Don Anderson <dda@mongodb.com>	2016-05-30 21:39:51 -0400
committer	Michael Cahill <michael.cahill@mongodb.com>	2016-05-31 11:39:51 +1000
commit	27981762bac2b4b1d854826f845866f4f523a270 (patch)
tree	1171c55e2bc95c42c088395be4ef8da892bd8ac1
parent	12b772612a180db5db82f68920b00e297c43da2a (diff)
download	mongo-27981762bac2b4b1d854826f845866f4f523a270.tar.gz