diff options
author | Alex Gorrod <alexg@wiredtiger.com> | 2014-10-15 11:55:19 +1100 |
---|---|---|
committer | Alex Gorrod <alexg@wiredtiger.com> | 2014-10-15 11:55:19 +1100 |
commit | 9ef6222eb484e2328e90f639e49bf64584a92a38 (patch) | |
tree | bc1338268962976bbe00c1af9ffcd7a52e808cbe | |
parent | ba4f6023c5c580b5f3be1d5538f57c03a8c49fe8 (diff) | |
parent | 2bca93d54b3b3c3ad01f1fe932a783e83495701e (diff) | |
download | mongo-9ef6222eb484e2328e90f639e49bf64584a92a38.tar.gz |
Merge pull request #1154 from wiredtiger/json-load
Add JSON loading to cursors and wt load utility. refs #740.
-rw-r--r-- | build_posix/Make.base | 1 | ||||
-rw-r--r-- | dist/api_data.py | 7 | ||||
-rw-r--r-- | dist/s_string.ok | 26 | ||||
-rw-r--r-- | lang/python/wiredtiger.i | 27 | ||||
-rw-r--r-- | src/cursor/cur_dump.c | 25 | ||||
-rw-r--r-- | src/cursor/cur_json.c | 561 | ||||
-rw-r--r-- | src/cursor/cur_std.c | 8 | ||||
-rw-r--r-- | src/cursor/cur_table.c | 2 | ||||
-rw-r--r-- | src/docs/command-line.dox | 4 | ||||
-rw-r--r-- | src/include/extern.h | 6 | ||||
-rw-r--r-- | src/include/packing.i | 36 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 7 | ||||
-rw-r--r-- | src/support/hex.c | 10 | ||||
-rw-r--r-- | src/utilities/util_dump.c | 8 | ||||
-rw-r--r-- | src/utilities/util_load.c | 221 | ||||
-rw-r--r-- | src/utilities/util_load.h | 30 | ||||
-rw-r--r-- | src/utilities/util_load_json.c | 567 | ||||
-rw-r--r-- | test/suite/test_jsondump01.py | 28 | ||||
-rw-r--r-- | test/suite/test_jsondump02.py | 143 |
19 files changed, 1577 insertions, 140 deletions
diff --git a/build_posix/Make.base b/build_posix/Make.base index 3340bd8ad80..51a8e77cebe 100644 --- a/build_posix/Make.base +++ b/build_posix/Make.base @@ -25,6 +25,7 @@ wt_SOURCES =\ src/utilities/util_dump.c \ src/utilities/util_list.c \ src/utilities/util_load.c \ + src/utilities/util_load_json.c \ src/utilities/util_loadtext.c \ src/utilities/util_main.c \ src/utilities/util_misc.c \ diff --git a/dist/api_data.py b/dist/api_data.py index e92db02c6e6..8e42ba72b88 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -532,11 +532,10 @@ methods = { Config('dump', '', r''' configure the cursor for dump format inputs and outputs: "hex" selects a simple hexadecimal format, "json" selects a JSON format - with each record formats as fields named by column names if + with each record formatted as fields named by column names if available, and "print" selects a format where only non-printing - characters are hexadecimal encoded, and "json" produces a JSON - encoding of the data. The "hex" and "print" dump format are - compatible with the @ref util_dump and @ref util_load commands''', + characters are hexadecimal encoded. These formats are compatible + with the @ref util_dump and @ref util_load commands''', choices=['hex', 'json', 'print']), Config('next_random', 'false', r''' configure the cursor to return a pseudo-random record from diff --git a/dist/s_string.ok b/dist/s_string.ok index a6cddfa8a72..69545dbda1f 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -68,6 +68,7 @@ CreateFileMapping CreateThread CustomersPhone DATAITEMs +DECL DESC DHANDLE DLFCN @@ -137,6 +138,9 @@ GetModuleHandleEx GetProcAddress Givargis Google +HHHH +HHHHLL +HHHLL HYPERLEVELDB HyperLevelDB IEC @@ -174,6 +178,8 @@ LIBPTHREAD LIBRT LIBSNAPPY LIBZ +LLLLLL +LLLLLLL LNO LOGREC LOGSCAN @@ -233,6 +239,8 @@ PADDR PAGE's PARAM POSIX +PRIu +PRIu64 PSIZE PTHREAD PTR @@ -523,6 +531,7 @@ ds dsk dsrc dst +dstlen dsync dt dtype @@ -580,6 +589,7 @@ fillms firstfit fixup flcs +floatnum fmt fmterr fnv @@ -656,6 +666,7 @@ insertK insertV instantiation intl +intnum intpack ints inttypes @@ -664,6 +675,7 @@ io ip ispo iteratively +jnr jrx json kb @@ -673,10 +685,13 @@ keygen keyname keyv kv +kvraw kvs kvsbdb lang latencies +lbrace +lbracket lbz ld ldl @@ -817,6 +832,7 @@ os ovfl ownp packv +parens parserp patchp pathname @@ -838,6 +854,7 @@ primary's printf printlog priv +progname ps pse psp @@ -855,6 +872,8 @@ qsort quartile qup rS +rbrace +rbracket rdlock rduppo readlock @@ -950,6 +969,7 @@ strerror strftime strget stringin +strlen strncmp strncpy strndup @@ -994,6 +1014,11 @@ tlist tload tmp toffpage +tokenizer +toklen +tokname +tokstart +toktype toverflow tparent tprintlog @@ -1053,6 +1078,7 @@ unpackv unreferenced unregister unsized +unterminated untyped upd update's diff --git a/lang/python/wiredtiger.i b/lang/python/wiredtiger.i index 0c228c56e5f..be55845a7b2 100644 --- a/lang/python/wiredtiger.i +++ b/lang/python/wiredtiger.i @@ -388,7 +388,9 @@ COMPARE_OK(__wt_cursor::search_near) %exception __wt_async_op::_set_key; %exception __wt_async_op::_set_value; %exception __wt_cursor::_set_key; +%exception __wt_cursor::_set_key_str; %exception __wt_cursor::_set_value; +%exception __wt_cursor::_set_value_str; %exception wiredtiger_strerror; %exception wiredtiger_version; @@ -577,6 +579,11 @@ typedef int int_void; $self->set_key($self, &k); } + /* Get / set keys and values */ + void _set_key_str(char *str) { + $self->set_key($self, str); + } + int_void _set_recno(uint64_t recno) { WT_ITEM k; uint8_t recno_buf[20]; @@ -601,6 +608,11 @@ typedef int int_void; $self->set_value($self, &v); } + /* Get / set keys and values */ + void _set_value_str(char *str) { + $self->set_value($self, str); + } + /* Don't return values, just throw exceptions on failure. */ int_void _get_key(char **datap, int *sizep) { WT_ITEM k; @@ -739,6 +751,8 @@ typedef int int_void; args = args[0] if self.is_column: self._set_recno(long(args[0])) + elif self.is_json: + self._set_key_str(args[0]) else: # Keep the Python string pinned self._key = pack(self.key_format, *args) @@ -748,11 +762,14 @@ typedef int int_void; '''set_value(self) -> None @copydoc WT_CURSOR::set_value''' - if len(args) == 1 and type(args[0]) == tuple: - args = args[0] - # Keep the Python string pinned - self._value = pack(self.value_format, *args) - self._set_value(self._value) + if self.is_json: + self._set_value_str(args[0]) + else: + if len(args) == 1 and type(args[0]) == tuple: + args = args[0] + # Keep the Python string pinned + self._value = pack(self.value_format, *args) + self._set_value(self._value) def __iter__(self): '''Cursor objects support iteration, equivalent to calling diff --git a/src/cursor/cur_dump.c b/src/cursor/cur_dump.c index 31d40d32060..003b7e1f961 100644 --- a/src/cursor/cur_dump.c +++ b/src/cursor/cur_dump.c @@ -160,10 +160,6 @@ __curdump_set_key(WT_CURSOR *cursor, ...) child = cdump->child; CURSOR_API_CALL(cursor, session, set_key, NULL); - if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) - WT_ERR_MSG(session, EINVAL, - "Setting keys for JSON cursors not permitted"); - va_start(ap, cursor); if (F_ISSET(cursor, WT_CURSTD_RAW)) p = va_arg(ap, WT_ITEM *)->data; @@ -176,8 +172,13 @@ __curdump_set_key(WT_CURSOR *cursor, ...) child->set_key(child, recno); } else { - WT_ERR(__dump_to_raw(session, p, &cursor->key, - F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0)); + if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) + WT_ERR(__wt_json_to_item(session, p, cursor->key_format, + (WT_CURSOR_JSON *)cursor->json_private, 1, + &cursor->key)); + else + WT_ERR(__dump_to_raw(session, p, &cursor->key, + F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0)); child->set_key(child, &cursor->key); } @@ -255,10 +256,6 @@ __curdump_set_value(WT_CURSOR *cursor, ...) child = cdump->child; CURSOR_API_CALL(cursor, session, set_value, NULL); - if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) - WT_ERR_MSG(session, EINVAL, - "Setting values for JSON cursors not permitted"); - va_start(ap, cursor); if (F_ISSET(cursor, WT_CURSTD_RAW)) p = va_arg(ap, WT_ITEM *)->data; @@ -266,8 +263,12 @@ __curdump_set_value(WT_CURSOR *cursor, ...) p = va_arg(ap, const char *); va_end(ap); - WT_ERR(__dump_to_raw(session, - p, &cursor->value, F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0)); + if (F_ISSET(cursor, WT_CURSTD_DUMP_JSON)) + WT_ERR(__wt_json_to_item(session, p, cursor->value_format, + (WT_CURSOR_JSON *)cursor->json_private, 0, &cursor->value)); + else + WT_ERR(__dump_to_raw(session, p, &cursor->value, + F_ISSET(cursor, WT_CURSTD_DUMP_HEX) ? 1 : 0)); child->set_value(child, &cursor->value); diff --git a/src/cursor/cur_json.c b/src/cursor/cur_json.c index 618596e39b8..4a4ae7544b1 100644 --- a/src/cursor/cur_json.c +++ b/src/cursor/cur_json.c @@ -7,6 +7,51 @@ #include "wt_internal.h" +static size_t __json_unpack_put(WT_SESSION_IMPL *, void *, u_char *, size_t, + WT_CONFIG_ITEM *); +static inline int __json_struct_size(WT_SESSION_IMPL *, const void *, size_t, + const char *, WT_CONFIG_ITEM *, int, size_t *); +static inline int __json_struct_unpackv(WT_SESSION_IMPL *, const void *, size_t, + const char *, WT_CONFIG_ITEM *, u_char *, size_t, int, va_list); +static int json_string_arg(WT_SESSION_IMPL *, const char **, WT_ITEM *); +static int json_int_arg(WT_SESSION_IMPL *, const char **, int64_t *); +static int json_uint_arg(WT_SESSION_IMPL *, const char **, uint64_t *); +static int __json_pack_struct(WT_SESSION_IMPL *, void *, size_t, const char *, + const char *); +static int __json_pack_size(WT_SESSION_IMPL *, const char *, WT_CONFIG_ITEM *, + int, const char *, size_t *); + +#define WT_PACK_JSON_GET(session, pv, jstr) do { \ + switch (pv.type) { \ + case 'x': \ + break; \ + case 's': \ + case 'S': \ + WT_RET(json_string_arg(session, &jstr, &pv.u.item)); \ + pv.type = pv.type == 's' ? 'j' : 'J'; \ + break; \ + case 'b': \ + case 'h': \ + case 'i': \ + case 'l': \ + case 'q': \ + WT_RET(json_int_arg(session, &jstr, &pv.u.i)); \ + break; \ + case 'B': \ + case 'H': \ + case 'I': \ + case 'L': \ + case 'Q': \ + case 'r': \ + case 'R': \ + case 't': \ + WT_RET(json_uint_arg(session, &jstr, &pv.u.u)); \ + break; \ + /* User format strings have already been validated. */ \ + WT_ILLEGAL_VALUE(session); \ + } \ +} while (0) + /* * __json_unpack_put -- * Calculate the size of a packed byte string as formatted for JSON. @@ -367,3 +412,519 @@ __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, } return (0); } + +#define MATCH_KEYWORD(session, in, result, keyword, matchval) do { \ + size_t _kwlen = strlen(keyword); \ + if (strncmp(in, keyword, _kwlen) == 0 && !isalnum(in[_kwlen])) { \ + in += _kwlen; \ + result = matchval; \ + } else { \ + const char *_bad = in; \ + while (isalnum(*in)) \ + in++; \ + __wt_errx(session, "unknown keyword \"%.*s\" in JSON", \ + (int)(in - _bad), _bad); \ + } \ +} while (0) + +/* + * __wt_json_token -- + * Return the type, start position and length of the next JSON + * token in the input. String tokens include the quotes. JSON + * can be entirely parsed using calls to this tokenizer, each + * call using a src pointer that is the previously returned + * tokstart + toklen. + * + * The token type returned is one of: + * 0 : EOF + * 's' : string + * 'i' : intnum + * 'f' : floatnum + * ':' : colon + * ',' : comma + * '{' : lbrace + * '}' : rbrace + * '[' : lbracket + * ']' : rbracket + * 'N' : null + * 'T' : true + * 'F' : false + */ +int +__wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, + const char **tokstart, size_t *toklen) +{ + WT_SESSION_IMPL *session; + char ch; + const char *bad; + int backslash, isalph, isfloat, result; + + result = -1; + session = (WT_SESSION_IMPL *)wt_session; + while (isspace(*src)) + src++; + *tokstart = src; + + if (*src == '\0') { + *toktype = 0; + *toklen = 0; + return (0); + } + + /* JSON is specified in RFC 4627. */ + switch (*src) { + case '"': + backslash = 0; + src++; + while ((ch = *src) != '\0') { + if (!backslash) { + if (ch == '"') { + src++; + result = 's'; + break; + } + if (ch == '\\') + backslash = 1; + } else { + /* We validate Unicode on this pass. */ + if (ch == 'u') { + u_char ignored; + const u_char *uc; + + uc = (const u_char *)src; + if (__wt_hex2byte(&uc[1], &ignored) || + __wt_hex2byte(&uc[3], &ignored)) { + __wt_errx(session, + "invalid Unicode within JSON string"); + return (-1); + } + src += 5; + } + backslash = 0; + } + src++; + } + if (result != 's') + __wt_errx(session, "unterminated string in JSON"); + break; + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + isfloat = 0; + if (*src == '-') + src++; + while ((ch = *src) != '\0' && isdigit(ch)) + src++; + if (*src == '.') { + isfloat = 1; + src++; + while ((ch = *src) != '\0' && + isdigit(ch)) + src++; + } + if (*src == 'e' || *src == 'E') { + isfloat = 1; + src++; + if (*src == '+' || *src == '-') + src++; + while ((ch = *src) != '\0' && + isdigit(ch)) + src++; + } + result = isfloat ? 'f' : 'i'; + break; + case ':': + case ',': + case '{': + case '}': + case '[': + case ']': + result = *src++; + break; + case 'n': + MATCH_KEYWORD(session, src, result, "null", 'N'); + break; + case 't': + MATCH_KEYWORD(session, src, result, "true", 'T'); + break; + case 'f': + MATCH_KEYWORD(session, src, result, "false", 'F'); + break; + default: + /* An illegal token, move past it anyway */ + bad = src; + isalph = isalnum(*src); + src++; + if (isalph) + while (*src != '\0' && isalnum(*src)) + src++; + __wt_errx(session, "unknown token \"%.*s\" in JSON", + (int)(src - bad), bad); + break; + } + *toklen = (size_t)(src - *tokstart); + *toktype = result; + return (result < 0 ? EINVAL : 0); +} + +/* + * __wt_json_tokname + * Return a descriptive name from the token type returned by + * __wt_json_token + */ +const char * +__wt_json_tokname(int toktype) +{ + switch (toktype) { + case 0: return ("<EOF>"); + case 's': return ("<string>"); + case 'i': return ("<integer>"); + case 'f': return ("<float>"); + case ':': return ("':'"); + case ',': return ("','"); + case '{': return ("'{'"); + case '}': return ("'}'"); + case '[': return ("'['"); + case ']': return ("']'"); + case 'N': return ("'null'"); + case 'T': return ("'true'"); + case 'F': return ("'false'"); + default: return ("<UNKNOWN>"); + } +} + +/* + * json_string_arg -- + * Returns a first cut of the needed string in item. + * The result has not been stripped of escapes. + */ +static int +json_string_arg(WT_SESSION_IMPL *session, const char **jstr, WT_ITEM *item) +{ + const char *tokstart; + int tok; + WT_DECL_RET; + + WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart, + &item->size)); + if (tok == 's') { + *jstr = tokstart + item->size; + /* The tokenizer includes the '"' chars */ + item->data = tokstart + 1; + item->size -= 2; + ret = 0; + } else { + __wt_errx(session, "expected JSON <string>, got %s", + __wt_json_tokname(tok)); + ret = EINVAL; + } + return (ret); +} + +/* + * json_int_arg -- + * Returns a signed integral value from the current position + * in the JSON string. + */ +static int +json_int_arg(WT_SESSION_IMPL *session, const char **jstr, int64_t *ip) +{ + char *end; + const char *tokstart; + int tok; + size_t toksize; + + WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart, + &toksize)); + if (tok == 'i') { + /* JSON only allows decimal */ + *ip = strtoll(tokstart, &end, 10); + if (end != tokstart + toksize) + WT_RET_MSG(session, EINVAL, + "JSON <int> extraneous input"); + *jstr = tokstart + toksize; + } else { + __wt_errx(session, "expected JSON <int>, got %s", + __wt_json_tokname(tok)); + return (EINVAL); + } + return (0); +} + +/* + * json_uint_arg -- + * Returns an unsigned integral value from the current position + * in the JSON string. + */ +static int +json_uint_arg(WT_SESSION_IMPL *session, const char **jstr, uint64_t *up) +{ + char *end; + const char *tokstart; + int tok; + size_t toksize; + + WT_RET(__wt_json_token((WT_SESSION *)session, *jstr, &tok, &tokstart, + &toksize)); + if (tok == 'i' && *tokstart != '-') { + /* JSON only allows decimal */ + *up = strtoull(tokstart, &end, 10); + if (end != tokstart + toksize) + WT_RET_MSG(session, EINVAL, + "JSON <int> extraneous input"); + *jstr = tokstart + toksize; + } else { + __wt_errx(session, "expected unsigned JSON <int>, got %s", + __wt_json_tokname(tok)); + return (EINVAL); + } + return (0); +} + +#define JSON_EXPECT_TOKEN_GET(session, jstr, tokval, start, sz) do { \ + int __tok; \ + WT_RET(__wt_json_token((WT_SESSION *)session, jstr, &__tok, &start, &sz));\ + if (__tok != tokval) { \ + __wt_errx(session, "expected JSON %s, got %s", \ + __wt_json_tokname(tokval), __wt_json_tokname(__tok)); \ + return (EINVAL); \ + } \ + jstr = start + sz; \ +} while (0) + +#define JSON_EXPECT_TOKEN(session, jstr, tokval) do { \ + const char *__start; \ + size_t __sz; \ + JSON_EXPECT_TOKEN_GET(session, jstr, tokval, __start, __sz); \ +} while (0) + +/* + * __json_pack_struct -- + * Pack a byte string from a JSON string. + */ +static int +__json_pack_struct(WT_SESSION_IMPL *session, void *buffer, size_t size, + const char *fmt, const char *jstr) +{ + WT_DECL_PACK_VALUE(pv); + WT_DECL_RET; + WT_PACK pack; + const char *tokstart; + int multi; + size_t toksize; + uint8_t *p, *end; + + p = buffer; + end = p + size; + multi = 0; + + if (fmt[0] != '\0' && fmt[1] == '\0') { + JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize); + /* the key name was verified in __json_pack_size */ + JSON_EXPECT_TOKEN(session, jstr, ':'); + pv.type = fmt[0]; + WT_PACK_JSON_GET(session, pv, jstr); + return (__pack_write(session, &pv, &p, size)); + } + + WT_RET(__pack_init(session, &pack, fmt)); + while ((ret = __pack_next(&pack, &pv)) == 0) { + if (multi) + JSON_EXPECT_TOKEN(session, jstr, ','); + JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize); + /* the key name was verified in __json_pack_size */ + JSON_EXPECT_TOKEN(session, jstr, ':'); + WT_PACK_JSON_GET(session, pv, jstr); + WT_RET(__pack_write(session, &pv, &p, (size_t)(end - p))); + multi = 1; + } + + /* Be paranoid - __pack_write should never overflow. */ + WT_ASSERT(session, p <= end); + + if (ret != WT_NOTFOUND) + return (ret); + + return (0); +} + +/* + * __json_pack_size -- + * Calculate the size of a packed byte string from a JSON string. + * We verify that the names and value types provided in JSON match + * the column names and type from the schema format, returning error + * if not. + */ +static int +__json_pack_size( + WT_SESSION_IMPL *session, const char *fmt, WT_CONFIG_ITEM *names, + int iskey, const char *jstr, size_t *sizep) +{ + WT_CONFIG_ITEM name; + WT_DECL_PACK_VALUE(pv); + WT_PACK pack; + WT_PACK_NAME packname; + const char *tokstart; + int multi; + size_t toksize, total; + + WT_RET(__pack_name_init(session, names, iskey, &packname)); + multi = 0; + WT_RET(__pack_init(session, &pack, fmt)); + for (total = 0; __pack_next(&pack, &pv) == 0;) { + if (multi) + JSON_EXPECT_TOKEN(session, jstr, ','); + JSON_EXPECT_TOKEN_GET(session, jstr, 's', tokstart, toksize); + WT_RET(__pack_name_next(&packname, &name)); + if (toksize - 2 != name.len || + strncmp(tokstart + 1, name.str, toksize - 2) != 0) { + __wt_errx(session, "JSON expected %s name: \"%.*s\"", + iskey ? "key" : "value", (int)name.len, name.str); + return (EINVAL); + } + JSON_EXPECT_TOKEN(session, jstr, ':'); + WT_PACK_JSON_GET(session, pv, jstr); + total += __pack_size(session, &pv); + multi = 1; + } + /* check end of string */ + JSON_EXPECT_TOKEN(session, jstr, 0); + + *sizep = total; + return (0); +} + +/* + * __wt_json_to_item -- + * Convert a JSON input string for either key/value to a raw WT_ITEM. + * Checks that the input matches the expected format. + */ +int +__wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, + const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item) +{ + size_t sz; + + WT_RET(__json_pack_size(session, format, + iskey ? &json->key_names : &json->value_names, iskey, jstr, &sz)); + WT_RET(__wt_buf_initsize(session, item, sz)); + WT_RET(__json_pack_struct(session, item->mem, sz, format, jstr)); + return (0); +} + +/* + * __wt_json_strlen -- + * Return the number of bytes represented by a string in JSON format, + * or -1 if the format is incorrect. + */ +ssize_t +__wt_json_strlen(const char *src, size_t srclen) +{ + const char *srcend; + size_t dstlen; + u_char hi, lo; + + dstlen = 0; + srcend = src + srclen; + while (src < srcend) { + /* JSON can include any UTF-8 expressed in 4 hex chars. */ + if (*src == '\\') { + if (*++src == 'u') { + if (__wt_hex2byte((const u_char *)++src, &hi)) + return (-1); + src += 2; + if (__wt_hex2byte((const u_char *)src, &lo)) + return (-1); + src += 2; + /* RFC 3629 */ + if (hi >= 0x8) { + /* 3 bytes total */ + dstlen += 2; + } + else if (hi != 0 || lo >= 0x80) { + /* 2 bytes total */ + dstlen++; + } + /* else 1 byte total */ + } + } + dstlen++; + src++; + } + if (src != srcend) + return (-1); /* invalid input, e.g. final char is '\\' */ + return ((ssize_t)dstlen); +} + +/* + * __wt_json_strncpy -- + * Copy bytes of string in JSON format to a destination, + * up to dstlen bytes. If dstlen is greater than the needed size, + * the result if zero padded. + */ +int +__wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen) +{ + char *dst; + const char *dstend, *srcend; + u_char hi, lo; + + dst = *pdst; + dstend = dst + dstlen; + srcend = src + srclen; + while (src < srcend && dst < dstend) { + /* JSON can include any UTF-8 expressed in 4 hex chars. */ + if (*src == '\\') { + if (*++src == 'u') { + if (__wt_hex2byte((const u_char *)++src, &hi)) + return (EINVAL); + src += 2; + if (__wt_hex2byte((const u_char *)src, &lo)) + return (EINVAL); + src += 2; + /* RFC 3629 */ + if (hi >= 0x8) { + /* 3 bytes total */ + /* byte 0: 1110HHHH */ + /* byte 1: 10HHHHLL */ + /* byte 2: 10LLLLLL */ + *dst++ = (char)(0xe0 | + ((hi << 4) & 0x0f)); + *dst++ = (char)(0x80 | + ((hi << 2) & 0x3c) | + ((lo >> 6) & 0x03)); + *dst++ = (char)(0x80 | (lo & 0x3f)); + } else if (hi != 0 || lo >= 0x80) { + /* 2 bytes total */ + /* byte 0: 110HHHLL */ + /* byte 1: 10LLLLLL */ + *dst++ = (char)(0xc0 | + (hi << 2) | + ((lo >> 6) & 0x03)); + *dst++ = (char)(0x80 | (lo & 0x3f)); + } else + /* else 1 byte total */ + /* byte 0: 0LLLLLLL */ + *dst++ = (char)lo; + } + else + *dst++ = *src; + } else + *dst++ = *src; + src++; + } + if (src != srcend) + return (ENOMEM); + *pdst = dst; + while (dst < dstend) + *dst++ = '\0'; + return (0); +} diff --git a/src/cursor/cur_std.c b/src/cursor/cur_std.c index 52cdf232279..cfaf83824fd 100644 --- a/src/cursor/cur_std.c +++ b/src/cursor/cur_std.c @@ -276,9 +276,10 @@ __wt_cursor_set_keyv(WT_CURSOR *cursor, uint32_t flags, va_list ap) cursor->key.data = &cursor->recno; sz = sizeof(cursor->recno); } else { - /* Fast path some common cases. */ + /* Fast path some common cases and special case WT_ITEMs. */ fmt = cursor->key_format; - if (LF_ISSET(WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) { + if (LF_ISSET(WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) || + WT_STREQ(fmt, "u")) { item = va_arg(ap, WT_ITEM *); sz = item->size; cursor->key.data = item->data; @@ -399,7 +400,8 @@ __wt_cursor_set_valuev(WT_CURSOR *cursor, va_list ap) /* Fast path some common cases. */ fmt = cursor->value_format; - if (F_ISSET(cursor, WT_CURSOR_RAW_OK) || WT_STREQ(fmt, "u")) { + if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON) || + WT_STREQ(fmt, "u")) { item = va_arg(ap, WT_ITEM *); sz = item->size; cursor->value.data = item->data; diff --git a/src/cursor/cur_table.c b/src/cursor/cur_table.c index 21a1b6e07e4..aa336805d06 100644 --- a/src/cursor/cur_table.c +++ b/src/cursor/cur_table.c @@ -146,7 +146,7 @@ __wt_curtable_set_value(WT_CURSOR *cursor, ...) CURSOR_API_CALL(cursor, session, set_value, NULL); va_start(ap, cursor); - if (F_ISSET(cursor, WT_CURSOR_RAW_OK)) { + if (F_ISSET(cursor, WT_CURSOR_RAW_OK | WT_CURSTD_DUMP_JSON)) { item = va_arg(ap, WT_ITEM *); cursor->value.data = item->data; cursor->value.size = item->size; diff --git a/src/docs/command-line.dox b/src/docs/command-line.dox index ee51cc21c2a..a4de4d85e71 100644 --- a/src/docs/command-line.dox +++ b/src/docs/command-line.dox @@ -175,6 +175,10 @@ column store. By default, the \c load command reads from the standard input; the \c -f option reads the input from the specified file. +@par <code>-j</code> +Load input in the JSON (<a href="http://www.json.org">JavaScript Object +Notation</a>) format that was created by the <code>dump -j</code> command. + @par <code>-n</code> By default, input data will overwrite existing data where the key/value pair already exists in the data source; the \c -n option causes the \c diff --git a/src/include/extern.h b/src/include/extern.h index d9f0dd48abb..807092f6060 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -270,6 +270,11 @@ extern int __wt_json_alloc_unpack(WT_SESSION_IMPL *session, const void *buffer, extern void __wt_json_close(WT_SESSION_IMPL *session, WT_CURSOR *cursor); extern size_t __wt_json_unpack_char(char ch, u_char *buf, size_t bufsz, int force_unicode); extern int __wt_json_column_init(WT_CURSOR *cursor, const char *keyformat, const WT_CONFIG_ITEM *idxconf, const WT_CONFIG_ITEM *colconf); +extern int __wt_json_token(WT_SESSION *wt_session, const char *src, int *toktype, const char **tokstart, size_t *toklen); +extern const char *__wt_json_tokname(int toktype); +extern int __wt_json_to_item(WT_SESSION_IMPL *session, const char *jstr, const char *format, WT_CURSOR_JSON *json, int iskey, WT_ITEM *item); +extern ssize_t __wt_json_strlen(const char *src, size_t srclen); +extern int __wt_json_strncpy(char **pdst, size_t dstlen, const char *src, size_t srclen); extern int __wt_curlog_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_curmetadata_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern void __wt_curstat_dsrc_final(WT_CURSOR_STAT *cst); @@ -569,6 +574,7 @@ extern void __wt_hazard_close(WT_SESSION_IMPL *session); extern int __wt_raw_to_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); extern void __wt_raw_to_hex_mem( const uint8_t *from, size_t size, uint8_t *dest, size_t dest_size); extern int __wt_raw_to_esc_hex( WT_SESSION_IMPL *session, const uint8_t *from, size_t size, WT_ITEM *to); +extern int __wt_hex2byte(const u_char *from, u_char *to); extern int __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to); extern int __wt_nhex_to_raw( WT_SESSION_IMPL *session, const char *from, size_t size, WT_ITEM *to); extern int __wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to); diff --git a/src/include/packing.i b/src/include/packing.i index 7178052ed91..6e0e7be13eb 100644 --- a/src/include/packing.i +++ b/src/include/packing.i @@ -248,6 +248,20 @@ __pack_size(WT_SESSION_IMPL *session, WT_PACK_VALUE *pv) switch (pv->type) { case 'x': return (pv->size); + case 'j': + case 'J': + if (pv->type == 'j' || pv->havesize) + s = pv->size; + else { + ssize_t len; + + /* The string was previously validated. */ + len = __wt_json_strlen(pv->u.item.data, + pv->u.item.size); + WT_ASSERT(session, len >= 0); + s = (size_t)len + 1; + } + return (s); case 's': case 'S': if (pv->type == 's' || pv->havesize) @@ -329,6 +343,28 @@ __pack_write( *pp += pad; } break; + case 'j': + case 'J': + s = pv->u.item.size; + if ((pv->type == 'j' || pv->havesize) && pv->size < s) { + s = pv->size; + pad = 0; + } else if (pv->havesize) + pad = pv->size - s; + else + pad = 1; + if (s > 0) { + oldp = *pp; + WT_RET(__wt_json_strncpy((char **)pp, maxlen, + pv->u.item.data, s)); + maxlen -= (size_t)(*pp - oldp); + } + if (pad > 0) { + WT_SIZE_CHECK(pad, maxlen); + memset(*pp, 0, pad); + *pp += pad; + } + break; case 'U': case 'u': s = pv->u.item.size; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index f985fc062c4..c83c5f49144 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -851,10 +851,9 @@ struct __wt_session { * modification., a string; default empty.} * @config{dump, configure the cursor for dump format inputs and * outputs: "hex" selects a simple hexadecimal format\, "json" selects a - * JSON format with each record formats as fields named by column names - * if available\, and "print" selects a format where only non-printing - * characters are hexadecimal encoded\, and "json" produces a JSON - * encoding of the data. The "hex" and "print" dump format are + * JSON format with each record formatted as fields named by column + * names if available\, and "print" selects a format where only + * non-printing characters are hexadecimal encoded. These formats are * compatible with the @ref util_dump and @ref util_load commands., a * string\, chosen from the following options: \c "hex"\, \c "json"\, \c * "print"; default empty.} diff --git a/src/support/hex.c b/src/support/hex.c index 552fbfa1375..96cf5ecc4d4 100644 --- a/src/support/hex.c +++ b/src/support/hex.c @@ -106,11 +106,11 @@ __wt_raw_to_esc_hex( } /* - * hex2byte -- + * __wt_hex2byte -- * Convert a pair of hex characters into a byte. */ -static inline int -hex2byte(const u_char *from, u_char *to) +int +__wt_hex2byte(const u_char *from, u_char *to) { uint8_t byte; @@ -196,7 +196,7 @@ __wt_nhex_to_raw( WT_RET(__wt_buf_init(session, to, size / 2)); for (p = (u_char *)from, t = to->mem; size > 0; p += 2, size -= 2, ++t) - if (hex2byte(p, t)) + if (__wt_hex2byte(p, t)) return (__hex_fmterr(session)); to->size = WT_PTRDIFF(t, to->mem); @@ -220,7 +220,7 @@ __wt_esc_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM *to) continue; ++p; if (p[0] != '\\') { - if (p[0] == '\0' || p[1] == '\0' || hex2byte(p, t)) + if (p[0] == '\0' || p[1] == '\0' || __wt_hex2byte(p, t)) return (__hex_fmterr(session)); ++p; } diff --git a/src/utilities/util_dump.c b/src/utilities/util_dump.c index 85b63b6ab9c..bd0590948b4 100644 --- a/src/utilities/util_dump.c +++ b/src/utilities/util_dump.c @@ -251,7 +251,7 @@ dump_json_table_begin(WT_CURSOR *cursor, const char *uri, const char *config) dump_json_table_cg(cursor, uri, name, "index:", "indices"); } - if (printf("\n },\n [") < 0) + if (printf("\n },\n {\n \"data\" : [") < 0) goto eio; if (0) { @@ -422,7 +422,7 @@ dump_json_table_config(WT_SESSION *session, const char *uri) static int dump_json_table_end(void) { - if (printf(" ]\n ]") < 0) + if (printf(" ]\n }\n ]") < 0) return (util_err(EIO, NULL)); return (0); } @@ -595,9 +595,9 @@ dump_record(WT_CURSOR *cursor, const char *name, int reverse, int json) once = 0; if (json) { - prefix = "\n {\n"; + prefix = "\n{\n"; infix = ",\n"; - suffix = "\n }"; + suffix = "\n}"; } else { prefix = ""; infix = "\n"; diff --git a/src/utilities/util_load.c b/src/utilities/util_load.c index 4bdf356cfd6..1a7e71571a6 100644 --- a/src/utilities/util_load.c +++ b/src/utilities/util_load.c @@ -6,26 +6,30 @@ */ #include "util.h" +#include "util_load.h" static int format(void); static int insert(WT_CURSOR *, const char *); static int load_dump(WT_SESSION *); -static int config_read(char ***, int *); -static int config_rename(char **, const char *); -static int config_update(WT_SESSION *, char **); static int usage(void); static int append; /* -a append (ignore record number keys) */ static char *cmdname; /* -r rename */ static char **cmdconfig; /* configuration pairs */ +static int json; /* -j input is JSON format */ static int no_overwrite; /* -n don't overwrite existing data */ int util_load(WT_SESSION *session, int argc, char *argv[]) { int ch; + const char *filename; + uint32_t flags; - while ((ch = __wt_getopt(progname, argc, argv, "af:nr:")) != EOF) + flags = 0; + + filename = "<stdin>"; + while ((ch = __wt_getopt(progname, argc, argv, "af:jnr:")) != EOF) switch (ch) { case 'a': /* append (ignore record number keys) */ append = 1; @@ -34,6 +38,11 @@ util_load(WT_SESSION *session, int argc, char *argv[]) if (freopen(__wt_optarg, "r", stdin) == NULL) return ( util_err(errno, "%s: reopen", __wt_optarg)); + else + filename = __wt_optarg; + break; + case 'j': /* input is JSON */ + json = 1; break; case 'n': /* don't overwrite existing data */ no_overwrite = 1; @@ -61,7 +70,14 @@ util_load(WT_SESSION *session, int argc, char *argv[]) cmdconfig = argv; } - return (load_dump(session)); + if (json) { + if (append) + flags |= LOAD_JSON_APPEND; + if (no_overwrite) + flags |= LOAD_JSON_NO_OVERWRITE; + return (util_load_json(session, filename, flags)); + } else + return (load_dump(session)); } /* @@ -74,7 +90,7 @@ load_dump(WT_SESSION *session) WT_CURSOR *cursor; WT_DECL_RET; int hex, tret; - char **entry, **list, *p, **tlist, *uri, config[64]; + char **list, **tlist, *uri, config[64]; cursor = NULL; list = NULL; /* -Wuninitialized */ @@ -85,48 +101,18 @@ load_dump(WT_SESSION *session) if ((ret = config_read(&list, &hex)) != 0) return (ret); - /* - * Search for a table name -- if we find one, then it's table dump, - * otherwise, it's a single file dump. - */ - for (entry = list; *entry != NULL; ++entry) - if (WT_PREFIX_MATCH(*entry, "table:")) - break; - if (*entry == NULL) { - /* - * Single file dumps can only have two lines, the file name and - * the configuration information. - */ - if ((list[0] == NULL || list[1] == NULL || list[2] != NULL) || - (WT_PREFIX_MATCH(list[0], "file:") && - WT_PREFIX_MATCH(list[0], "lsm:"))) { - ret = format(); - goto err; - } - - entry = list; - } - - /* - * Make sure the table key/value pair comes first, then we can just - * run through the array in order. (We already checked that we had - * a multiple of 2 entries, so this is safe.) - */ - if (entry != list) { - p = list[0]; list[0] = entry[0]; entry[0] = p; - p = list[1]; list[1] = entry[1]; entry[1] = p; - } + /* Reorder and check the list. */ + if ((ret = config_reorder(list)) != 0) + return (ret); /* Update the config based on any command-line configuration. */ if ((ret = config_update(session, list)) != 0) goto err; uri = list[0]; - for (entry = list; *entry != NULL; entry += 2) - if ((ret = session->create(session, entry[0], entry[1])) != 0) { - ret = util_err(ret, "%s: session.create", entry[0]); - goto err; - } + /* Create the items in the list. */ + if ((ret = config_exec(session, list)) != 0) + goto err; /* Open the insert cursor. */ (void)snprintf(config, sizeof(config), @@ -173,10 +159,51 @@ err: /* } /* + * config_exec -- + * Create the tables/indices/colgroups implied by the list. + */ +int +config_exec(WT_SESSION *session, char **list) +{ + WT_DECL_RET; + + for (; *list != NULL; list += 2) + if ((ret = session->create(session, list[0], list[1])) != 0) + return (util_err(ret, "%s: session.create", list[0])); + return (0); +} + +int +config_list_add(CONFIG_LIST *clp, char *val) +{ + if (clp->entry + 1 >= clp->max_entry) + if ((clp->list = realloc(clp->list, (size_t) + (clp->max_entry += 100) * sizeof(char *))) == NULL) + /* List already freed by realloc. */ + return (util_err(errno, NULL)); + + clp->list[clp->entry++] = val; + clp->list[clp->entry] = NULL; + return (0); +} + +void +config_list_free(CONFIG_LIST *clp) +{ + char **entry; + + if (clp->list != NULL) + for (entry = &clp->list[0]; *entry != NULL; entry++) + free(*entry); + free(clp->list); + clp->list = NULL; +} + +/* * config_read -- * Read the config lines and do some basic validation. */ -static int +int config_read(char ***listp, int *hexp) { ULINE l; @@ -260,16 +287,62 @@ err: if (list != NULL) { } /* + * config_reorder -- + * For table dumps, reorder the list so tables are first. + * For other dumps, make any needed checks. + */ +int +config_reorder(char **list) +{ + char **entry, *p; + + /* + * Search for a table name -- if we find one, then it's table dump, + * otherwise, it's a single file dump. + */ + for (entry = list; *entry != NULL; ++entry) + if (WT_PREFIX_MATCH(*entry, "table:")) + break; + if (*entry == NULL) { + /* + * Single file dumps can only have two lines, the file name and + * the configuration information. + */ + if ((list[0] == NULL || list[1] == NULL || list[2] != NULL) || + (WT_PREFIX_MATCH(list[0], "file:") && + WT_PREFIX_MATCH(list[0], "lsm:"))) + return (format()); + + entry = list; + } + + /* + * Make sure the table key/value pair comes first, then we can just + * run through the array in order. (We already checked that we had + * a multiple of 2 entries, so this is safe.) + */ + if (entry != list) { + p = list[0]; list[0] = entry[0]; entry[0] = p; + p = list[1]; list[1] = entry[1]; entry[1] = p; + } + return (0); +} + +/* * config_update -- * Reconcile and update the command line configuration against the - * config we found. + * config we found. */ -static int +int config_update(WT_SESSION *session, char **list) { int found; const char *cfg[] = { NULL, NULL, NULL }; - char **configp, **listp, *p, *t; + char **configp, **listp; + const char **rm; + static const char *rmnames[] = { + "filename", "id", "checkpoint", "checkpoint_lsn", + "version", "source", NULL }; /* * If the object has been renamed, replace all of the column group, @@ -296,16 +369,14 @@ config_update(WT_SESSION *session, char **list) } /* - * Remove all "filename=" configurations from the values, new filenames - * are chosen as part of table load. + * Remove all "filename=", "source=" and other configurations + * that foil loading from the values. New filenames are chosen + * as part of table load. */ for (listp = list; *listp != NULL; listp += 2) - if ((p = strstr(listp[1], "filename=")) != NULL) { - if ((t = strchr(p, ',')) == NULL) - *p = '\0'; - else - memmove(p, t + 1, strlen(t + 1) + 1); - } + for (rm = rmnames; *rm != NULL; rm++) + if (strstr(listp[1], *rm) != NULL) + config_remove(listp[1], *rm); /* * It's possible to update everything except the key/value formats. @@ -375,7 +446,7 @@ config_update(WT_SESSION *session, char **list) * config_rename -- * Update the URI name. */ -static int +int config_rename(char **urip, const char *name) { size_t len; @@ -403,6 +474,46 @@ config_rename(char **urip, const char *name) } /* + * config_remove -- + * Remove a single config key and its value. + */ +void +config_remove(char *config, const char *ckey) +{ + int parens, quoted; + char *begin, match[100], *next, *p; + + snprintf(match, sizeof(match), "%s=", ckey); + if ((begin = strstr(config, match)) != NULL) { + parens = 0; + quoted = 0; + next = NULL; + for (p = begin + strlen(match); !next && *p; p++) + switch (*p) { + case '(': + if (!quoted) + parens++; + break; + case ')': + if (!quoted) + parens--; + break; + case '"': + quoted = !quoted; + break; + case ',': + if (!quoted && parens == 0) + next = p + 1; + break; + } + if (next) + memmove(begin, next, strlen(next) + 1); + else + *begin = '\0'; + } +} + +/* * format -- * The input doesn't match the dump format. */ diff --git a/src/utilities/util_load.h b/src/utilities/util_load.h new file mode 100644 index 00000000000..13174b95c72 --- /dev/null +++ b/src/utilities/util_load.h @@ -0,0 +1,30 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +/* + * A list of configuration strings. + */ +typedef struct { + char **list; /* array of alternating (uri, config) values */ + int entry; /* next entry available in list */ + int max_entry; /* how many allocated in list */ +} CONFIG_LIST; + +int config_exec(WT_SESSION *, char **); +int config_list_add(CONFIG_LIST *, char *); +void config_list_free(CONFIG_LIST *); +int config_read(char ***, int *); +int config_rename(char **, const char *); +void config_remove(char *, const char *); +int config_reorder(char **); +int config_update(WT_SESSION *, char **); + +/* Flags for util_load_json */ +#define LOAD_JSON_APPEND 0x0001 /* append (ignore record number keys) */ +#define LOAD_JSON_NO_OVERWRITE 0x0002 /* don't overwrite existing data */ + +int util_load_json(WT_SESSION *, const char *, uint32_t); diff --git a/src/utilities/util_load_json.c b/src/utilities/util_load_json.c new file mode 100644 index 00000000000..9fba6b73948 --- /dev/null +++ b/src/utilities/util_load_json.c @@ -0,0 +1,567 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "util.h" +#include "util_load.h" + +/* + * Encapsulates the input state for parsing JSON. + * + * At any time, we may be peeking at an unconsumed token; this is + * indicated by 'peeking' as true. toktype, tokstart, toklen will be + * set in this case. + * + * Generally we are collecting and processing tokens one by one. + * In JSON, tokens never span lines so this makes processing easy. + * The exception is that a JSON dump cursor takes the complete + * set of keys or values during cursor->set_key/set_value calls, + * which may contain many tokens and span lines. E.g. + * cursor->set_value("\"name\" : \"John\", \"phone\" : 2348765"); + * The raw key/value string is collected in the kvraw field. + */ +typedef struct { + WT_SESSION *session; /* associated session */ + ULINE line; /* current line */ + const char *p; /* points to cur position in line.mem */ + int ateof; /* current token is EOF */ + int peeking; /* peeking at next token */ + int toktype; /* next token, defined by __wt_json_token() */ + const char *tokstart; /* next token start (points into line.mem) */ + size_t toklen; /* next token length */ + char *kvraw; /* multiple line raw content collected so far */ + size_t kvrawstart; /* pos on cur line that JSON key/value starts */ + const char *filename; /* filename for error reporting */ + int linenum; /* line number for error reporting */ +} JSON_INPUT_STATE; + +static int json_column_group_index(WT_SESSION *, JSON_INPUT_STATE *, + CONFIG_LIST *, int); +static int json_data(WT_SESSION *, JSON_INPUT_STATE *, CONFIG_LIST *, uint32_t); +static int json_expect(WT_SESSION *, JSON_INPUT_STATE *, int); +static int json_peek(WT_SESSION *, JSON_INPUT_STATE *); +static int json_skip(WT_SESSION *, JSON_INPUT_STATE *, const char **); +static int json_kvraw_append(JSON_INPUT_STATE *, const char *, size_t); +static int json_strdup(JSON_INPUT_STATE *, char **); +static int json_top_level(WT_SESSION *, JSON_INPUT_STATE *, uint32_t); + +#define JSON_STRING_MATCH(ins, match) \ + ((ins)->toklen - 2 == strlen(match) && \ + strncmp((ins)->tokstart + 1, (match), (ins)->toklen - 2) == 0) + +#define JSON_INPUT_POS(ins) \ + ((size_t)((ins)->p - (const char *)(ins)->line.mem)) + +#define JSON_EXPECT(session, ins, tok) do { \ + if (json_expect(session, ins, tok)) \ + goto err; \ +} while (0) + +/* + * json_column_group_index -- + * Parse a column group or index entry from JSON input. + */ +static int +json_column_group_index(WT_SESSION *session, JSON_INPUT_STATE *ins, + CONFIG_LIST *clp, int idx) +{ + WT_DECL_RET; + char *config, *p, *uri; + int isconfig; + + uri = NULL; + config = NULL; + + while (json_peek(session, ins) == '{') { + JSON_EXPECT(session, ins, '{'); + JSON_EXPECT(session, ins, 's'); + isconfig = JSON_STRING_MATCH(ins, "config"); + if (!isconfig && !JSON_STRING_MATCH(ins, "uri")) + goto err; + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, 's'); + + if ((ret = json_strdup(ins, &p)) != 0) { + ret = util_err(ret, NULL); + goto err; + } + if (isconfig) + config = p; + else + uri = p; + + isconfig = !isconfig; + JSON_EXPECT(session, ins, ','); + JSON_EXPECT(session, ins, 's'); + if (!JSON_STRING_MATCH(ins, isconfig ? "config" : "uri")) + goto err; + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, 's'); + + if ((ret = json_strdup(ins, &p)) != 0) { + ret = util_err(ret, NULL); + goto err; + } + if (isconfig) + config = p; + else + uri = p; + JSON_EXPECT(session, ins, '}'); + if ((idx && strncmp(uri, "index:", 6) != 0) || + (!idx && strncmp(uri, "colgroup:", 9) != 0)) { + ret = util_err(EINVAL, + "%s: misplaced colgroup or index", uri); + goto err; + } + if ((ret = config_list_add(clp, uri)) != 0 || + (ret = config_list_add(clp, config)) != 0) + goto err; + + if (json_peek(session, ins) != ',') + break; + JSON_EXPECT(session, ins, ','); + if (json_peek(session, ins) != '{') + goto err; + } + if (0) { +err: if (ret == 0) + ret = EINVAL; + } + return (ret); +} + +/* + * json_kvraw_append -- + * Append to the kvraw buffer, which is used to collect all the + * raw key/value pairs from JSON input. + */ +static int json_kvraw_append(JSON_INPUT_STATE *ins, const char *str, size_t len) +{ + char *tmp; + size_t needsize; + + if (len > 0) { + needsize = strlen(ins->kvraw) + len + 2; + if ((tmp = malloc(needsize)) == NULL) + return (util_err(errno, NULL)); + snprintf(tmp, needsize, "%s %.*s", ins->kvraw, (int)len, str); + free(ins->kvraw); + ins->kvraw = tmp; + } + return (0); +} + +/* + * json_strdup -- + * Return a string, with no escapes or other JSON-isms, from the + * JSON string at the current input position. + */ +static int +json_strdup(JSON_INPUT_STATE *ins, char **resultp) +{ + WT_DECL_RET; + char *result, *resultcpy; + const char *src; + ssize_t resultlen; + size_t srclen; + + result = NULL; + src = ins->tokstart + 1; /*strip "" from token */ + srclen = ins->toklen - 2; + if ((resultlen = __wt_json_strlen(src, srclen)) < 0) { + ret = util_err(EINVAL, "Invalid config string"); + goto err; + } + resultlen += 1; + if ((result = (char *)malloc((size_t)resultlen)) == NULL) { + ret = util_err(errno, NULL); + goto err; + } + *resultp = result; + resultcpy = result; + if ((ret = __wt_json_strncpy(&resultcpy, (size_t)resultlen, src, + srclen)) + != 0) { + ret = util_err(ret, NULL); + goto err; + } + + if (0) { +err: if (ret == 0) + ret = EINVAL; + if (result != NULL) + free(result); + *resultp = NULL; + } + return (ret); +} + +/* + * json_data -- + * Parse the data portion of the JSON input, and insert all + * values. + */ +static int +json_data(WT_SESSION *session, JSON_INPUT_STATE *ins, CONFIG_LIST *clp, + uint32_t flags) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + char config[64], *endp, *uri; + const char *keyformat; + int isrec, nfield, nkeys, toktype, tret; + size_t keystrlen; + ssize_t gotnolen; + uint64_t gotno, recno; + + cursor = NULL; + uri = NULL; + + /* Reorder and check the list. */ + if ((ret = config_reorder(clp->list)) != 0) + goto err; + + /* Update config based on command-line configuration. */ + if ((ret = config_update(session, clp->list)) != 0) + goto err; + + /* Create the items collected. */ + if ((ret = config_exec(session, clp->list)) != 0) + goto err; + + uri = clp->list[0]; + (void)snprintf(config, sizeof(config), + "dump=json%s%s", + LF_ISSET(LOAD_JSON_APPEND) ? ",append" : "", + LF_ISSET(LOAD_JSON_NO_OVERWRITE) ? ",overwrite=false" : ""); + if ((ret = session->open_cursor( + session, uri, NULL, config, &cursor)) != 0) { + ret = util_err(ret, "%s: session.open", uri); + goto err; + } + keyformat = cursor->key_format; + isrec = (strcmp(keyformat, "r") == 0); + for (nkeys = 0; *keyformat; keyformat++) + if (!isdigit(*keyformat)) + nkeys++; + + recno = 0; + while (json_peek(session, ins) == '{') { + nfield = 0; + JSON_EXPECT(session, ins, '{'); + if ((ins)->kvraw == NULL) + (ins)->kvraw = (char *)malloc(1); + (ins)->kvraw[0] = '\0'; + (ins)->kvrawstart = JSON_INPUT_POS(ins); + keystrlen = 0; + while (json_peek(session, ins) == 's') { + JSON_EXPECT(session, ins, 's'); + JSON_EXPECT(session, ins, ':'); + toktype = json_peek(session, ins); + JSON_EXPECT(session, ins, toktype); + if (isrec && nfield == 0) { + /* Verify the dump has recnos in order. */ + recno++; + gotno = __wt_strtouq(ins->tokstart, &endp, 0); + gotnolen = (endp - ins->tokstart); + if (recno != gotno || + ins->toklen != (size_t)gotnolen) { + ret = util_err(0, + "%s: recno out of order", uri); + goto err; + } + } + if (++nfield == nkeys) { + size_t curpos = JSON_INPUT_POS(ins); + if ((ret = json_kvraw_append(ins, + (char *)(ins)->line.mem + (ins)->kvrawstart, + curpos - (ins)->kvrawstart)) != 0) + goto err; + ins->kvrawstart = curpos; + keystrlen = strlen(ins->kvraw); + } + if (json_peek(session, ins) != ',') + break; + JSON_EXPECT(session, ins, ','); + if (json_peek(session, ins) != 's') + goto err; + } + if (json_kvraw_append(ins, ins->line.mem, JSON_INPUT_POS(ins))) + goto err; + + ins->kvraw[keystrlen] = '\0'; + if (!LF_ISSET(LOAD_JSON_APPEND)) + cursor->set_key(cursor, ins->kvraw); + /* skip over inserted space and comma */ + cursor->set_value(cursor, &ins->kvraw[keystrlen+2]); + if ((ret = cursor->insert(cursor)) != 0) { + ret = util_err(ret, "%s: cursor.insert", uri); + goto err; + } + + JSON_EXPECT(session, ins, '}'); + if (json_peek(session, ins) != ',') + break; + JSON_EXPECT(session, ins, ','); + if (json_peek(session, ins) != '{') + goto err; + } + if (0) { +err: if (ret == 0) + ret = EINVAL; + } + /* + * Technically, we don't have to close the cursor because the session + * handle will do it for us, but I'd like to see the flush to disk and + * the close succeed, it's better to fail early when loading files. + */ + if (cursor != NULL && (tret = cursor->close(cursor)) != 0) { + tret = util_err(tret, "%s: cursor.close", uri); + if (ret == 0) + ret = tret; + } + if (ret == 0) + ret = util_flush(session, uri); + return (ret); +} + +/* + * json_top_level -- + * Parse the top level JSON input. + */ +static int +json_top_level(WT_SESSION *session, JSON_INPUT_STATE *ins, uint32_t flags) +{ + CONFIG_LIST cl; + WT_DECL_RET; + char *config, *tableuri; + int toktype; + static const char *json_markers[] = { + "\"config\"", "\"colgroups\"", "\"indices\"", "\"data\"", NULL }; + + memset(&cl, 0, sizeof(cl)); + tableuri = NULL; + JSON_EXPECT(session, ins, '{'); + while (json_peek(session, ins) == 's') { + JSON_EXPECT(session, ins, 's'); + tableuri = realloc(tableuri, ins->toklen); + snprintf(tableuri, ins->toklen, "%.*s", + (int)(ins->toklen - 2), ins->tokstart + 1); + JSON_EXPECT(session, ins, ':'); + + /* + * Allow any ordering of 'config', 'colgroups', + * 'indices' before 'data', which must appear last. + * The non-'data' items build up a list of entries + * that created in our session before the data is + * inserted. + */ + for (;;) { + if (json_skip(session, ins, json_markers) != 0) + goto err; + JSON_EXPECT(session, ins, 's'); + if (JSON_STRING_MATCH(ins, "config")) { + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, 's'); + if ((ret = json_strdup(ins, &config)) != 0) { + ret = util_err(ret, NULL); + goto err; + } + config_list_add(&cl, tableuri); + config_list_add(&cl, config); + tableuri = NULL; + } else if (JSON_STRING_MATCH(ins, "colgroups")) { + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, '['); + if ((ret = json_column_group_index( + session, ins, &cl, 0)) != 0) + goto err; + JSON_EXPECT(session, ins, ']'); + } else if (JSON_STRING_MATCH(ins, "indices")) { + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, '['); + if ((ret = json_column_group_index( + session, ins, &cl, 1)) != 0) + goto err; + JSON_EXPECT(session, ins, ']'); + } else if (JSON_STRING_MATCH(ins, "data")) { + JSON_EXPECT(session, ins, ':'); + JSON_EXPECT(session, ins, '['); + if ((ret = json_data(session, ins, &cl, + flags)) != 0) + goto err; + config_list_free(&cl); + break; + } + else + goto err; + } + + while ((toktype = json_peek(session, ins)) == '}' || + toktype == ']') + JSON_EXPECT(session, ins, toktype); + if (toktype == 0) /* Check EOF. */ + break; + if (toktype == ',') { + JSON_EXPECT(session, ins, ','); + if (json_peek(session, ins) != 's') + goto err; + continue; + } + } + JSON_EXPECT(session, ins, 0); + + if (0) { +err: if (ret == 0) + ret = EINVAL; + } + config_list_free(&cl); + if (tableuri != NULL) + free(tableuri); + return (ret); +} + +/* + * json_peek -- + * Set the input state to the next available token in the input + * and return its token type, a code defined by __wt_json_token(). + */ +static int +json_peek(WT_SESSION *session, JSON_INPUT_STATE *ins) +{ + WT_DECL_RET; + + if (!ins->peeking) { + while (!ins->ateof) { + while (isspace(*ins->p)) + ins->p++; + if (*ins->p) + break; + if (ins->kvraw != NULL) { + if (json_kvraw_append(ins, + (char *)ins->line.mem + ins->kvrawstart, + strlen(ins->line.mem) - ins->kvrawstart)) { + ret = -1; + goto err; + } + ins->kvrawstart = 0; + } + if (util_read_line(&ins->line, 1, + &ins->ateof)) { + ins->toktype = -1; + ret = -1; + goto err; + } + ins->linenum++; + ins->p = (const char *)ins->line.mem; + } + if (ins->ateof) + ins->toktype = 0; + else if (__wt_json_token(session, ins->p, + &ins->toktype, &ins->tokstart, + &ins->toklen) != 0) + ins->toktype = -1; + ins->peeking = 1; + } + if (0) { + err: if (ret == 0) + ret = -1; + } + return (ret == 0 ? ins->toktype : -1); +} + +/* + * json_expect -- + * Ensure that the type of the next token in the input matches + * the wanted value, and advance past it. The values of the + * input state will be set so specific string or integer values + * can be pulled out after this call. + */ +static int +json_expect(WT_SESSION *session, JSON_INPUT_STATE *ins, int wanttok) +{ + if (json_peek(session, ins) < 0) + return (1); + ins->p += ins->toklen; + ins->peeking = 0; + if (ins->toktype != wanttok) { + fprintf(stderr, + "%s: %d: %ld: expected %s, got %s\n", + ins->filename, + ins->linenum, + JSON_INPUT_POS(ins) + 1, + __wt_json_tokname(wanttok), + __wt_json_tokname(ins->toktype)); + return (1); + } + return (0); +} + +/* + * json_skip -- + * Skip over JSON input until one of the specified strings appears. + * The tokenizer will be set to point to the beginning of + * that string. + */ +static int +json_skip(WT_SESSION *session, JSON_INPUT_STATE *ins, const char **matches) +{ + char *hit; + const char **match; + + if (ins->kvraw != NULL) + return (1); + + hit = NULL; + while (!ins->ateof) { + for (match = matches; *match != NULL; match++) + if ((hit = strstr(ins->p, *match)) != NULL) + goto out; + if (util_read_line(&ins->line, 1, &ins->ateof)) { + ins->toktype = -1; + return (1); + } + ins->linenum++; + ins->p = (const char *)ins->line.mem; + } +out: + if (hit == NULL) + return (1); + + /* Set to this token. */ + ins->p = hit; + ins->peeking = 0; + ins->toktype = 0; + (void)json_peek(session, ins); + return (0); +} + +/* + * load_json -- + * Load from the JSON format produced by 'wt dump -j'. + */ +int +util_load_json(WT_SESSION *session, const char *filename, uint32_t flags) +{ + JSON_INPUT_STATE instate; + WT_DECL_RET; + + memset(&instate, 0, sizeof(instate)); + instate.session = session; + if (util_read_line(&instate.line, 0, &instate.ateof)) + return (1); + instate.p = (const char *)instate.line.mem; + instate.linenum = 1; + instate.filename = filename; + + if ((ret = json_top_level(session, &instate, flags)) != 0) + goto err; + +err: if (instate.line.mem != NULL) + free(instate.line.mem); + free(instate.kvraw); + return (ret); +} diff --git a/test/suite/test_jsondump01.py b/test/suite/test_jsondump01.py index d7c83b1e7ff..730fbf0a05a 100644 --- a/test/suite/test_jsondump01.py +++ b/test/suite/test_jsondump01.py @@ -63,12 +63,11 @@ class FakeCursor: return tup # test_jsondump.py -# Utilities: wt jsondump -# Test the jsondump utility (I'm not testing the 'json' cursors, -# that's what the utility uses underneath). +# Utilities: wt dump +# Test the dump utility with the -j option. class test_jsondump01(wttest.WiredTigerTestCase, suite_subprocess): - name = 'test_jsondump01' + name2 = 'test_jsondump01b' nentries = 2500 keyfmt = [ @@ -109,7 +108,7 @@ class test_jsondump01(wttest.WiredTigerTestCase, suite_subprocess): # spot check configs = tables[uri][0] - data = tables[uri][1] + data = tables[uri][1]["data"] d = data[24] if 'column5' in d: self.assertEqual(d['column5'], '25: abcde') @@ -123,5 +122,24 @@ class test_jsondump01(wttest.WiredTigerTestCase, suite_subprocess): cursor.close() self.populate_check(self, fake, self.nentries) + # Dump using util, re-load using python's JSON, and do a content comparison. + def test_jsonload_util(self): + # Create the object. + uri = self.type + self.name + uri2 = self.type + self.name2 + self.populate(self, uri, 'key_format=' + self.keyfmt, self.nentries) + + # Dump the object. + self.runWt(['dump', '-j', uri], outfilename='jsondump.out') + + loadcmd = ['load', '-jf', 'jsondump.out', '-r', self.name2] + if self.keyfmt == 'r': + loadcmd.append('-a') + self.runWt(loadcmd) + + # check the contents of the data we read. + cursor = self.session.open_cursor(uri2, None) + self.populate_check(self, cursor, self.nentries) + if __name__ == '__main__': wttest.run() diff --git a/test/suite/test_jsondump02.py b/test/suite/test_jsondump02.py index 3ed073cf022..2eff8b755a7 100644 --- a/test/suite/test_jsondump02.py +++ b/test/suite/test_jsondump02.py @@ -29,9 +29,7 @@ import os import wiredtiger, wttest # test_jsondump.py -# Utilities: wt jsondump -# Test the jsondump utility (I'm not testing the 'json' cursors, -# that's what the utility uses underneath). +# Test dump output from json cursors. class test_jsondump02(wttest.WiredTigerTestCase): table_uri1 = 'table:jsondump02a.wt' @@ -79,6 +77,19 @@ class test_jsondump02(wttest.WiredTigerTestCase): pos += 1 self.assertEqual(pos, len(expect)) cursor.close() + + # Check the result of using a JSON cursor on the URI. + def load_json(self, uri, inserts): + cursor = self.session.open_cursor(uri, None, 'dump=json') + pos = 0 + try: + for insert in inserts: + #tty_pr('Insert: ' + str(insert)) + cursor.set_key(insert[0]) + cursor.set_value(insert[1]) + cursor.insert() + finally: + cursor.close() # Create JSON cursors and test them directly. def test_json_cursor(self): @@ -114,13 +125,93 @@ class test_jsondump02(wttest.WiredTigerTestCase): self.set_kv(self.table_uri3, 2, '\x77\x88\x99\x00\xff\xfe') self.populate_squarecube(self.table_uri4) - self.check_json(self.table_uri1, ( - ('"key0" : "KEY000"', '"value0" : "string value"'), - ('"key0" : "KEY001"', '"value0" : ' + - '"\'\\\"({[]})\\\"\', etc. allowed"'))) - self.check_json(self.table_uri2, ( - ('"key0" : "KEY000"', '"value0" : 123,\n"value1" : "str0"'), - ('"key0" : "KEY001"', '"value0" : 234,\n"value1" : "str1"'))) + table1_json = ( + ('"key0" : "KEY000"', '"value0" : "string value"'), + ('"key0" : "KEY001"', '"value0" : ' + + '"\'\\\"({[]})\\\"\', etc. allowed"')) + self.check_json(self.table_uri1, table1_json) + + self.session.truncate(self.table_uri1, None, None, None) + self.load_json(self.table_uri1, table1_json) + self.check_json(self.table_uri1, table1_json) + + table2_json = ( + ('"key0" : "KEY000"', '"value0" : 123,\n"value1" : "str0"'), + ('"key0" : "KEY001"', '"value0" : 234,\n"value1" : "str1"')) + self.check_json(self.table_uri2, table2_json) + self.session.truncate(self.table_uri2, None, None, None) + self.load_json(self.table_uri2, table2_json) + self.check_json(self.table_uri2, table2_json) + self.session.truncate(self.table_uri2, None, None, None) + + # bad tokens + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.load_json(self.table_uri2, + (('<>abc?', '9'),)), + '/unknown token/') + + # bad tokens + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.load_json(self.table_uri2, + (('"abc\u"', ''),)), + '/invalid Unicode/') + + # bad tokens + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.load_json(self.table_uri2, + (('"abc', ''),)), + '/unterminated string/') + + # bad syntax + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.load_json(self.table_uri2, + (('"stuff" "jibberish"', '"value0" "more jibberish"'),)), + '/expected key name.*\"key0\"/') + + # bad types + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.load_json(self.table_uri2, + (('"key0" : "KEY002"', '"value0" : "xyz",\n"value1" : "str0"'),)), + '/expected unsigned JSON <int>, got <string>/') + + # bad types + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.load_json(self.table_uri2, + (('"key0" : "KEY002"', '"value0" : 123,\n"value1" : 456'),)), + '/expected JSON <string>, got <integer>/') + + # extra stuff + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.load_json(self.table_uri2, + (('"key0" : "KEY002"', + '"value0" : 123,\n"value1" : "str0",'),)), + '/expected JSON <EOF>, got \',\'/') + + # fields out of order currently not supported + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.load_json(self.table_uri2, + (('"key0" : "KEY002"', '"value1" : "str0",\n"value0" : 123'),)), + '/expected value name.*\"value0\"/') + + # various invalid unicode + invalid_unicode = ( + '\\u', '\\ux', '\\u0', '\\u0F', '\\u0FA', '\\u0FAx', '\\u0FA\\x') + for uni in invalid_unicode: + self.assertRaisesWithMessage(wiredtiger.WiredTigerError, + lambda: self.load_json(self.table_uri2, + (('"key0" : "KEY002"', '"value0" : 123,\n"value1" : "' + + uni + '"'),)), + '/invalid Unicode/') + + # this one should work + self.load_json(self.table_uri2, + (('"key0" : "KEY002"', '"value0" : 345,\n"value1" : "str2"'),)) + + # extraneous/missing space is okay + self.load_json(self.table_uri2, + ((' "key0"\n:\t"KEY003" ', + '"value0":456,"value1"\n\n\r\n:\t\n"str3"'),)) + self.check_json(self.table_uri3, ( ('"key0" : 1', '"value0" : "\\u0001\\u0002\\u0003"'), ('"key0" : 2', @@ -163,37 +254,5 @@ class test_jsondump02(wttest.WiredTigerTestCase): ('"i2" : 16,\n"i4" : 64', '"S1" : "val16",\n"i2" : 16,\n"S3" : "val64",\n"i4" : 64'))) - def test_json_illegal(self): - """ - Create JSON cursors and use them illegally - """ - extra_params = ',allocation_size=512,' +\ - 'internal_page_max=16384,leaf_page_max=131072' - self.session.create(self.table_uri1, - 'key_format=S,value_format=S' + extra_params) - - self.set_kv(self.table_uri1, 'A', 'aaaa') - self.check_json(self.table_uri1, ( - ('"key0" : "A"', '"value0" : "aaaa"'),)) - - self.set_kv(self.table_uri1, 'B', 'bbbb') - self.check_json(self.table_uri1, ( - ('"key0" : "A"', '"value0" : "aaaa"'), - ('"key0" : "B"', '"value0" : "bbbb"'))) - - cursor = self.session.open_cursor(self.table_uri1, None, 'dump=json') - cursor.next() - - with self.expectedStderrPattern('Setting keys for JSON cursors not permitted'): - cursor.set_key('stuff') - with self.expectedStderrPattern('Setting values for JSON cursors not permitted'): - cursor.set_value('other stuff') - cursor.close() - - self.check_json(self.table_uri1, ( - ('"key0" : "A"', '"value0" : "aaaa"'), - ('"key0" : "B"', '"value0" : "bbbb"'))) - - if __name__ == '__main__': wttest.run() |