summaryrefslogtreecommitdiff
path: root/src/mongo/bson/json.h
blob: 5e5516607a12e910b8f25f5f14aa50313d3ab882 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
/**
*    Copyright (C) 2008 10gen Inc.
*
*    This program is free software: you can redistribute it and/or  modify
*    it under the terms of the GNU Affero General Public License, version 3,
*    as published by the Free Software Foundation.
*
*    This program is distributed in the hope that it will be useful,
*    but WITHOUT ANY WARRANTY; without even the implied warranty of
*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*    GNU Affero General Public License for more details.
*
*    You should have received a copy of the GNU Affero General Public License
*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*
*    As a special exception, the copyright holders give permission to link the
*    code of portions of this program with the OpenSSL library under certain
*    conditions as described in each individual source file and distribute
*    linked combinations including the program with the OpenSSL library. You
*    must comply with the GNU Affero General Public License in all respects for
*    all of the code used other than as permitted herein. If you modify file(s)
*    with this exception, you may extend this exception to your version of the
*    file(s), but you are not obligated to do so. If you do not wish to do so,
*    delete this exception statement from your version. If you delete this
*    exception statement from all source files in the program, then also delete
*    it in the license file.
*/

#pragma once

#include <string>

#include "mongo/base/status.h"
#include "mongo/bson/bsonobj.h"

namespace mongo {

/**
 * Create a BSONObj from a JSON <http://www.json.org>,
 * <http://www.ietf.org/rfc/rfc4627.txt> string.  In addition to the JSON
 * extensions extensions described here
 * <http://dochub.mongodb.org/core/mongodbextendedjson>, this function
 * accepts unquoted field names and allows single quotes to optionally be
 * used when specifying field names and std::string values instead of double
 * quotes.  JSON unicode escape sequences (of the form \uXXXX) are
 * converted to utf8.
 *
 * @throws MsgAssertionException if parsing fails.  The message included with
 * this assertion includes the character offset where parsing failed.
 */
BSONObj fromjson(const std::string& str);

/** @param len will be size of JSON object in text chars. */
BSONObj fromjson(const char* str, int* len = NULL);

/**
 * Tests whether the JSON string is an Array.
 *
 * Useful for assigning the result of fromjson to the right object type. Either:
 *  BSONObj
 *  BSONArray
 *
 * @example Using the method to select the proper type.
 *  If this method returns true, the user could store the result of fromjson
 *  inside a BSONArray, rather than a BSONObj, in order to have it print as an
 *  array when passed to tojson.
 *
 * @param obj The JSON string to test.
 */
bool isArray(StringData str);

/**
 * Convert a BSONArray to a JSON string.
 *
 * @param arr The BSON Array.
 * @param format The JSON format (JS, TenGen, Strict).
 * @param pretty Enables pretty output.
 */
std::string tojson(const BSONArray& arr, JsonStringFormat format = Strict, bool pretty = false);

/**
 * Convert a BSONObj to a JSON string.
 *
 * @param obj The BSON Object.
 * @param format The JSON format (JS, TenGen, Strict).
 * @param pretty Enables pretty output.
 */
std::string tojson(const BSONObj& obj, JsonStringFormat format = Strict, bool pretty = false);

/**
 * Parser class.  A BSONObj is constructed incrementally by passing a
 * BSONObjBuilder to the recursive parsing methods.  The grammar for the
 * element parsed is described before each function.
 */
class JParse {
public:
    explicit JParse(StringData str);

    /*
     * Notation: All-uppercase symbols denote non-terminals; all other
     * symbols are literals.
     */

    /*
     * VALUE :
     *     STRING
     *   | NUMBER
     *   | NUMBERINT
     *   | NUMBERLONG
     *   | NUMBERDECIMAL
     *   | OBJECT
     *   | ARRAY
     *
     *   | true
     *   | false
     *   | null
     *   | undefined
     *
     *   | NaN
     *   | Infinity
     *   | -Infinity
     *
     *   | DATE
     *   | TIMESTAMP
     *   | REGEX
     *   | OBJECTID
     *   | DBREF
     *
     *   | new CONSTRUCTOR
     */
private:
    Status value(StringData fieldName, BSONObjBuilder&);

    /*
     * OBJECT :
     *     {}
     *   | { MEMBERS }
     *   | SPECIALOBJECT
     *
     * MEMBERS :
     *     PAIR
     *   | PAIR , MEMBERS
     *
     * PAIR :
     *     FIELD : VALUE
     *
     * SPECIALOBJECT :
     *     OIDOBJECT
     *   | BINARYOBJECT
     *   | DATEOBJECT
     *   | TIMESTAMPOBJECT
     *   | REGEXOBJECT
     *   | REFOBJECT
     *   | UNDEFINEDOBJECT
     *   | NUMBERLONGOBJECT
     *   | NUMBERDECIMALOBJECT
     *   | MINKEYOBJECT
     *   | MAXKEYOBJECT
     *
     */
public:
    Status object(StringData fieldName, BSONObjBuilder&, bool subObj = true);
    Status parse(BSONObjBuilder& builder);
    bool isArray();

private:
    /* The following functions are called with the '{' and the first
     * field already parsed since they are both implied given the
     * context. */
    /*
     * OIDOBJECT :
     *     { FIELD("$oid") : <24 character hex std::string> }
     */
    Status objectIdObject(StringData fieldName, BSONObjBuilder&);

    /*
     * BINARYOBJECT :
     *     { FIELD("$binary") : <base64 representation of a binary std::string>,
     *          FIELD("$type") : <hexadecimal representation of a single byte
     *              indicating the data type> }
     */
    Status binaryObject(StringData fieldName, BSONObjBuilder&);

    /*
     * DATEOBJECT :
     *     { FIELD("$date") : <64 bit signed integer for milliseconds since epoch> }
     */
    Status dateObject(StringData fieldName, BSONObjBuilder&);

    /*
     * TIMESTAMPOBJECT :
     *     { FIELD("$timestamp") : {
     *         FIELD("t") : <32 bit unsigned integer for seconds since epoch>,
     *         FIELD("i") : <32 bit unsigned integer for the increment> } }
     */
    Status timestampObject(StringData fieldName, BSONObjBuilder&);

    /*
     *     NOTE: the rules for the body of the regex are different here,
     *     since it is quoted instead of surrounded by slashes.
     * REGEXOBJECT :
     *     { FIELD("$regex") : <string representing body of regex> }
     *   | { FIELD("$regex") : <string representing body of regex>,
     *          FIELD("$options") : <string representing regex options> }
     */
    Status regexObject(StringData fieldName, BSONObjBuilder&);

    /*
     * REFOBJECT :
     *     { FIELD("$ref") : <string representing collection name>,
     *          FIELD("$id") : <24 character hex std::string> }
     *   | { FIELD("$ref") : std::string , FIELD("$id") : OBJECTID }
     *   | { FIELD("$ref") : std::string , FIELD("$id") : OIDOBJECT }
     */
    Status dbRefObject(StringData fieldName, BSONObjBuilder&);

    /*
     * UNDEFINEDOBJECT :
     *     { FIELD("$undefined") : true }
     */
    Status undefinedObject(StringData fieldName, BSONObjBuilder&);

    /*
     * NUMBERLONGOBJECT :
     *     { FIELD("$numberLong") : "<number>" }
     */
    Status numberLongObject(StringData fieldName, BSONObjBuilder&);

    /*
     * NUMBERDECIMALOBJECT :
     *     { FIELD("$numberDecimal") : "<number>" }
     */
    Status numberDecimalObject(StringData fieldName, BSONObjBuilder&);

    /*
     * MINKEYOBJECT :
     *     { FIELD("$minKey") : 1 }
     */
    Status minKeyObject(StringData fieldName, BSONObjBuilder& builder);

    /*
     * MAXKEYOBJECT :
     *     { FIELD("$maxKey") : 1 }
     */
    Status maxKeyObject(StringData fieldName, BSONObjBuilder& builder);

    /*
     * ARRAY :
     *     []
     *   | [ ELEMENTS ]
     *
     * ELEMENTS :
     *     VALUE
     *   | VALUE , ELEMENTS
     */
    Status array(StringData fieldName, BSONObjBuilder&, bool subObj = true);

    /*
     * NOTE: Currently only Date can be preceded by the "new" keyword
     * CONSTRUCTOR :
     *     DATE
     */
    Status constructor(StringData fieldName, BSONObjBuilder&);

    /* The following functions only parse the body of the constructor
     * between the parentheses, not including the constructor name */
    /*
     * DATE :
     *     Date( <64 bit signed integer for milliseconds since epoch> )
     */
    Status date(StringData fieldName, BSONObjBuilder&);

    /*
     * TIMESTAMP :
     *     Timestamp( <32 bit unsigned integer for seconds since epoch>,
     *          <32 bit unsigned integer for the increment> )
     */
    Status timestamp(StringData fieldName, BSONObjBuilder&);

    /*
     * OBJECTID :
     *     ObjectId( <24 character hex std::string> )
     */
    Status objectId(StringData fieldName, BSONObjBuilder&);

    /*
     * NUMBERLONG :
     *     NumberLong( <number> )
     */
    Status numberLong(StringData fieldName, BSONObjBuilder&);

    /*
     * NUMBERDECIMAL :
     *     NumberDecimal( <number> )
     */
    Status numberDecimal(StringData fieldName, BSONObjBuilder&);

    /*
     * NUMBERINT :
     *     NumberInt( <number> )
     */
    Status numberInt(StringData fieldName, BSONObjBuilder&);

    /*
     * DBREF :
     *     Dbref( <namespace std::string> , <24 character hex std::string> )
     */
    Status dbRef(StringData fieldName, BSONObjBuilder&);

    /*
     * REGEX :
     *     / REGEXCHARS / REGEXOPTIONS
     *
     * REGEXCHARS :
     *     REGEXCHAR
     *   | REGEXCHAR REGEXCHARS
     *
     * REGEXCHAR :
     *     any-Unicode-character-except-/-or-\-or-CONTROLCHAR
     *   | \"
     *   | \'
     *   | \\
     *   | \/
     *   | \b
     *   | \f
     *   | \n
     *   | \r
     *   | \t
     *   | \v
     *   | \u HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
     *   | \any-Unicode-character-except-x-or-[0-7]
     *
     * REGEXOPTIONS :
     *     REGEXOPTION
     *   | REGEXOPTION REGEXOPTIONS
     *
     * REGEXOPTION :
     *     g | i | m | s
     */
    Status regex(StringData fieldName, BSONObjBuilder&);
    Status regexPat(std::string* result);
    Status regexOpt(std::string* result);
    Status regexOptCheck(StringData opt);

    /*
     * NUMBER :
     *
     * NOTE: Number parsing is based on standard library functions, not
     * necessarily on the JSON numeric grammar.
     *
     * Number as value - strtoll and strtod
     * Date - strtoll
     * Timestamp - strtoul for both timestamp and increment and '-'
     * before a number explicity disallowed
     */
    Status number(StringData fieldName, BSONObjBuilder&);

    /*
     * FIELD :
     *     STRING
     *   | [a-zA-Z$_] FIELDCHARS
     *
     * FIELDCHARS :
     *     [a-zA-Z0-9$_]
     *   | [a-zA-Z0-9$_] FIELDCHARS
     */
    Status field(std::string* result);

    /*
     * std::string :
     *     " "
     *   | ' '
     *   | " CHARS "
     *   | ' CHARS '
     */
    Status quotedString(std::string* result);

    /*
     * CHARS :
     *     CHAR
     *   | CHAR CHARS
     *
     * Note: " or ' may be allowed depending on whether the std::string is
     * double or single quoted
     *
     * CHAR :
     *     any-Unicode-character-except-"-or-'-or-\-or-CONTROLCHAR
     *   | \"
     *   | \'
     *   | \\
     *   | \/
     *   | \b
     *   | \f
     *   | \n
     *   | \r
     *   | \t
     *   | \v
     *   | \u HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
     *   | \any-Unicode-character-except-x-or-[0-9]
     *
     * HEXDIGIT : [0..9a..fA..F]
     *
     * per http://www.ietf.org/rfc/rfc4627.txt, control characters are
     * (U+0000 through U+001F).  U+007F is not mentioned as a control
     * character.
     * CONTROLCHAR : [0x00..0x1F]
     *
     * If there is not an error, result will contain a null terminated
     * string, but there is no guarantee that it will not contain other
     * null characters.
     */
    Status chars(std::string* result, const char* terminalSet, const char* allowedSet = NULL);

    /**
     * Converts the two byte Unicode code point to its UTF8 character
     * encoding representation.  This function returns a std::string because
     * UTF8 encodings for code points from 0x0000 to 0xFFFF can range
     * from one to three characters.
     */
    std::string encodeUTF8(unsigned char first, unsigned char second) const;

    /**
     * @return true if the given token matches the next non whitespace
     * sequence in our buffer, and false if the token doesn't match or
     * we reach the end of our buffer.  Do not update the pointer to our
     * buffer (same as calling readTokenImpl with advance=false).
     */
    inline bool peekToken(const char* token);

    /**
     * @return true if the given token matches the next non whitespace
     * sequence in our buffer, and false if the token doesn't match or
     * we reach the end of our buffer.  Updates the pointer to our
     * buffer (same as calling readTokenImpl with advance=true).
     */
    inline bool readToken(const char* token);

    /**
     * @return true if the given token matches the next non whitespace
     * sequence in our buffer, and false if the token doesn't match or
     * we reach the end of our buffer.  Do not update the pointer to our
     * buffer if advance is false.
     */
    bool readTokenImpl(const char* token, bool advance = true);

    /**
     * @return true if the next field in our stream matches field.
     * Handles single quoted, double quoted, and unquoted field names
     */
    bool readField(StringData field);

    /**
     * @return true if matchChar is in matchSet
     * @return true if matchSet is NULL and false if it is an empty string
     */
    bool match(char matchChar, const char* matchSet) const;

    /**
     * @return true if every character in the std::string is a hex digit
     */
    bool isHexString(StringData) const;

    /**
     * @return true if every character in the std::string is a valid base64
     * character
     */
    bool isBase64String(StringData) const;

    /**
     * @return FailedToParse status with the given message and some
     * additional context information
     */
    Status parseError(StringData msg);

public:
    inline int offset() {
        return (_input - _buf);
    }

private:
    /*
     * _buf - start of our input buffer
     * _input - cursor we advance in our input buffer
     * _input_end - sentinel for the end of our input buffer
     *
     * _buf is the null terminated buffer containing the JSON std::string we
     * are parsing.  _input_end points to the null byte at the end of
     * the buffer.  strtoll, strtol, and strtod will access the null
     * byte at the end of the buffer because they are assuming a c-style
     * string.
     */
    const char* const _buf;
    const char* _input;
    const char* const _input_end;
};

}  // namespace mongo