SERVER-9666 Move json utils into bson library

author: Andrew Morrow <acm@mongodb.com> 2015-04-30 11:49:52 -0400
committer: Andrew Morrow <acm@mongodb.com> 2015-05-06 15:47:21 -0400
commit: 543ca54c22e13056b4f278e36b4c1b6436c2f1cb (patch)
tree: ce86697434bce1152c31df200341405e618472f8 /src/mongo/bson/json.h
parent: 0ec9948134ca39df062d59e7eaa212100631ecac (diff)
download: mongo-543ca54c22e13056b4f278e36b4c1b6436c2f1cb.tar.gz
1 files changed, 488 insertions, 0 deletions
diff --git a/src/mongo/bson/json.h b/src/mongo/bson/json.h
new file mode 100644
index 00000000000..34564765242
--- /dev/null
+++ b/src/mongo/bson/json.h
@@ -0,0 +1,488 @@
+/**
+*    Copyright (C) 2008 10gen Inc.
+*
+*    This program is free software: you can redistribute it and/or  modify
+*    it under the terms of the GNU Affero General Public License, version 3,
+*    as published by the Free Software Foundation.
+*
+*    This program is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU Affero General Public License for more details.
+*
+*    You should have received a copy of the GNU Affero General Public License
+*    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*
+*    As a special exception, the copyright holders give permission to link the
+*    code of portions of this program with the OpenSSL library under certain
+*    conditions as described in each individual source file and distribute
+*    linked combinations including the program with the OpenSSL library. You
+*    must comply with the GNU Affero General Public License in all respects for
+*    all of the code used other than as permitted herein. If you modify file(s)
+*    with this exception, you may extend this exception to your version of the
+*    file(s), but you are not obligated to do so. If you do not wish to do so,
+*    delete this exception statement from your version. If you delete this
+*    exception statement from all source files in the program, then also delete
+*    it in the license file.
+*/
+
+#pragma once
+
+#include <string>
+
+#include "mongo/bson/bsonobj.h"
+#include "mongo/base/status.h"
+
+namespace mongo {
+
+    /**
+     * Create a BSONObj from a JSON <http://www.json.org>,
+     * <http://www.ietf.org/rfc/rfc4627.txt> string.  In addition to the JSON
+     * extensions extensions described here
+     * <http://dochub.mongodb.org/core/mongodbextendedjson>, this function
+     * accepts unquoted field names and allows single quotes to optionally be
+     * used when specifying field names and std::string values instead of double
+     * quotes.  JSON unicode escape sequences (of the form \uXXXX) are
+     * converted to utf8.
+     *
+     * @throws MsgAssertionException if parsing fails.  The message included with
+     * this assertion includes the character offset where parsing failed.
+     */
+    BSONObj fromjson(const std::string& str);
+
+    /** @param len will be size of JSON object in text chars. */
+    BSONObj fromjson(const char* str, int* len=NULL);
+
+    /**
+     * Tests whether the JSON string is an Array.
+     *
+     * Useful for assigning the result of fromjson to the right object type. Either:
+     *  BSONObj
+     *  BSONArray
+     *
+     * @example Using the method to select the proper type.
+     *  If this method returns true, the user could store the result of fromjson
+     *  inside a BSONArray, rather than a BSONObj, in order to have it print as an
+     *  array when passed to tojson.
+     *
+     * @param obj The JSON string to test.
+     */
+    bool isArray(StringData str);
+
+    /**
+     * Convert a BSONArray to a JSON string.
+     *
+     * @param arr The BSON Array.
+     * @param format The JSON format (JS, TenGen, Strict).
+     * @param pretty Enables pretty output.
+     */
+    std::string tojson(
+        const BSONArray& arr,
+        JsonStringFormat format = Strict,
+        bool pretty = false
+    );
+
+    /**
+     * Convert a BSONObj to a JSON string.
+     *
+     * @param obj The BSON Object.
+     * @param format The JSON format (JS, TenGen, Strict).
+     * @param pretty Enables pretty output.
+     */
+    std::string tojson(
+        const BSONObj& obj,
+        JsonStringFormat format = Strict,
+        bool pretty = false
+    );
+
+    /**
+     * Parser class.  A BSONObj is constructed incrementally by passing a
+     * BSONObjBuilder to the recursive parsing methods.  The grammar for the
+     * element parsed is described before each function.
+     */
+    class JParse {
+        public:
+            explicit JParse(StringData str);
+
+            /*
+             * Notation: All-uppercase symbols denote non-terminals; all other
+             * symbols are literals.
+             */
+
+            /*
+             * VALUE :
+             *     STRING
+             *   | NUMBER
+             *   | NUMBERINT
+             *   | NUMBERLONG
+             *   | OBJECT
+             *   | ARRAY
+             *
+             *   | true
+             *   | false
+             *   | null
+             *   | undefined
+             *
+             *   | NaN
+             *   | Infinity
+             *   | -Infinity
+             *
+             *   | DATE
+             *   | TIMESTAMP
+             *   | REGEX
+             *   | OBJECTID
+             *   | DBREF
+             *
+             *   | new CONSTRUCTOR
+             */
+        private:
+            Status value(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * OBJECT :
+             *     {}
+             *   | { MEMBERS }
+             *   | SPECIALOBJECT
+             *
+             * MEMBERS :
+             *     PAIR
+             *   | PAIR , MEMBERS
+             *
+             * PAIR :
+             *     FIELD : VALUE
+             *
+             * SPECIALOBJECT :
+             *     OIDOBJECT
+             *   | BINARYOBJECT
+             *   | DATEOBJECT
+             *   | TIMESTAMPOBJECT
+             *   | REGEXOBJECT
+             *   | REFOBJECT
+             *   | UNDEFINEDOBJECT
+             *   | NUMBERLONGOBJECT
+             *   | MINKEYOBJECT
+             *   | MAXKEYOBJECT
+             *
+             */
+        public:
+            Status object(StringData fieldName, BSONObjBuilder&, bool subObj=true);
+            Status parse(BSONObjBuilder& builder);
+            bool isArray();
+
+        private:
+            /* The following functions are called with the '{' and the first
+             * field already parsed since they are both implied given the
+             * context. */
+            /*
+             * OIDOBJECT :
+             *     { FIELD("$oid") : <24 character hex std::string> }
+             */
+            Status objectIdObject(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * BINARYOBJECT :
+             *     { FIELD("$binary") : <base64 representation of a binary std::string>,
+             *          FIELD("$type") : <hexadecimal representation of a single byte
+             *              indicating the data type> }
+             */
+            Status binaryObject(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * DATEOBJECT :
+             *     { FIELD("$date") : <64 bit signed integer for milliseconds since epoch> }
+             */
+            Status dateObject(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * TIMESTAMPOBJECT :
+             *     { FIELD("$timestamp") : {
+             *         FIELD("t") : <32 bit unsigned integer for seconds since epoch>,
+             *         FIELD("i") : <32 bit unsigned integer for the increment> } }
+             */
+            Status timestampObject(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             *     NOTE: the rules for the body of the regex are different here,
+             *     since it is quoted instead of surrounded by slashes.
+             * REGEXOBJECT :
+             *     { FIELD("$regex") : <string representing body of regex> }
+             *   | { FIELD("$regex") : <string representing body of regex>,
+             *          FIELD("$options") : <string representing regex options> }
+             */
+            Status regexObject(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * REFOBJECT :
+             *     { FIELD("$ref") : <string representing collection name>,
+             *          FIELD("$id") : <24 character hex std::string> }
+             *   | { FIELD("$ref") : std::string , FIELD("$id") : OBJECTID }
+             *   | { FIELD("$ref") : std::string , FIELD("$id") : OIDOBJECT }
+             */
+            Status dbRefObject(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * UNDEFINEDOBJECT :
+             *     { FIELD("$undefined") : true }
+             */
+            Status undefinedObject(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * NUMBERLONGOBJECT :
+             *     { FIELD("$numberLong") : "<number>" }
+             */
+            Status numberLongObject(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * MINKEYOBJECT :
+             *     { FIELD("$minKey") : 1 }
+             */
+            Status minKeyObject(StringData fieldName, BSONObjBuilder& builder);
+
+            /*
+             * MAXKEYOBJECT :
+             *     { FIELD("$maxKey") : 1 }
+             */
+            Status maxKeyObject(StringData fieldName, BSONObjBuilder& builder);
+
+            /*
+             * ARRAY :
+             *     []
+             *   | [ ELEMENTS ]
+             *
+             * ELEMENTS :
+             *     VALUE
+             *   | VALUE , ELEMENTS
+             */
+            Status array(StringData fieldName, BSONObjBuilder&, bool subObj=true);
+
+            /*
+             * NOTE: Currently only Date can be preceded by the "new" keyword
+             * CONSTRUCTOR :
+             *     DATE
+             */
+            Status constructor(StringData fieldName, BSONObjBuilder&);
+
+            /* The following functions only parse the body of the constructor
+             * between the parentheses, not including the constructor name */
+            /*
+             * DATE :
+             *     Date( <64 bit signed integer for milliseconds since epoch> )
+             */
+            Status date(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * TIMESTAMP :
+             *     Timestamp( <32 bit unsigned integer for seconds since epoch>,
+             *          <32 bit unsigned integer for the increment> )
+             */
+            Status timestamp(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * OBJECTID :
+             *     ObjectId( <24 character hex std::string> )
+             */
+            Status objectId(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * NUMBERLONG :
+             *     NumberLong( <number> )
+             */
+            Status numberLong(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * NUMBERINT :
+             *     NumberInt( <number> )
+             */
+            Status numberInt(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * DBREF :
+             *     Dbref( <namespace std::string> , <24 character hex std::string> )
+             */
+            Status dbRef(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * REGEX :
+             *     / REGEXCHARS / REGEXOPTIONS
+             *
+             * REGEXCHARS :
+             *     REGEXCHAR
+             *   | REGEXCHAR REGEXCHARS
+             *
+             * REGEXCHAR :
+             *     any-Unicode-character-except-/-or-\-or-CONTROLCHAR
+             *   | \"
+             *   | \'
+             *   | \\
+             *   | \/
+             *   | \b
+             *   | \f
+             *   | \n
+             *   | \r
+             *   | \t
+             *   | \v
+             *   | \u HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
+             *   | \any-Unicode-character-except-x-or-[0-7]
+             *
+             * REGEXOPTIONS :
+             *     REGEXOPTION
+             *   | REGEXOPTION REGEXOPTIONS
+             *
+             * REGEXOPTION :
+             *     g | i | m | s
+             */
+            Status regex(StringData fieldName, BSONObjBuilder&);
+            Status regexPat(std::string* result);
+            Status regexOpt(std::string* result);
+            Status regexOptCheck(StringData opt);
+
+            /*
+             * NUMBER :
+             *
+             * NOTE: Number parsing is based on standard library functions, not
+             * necessarily on the JSON numeric grammar.
+             *
+             * Number as value - strtoll and strtod
+             * Date - strtoll
+             * Timestamp - strtoul for both timestamp and increment and '-'
+             * before a number explicity disallowed
+             */
+            Status number(StringData fieldName, BSONObjBuilder&);
+
+            /*
+             * FIELD :
+             *     STRING
+             *   | [a-zA-Z$_] FIELDCHARS
+             *
+             * FIELDCHARS :
+             *     [a-zA-Z0-9$_]
+             *   | [a-zA-Z0-9$_] FIELDCHARS
+             */
+            Status field(std::string* result);
+
+            /*
+             * std::string :
+             *     " "
+             *   | ' '
+             *   | " CHARS "
+             *   | ' CHARS '
+             */
+            Status quotedString(std::string* result);
+
+            /*
+             * CHARS :
+             *     CHAR
+             *   | CHAR CHARS
+             *
+             * Note: " or ' may be allowed depending on whether the std::string is
+             * double or single quoted
+             *
+             * CHAR :
+             *     any-Unicode-character-except-"-or-'-or-\-or-CONTROLCHAR
+             *   | \"
+             *   | \'
+             *   | \\
+             *   | \/
+             *   | \b
+             *   | \f
+             *   | \n
+             *   | \r
+             *   | \t
+             *   | \v
+             *   | \u HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
+             *   | \any-Unicode-character-except-x-or-[0-9]
+             *
+             * HEXDIGIT : [0..9a..fA..F]
+             *
+             * per http://www.ietf.org/rfc/rfc4627.txt, control characters are
+             * (U+0000 through U+001F).  U+007F is not mentioned as a control
+             * character.
+             * CONTROLCHAR : [0x00..0x1F]
+             *
+             * If there is not an error, result will contain a null terminated
+             * string, but there is no guarantee that it will not contain other
+             * null characters.
+             */
+            Status chars(std::string* result, const char* terminalSet, const char* allowedSet=NULL);
+
+            /**
+             * Converts the two byte Unicode code point to its UTF8 character
+             * encoding representation.  This function returns a std::string because
+             * UTF8 encodings for code points from 0x0000 to 0xFFFF can range
+             * from one to three characters.
+             */
+            std::string encodeUTF8(unsigned char first, unsigned char second) const;
+
+            /**
+             * @return true if the given token matches the next non whitespace
+             * sequence in our buffer, and false if the token doesn't match or
+             * we reach the end of our buffer.  Do not update the pointer to our
+             * buffer (same as calling readTokenImpl with advance=false).
+             */
+            inline bool peekToken(const char* token);
+
+            /**
+             * @return true if the given token matches the next non whitespace
+             * sequence in our buffer, and false if the token doesn't match or
+             * we reach the end of our buffer.  Updates the pointer to our
+             * buffer (same as calling readTokenImpl with advance=true).
+             */
+            inline bool readToken(const char* token);
+
+            /**
+             * @return true if the given token matches the next non whitespace
+             * sequence in our buffer, and false if the token doesn't match or
+             * we reach the end of our buffer.  Do not update the pointer to our
+             * buffer if advance is false.
+             */
+            bool readTokenImpl(const char* token, bool advance=true);
+
+            /**
+             * @return true if the next field in our stream matches field.
+             * Handles single quoted, double quoted, and unquoted field names
+             */
+            bool readField(StringData field);
+
+            /**
+             * @return true if matchChar is in matchSet
+             * @return true if matchSet is NULL and false if it is an empty string
+             */
+            bool match(char matchChar, const char* matchSet) const;
+
+            /**
+             * @return true if every character in the std::string is a hex digit
+             */
+            bool isHexString(StringData) const;
+
+            /**
+             * @return true if every character in the std::string is a valid base64
+             * character
+             */
+            bool isBase64String(StringData) const;
+
+            /**
+             * @return FailedToParse status with the given message and some
+             * additional context information
+             */
+            Status parseError(StringData msg);
+        public:
+            inline int offset() { return (_input - _buf); }
+
+        private:
+            /*
+             * _buf - start of our input buffer
+             * _input - cursor we advance in our input buffer
+             * _input_end - sentinel for the end of our input buffer
+             *
+             * _buf is the null terminated buffer containing the JSON std::string we
+             * are parsing.  _input_end points to the null byte at the end of
+             * the buffer.  strtoll, strtol, and strtod will access the null
+             * byte at the end of the buffer because they are assuming a c-style
+             * string.
+             */
+            const char* const _buf;
+            const char* _input;
+            const char* const _input_end;
+    };
+
+} // namespace mongo
author	Andrew Morrow <acm@mongodb.com>	2015-04-30 11:49:52 -0400
committer	Andrew Morrow <acm@mongodb.com>	2015-05-06 15:47:21 -0400
commit	543ca54c22e13056b4f278e36b4c1b6436c2f1cb (patch)
tree	ce86697434bce1152c31df200341405e618472f8 /src/mongo/bson/json.h
parent	0ec9948134ca39df062d59e7eaa212100631ecac (diff)
download	mongo-543ca54c22e13056b4f278e36b4c1b6436c2f1cb.tar.gz