diff options
Diffstat (limited to 'src/mongo/util/str_escape.h')
-rw-r--r-- | src/mongo/util/str_escape.h | 74 |
1 files changed, 70 insertions, 4 deletions
diff --git a/src/mongo/util/str_escape.h b/src/mongo/util/str_escape.h index 47fe3d30060..2d82e5697cd 100644 --- a/src/mongo/util/str_escape.h +++ b/src/mongo/util/str_escape.h @@ -35,9 +35,75 @@ #include <string> namespace mongo::str { -void escapeForText(fmt::memory_buffer& buffer, StringData str); -std::string escapeForText(StringData str); -void escapeForJSON(fmt::memory_buffer& buffer, StringData str); -std::string escapeForJSON(StringData str); +/** + * Escapes the special characters in 'str' for use as printable text. + * + * The backslash (`\`) character is escaped with another backslash, yielding the + * 2-character sequence {`\`, `\`}. + * + * The single-byte control characters (octets 0x00-0x1f, 0x7f) are generally escaped + * using the format "\xHH", where the 2 `H` characters are replaced by the 2 hex digits + * of the octet. For instance, the octet 0x7f would yield the sequence: {`\`, `x`, `7`, `f`}. + * Exemptions to this rule are the following octets, which are escaped using C-style escape + * sequences: + * 0x00 -> {`\`, `0`} + * 0x07 -> {`\`, `a`} + * 0x08 -> {`\`, `b`} + * 0x09 -> {`\`, `t`} + * 0x0a -> {`\`, `n`} + * 0x0b -> {`\`, `v`} + * 0x0c -> {`\`, `f`} + * 0x0d -> {`\`, `r`} + * 0x1b -> {`\`, `e`} + * + * The two-byte UTF-8 sequences between 0xC280 (U+0080) and 0xC29F (U+009F), inclusive, are + * also escaped as they are considered control characters. The escape sequence for these has + * the format: "\xC2\xHH", where the 2 `H` characters are replaced by the 2 hex digits of the + * second octet. + * + * Invalid bytes found are replaced with the escape sequence following the format: "\xHH", + * similar to how single-byte control characters are escaped. + * + * This writes the escaped output to 'buffer', and stops writing when either the output + * length reaches the 'maxLength', or if appending the next escape sequence will cause the + * output to exceed 'maxLength'. A 'maxLength' value of std::string::npos means unbounded. + * + * The 'wouldWrite' output is updated to contain the total bytes that would have been written + * if there was no length limit. + */ +void escapeForText(fmt::memory_buffer& buffer, + StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); +std::string escapeForText(StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); + +/** + * Escapes the special characters in 'str' for use in JSON. + * + * This differs from escapeForText in that the double-quote character (`"`) is escaped + * with a backslash, yielding the 2-character sequence {`\`, `"`}. + * + * The general format of the escape sequences for single-byte control characters becomes + * "\u00HH", where the 2 `H` characters are replaced by the 2 hex digits of the octet. + * For example, the octet 0x7f would yield the sequence: {`\`, `u`, `0`, `0`, `7`, `f`}. + * The list of octets escaped using C-style escape sequences is also shortened to: + * 0x08 -> {`\`, `b`} + * 0x09 -> {`\`, `t`} + * 0x0a -> {`\`, `n`} + * 0x0c -> {`\`, `f`} + * 0x0d -> {`\`, `r`} + * For two-byte control characters, the format of the escape sequence becomes "\uc2HH", + * where the 2 `H` characters are replaced by the 2 hex digits of the second octet. + * Invalid bytes found are replaced with the sequence: "\ufffd". + */ +void escapeForJSON(fmt::memory_buffer& buffer, + StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); +std::string escapeForJSON(StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); } // namespace mongo::str |