summaryrefslogtreecommitdiff
path: root/src/mongo/util/str_escape.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/mongo/util/str_escape.h')
-rw-r--r--src/mongo/util/str_escape.h74
1 files changed, 70 insertions, 4 deletions
diff --git a/src/mongo/util/str_escape.h b/src/mongo/util/str_escape.h
index 47fe3d30060..2d82e5697cd 100644
--- a/src/mongo/util/str_escape.h
+++ b/src/mongo/util/str_escape.h
@@ -35,9 +35,75 @@
#include <string>
namespace mongo::str {
-void escapeForText(fmt::memory_buffer& buffer, StringData str);
-std::string escapeForText(StringData str);
-void escapeForJSON(fmt::memory_buffer& buffer, StringData str);
-std::string escapeForJSON(StringData str);
+/**
+ * Escapes the special characters in 'str' for use as printable text.
+ *
+ * The backslash (`\`) character is escaped with another backslash, yielding the
+ * 2-character sequence {`\`, `\`}.
+ *
+ * The single-byte control characters (octets 0x00-0x1f, 0x7f) are generally escaped
+ * using the format "\xHH", where the 2 `H` characters are replaced by the 2 hex digits
+ * of the octet. For instance, the octet 0x7f would yield the sequence: {`\`, `x`, `7`, `f`}.
+ * Exemptions to this rule are the following octets, which are escaped using C-style escape
+ * sequences:
+ * 0x00 -> {`\`, `0`}
+ * 0x07 -> {`\`, `a`}
+ * 0x08 -> {`\`, `b`}
+ * 0x09 -> {`\`, `t`}
+ * 0x0a -> {`\`, `n`}
+ * 0x0b -> {`\`, `v`}
+ * 0x0c -> {`\`, `f`}
+ * 0x0d -> {`\`, `r`}
+ * 0x1b -> {`\`, `e`}
+ *
+ * The two-byte UTF-8 sequences between 0xC280 (U+0080) and 0xC29F (U+009F), inclusive, are
+ * also escaped as they are considered control characters. The escape sequence for these has
+ * the format: "\xC2\xHH", where the 2 `H` characters are replaced by the 2 hex digits of the
+ * second octet.
+ *
+ * Invalid bytes found are replaced with the escape sequence following the format: "\xHH",
+ * similar to how single-byte control characters are escaped.
+ *
+ * This writes the escaped output to 'buffer', and stops writing when either the output
+ * length reaches the 'maxLength', or if appending the next escape sequence will cause the
+ * output to exceed 'maxLength'. A 'maxLength' value of std::string::npos means unbounded.
+ *
+ * The 'wouldWrite' output is updated to contain the total bytes that would have been written
+ * if there was no length limit.
+ */
+void escapeForText(fmt::memory_buffer& buffer,
+ StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
+std::string escapeForText(StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
+
+/**
+ * Escapes the special characters in 'str' for use in JSON.
+ *
+ * This differs from escapeForText in that the double-quote character (`"`) is escaped
+ * with a backslash, yielding the 2-character sequence {`\`, `"`}.
+ *
+ * The general format of the escape sequences for single-byte control characters becomes
+ * "\u00HH", where the 2 `H` characters are replaced by the 2 hex digits of the octet.
+ * For example, the octet 0x7f would yield the sequence: {`\`, `u`, `0`, `0`, `7`, `f`}.
+ * The list of octets escaped using C-style escape sequences is also shortened to:
+ * 0x08 -> {`\`, `b`}
+ * 0x09 -> {`\`, `t`}
+ * 0x0a -> {`\`, `n`}
+ * 0x0c -> {`\`, `f`}
+ * 0x0d -> {`\`, `r`}
+ * For two-byte control characters, the format of the escape sequence becomes "\uc2HH",
+ * where the 2 `H` characters are replaced by the 2 hex digits of the second octet.
+ * Invalid bytes found are replaced with the sequence: "\ufffd".
+ */
+void escapeForJSON(fmt::memory_buffer& buffer,
+ StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
+std::string escapeForJSON(StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
} // namespace mongo::str