SERVER-66841 Fix LOGV2 invalid JSON when truncation happens at a backslash character

(cherry picked from commit b795776ac3c1599528a5772825f16f88ecb5b1c9)
author: Erwin Pe <erwin.pe@mongodb.com> 2022-08-09 15:17:40 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-08-09 16:03:43 +0000
commit: d56d5295aed750c2d9d585d78d1d224955b40a62 (patch)
tree: 416dc8fd7d470b1c1cba5a5224e88ea691118db9
parent: c6e439a9ab036e9f20003623f6d95d9864349ed7 (diff)
download: mongo-d56d5295aed750c2d9d585d78d1d224955b40a62.tar.gz
4 files changed, 230 insertions, 42 deletions
diff --git a/src/mongo/logv2/json_formatter.cpp b/src/mongo/logv2/json_formatter.cpp
index 2dc65831014..12e3a685e2c 100644
--- a/src/mongo/logv2/json_formatter.cpp
+++ b/src/mongo/logv2/json_formatter.cpp
@@ -168,16 +168,27 @@ private:
     void storeQuoted(StringData name, const T& value) {
         format_to(std::back_inserter(_buffer), FMT_COMPILE(R"({}"{}":")"), _separator, name);
         std::size_t before = _buffer.size();
-        str::escapeForJSON(_buffer, value);
-        if (_attributeMaxSize != 0) {
+        std::size_t wouldWrite = 0;
+        std::size_t written = 0;
+        str::escapeForJSON(
+            _buffer, value, _attributeMaxSize ? _attributeMaxSize : std::string::npos, &wouldWrite);
+        written = _buffer.size() - before;
+
+        if (wouldWrite > written) {
+            // The bounded escape may have reached the limit and
+            // stopped writing while in the middle of a UTF-8 sequence,
+            // in which case the incomplete UTF-8 octets at the tail of the
+            // buffer have to be trimmed.
+            // Push a dummy byte so that the UTF-8 safe truncation
+            // will truncate back down to the correct size.
+            _buffer.push_back('x');
             auto truncatedEnd =
-                str::UTF8SafeTruncation(_buffer.begin() + before, _buffer.end(), _attributeMaxSize);
-            if (truncatedEnd != _buffer.end()) {
-                BSONObjBuilder truncationInfo = _truncated.subobjStart(name);
-                truncationInfo.append("type"_sd, typeName(BSONType::String));
-                truncationInfo.append("size"_sd, static_cast<int64_t>(_buffer.size() - before));
-                truncationInfo.done();
-            }
+                str::UTF8SafeTruncation(_buffer.begin() + before, _buffer.end(), written);
+
+            BSONObjBuilder truncationInfo = _truncated.subobjStart(name);
+            truncationInfo.append("type"_sd, typeName(BSONType::String));
+            truncationInfo.append("size"_sd, static_cast<int64_t>(wouldWrite));
+            truncationInfo.done();
 
             _buffer.resize(truncatedEnd - _buffer.begin());
         }
diff --git a/src/mongo/logv2/logv2_test.cpp b/src/mongo/logv2/logv2_test.cpp
index ac1ab0d25ea..a20f202b3e2 100644
--- a/src/mongo/logv2/logv2_test.cpp
+++ b/src/mongo/logv2/logv2_test.cpp
@@ -60,6 +60,7 @@
 #include "mongo/stdx/thread.h"
 #include "mongo/unittest/temp_dir.h"
 #include "mongo/unittest/unittest.h"
+#include "mongo/util/str_escape.h"
 #include "mongo/util/string_map.h"
 #include "mongo/util/uuid.h"
 
@@ -1474,6 +1475,54 @@ TEST_F(LogV2Test, JsonTruncation) {
     validateArrayTruncation(mongo::fromjson(lines.back()));
 }
 
+TEST_F(LogV2Test, StringTruncation) {
+    const AtomicWord<int32_t> maxAttributeSizeKB(1);
+    auto lines = makeLineCapture(JSONFormatter(&maxAttributeSizeKB));
+
+    std::size_t maxLength = maxAttributeSizeKB.load() << 10;
+    std::string prefix(maxLength - 3, 'a');
+
+    struct TestCase {
+        std::string input;
+        std::string suffix;
+        std::string note;
+    };
+
+    TestCase tests[] = {
+        {prefix + "LMNOPQ", "LMN", "unescaped 1-byte octet"},
+        // "\n\"NOPQ" expands to "\\n\\\"NOPQ" after escape, and the limit
+        // is reached at the 2nd '\\' octet, but since it splits the "\\\""
+        // sequence, the actual truncation happens after the 'n' octet.
+        {prefix + "\n\"NOPQ", "\n", "2-byte escape sequence"},
+        // "L\vNOPQ" expands to "L\\u000bNOPQ" after escape, and the limit
+        // is reached at the 'u' octet, so the entire sequence is truncated.
+        {prefix + "L\vNOPQ", "L", "multi-byte escape sequence"},
+        {prefix + "LM\xC3\xB1PQ", "LM", "2-byte UTF-8 sequence"},
+        {prefix + "L\xE1\x9B\x8FPQ", "L", "3-byte UTF-8 sequence"},
+        {prefix + "L\xF0\x90\x8C\xBCQ", "L", "4-byte UTF-8 sequence"},
+        {prefix + "\xE1\x9B\x8E\xE1\x9B\x8F", "\xE1\x9B\x8E", "UTF-8 codepoint boundary"},
+        // The invalid UTF-8 codepoint 0xC3 is replaced with "\\ufffd", and truncated entirely
+        {prefix + "L\xC3NOPQ", "L", "escaped invalid codepoint"},
+        {std::string(maxLength, '\\'), "\\", "escaped backslash"},
+    };
+
+    for (const auto& [input, suffix, note] : tests) {
+        LOGV2(6694001, "name", "name"_attr = input);
+        BSONObj obj = fromjson(lines.back());
+
+        auto str = obj[constants::kAttributesFieldName]["name"].checkAndGetStringData();
+        std::string context = "Failed test: " + note;
+
+        ASSERT_LTE(str.size(), maxLength) << context;
+        ASSERT(str.endsWith(suffix))
+            << context << " - string " << str << " does not end with " << suffix;
+
+        auto trunc = obj[constants::kTruncatedFieldName]["name"];
+        ASSERT_EQUALS(trunc["type"].String(), typeName(BSONType::String)) << context;
+        ASSERT_EQUALS(trunc["size"].numberLong(), str::escapeForJSON(input).size()) << context;
+    }
+}
+
 TEST_F(LogV2Test, Threads) {
     auto linesPlain = makeLineCapture(PlainFormatter());
     auto linesText = makeLineCapture(TextFormatter());
diff --git a/src/mongo/util/str_escape.cpp b/src/mongo/util/str_escape.cpp
index d191fb92252..c42a916bab0 100644
--- a/src/mongo/util/str_escape.cpp
+++ b/src/mongo/util/str_escape.cpp
@@ -37,45 +37,82 @@ namespace mongo::str {
 namespace {
 constexpr char kHexChar[] = "0123456789abcdef";
 
+// Appends the bytes in the range [begin, end) to the output buffer,
+// which can either be a fmt::memory_buffer, or a std::string.
+template <typename Buffer, typename Iterator>
+void appendBuffer(Buffer& buffer, Iterator begin, Iterator end) {
+    buffer.append(begin, end);
+}
+
 // 'singleHandler' Function to write a valid single byte UTF-8 sequence with desired escaping.
 // 'invalidByteHandler' Function to write a byte of invalid UTF-8 encoding
 // 'twoEscaper' Function to write a valid two byte UTF-8 sequence with desired escaping, for C1
 // control codes.
+// 'maxLength' Max length to write into output buffer; A value of std::string::npos means unbounded.
+// An escape sequence will not be written if appending the entire sequence will exceed this limit.
+// 'wouldWrite' Output to contain the total bytes that would have been written to the buffer if no
+// size limit is in place.
+//
 // All these functions take a function object as their first parameter to perform the
 // writing of any escaped data. This function expects the number of handled bytes as its first
 // parameter and the corresponding escaped string as the second. They are templates to they can be
 // inlined.
-template <typename SingleByteHandler, typename InvalidByteHandler, typename TwoByteEscaper>
-void escape(fmt::memory_buffer& buffer,
+template <typename Buffer,
+          typename SingleByteHandler,
+          typename InvalidByteHandler,
+          typename TwoByteEscaper>
+void escape(Buffer& buffer,
             StringData str,
             SingleByteHandler singleHandler,
             InvalidByteHandler invalidByteHandler,
-            TwoByteEscaper twoEscaper) {
-    // The range [begin, it) contains input that does not need to be escaped and that has not been
+            TwoByteEscaper twoEscaper,
+            size_t maxLength,
+            size_t* wouldWrite) {
+    // The range [inFirst, it) contains input that does not need to be escaped and that has not been
     // written to output yet.
-    // The range [it end) contains remaining input to scan 'begin' is pointing to the beginning of
-    // the input that has not yet been written to 'escaped'.
-    // 'it' is pointing to the beginning of the unicode code point we're currently processing in the
-    // while-loop below. 'end' is the end of the input sequence.
-    auto begin = str.begin();
-    auto it = str.begin();
-    auto end = str.end();
+    // The range [it, inLast) contains remaining input to scan. 'inFirst' is pointing to the
+    // beginning of the input that has not yet been written to 'escaped'. 'it' is pointing to the
+    // beginning of the unicode code point we're currently processing in the while-loop below.
+    // 'inLast' is the end of the input sequence.
+    auto inFirst = str.begin();
+    auto inLast = str.end();
+    auto it = inFirst;
+    size_t cap = maxLength;
+    size_t total = 0;
 
     // Writes an escaped sequence to output after flushing pending input that does not need to be
     // escaped. 'it' is assumed to be at the beginning of the input sequence represented by the
     // escaped data.
     // 'numHandled' the number of bytes of unescaped data being written escaped in 'escapeSequence'
     auto flushAndWrite = [&](size_t numHandled, StringData escapeSequence) {
+        // Appends the range [wFirst, wLast) to the output if the result is within the max length.
+        // 'canTruncate' controls the behavior if appending the entire range would exceed the limit.
+        // If true, this appends input up to the length limit. Otherwise, none is appended.
+        auto boundedWrite = [&](auto wFirst, auto wLast, bool canTruncate) {
+            size_t len = std::distance(wFirst, wLast);
+            total += len;
+            if (maxLength != std::string::npos) {
+                if (len > cap) {
+                    if (!canTruncate) {
+                        cap = 0;
+                    }
+                    len = cap;
+                }
+                cap -= len;
+            }
+            appendBuffer(buffer, wFirst, wFirst + len);
+        };
+
         // Flush range of unmodified input
-        buffer.append(begin, it);
-        begin = it + numHandled;
+        boundedWrite(inFirst, it, true);
+        inFirst = it + numHandled;
 
         // Write escaped data
-        buffer.append(escapeSequence.rawData(), escapeSequence.rawData() + escapeSequence.size());
+        boundedWrite(escapeSequence.begin(), escapeSequence.end(), false);
     };
 
     auto isValidCodePoint = [&](auto pos, int len) {
-        return std::distance(pos, end) >= len &&
+        return std::distance(pos, inLast) >= len &&
             std::all_of(pos + 1, pos + len, [](uint8_t c) { return (c >> 6) == 0b10; });
     };
 
@@ -98,7 +135,7 @@ void escape(fmt::memory_buffer& buffer,
     auto writeInvalid = [&](uint8_t c) { invalidByteHandler(flushAndWrite, c); };
 
 
-    while (it != end) {
+    while (it != inLast) {
         uint8_t c = *it;
         bool bit7 = (c >> 7) & 1;
         if (MONGO_likely(!bit7)) {
@@ -156,10 +193,15 @@ void escape(fmt::memory_buffer& buffer,
         }
     }
     // Write last block
-    buffer.append(begin, it);
+    flushAndWrite(0, {});
+    if (wouldWrite) {
+        *wouldWrite = total;
+    }
 }
 }  // namespace
-void escapeForText(fmt::memory_buffer& buffer, StringData str) {
+
+template <typename Buffer>
+void escapeForTextCommon(Buffer& buffer, StringData str, size_t maxLength, size_t* wouldWrite) {
     auto singleByteHandler = [](const auto& writer, uint8_t unescaped) {
         switch (unescaped) {
             case '\0':
@@ -287,16 +329,26 @@ void escapeForText(fmt::memory_buffer& buffer, StringData str) {
                   str,
                   std::move(singleByteHandler),
                   std::move(invalidByteHandler),
-                  std::move(twoByteEscaper));
+                  std::move(twoByteEscaper),
+                  maxLength,
+                  wouldWrite);
 }
 
-std::string escapeForText(StringData str) {
-    fmt::memory_buffer buffer;
-    escapeForText(buffer, str);
-    return fmt::to_string(buffer);
+void escapeForText(fmt::memory_buffer& buffer,
+                   StringData str,
+                   size_t maxLength,
+                   size_t* wouldWrite) {
+    escapeForTextCommon(buffer, str, maxLength, wouldWrite);
 }
 
-void escapeForJSON(fmt::memory_buffer& buffer, StringData str) {
+std::string escapeForText(StringData str, size_t maxLength, size_t* wouldWrite) {
+    std::string buffer;
+    escapeForTextCommon(buffer, str, maxLength, wouldWrite);
+    return buffer;
+}
+
+template <typename Buffer>
+void escapeForJSONCommon(Buffer& buffer, StringData str, size_t maxLength, size_t* wouldWrite) {
     auto singleByteHandler = [](const auto& writer, uint8_t unescaped) {
         switch (unescaped) {
             case '\0':
@@ -427,11 +479,21 @@ void escapeForJSON(fmt::memory_buffer& buffer, StringData str) {
                   str,
                   std::move(singleByteHandler),
                   std::move(invalidByteHandler),
-                  std::move(twoByteEscaper));
+                  std::move(twoByteEscaper),
+                  maxLength,
+                  wouldWrite);
 }
-std::string escapeForJSON(StringData str) {
-    fmt::memory_buffer buffer;
-    escapeForJSON(buffer, str);
-    return fmt::to_string(buffer);
+
+void escapeForJSON(fmt::memory_buffer& buffer,
+                   StringData str,
+                   size_t maxLength,
+                   size_t* wouldWrite) {
+    escapeForJSONCommon(buffer, str, maxLength, wouldWrite);
+}
+
+std::string escapeForJSON(StringData str, size_t maxLength, size_t* wouldWrite) {
+    std::string buffer;
+    escapeForJSONCommon(buffer, str, maxLength, wouldWrite);
+    return buffer;
 }
 }  // namespace mongo::str
diff --git a/src/mongo/util/str_escape.h b/src/mongo/util/str_escape.h
index 47fe3d30060..2d82e5697cd 100644
--- a/src/mongo/util/str_escape.h
+++ b/src/mongo/util/str_escape.h
@@ -35,9 +35,75 @@
 #include <string>
 
 namespace mongo::str {
-void escapeForText(fmt::memory_buffer& buffer, StringData str);
-std::string escapeForText(StringData str);
 
-void escapeForJSON(fmt::memory_buffer& buffer, StringData str);
-std::string escapeForJSON(StringData str);
+/**
+ * Escapes the special characters in 'str' for use as printable text.
+ *
+ * The backslash (`\`) character is escaped with another backslash, yielding the
+ * 2-character sequence {`\`, `\`}.
+ *
+ * The single-byte control characters (octets 0x00-0x1f, 0x7f) are generally escaped
+ * using the format "\xHH", where the 2 `H` characters are replaced by the 2 hex digits
+ * of the octet. For instance, the octet 0x7f would yield the sequence: {`\`, `x`, `7`, `f`}.
+ * Exemptions to this rule are the following octets, which are escaped using C-style escape
+ * sequences:
+ *   0x00  ->  {`\`, `0`}
+ *   0x07  ->  {`\`, `a`}
+ *   0x08  ->  {`\`, `b`}
+ *   0x09  ->  {`\`, `t`}
+ *   0x0a  ->  {`\`, `n`}
+ *   0x0b  ->  {`\`, `v`}
+ *   0x0c  ->  {`\`, `f`}
+ *   0x0d  ->  {`\`, `r`}
+ *   0x1b  ->  {`\`, `e`}
+ *
+ * The two-byte UTF-8 sequences between 0xC280 (U+0080) and 0xC29F (U+009F), inclusive, are
+ * also escaped as they are considered control characters. The escape sequence for these has
+ * the format: "\xC2\xHH", where the 2 `H` characters are replaced by the 2 hex digits of the
+ * second octet.
+ *
+ * Invalid bytes found are replaced with the escape sequence following the format: "\xHH",
+ * similar to how single-byte control characters are escaped.
+ *
+ * This writes the escaped output to 'buffer', and stops writing when either the output
+ * length reaches the 'maxLength', or if appending the next escape sequence will cause the
+ * output to exceed 'maxLength'. A 'maxLength' value of std::string::npos means unbounded.
+ *
+ * The 'wouldWrite' output is updated to contain the total bytes that would have been written
+ * if there was no length limit.
+ */
+void escapeForText(fmt::memory_buffer& buffer,
+                   StringData str,
+                   size_t maxLength = std::string::npos,
+                   size_t* wouldWrite = nullptr);
+std::string escapeForText(StringData str,
+                          size_t maxLength = std::string::npos,
+                          size_t* wouldWrite = nullptr);
+
+/**
+ * Escapes the special characters in 'str' for use in JSON.
+ *
+ * This differs from escapeForText in that the double-quote character (`"`) is escaped
+ * with a backslash, yielding the 2-character sequence {`\`, `"`}.
+ *
+ * The general format of the escape sequences for single-byte control characters becomes
+ * "\u00HH", where the 2 `H` characters are replaced by the 2 hex digits of the octet.
+ * For example, the octet 0x7f would yield the sequence: {`\`, `u`, `0`, `0`, `7`, `f`}.
+ * The list of octets escaped using C-style escape sequences is also shortened to:
+ *   0x08  ->  {`\`, `b`}
+ *   0x09  ->  {`\`, `t`}
+ *   0x0a  ->  {`\`, `n`}
+ *   0x0c  ->  {`\`, `f`}
+ *   0x0d  ->  {`\`, `r`}
+ * For two-byte control characters, the format of the escape sequence becomes "\uc2HH",
+ * where the 2 `H` characters are replaced by the 2 hex digits of the second octet.
+ * Invalid bytes found are replaced with the sequence: "\ufffd".
+ */
+void escapeForJSON(fmt::memory_buffer& buffer,
+                   StringData str,
+                   size_t maxLength = std::string::npos,
+                   size_t* wouldWrite = nullptr);
+std::string escapeForJSON(StringData str,
+                          size_t maxLength = std::string::npos,
+                          size_t* wouldWrite = nullptr);
 }  // namespace mongo::str
author	Erwin Pe <erwin.pe@mongodb.com>	2022-08-09 15:17:40 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-08-09 16:03:43 +0000
commit	d56d5295aed750c2d9d585d78d1d224955b40a62 (patch)
tree	416dc8fd7d470b1c1cba5a5224e88ea691118db9
parent	c6e439a9ab036e9f20003623f6d95d9864349ed7 (diff)
download	mongo-d56d5295aed750c2d9d585d78d1d224955b40a62.tar.gz