SERVER-66841 Fix LOGV2 invalid JSON when truncation happens at a backslash character

author: Erwin Pe <erwin.pe@mongodb.com> 2022-08-01 14:17:53 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2022-08-01 15:01:29 +0000
commit: b795776ac3c1599528a5772825f16f88ecb5b1c9 (patch)
tree: b5759670113e3655f80a2f3b1673aabd6087ff94 /src/mongo/logv2
parent: 9a775c3090716d55f3a5aa0d5027fad7af32acd0 (diff)
download: mongo-b795776ac3c1599528a5772825f16f88ecb5b1c9.tar.gz
2 files changed, 69 insertions, 9 deletions
diff --git a/src/mongo/logv2/json_formatter.cpp b/src/mongo/logv2/json_formatter.cpp
index cbcfc85f121..58111121423 100644
--- a/src/mongo/logv2/json_formatter.cpp
+++ b/src/mongo/logv2/json_formatter.cpp
@@ -168,16 +168,27 @@ private:
     void storeQuoted(StringData name, const T& value) {
         format_to(std::back_inserter(_buffer), FMT_COMPILE(R"({}"{}":")"), _separator, name);
         std::size_t before = _buffer.size();
-        str::escapeForJSON(_buffer, value);
-        if (_attributeMaxSize != 0) {
+        std::size_t wouldWrite = 0;
+        std::size_t written = 0;
+        str::escapeForJSON(
+            _buffer, value, _attributeMaxSize ? _attributeMaxSize : std::string::npos, &wouldWrite);
+        written = _buffer.size() - before;
+
+        if (wouldWrite > written) {
+            // The bounded escape may have reached the limit and
+            // stopped writing while in the middle of a UTF-8 sequence,
+            // in which case the incomplete UTF-8 octets at the tail of the
+            // buffer have to be trimmed.
+            // Push a dummy byte so that the UTF-8 safe truncation
+            // will truncate back down to the correct size.
+            _buffer.push_back('x');
             auto truncatedEnd =
-                str::UTF8SafeTruncation(_buffer.begin() + before, _buffer.end(), _attributeMaxSize);
-            if (truncatedEnd != _buffer.end()) {
-                BSONObjBuilder truncationInfo = _truncated.subobjStart(name);
-                truncationInfo.append("type"_sd, typeName(BSONType::String));
-                truncationInfo.append("size"_sd, static_cast<int64_t>(_buffer.size() - before));
-                truncationInfo.done();
-            }
+                str::UTF8SafeTruncation(_buffer.begin() + before, _buffer.end(), written);
+
+            BSONObjBuilder truncationInfo = _truncated.subobjStart(name);
+            truncationInfo.append("type"_sd, typeName(BSONType::String));
+            truncationInfo.append("size"_sd, static_cast<int64_t>(wouldWrite));
+            truncationInfo.done();
 
             _buffer.resize(truncatedEnd - _buffer.begin());
         }
diff --git a/src/mongo/logv2/logv2_test.cpp b/src/mongo/logv2/logv2_test.cpp
index 91d9327f06a..fde66599224 100644
--- a/src/mongo/logv2/logv2_test.cpp
+++ b/src/mongo/logv2/logv2_test.cpp
@@ -64,6 +64,7 @@
 #include "mongo/unittest/temp_dir.h"
 #include "mongo/unittest/unittest.h"
 #include "mongo/util/exit_code.h"
+#include "mongo/util/str_escape.h"
 #include "mongo/util/string_map.h"
 #include "mongo/util/uuid.h"
 
@@ -1568,6 +1569,54 @@ TEST_F(LogV2Test, JsonTruncation) {
     validateArrayTruncation(mongo::fromjson(lines.back()));
 }
 
+TEST_F(LogV2Test, StringTruncation) {
+    const AtomicWord<int32_t> maxAttributeSizeKB(1);
+    auto lines = makeLineCapture(JSONFormatter(&maxAttributeSizeKB));
+
+    std::size_t maxLength = maxAttributeSizeKB.load() << 10;
+    std::string prefix(maxLength - 3, 'a');
+
+    struct TestCase {
+        std::string input;
+        std::string suffix;
+        std::string note;
+    };
+
+    TestCase tests[] = {
+        {prefix + "LMNOPQ", "LMN", "unescaped 1-byte octet"},
+        // "\n\"NOPQ" expands to "\\n\\\"NOPQ" after escape, and the limit
+        // is reached at the 2nd '\\' octet, but since it splits the "\\\""
+        // sequence, the actual truncation happens after the 'n' octet.
+        {prefix + "\n\"NOPQ", "\n", "2-byte escape sequence"},
+        // "L\vNOPQ" expands to "L\\u000bNOPQ" after escape, and the limit
+        // is reached at the 'u' octet, so the entire sequence is truncated.
+        {prefix + "L\vNOPQ", "L", "multi-byte escape sequence"},
+        {prefix + "LM\xC3\xB1PQ", "LM", "2-byte UTF-8 sequence"},
+        {prefix + "L\xE1\x9B\x8FPQ", "L", "3-byte UTF-8 sequence"},
+        {prefix + "L\xF0\x90\x8C\xBCQ", "L", "4-byte UTF-8 sequence"},
+        {prefix + "\xE1\x9B\x8E\xE1\x9B\x8F", "\xE1\x9B\x8E", "UTF-8 codepoint boundary"},
+        // The invalid UTF-8 codepoint 0xC3 is replaced with "\\ufffd", and truncated entirely
+        {prefix + "L\xC3NOPQ", "L", "escaped invalid codepoint"},
+        {std::string(maxLength, '\\'), "\\", "escaped backslash"},
+    };
+
+    for (const auto& [input, suffix, note] : tests) {
+        LOGV2(6694001, "name", "name"_attr = input);
+        BSONObj obj = fromjson(lines.back());
+
+        auto str = obj[constants::kAttributesFieldName]["name"].checkAndGetStringData();
+        std::string context = "Failed test: " + note;
+
+        ASSERT_LTE(str.size(), maxLength) << context;
+        ASSERT(str.endsWith(suffix))
+            << context << " - string " << str << " does not end with " << suffix;
+
+        auto trunc = obj[constants::kTruncatedFieldName]["name"];
+        ASSERT_EQUALS(trunc["type"].String(), typeName(BSONType::String)) << context;
+        ASSERT_EQUALS(trunc["size"].numberLong(), str::escapeForJSON(input).size()) << context;
+    }
+}
+
 TEST_F(LogV2Test, Threads) {
     auto linesPlain = makeLineCapture(PlainFormatter());
     auto linesText = makeLineCapture(TextFormatter());
author	Erwin Pe <erwin.pe@mongodb.com>	2022-08-01 14:17:53 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2022-08-01 15:01:29 +0000
commit	b795776ac3c1599528a5772825f16f88ecb5b1c9 (patch)
tree	b5759670113e3655f80a2f3b1673aabd6087ff94 /src/mongo/logv2
parent	9a775c3090716d55f3a5aa0d5027fad7af32acd0 (diff)
download	mongo-b795776ac3c1599528a5772825f16f88ecb5b1c9.tar.gz