From b795776ac3c1599528a5772825f16f88ecb5b1c9 Mon Sep 17 00:00:00 2001
From: Erwin Pe <erwin.pe@mongodb.com>
Date: Mon, 1 Aug 2022 14:17:53 +0000
Subject: SERVER-66841 Fix LOGV2 invalid JSON when truncation happens at a
 backslash character

---
 src/mongo/logv2/logv2_test.cpp | 49 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'src/mongo/logv2/logv2_test.cpp')

diff --git a/src/mongo/logv2/logv2_test.cpp b/src/mongo/logv2/logv2_test.cpp
index 91d9327f06a..fde66599224 100644
--- a/src/mongo/logv2/logv2_test.cpp
+++ b/src/mongo/logv2/logv2_test.cpp
@@ -64,6 +64,7 @@
 #include "mongo/unittest/temp_dir.h"
 #include "mongo/unittest/unittest.h"
 #include "mongo/util/exit_code.h"
+#include "mongo/util/str_escape.h"
 #include "mongo/util/string_map.h"
 #include "mongo/util/uuid.h"
 
@@ -1568,6 +1569,54 @@ TEST_F(LogV2Test, JsonTruncation) {
     validateArrayTruncation(mongo::fromjson(lines.back()));
 }
 
+TEST_F(LogV2Test, StringTruncation) {
+    const AtomicWord<int32_t> maxAttributeSizeKB(1);
+    auto lines = makeLineCapture(JSONFormatter(&maxAttributeSizeKB));
+
+    std::size_t maxLength = maxAttributeSizeKB.load() << 10;
+    std::string prefix(maxLength - 3, 'a');
+
+    struct TestCase {
+        std::string input;
+        std::string suffix;
+        std::string note;
+    };
+
+    TestCase tests[] = {
+        {prefix + "LMNOPQ", "LMN", "unescaped 1-byte octet"},
+        // "\n\"NOPQ" expands to "\\n\\\"NOPQ" after escape, and the limit
+        // is reached at the 2nd '\\' octet, but since it splits the "\\\""
+        // sequence, the actual truncation happens after the 'n' octet.
+        {prefix + "\n\"NOPQ", "\n", "2-byte escape sequence"},
+        // "L\vNOPQ" expands to "L\\u000bNOPQ" after escape, and the limit
+        // is reached at the 'u' octet, so the entire sequence is truncated.
+        {prefix + "L\vNOPQ", "L", "multi-byte escape sequence"},
+        {prefix + "LM\xC3\xB1PQ", "LM", "2-byte UTF-8 sequence"},
+        {prefix + "L\xE1\x9B\x8FPQ", "L", "3-byte UTF-8 sequence"},
+        {prefix + "L\xF0\x90\x8C\xBCQ", "L", "4-byte UTF-8 sequence"},
+        {prefix + "\xE1\x9B\x8E\xE1\x9B\x8F", "\xE1\x9B\x8E", "UTF-8 codepoint boundary"},
+        // The invalid UTF-8 codepoint 0xC3 is replaced with "\\ufffd", and truncated entirely
+        {prefix + "L\xC3NOPQ", "L", "escaped invalid codepoint"},
+        {std::string(maxLength, '\\'), "\\", "escaped backslash"},
+    };
+
+    for (const auto& [input, suffix, note] : tests) {
+        LOGV2(6694001, "name", "name"_attr = input);
+        BSONObj obj = fromjson(lines.back());
+
+        auto str = obj[constants::kAttributesFieldName]["name"].checkAndGetStringData();
+        std::string context = "Failed test: " + note;
+
+        ASSERT_LTE(str.size(), maxLength) << context;
+        ASSERT(str.endsWith(suffix))
+            << context << " - string " << str << " does not end with " << suffix;
+
+        auto trunc = obj[constants::kTruncatedFieldName]["name"];
+        ASSERT_EQUALS(trunc["type"].String(), typeName(BSONType::String)) << context;
+        ASSERT_EQUALS(trunc["size"].numberLong(), str::escapeForJSON(input).size()) << context;
+    }
+}
+
 TEST_F(LogV2Test, Threads) {
     auto linesPlain = makeLineCapture(PlainFormatter());
     auto linesText = makeLineCapture(TextFormatter());
-- 
cgit v1.2.1