diff options
-rw-r--r-- | src/mongo/logv2/json_formatter.cpp | 29 | ||||
-rw-r--r-- | src/mongo/logv2/logv2_test.cpp | 49 | ||||
-rw-r--r-- | src/mongo/util/str_escape.cpp | 120 | ||||
-rw-r--r-- | src/mongo/util/str_escape.h | 74 |
4 files changed, 230 insertions, 42 deletions
diff --git a/src/mongo/logv2/json_formatter.cpp b/src/mongo/logv2/json_formatter.cpp index cbcfc85f121..58111121423 100644 --- a/src/mongo/logv2/json_formatter.cpp +++ b/src/mongo/logv2/json_formatter.cpp @@ -168,16 +168,27 @@ private: void storeQuoted(StringData name, const T& value) { format_to(std::back_inserter(_buffer), FMT_COMPILE(R"({}"{}":")"), _separator, name); std::size_t before = _buffer.size(); - str::escapeForJSON(_buffer, value); - if (_attributeMaxSize != 0) { + std::size_t wouldWrite = 0; + std::size_t written = 0; + str::escapeForJSON( + _buffer, value, _attributeMaxSize ? _attributeMaxSize : std::string::npos, &wouldWrite); + written = _buffer.size() - before; + + if (wouldWrite > written) { + // The bounded escape may have reached the limit and + // stopped writing while in the middle of a UTF-8 sequence, + // in which case the incomplete UTF-8 octets at the tail of the + // buffer have to be trimmed. + // Push a dummy byte so that the UTF-8 safe truncation + // will truncate back down to the correct size. + _buffer.push_back('x'); auto truncatedEnd = - str::UTF8SafeTruncation(_buffer.begin() + before, _buffer.end(), _attributeMaxSize); - if (truncatedEnd != _buffer.end()) { - BSONObjBuilder truncationInfo = _truncated.subobjStart(name); - truncationInfo.append("type"_sd, typeName(BSONType::String)); - truncationInfo.append("size"_sd, static_cast<int64_t>(_buffer.size() - before)); - truncationInfo.done(); - } + str::UTF8SafeTruncation(_buffer.begin() + before, _buffer.end(), written); + + BSONObjBuilder truncationInfo = _truncated.subobjStart(name); + truncationInfo.append("type"_sd, typeName(BSONType::String)); + truncationInfo.append("size"_sd, static_cast<int64_t>(wouldWrite)); + truncationInfo.done(); _buffer.resize(truncatedEnd - _buffer.begin()); } diff --git a/src/mongo/logv2/logv2_test.cpp b/src/mongo/logv2/logv2_test.cpp index 91d9327f06a..fde66599224 100644 --- a/src/mongo/logv2/logv2_test.cpp +++ b/src/mongo/logv2/logv2_test.cpp @@ -64,6 +64,7 @@ #include "mongo/unittest/temp_dir.h" #include "mongo/unittest/unittest.h" #include "mongo/util/exit_code.h" +#include "mongo/util/str_escape.h" #include "mongo/util/string_map.h" #include "mongo/util/uuid.h" @@ -1568,6 +1569,54 @@ TEST_F(LogV2Test, JsonTruncation) { validateArrayTruncation(mongo::fromjson(lines.back())); } +TEST_F(LogV2Test, StringTruncation) { + const AtomicWord<int32_t> maxAttributeSizeKB(1); + auto lines = makeLineCapture(JSONFormatter(&maxAttributeSizeKB)); + + std::size_t maxLength = maxAttributeSizeKB.load() << 10; + std::string prefix(maxLength - 3, 'a'); + + struct TestCase { + std::string input; + std::string suffix; + std::string note; + }; + + TestCase tests[] = { + {prefix + "LMNOPQ", "LMN", "unescaped 1-byte octet"}, + // "\n\"NOPQ" expands to "\\n\\\"NOPQ" after escape, and the limit + // is reached at the 2nd '\\' octet, but since it splits the "\\\"" + // sequence, the actual truncation happens after the 'n' octet. + {prefix + "\n\"NOPQ", "\n", "2-byte escape sequence"}, + // "L\vNOPQ" expands to "L\\u000bNOPQ" after escape, and the limit + // is reached at the 'u' octet, so the entire sequence is truncated. + {prefix + "L\vNOPQ", "L", "multi-byte escape sequence"}, + {prefix + "LM\xC3\xB1PQ", "LM", "2-byte UTF-8 sequence"}, + {prefix + "L\xE1\x9B\x8FPQ", "L", "3-byte UTF-8 sequence"}, + {prefix + "L\xF0\x90\x8C\xBCQ", "L", "4-byte UTF-8 sequence"}, + {prefix + "\xE1\x9B\x8E\xE1\x9B\x8F", "\xE1\x9B\x8E", "UTF-8 codepoint boundary"}, + // The invalid UTF-8 codepoint 0xC3 is replaced with "\\ufffd", and truncated entirely + {prefix + "L\xC3NOPQ", "L", "escaped invalid codepoint"}, + {std::string(maxLength, '\\'), "\\", "escaped backslash"}, + }; + + for (const auto& [input, suffix, note] : tests) { + LOGV2(6694001, "name", "name"_attr = input); + BSONObj obj = fromjson(lines.back()); + + auto str = obj[constants::kAttributesFieldName]["name"].checkAndGetStringData(); + std::string context = "Failed test: " + note; + + ASSERT_LTE(str.size(), maxLength) << context; + ASSERT(str.endsWith(suffix)) + << context << " - string " << str << " does not end with " << suffix; + + auto trunc = obj[constants::kTruncatedFieldName]["name"]; + ASSERT_EQUALS(trunc["type"].String(), typeName(BSONType::String)) << context; + ASSERT_EQUALS(trunc["size"].numberLong(), str::escapeForJSON(input).size()) << context; + } +} + TEST_F(LogV2Test, Threads) { auto linesPlain = makeLineCapture(PlainFormatter()); auto linesText = makeLineCapture(TextFormatter()); diff --git a/src/mongo/util/str_escape.cpp b/src/mongo/util/str_escape.cpp index d191fb92252..c42a916bab0 100644 --- a/src/mongo/util/str_escape.cpp +++ b/src/mongo/util/str_escape.cpp @@ -37,45 +37,82 @@ namespace mongo::str { namespace { constexpr char kHexChar[] = "0123456789abcdef"; +// Appends the bytes in the range [begin, end) to the output buffer, +// which can either be a fmt::memory_buffer, or a std::string. +template <typename Buffer, typename Iterator> +void appendBuffer(Buffer& buffer, Iterator begin, Iterator end) { + buffer.append(begin, end); +} + // 'singleHandler' Function to write a valid single byte UTF-8 sequence with desired escaping. // 'invalidByteHandler' Function to write a byte of invalid UTF-8 encoding // 'twoEscaper' Function to write a valid two byte UTF-8 sequence with desired escaping, for C1 // control codes. +// 'maxLength' Max length to write into output buffer; A value of std::string::npos means unbounded. +// An escape sequence will not be written if appending the entire sequence will exceed this limit. +// 'wouldWrite' Output to contain the total bytes that would have been written to the buffer if no +// size limit is in place. +// // All these functions take a function object as their first parameter to perform the // writing of any escaped data. This function expects the number of handled bytes as its first // parameter and the corresponding escaped string as the second. They are templates to they can be // inlined. -template <typename SingleByteHandler, typename InvalidByteHandler, typename TwoByteEscaper> -void escape(fmt::memory_buffer& buffer, +template <typename Buffer, + typename SingleByteHandler, + typename InvalidByteHandler, + typename TwoByteEscaper> +void escape(Buffer& buffer, StringData str, SingleByteHandler singleHandler, InvalidByteHandler invalidByteHandler, - TwoByteEscaper twoEscaper) { - // The range [begin, it) contains input that does not need to be escaped and that has not been + TwoByteEscaper twoEscaper, + size_t maxLength, + size_t* wouldWrite) { + // The range [inFirst, it) contains input that does not need to be escaped and that has not been // written to output yet. - // The range [it end) contains remaining input to scan 'begin' is pointing to the beginning of - // the input that has not yet been written to 'escaped'. - // 'it' is pointing to the beginning of the unicode code point we're currently processing in the - // while-loop below. 'end' is the end of the input sequence. - auto begin = str.begin(); - auto it = str.begin(); - auto end = str.end(); + // The range [it, inLast) contains remaining input to scan. 'inFirst' is pointing to the + // beginning of the input that has not yet been written to 'escaped'. 'it' is pointing to the + // beginning of the unicode code point we're currently processing in the while-loop below. + // 'inLast' is the end of the input sequence. + auto inFirst = str.begin(); + auto inLast = str.end(); + auto it = inFirst; + size_t cap = maxLength; + size_t total = 0; // Writes an escaped sequence to output after flushing pending input that does not need to be // escaped. 'it' is assumed to be at the beginning of the input sequence represented by the // escaped data. // 'numHandled' the number of bytes of unescaped data being written escaped in 'escapeSequence' auto flushAndWrite = [&](size_t numHandled, StringData escapeSequence) { + // Appends the range [wFirst, wLast) to the output if the result is within the max length. + // 'canTruncate' controls the behavior if appending the entire range would exceed the limit. + // If true, this appends input up to the length limit. Otherwise, none is appended. + auto boundedWrite = [&](auto wFirst, auto wLast, bool canTruncate) { + size_t len = std::distance(wFirst, wLast); + total += len; + if (maxLength != std::string::npos) { + if (len > cap) { + if (!canTruncate) { + cap = 0; + } + len = cap; + } + cap -= len; + } + appendBuffer(buffer, wFirst, wFirst + len); + }; + // Flush range of unmodified input - buffer.append(begin, it); - begin = it + numHandled; + boundedWrite(inFirst, it, true); + inFirst = it + numHandled; // Write escaped data - buffer.append(escapeSequence.rawData(), escapeSequence.rawData() + escapeSequence.size()); + boundedWrite(escapeSequence.begin(), escapeSequence.end(), false); }; auto isValidCodePoint = [&](auto pos, int len) { - return std::distance(pos, end) >= len && + return std::distance(pos, inLast) >= len && std::all_of(pos + 1, pos + len, [](uint8_t c) { return (c >> 6) == 0b10; }); }; @@ -98,7 +135,7 @@ void escape(fmt::memory_buffer& buffer, auto writeInvalid = [&](uint8_t c) { invalidByteHandler(flushAndWrite, c); }; - while (it != end) { + while (it != inLast) { uint8_t c = *it; bool bit7 = (c >> 7) & 1; if (MONGO_likely(!bit7)) { @@ -156,10 +193,15 @@ void escape(fmt::memory_buffer& buffer, } } // Write last block - buffer.append(begin, it); + flushAndWrite(0, {}); + if (wouldWrite) { + *wouldWrite = total; + } } } // namespace -void escapeForText(fmt::memory_buffer& buffer, StringData str) { + +template <typename Buffer> +void escapeForTextCommon(Buffer& buffer, StringData str, size_t maxLength, size_t* wouldWrite) { auto singleByteHandler = [](const auto& writer, uint8_t unescaped) { switch (unescaped) { case '\0': @@ -287,16 +329,26 @@ void escapeForText(fmt::memory_buffer& buffer, StringData str) { str, std::move(singleByteHandler), std::move(invalidByteHandler), - std::move(twoByteEscaper)); + std::move(twoByteEscaper), + maxLength, + wouldWrite); } -std::string escapeForText(StringData str) { - fmt::memory_buffer buffer; - escapeForText(buffer, str); - return fmt::to_string(buffer); +void escapeForText(fmt::memory_buffer& buffer, + StringData str, + size_t maxLength, + size_t* wouldWrite) { + escapeForTextCommon(buffer, str, maxLength, wouldWrite); } -void escapeForJSON(fmt::memory_buffer& buffer, StringData str) { +std::string escapeForText(StringData str, size_t maxLength, size_t* wouldWrite) { + std::string buffer; + escapeForTextCommon(buffer, str, maxLength, wouldWrite); + return buffer; +} + +template <typename Buffer> +void escapeForJSONCommon(Buffer& buffer, StringData str, size_t maxLength, size_t* wouldWrite) { auto singleByteHandler = [](const auto& writer, uint8_t unescaped) { switch (unescaped) { case '\0': @@ -427,11 +479,21 @@ void escapeForJSON(fmt::memory_buffer& buffer, StringData str) { str, std::move(singleByteHandler), std::move(invalidByteHandler), - std::move(twoByteEscaper)); + std::move(twoByteEscaper), + maxLength, + wouldWrite); } -std::string escapeForJSON(StringData str) { - fmt::memory_buffer buffer; - escapeForJSON(buffer, str); - return fmt::to_string(buffer); + +void escapeForJSON(fmt::memory_buffer& buffer, + StringData str, + size_t maxLength, + size_t* wouldWrite) { + escapeForJSONCommon(buffer, str, maxLength, wouldWrite); +} + +std::string escapeForJSON(StringData str, size_t maxLength, size_t* wouldWrite) { + std::string buffer; + escapeForJSONCommon(buffer, str, maxLength, wouldWrite); + return buffer; } } // namespace mongo::str diff --git a/src/mongo/util/str_escape.h b/src/mongo/util/str_escape.h index 47fe3d30060..2d82e5697cd 100644 --- a/src/mongo/util/str_escape.h +++ b/src/mongo/util/str_escape.h @@ -35,9 +35,75 @@ #include <string> namespace mongo::str { -void escapeForText(fmt::memory_buffer& buffer, StringData str); -std::string escapeForText(StringData str); -void escapeForJSON(fmt::memory_buffer& buffer, StringData str); -std::string escapeForJSON(StringData str); +/** + * Escapes the special characters in 'str' for use as printable text. + * + * The backslash (`\`) character is escaped with another backslash, yielding the + * 2-character sequence {`\`, `\`}. + * + * The single-byte control characters (octets 0x00-0x1f, 0x7f) are generally escaped + * using the format "\xHH", where the 2 `H` characters are replaced by the 2 hex digits + * of the octet. For instance, the octet 0x7f would yield the sequence: {`\`, `x`, `7`, `f`}. + * Exemptions to this rule are the following octets, which are escaped using C-style escape + * sequences: + * 0x00 -> {`\`, `0`} + * 0x07 -> {`\`, `a`} + * 0x08 -> {`\`, `b`} + * 0x09 -> {`\`, `t`} + * 0x0a -> {`\`, `n`} + * 0x0b -> {`\`, `v`} + * 0x0c -> {`\`, `f`} + * 0x0d -> {`\`, `r`} + * 0x1b -> {`\`, `e`} + * + * The two-byte UTF-8 sequences between 0xC280 (U+0080) and 0xC29F (U+009F), inclusive, are + * also escaped as they are considered control characters. The escape sequence for these has + * the format: "\xC2\xHH", where the 2 `H` characters are replaced by the 2 hex digits of the + * second octet. + * + * Invalid bytes found are replaced with the escape sequence following the format: "\xHH", + * similar to how single-byte control characters are escaped. + * + * This writes the escaped output to 'buffer', and stops writing when either the output + * length reaches the 'maxLength', or if appending the next escape sequence will cause the + * output to exceed 'maxLength'. A 'maxLength' value of std::string::npos means unbounded. + * + * The 'wouldWrite' output is updated to contain the total bytes that would have been written + * if there was no length limit. + */ +void escapeForText(fmt::memory_buffer& buffer, + StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); +std::string escapeForText(StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); + +/** + * Escapes the special characters in 'str' for use in JSON. + * + * This differs from escapeForText in that the double-quote character (`"`) is escaped + * with a backslash, yielding the 2-character sequence {`\`, `"`}. + * + * The general format of the escape sequences for single-byte control characters becomes + * "\u00HH", where the 2 `H` characters are replaced by the 2 hex digits of the octet. + * For example, the octet 0x7f would yield the sequence: {`\`, `u`, `0`, `0`, `7`, `f`}. + * The list of octets escaped using C-style escape sequences is also shortened to: + * 0x08 -> {`\`, `b`} + * 0x09 -> {`\`, `t`} + * 0x0a -> {`\`, `n`} + * 0x0c -> {`\`, `f`} + * 0x0d -> {`\`, `r`} + * For two-byte control characters, the format of the escape sequence becomes "\uc2HH", + * where the 2 `H` characters are replaced by the 2 hex digits of the second octet. + * Invalid bytes found are replaced with the sequence: "\ufffd". + */ +void escapeForJSON(fmt::memory_buffer& buffer, + StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); +std::string escapeForJSON(StringData str, + size_t maxLength = std::string::npos, + size_t* wouldWrite = nullptr); } // namespace mongo::str |