summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErwin Pe <erwin.pe@mongodb.com>2022-08-09 15:17:40 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-08-09 16:03:43 +0000
commitd56d5295aed750c2d9d585d78d1d224955b40a62 (patch)
tree416dc8fd7d470b1c1cba5a5224e88ea691118db9
parentc6e439a9ab036e9f20003623f6d95d9864349ed7 (diff)
downloadmongo-d56d5295aed750c2d9d585d78d1d224955b40a62.tar.gz
SERVER-66841 Fix LOGV2 invalid JSON when truncation happens at a backslash character
(cherry picked from commit b795776ac3c1599528a5772825f16f88ecb5b1c9)
-rw-r--r--src/mongo/logv2/json_formatter.cpp29
-rw-r--r--src/mongo/logv2/logv2_test.cpp49
-rw-r--r--src/mongo/util/str_escape.cpp120
-rw-r--r--src/mongo/util/str_escape.h74
4 files changed, 230 insertions, 42 deletions
diff --git a/src/mongo/logv2/json_formatter.cpp b/src/mongo/logv2/json_formatter.cpp
index 2dc65831014..12e3a685e2c 100644
--- a/src/mongo/logv2/json_formatter.cpp
+++ b/src/mongo/logv2/json_formatter.cpp
@@ -168,16 +168,27 @@ private:
void storeQuoted(StringData name, const T& value) {
format_to(std::back_inserter(_buffer), FMT_COMPILE(R"({}"{}":")"), _separator, name);
std::size_t before = _buffer.size();
- str::escapeForJSON(_buffer, value);
- if (_attributeMaxSize != 0) {
+ std::size_t wouldWrite = 0;
+ std::size_t written = 0;
+ str::escapeForJSON(
+ _buffer, value, _attributeMaxSize ? _attributeMaxSize : std::string::npos, &wouldWrite);
+ written = _buffer.size() - before;
+
+ if (wouldWrite > written) {
+ // The bounded escape may have reached the limit and
+ // stopped writing while in the middle of a UTF-8 sequence,
+ // in which case the incomplete UTF-8 octets at the tail of the
+ // buffer have to be trimmed.
+ // Push a dummy byte so that the UTF-8 safe truncation
+ // will truncate back down to the correct size.
+ _buffer.push_back('x');
auto truncatedEnd =
- str::UTF8SafeTruncation(_buffer.begin() + before, _buffer.end(), _attributeMaxSize);
- if (truncatedEnd != _buffer.end()) {
- BSONObjBuilder truncationInfo = _truncated.subobjStart(name);
- truncationInfo.append("type"_sd, typeName(BSONType::String));
- truncationInfo.append("size"_sd, static_cast<int64_t>(_buffer.size() - before));
- truncationInfo.done();
- }
+ str::UTF8SafeTruncation(_buffer.begin() + before, _buffer.end(), written);
+
+ BSONObjBuilder truncationInfo = _truncated.subobjStart(name);
+ truncationInfo.append("type"_sd, typeName(BSONType::String));
+ truncationInfo.append("size"_sd, static_cast<int64_t>(wouldWrite));
+ truncationInfo.done();
_buffer.resize(truncatedEnd - _buffer.begin());
}
diff --git a/src/mongo/logv2/logv2_test.cpp b/src/mongo/logv2/logv2_test.cpp
index ac1ab0d25ea..a20f202b3e2 100644
--- a/src/mongo/logv2/logv2_test.cpp
+++ b/src/mongo/logv2/logv2_test.cpp
@@ -60,6 +60,7 @@
#include "mongo/stdx/thread.h"
#include "mongo/unittest/temp_dir.h"
#include "mongo/unittest/unittest.h"
+#include "mongo/util/str_escape.h"
#include "mongo/util/string_map.h"
#include "mongo/util/uuid.h"
@@ -1474,6 +1475,54 @@ TEST_F(LogV2Test, JsonTruncation) {
validateArrayTruncation(mongo::fromjson(lines.back()));
}
+TEST_F(LogV2Test, StringTruncation) {
+ const AtomicWord<int32_t> maxAttributeSizeKB(1);
+ auto lines = makeLineCapture(JSONFormatter(&maxAttributeSizeKB));
+
+ std::size_t maxLength = maxAttributeSizeKB.load() << 10;
+ std::string prefix(maxLength - 3, 'a');
+
+ struct TestCase {
+ std::string input;
+ std::string suffix;
+ std::string note;
+ };
+
+ TestCase tests[] = {
+ {prefix + "LMNOPQ", "LMN", "unescaped 1-byte octet"},
+ // "\n\"NOPQ" expands to "\\n\\\"NOPQ" after escape, and the limit
+ // is reached at the 2nd '\\' octet, but since it splits the "\\\""
+ // sequence, the actual truncation happens after the 'n' octet.
+ {prefix + "\n\"NOPQ", "\n", "2-byte escape sequence"},
+ // "L\vNOPQ" expands to "L\\u000bNOPQ" after escape, and the limit
+ // is reached at the 'u' octet, so the entire sequence is truncated.
+ {prefix + "L\vNOPQ", "L", "multi-byte escape sequence"},
+ {prefix + "LM\xC3\xB1PQ", "LM", "2-byte UTF-8 sequence"},
+ {prefix + "L\xE1\x9B\x8FPQ", "L", "3-byte UTF-8 sequence"},
+ {prefix + "L\xF0\x90\x8C\xBCQ", "L", "4-byte UTF-8 sequence"},
+ {prefix + "\xE1\x9B\x8E\xE1\x9B\x8F", "\xE1\x9B\x8E", "UTF-8 codepoint boundary"},
+ // The invalid UTF-8 codepoint 0xC3 is replaced with "\\ufffd", and truncated entirely
+ {prefix + "L\xC3NOPQ", "L", "escaped invalid codepoint"},
+ {std::string(maxLength, '\\'), "\\", "escaped backslash"},
+ };
+
+ for (const auto& [input, suffix, note] : tests) {
+ LOGV2(6694001, "name", "name"_attr = input);
+ BSONObj obj = fromjson(lines.back());
+
+ auto str = obj[constants::kAttributesFieldName]["name"].checkAndGetStringData();
+ std::string context = "Failed test: " + note;
+
+ ASSERT_LTE(str.size(), maxLength) << context;
+ ASSERT(str.endsWith(suffix))
+ << context << " - string " << str << " does not end with " << suffix;
+
+ auto trunc = obj[constants::kTruncatedFieldName]["name"];
+ ASSERT_EQUALS(trunc["type"].String(), typeName(BSONType::String)) << context;
+ ASSERT_EQUALS(trunc["size"].numberLong(), str::escapeForJSON(input).size()) << context;
+ }
+}
+
TEST_F(LogV2Test, Threads) {
auto linesPlain = makeLineCapture(PlainFormatter());
auto linesText = makeLineCapture(TextFormatter());
diff --git a/src/mongo/util/str_escape.cpp b/src/mongo/util/str_escape.cpp
index d191fb92252..c42a916bab0 100644
--- a/src/mongo/util/str_escape.cpp
+++ b/src/mongo/util/str_escape.cpp
@@ -37,45 +37,82 @@ namespace mongo::str {
namespace {
constexpr char kHexChar[] = "0123456789abcdef";
+// Appends the bytes in the range [begin, end) to the output buffer,
+// which can either be a fmt::memory_buffer, or a std::string.
+template <typename Buffer, typename Iterator>
+void appendBuffer(Buffer& buffer, Iterator begin, Iterator end) {
+ buffer.append(begin, end);
+}
+
// 'singleHandler' Function to write a valid single byte UTF-8 sequence with desired escaping.
// 'invalidByteHandler' Function to write a byte of invalid UTF-8 encoding
// 'twoEscaper' Function to write a valid two byte UTF-8 sequence with desired escaping, for C1
// control codes.
+// 'maxLength' Max length to write into output buffer; A value of std::string::npos means unbounded.
+// An escape sequence will not be written if appending the entire sequence will exceed this limit.
+// 'wouldWrite' Output to contain the total bytes that would have been written to the buffer if no
+// size limit is in place.
+//
// All these functions take a function object as their first parameter to perform the
// writing of any escaped data. This function expects the number of handled bytes as its first
// parameter and the corresponding escaped string as the second. They are templates to they can be
// inlined.
-template <typename SingleByteHandler, typename InvalidByteHandler, typename TwoByteEscaper>
-void escape(fmt::memory_buffer& buffer,
+template <typename Buffer,
+ typename SingleByteHandler,
+ typename InvalidByteHandler,
+ typename TwoByteEscaper>
+void escape(Buffer& buffer,
StringData str,
SingleByteHandler singleHandler,
InvalidByteHandler invalidByteHandler,
- TwoByteEscaper twoEscaper) {
- // The range [begin, it) contains input that does not need to be escaped and that has not been
+ TwoByteEscaper twoEscaper,
+ size_t maxLength,
+ size_t* wouldWrite) {
+ // The range [inFirst, it) contains input that does not need to be escaped and that has not been
// written to output yet.
- // The range [it end) contains remaining input to scan 'begin' is pointing to the beginning of
- // the input that has not yet been written to 'escaped'.
- // 'it' is pointing to the beginning of the unicode code point we're currently processing in the
- // while-loop below. 'end' is the end of the input sequence.
- auto begin = str.begin();
- auto it = str.begin();
- auto end = str.end();
+ // The range [it, inLast) contains remaining input to scan. 'inFirst' is pointing to the
+ // beginning of the input that has not yet been written to 'escaped'. 'it' is pointing to the
+ // beginning of the unicode code point we're currently processing in the while-loop below.
+ // 'inLast' is the end of the input sequence.
+ auto inFirst = str.begin();
+ auto inLast = str.end();
+ auto it = inFirst;
+ size_t cap = maxLength;
+ size_t total = 0;
// Writes an escaped sequence to output after flushing pending input that does not need to be
// escaped. 'it' is assumed to be at the beginning of the input sequence represented by the
// escaped data.
// 'numHandled' the number of bytes of unescaped data being written escaped in 'escapeSequence'
auto flushAndWrite = [&](size_t numHandled, StringData escapeSequence) {
+ // Appends the range [wFirst, wLast) to the output if the result is within the max length.
+ // 'canTruncate' controls the behavior if appending the entire range would exceed the limit.
+ // If true, this appends input up to the length limit. Otherwise, none is appended.
+ auto boundedWrite = [&](auto wFirst, auto wLast, bool canTruncate) {
+ size_t len = std::distance(wFirst, wLast);
+ total += len;
+ if (maxLength != std::string::npos) {
+ if (len > cap) {
+ if (!canTruncate) {
+ cap = 0;
+ }
+ len = cap;
+ }
+ cap -= len;
+ }
+ appendBuffer(buffer, wFirst, wFirst + len);
+ };
+
// Flush range of unmodified input
- buffer.append(begin, it);
- begin = it + numHandled;
+ boundedWrite(inFirst, it, true);
+ inFirst = it + numHandled;
// Write escaped data
- buffer.append(escapeSequence.rawData(), escapeSequence.rawData() + escapeSequence.size());
+ boundedWrite(escapeSequence.begin(), escapeSequence.end(), false);
};
auto isValidCodePoint = [&](auto pos, int len) {
- return std::distance(pos, end) >= len &&
+ return std::distance(pos, inLast) >= len &&
std::all_of(pos + 1, pos + len, [](uint8_t c) { return (c >> 6) == 0b10; });
};
@@ -98,7 +135,7 @@ void escape(fmt::memory_buffer& buffer,
auto writeInvalid = [&](uint8_t c) { invalidByteHandler(flushAndWrite, c); };
- while (it != end) {
+ while (it != inLast) {
uint8_t c = *it;
bool bit7 = (c >> 7) & 1;
if (MONGO_likely(!bit7)) {
@@ -156,10 +193,15 @@ void escape(fmt::memory_buffer& buffer,
}
}
// Write last block
- buffer.append(begin, it);
+ flushAndWrite(0, {});
+ if (wouldWrite) {
+ *wouldWrite = total;
+ }
}
} // namespace
-void escapeForText(fmt::memory_buffer& buffer, StringData str) {
+
+template <typename Buffer>
+void escapeForTextCommon(Buffer& buffer, StringData str, size_t maxLength, size_t* wouldWrite) {
auto singleByteHandler = [](const auto& writer, uint8_t unescaped) {
switch (unescaped) {
case '\0':
@@ -287,16 +329,26 @@ void escapeForText(fmt::memory_buffer& buffer, StringData str) {
str,
std::move(singleByteHandler),
std::move(invalidByteHandler),
- std::move(twoByteEscaper));
+ std::move(twoByteEscaper),
+ maxLength,
+ wouldWrite);
}
-std::string escapeForText(StringData str) {
- fmt::memory_buffer buffer;
- escapeForText(buffer, str);
- return fmt::to_string(buffer);
+void escapeForText(fmt::memory_buffer& buffer,
+ StringData str,
+ size_t maxLength,
+ size_t* wouldWrite) {
+ escapeForTextCommon(buffer, str, maxLength, wouldWrite);
}
-void escapeForJSON(fmt::memory_buffer& buffer, StringData str) {
+std::string escapeForText(StringData str, size_t maxLength, size_t* wouldWrite) {
+ std::string buffer;
+ escapeForTextCommon(buffer, str, maxLength, wouldWrite);
+ return buffer;
+}
+
+template <typename Buffer>
+void escapeForJSONCommon(Buffer& buffer, StringData str, size_t maxLength, size_t* wouldWrite) {
auto singleByteHandler = [](const auto& writer, uint8_t unescaped) {
switch (unescaped) {
case '\0':
@@ -427,11 +479,21 @@ void escapeForJSON(fmt::memory_buffer& buffer, StringData str) {
str,
std::move(singleByteHandler),
std::move(invalidByteHandler),
- std::move(twoByteEscaper));
+ std::move(twoByteEscaper),
+ maxLength,
+ wouldWrite);
}
-std::string escapeForJSON(StringData str) {
- fmt::memory_buffer buffer;
- escapeForJSON(buffer, str);
- return fmt::to_string(buffer);
+
+void escapeForJSON(fmt::memory_buffer& buffer,
+ StringData str,
+ size_t maxLength,
+ size_t* wouldWrite) {
+ escapeForJSONCommon(buffer, str, maxLength, wouldWrite);
+}
+
+std::string escapeForJSON(StringData str, size_t maxLength, size_t* wouldWrite) {
+ std::string buffer;
+ escapeForJSONCommon(buffer, str, maxLength, wouldWrite);
+ return buffer;
}
} // namespace mongo::str
diff --git a/src/mongo/util/str_escape.h b/src/mongo/util/str_escape.h
index 47fe3d30060..2d82e5697cd 100644
--- a/src/mongo/util/str_escape.h
+++ b/src/mongo/util/str_escape.h
@@ -35,9 +35,75 @@
#include <string>
namespace mongo::str {
-void escapeForText(fmt::memory_buffer& buffer, StringData str);
-std::string escapeForText(StringData str);
-void escapeForJSON(fmt::memory_buffer& buffer, StringData str);
-std::string escapeForJSON(StringData str);
+/**
+ * Escapes the special characters in 'str' for use as printable text.
+ *
+ * The backslash (`\`) character is escaped with another backslash, yielding the
+ * 2-character sequence {`\`, `\`}.
+ *
+ * The single-byte control characters (octets 0x00-0x1f, 0x7f) are generally escaped
+ * using the format "\xHH", where the 2 `H` characters are replaced by the 2 hex digits
+ * of the octet. For instance, the octet 0x7f would yield the sequence: {`\`, `x`, `7`, `f`}.
+ * Exemptions to this rule are the following octets, which are escaped using C-style escape
+ * sequences:
+ * 0x00 -> {`\`, `0`}
+ * 0x07 -> {`\`, `a`}
+ * 0x08 -> {`\`, `b`}
+ * 0x09 -> {`\`, `t`}
+ * 0x0a -> {`\`, `n`}
+ * 0x0b -> {`\`, `v`}
+ * 0x0c -> {`\`, `f`}
+ * 0x0d -> {`\`, `r`}
+ * 0x1b -> {`\`, `e`}
+ *
+ * The two-byte UTF-8 sequences between 0xC280 (U+0080) and 0xC29F (U+009F), inclusive, are
+ * also escaped as they are considered control characters. The escape sequence for these has
+ * the format: "\xC2\xHH", where the 2 `H` characters are replaced by the 2 hex digits of the
+ * second octet.
+ *
+ * Invalid bytes found are replaced with the escape sequence following the format: "\xHH",
+ * similar to how single-byte control characters are escaped.
+ *
+ * This writes the escaped output to 'buffer', and stops writing when either the output
+ * length reaches the 'maxLength', or if appending the next escape sequence will cause the
+ * output to exceed 'maxLength'. A 'maxLength' value of std::string::npos means unbounded.
+ *
+ * The 'wouldWrite' output is updated to contain the total bytes that would have been written
+ * if there was no length limit.
+ */
+void escapeForText(fmt::memory_buffer& buffer,
+ StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
+std::string escapeForText(StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
+
+/**
+ * Escapes the special characters in 'str' for use in JSON.
+ *
+ * This differs from escapeForText in that the double-quote character (`"`) is escaped
+ * with a backslash, yielding the 2-character sequence {`\`, `"`}.
+ *
+ * The general format of the escape sequences for single-byte control characters becomes
+ * "\u00HH", where the 2 `H` characters are replaced by the 2 hex digits of the octet.
+ * For example, the octet 0x7f would yield the sequence: {`\`, `u`, `0`, `0`, `7`, `f`}.
+ * The list of octets escaped using C-style escape sequences is also shortened to:
+ * 0x08 -> {`\`, `b`}
+ * 0x09 -> {`\`, `t`}
+ * 0x0a -> {`\`, `n`}
+ * 0x0c -> {`\`, `f`}
+ * 0x0d -> {`\`, `r`}
+ * For two-byte control characters, the format of the escape sequence becomes "\uc2HH",
+ * where the 2 `H` characters are replaced by the 2 hex digits of the second octet.
+ * Invalid bytes found are replaced with the sequence: "\ufffd".
+ */
+void escapeForJSON(fmt::memory_buffer& buffer,
+ StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
+std::string escapeForJSON(StringData str,
+ size_t maxLength = std::string::npos,
+ size_t* wouldWrite = nullptr);
} // namespace mongo::str