summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorHenrik Edin <henrik.edin@mongodb.com>2019-11-26 15:29:49 +0000
committerevergreen <evergreen@mongodb.com>2019-11-26 15:29:49 +0000
commit5e2967253a16233076ffcc64839c4cba4706e1e8 (patch)
tree6427fe892f36cfbe68584697e7706af934b9a0b5 /src/mongo
parent272c89db8935802eb43535382960dd7fe24326d9 (diff)
downloadmongo-5e2967253a16233076ffcc64839c4cba4706e1e8.tar.gz
SERVER-44621 Add string escaping to text formatter in logv2
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/SConscript1
-rw-r--r--src/mongo/logv2/constants.h3
-rw-r--r--src/mongo/logv2/log_test_v2.cpp47
-rw-r--r--src/mongo/logv2/plain_formatter.cpp8
-rw-r--r--src/mongo/logv2/string_escape.cpp427
-rw-r--r--src/mongo/logv2/string_escape.h39
6 files changed, 524 insertions, 1 deletions
diff --git a/src/mongo/SConscript b/src/mongo/SConscript
index b085df33adf..706d19eab53 100644
--- a/src/mongo/SConscript
+++ b/src/mongo/SConscript
@@ -123,6 +123,7 @@ baseEnv.Library(
'logv2/log_tag.cpp',
'logv2/plain_formatter.cpp',
'logv2/ramlog.cpp',
+ 'logv2/string_escape.cpp',
'logv2/text_formatter.cpp',
'platform/decimal128.cpp',
'platform/mutex.cpp',
diff --git a/src/mongo/logv2/constants.h b/src/mongo/logv2/constants.h
index a235bd8081b..7029e8209e8 100644
--- a/src/mongo/logv2/constants.h
+++ b/src/mongo/logv2/constants.h
@@ -35,4 +35,7 @@ namespace mongo::logv2::constants {
// memory.
constexpr size_t kNumStaticAttrs = 16;
+// Allocate extra space to fit some escape sequences
+constexpr size_t kReservedSpaceForEscaping = 16;
+
} // namespace mongo::logv2::constants
diff --git a/src/mongo/logv2/log_test_v2.cpp b/src/mongo/logv2/log_test_v2.cpp
index b0c9afa827b..92d122fd514 100644
--- a/src/mongo/logv2/log_test_v2.cpp
+++ b/src/mongo/logv2/log_test_v2.cpp
@@ -412,6 +412,53 @@ TEST_F(LogTestV2, JSONFormat) {
ASSERT(log.getField("attr"_sd).Obj().getField("name").String() == t.toString());
}
+TEST_F(LogTestV2, Unicode) {
+ std::vector<std::string> lines;
+ auto sink = LogTestBackend::create(lines);
+ sink->set_filter(ComponentSettingsFilter(LogManager::global().getGlobalDomain(),
+ LogManager::global().getGlobalSettings()));
+ sink->set_formatter(PlainFormatter());
+ attach(sink);
+
+ std::pair<StringData, StringData> strs[] = {
+ // Single byte characters that needs to be escaped
+ {"\a\b\f\n\r\t\v\\\0\x7f\x1b"_sd, "\\a\\b\\f\\n\\r\\t\\v\\\\\\0\\x7f\\e"_sd},
+ // multi byte characters that needs to be escaped (unicode control characters)
+ {"\u0080\u009f"_sd, "\\xc2\\x80\\xc2\\x9f"_sd},
+ // Valid 2 Octet sequence, LATIN SMALL LETTER N WITH TILDE
+ {"\u00f1"_sd, "\u00f1"_sd},
+ // Invalid 2 Octet Sequence, result is escaped
+ {"\xc3\x28"_sd, "\\xc3\x28"_sd},
+ // Invalid Sequence Identifier, result is escaped
+ {"\xa0\xa1"_sd, "\\xa0\\xa1"_sd},
+ // Valid 3 Octet sequence, RUNIC LETTER TIWAZ TIR TYR T
+ {"\u16cf"_sd, "\u16cf"_sd},
+ // Invalid 3 Octet Sequence (in 2nd Octet), result is escaped
+ {"\xe2\x28\xa1"_sd, "\\xe2\x28\\xa1"_sd},
+ // Invalid 3 Octet Sequence (in 3rd Octet), result is escaped
+ {"\xe2\x82\x28"_sd, "\\xe2\\x82\x28"_sd},
+ // Valid 4 Octet sequence, GOTHIC LETTER MANNA
+ {"\U0001033c"_sd, "\U0001033c"_sd},
+ // Invalid 4 Octet Sequence (in 2nd Octet), result is escaped
+ {"\xf0\x28\x8c\xbc"_sd, "\\xf0\x28\\x8c\\xbc"_sd},
+ // Invalid 4 Octet Sequence (in 3rd Octet), result is escaped
+ {"\xf0\x90\x28\xbc"_sd, "\\xf0\\x90\x28\\xbc"_sd},
+ // Invalid 4 Octet Sequence (in 4th Octet), result is escaped
+ {"\xf0\x28\x8c\x28"_sd, "\\xf0\x28\\x8c\x28"_sd},
+ // Valid 5 Octet Sequence (but not Unicode!), result is escaped
+ {"\xf8\xa1\xa1\xa1\xa1"_sd, "\\xf8\\xa1\\xa1\\xa1\\xa1"_sd},
+ // Valid 6 Octet Sequence (but not Unicode!), result is escaped
+ {"\xfc\xa1\xa1\xa1\xa1\xa1"_sd, "\\xfc\\xa1\\xa1\\xa1\\xa1\\xa1"_sd},
+ // Invalid 3 Octet sequence, buffer ends prematurely, result is escaped
+ {"\xe2\x82"_sd, "\\xe2\\x82"_sd},
+ };
+
+ for (const auto& pair : strs) {
+ LOGV2("{}", "name"_attr = pair.first);
+ ASSERT_EQUALS(lines.back(), pair.second);
+ }
+}
+
TEST_F(LogTestV2, Threads) {
std::vector<std::string> linesPlain;
auto plainSink = LogTestBackend::create(linesPlain);
diff --git a/src/mongo/logv2/plain_formatter.cpp b/src/mongo/logv2/plain_formatter.cpp
index 235172e6dda..82caf3ecf14 100644
--- a/src/mongo/logv2/plain_formatter.cpp
+++ b/src/mongo/logv2/plain_formatter.cpp
@@ -33,6 +33,7 @@
#include "mongo/logv2/attribute_storage.h"
#include "mongo/logv2/attributes.h"
#include "mongo/logv2/constants.h"
+#include "mongo/logv2/string_escape.h"
#include <boost/container/small_vector.hpp>
#include <boost/log/attributes/value_extraction.hpp>
@@ -47,7 +48,7 @@ namespace {
struct TextValueExtractor {
void operator()(StringData name, CustomAttributeValue const& val) {
- _storage.push_back(val.toString());
+ _storage.push_back(escapeForText(val.toString()));
operator()(name, _storage.back());
}
@@ -56,6 +57,11 @@ struct TextValueExtractor {
operator()(name, _storage.back());
}
+ void operator()(StringData name, StringData val) {
+ _storage.push_back(escapeForText(val));
+ operator()(name, _storage.back());
+ }
+
template <typename T>
void operator()(StringData name, const T& val) {
args.push_back(fmt::internal::make_arg<fmt::format_context>(val));
diff --git a/src/mongo/logv2/string_escape.cpp b/src/mongo/logv2/string_escape.cpp
new file mode 100644
index 00000000000..c270c031668
--- /dev/null
+++ b/src/mongo/logv2/string_escape.cpp
@@ -0,0 +1,427 @@
+/**
+ * Copyright (C) 2019-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#include "mongo/logv2/string_escape.h"
+
+#include "mongo/logv2/constants.h"
+
+#include <algorithm>
+#include <array>
+#include <iterator>
+
+namespace mongo::logv2 {
+namespace {
+constexpr char kHexChar[] = "0123456789abcdef";
+
+// 'singleHandler' Function to write a valid single byte UTF-8 sequence with desired escaping.
+// 'singleEscaper' Function to write a byte of invalid UTF-8 encoding
+// 'twoEscaper' Function to write a valid two byte UTF-8 sequence with desired escaping, for C1
+// control codes.
+// All these functions take a function object as their first parameter to perform the
+// writing of any escaped data. This function expects the number of handled bytes as its first
+// parameter and the corresponding escaped string as the second. They are templates to they can be
+// inlined.
+template <typename SingleByteHandler, typename SingleByteEscaper, typename TwoByteEscaper>
+std::string escape(StringData str,
+ SingleByteHandler singleHandler,
+ SingleByteEscaper singleEscaper,
+ TwoByteEscaper twoEscaper) {
+ std::string escaped;
+ // If input string is over the SSO size and we're going to need to allocate memory, add some
+ // extra to fit a couple of eventual escape sequences.
+ if (str.size() > escaped.capacity())
+ escaped.reserve(str.size() + constants::kReservedSpaceForEscaping);
+
+ // The range [begin, it) contains input that does not need to be escaped and that has not been
+ // written to output yet.
+ // The range [it end) contains remaining input to scan 'begin' is pointing to the beginning of
+ // the input that has not yet been written to 'escaped'.
+ // 'it' is pointing to the beginning of the unicode code point we're currently processing in the
+ // while-loop below. 'end' is the end of the input sequence.
+ auto begin = str.begin();
+ auto it = str.begin();
+ auto end = str.end();
+
+ // Writes an escaped sequence to output after flushing pending input that does not need to be
+ // escaped. 'it' is assumed to be at the beginning of the input sequence represented by the
+ // escaped data.
+ // 'numHandled' the number of bytes of unescaped data being written escaped in 'escapeSequence'
+ auto flushAndWrite = [&](size_t numHandled, StringData escapeSequence) {
+ // Flush range of unmodified input
+ escaped.append(begin, it);
+ begin = it + numHandled;
+
+ // Write escaped data
+ escaped.append(escapeSequence.rawData(), escapeSequence.size());
+ };
+
+ auto isValidCodePoint = [&](auto pos, int len) {
+ return std::distance(pos, end) >= len &&
+ std::all_of(pos + 1, pos + len, [](uint8_t c) { return (c >> 6) == 0b10; });
+ };
+
+ // Helper function to write a valid one byte UTF-8 sequence from the input stream
+ auto writeValid1Byte = [&]() { singleHandler(flushAndWrite, *it); };
+
+ // Helper function to write a valid two byte UTF-8 sequence from the input stream
+ auto writeValid2Byte = [&]() {
+ uint8_t first = *it;
+ uint8_t second = *(it + 1);
+
+ if (MONGO_unlikely(first == 0xc2 && second >= 0x80 && second < 0xa0)) {
+ twoEscaper(flushAndWrite, first, second);
+ }
+ };
+
+ // Helper function to write an invalid UTF-8 sequence from the input stream
+ // Will try and write up to num bytes but bail if we reach the end of the input.
+ // Updates the position of 'it'.
+ auto writeInvalid = [&](uint8_t c) { singleEscaper(flushAndWrite, c); };
+
+
+ while (it != end) {
+ uint8_t c = *it;
+ bool bit7 = (c >> 7) & 1;
+ if (MONGO_likely(!bit7)) {
+ writeValid1Byte();
+ ++it;
+ continue;
+ }
+
+ bool bit6 = (c >> 6) & 1;
+ if (MONGO_unlikely(!bit6)) {
+ writeInvalid(c);
+ ++it;
+ continue;
+ }
+
+ bool bit5 = (c >> 5) & 1;
+ if (!bit5) {
+ // 2 byte sequence
+ if (MONGO_likely(isValidCodePoint(it, 2))) {
+ writeValid2Byte();
+ it += 2;
+ } else {
+ writeInvalid(c);
+ ++it;
+ }
+
+ continue;
+ }
+
+ bool bit4 = (c >> 4) & 1;
+ if (!bit4) {
+ // 3 byte sequence
+ if (MONGO_likely(isValidCodePoint(it, 3))) {
+ it += 3;
+ } else {
+ writeInvalid(c);
+ ++it;
+ }
+ continue;
+ }
+
+ bool bit3 = (c >> 3) & 1;
+ if (bit3) {
+ writeInvalid(c);
+ ++it;
+ continue;
+ }
+
+ // 4 byte sequence
+ if (MONGO_likely(isValidCodePoint(it, 4))) {
+ it += 4;
+ } else {
+ writeInvalid(c);
+ ++it;
+ }
+ }
+ // Write last block
+ escaped.append(begin, it);
+ return escaped;
+}
+} // namespace
+std::string escapeForText(StringData str) {
+ return escape(str,
+ [](const auto& writer, uint8_t unescaped) {
+ switch (unescaped) {
+ case '\0':
+ writer(1, "\\0"_sd);
+ break;
+ case 0x01:
+ writer(1, "\\x01"_sd);
+ break;
+ case 0x02:
+ writer(1, "\\x02"_sd);
+ break;
+ case 0x03:
+ writer(1, "\\x03"_sd);
+ break;
+ case 0x04:
+ writer(1, "\\x04"_sd);
+ break;
+ case 0x05:
+ writer(1, "\\x05"_sd);
+ break;
+ case 0x06:
+ writer(1, "\\x06"_sd);
+ break;
+ case 0x07:
+ writer(1, "\\a"_sd);
+ break;
+ case 0x08:
+ writer(1, "\\b"_sd);
+ break;
+ case 0x09:
+ writer(1, "\\t"_sd);
+ break;
+ case 0x0a:
+ writer(1, "\\n"_sd);
+ break;
+ case 0x0b:
+ writer(1, "\\v"_sd);
+ break;
+ case 0x0c:
+ writer(1, "\\f"_sd);
+ break;
+ case 0x0d:
+ writer(1, "\\r"_sd);
+ break;
+ case 0x0e:
+ writer(1, "\\x0e"_sd);
+ break;
+ case 0x0f:
+ writer(1, "\\x0f"_sd);
+ break;
+ case 0x10:
+ writer(1, "\\x10"_sd);
+ break;
+ case 0x11:
+ writer(1, "\\x11"_sd);
+ break;
+ case 0x12:
+ writer(1, "\\x12"_sd);
+ break;
+ case 0x13:
+ writer(1, "\\x13"_sd);
+ break;
+ case 0x14:
+ writer(1, "\\x14"_sd);
+ break;
+ case 0x15:
+ writer(1, "\\x15"_sd);
+ break;
+ case 0x16:
+ writer(1, "\\x16"_sd);
+ break;
+ case 0x17:
+ writer(1, "\\x17"_sd);
+ break;
+ case 0x18:
+ writer(1, "\\x18"_sd);
+ break;
+ case 0x19:
+ writer(1, "\\x19"_sd);
+ break;
+ case 0x1a:
+ writer(1, "\\x1a"_sd);
+ break;
+ case 0x1b:
+ writer(1, "\\e"_sd);
+ break;
+ case 0x1c:
+ writer(1, "\\x1c"_sd);
+ break;
+ case 0x1d:
+ writer(1, "\\x1d"_sd);
+ break;
+ case 0x1e:
+ writer(1, "\\x1e"_sd);
+ break;
+ case 0x1f:
+ writer(1, "\\x1f"_sd);
+ break;
+ case '\\':
+ writer(1, "\\\\"_sd);
+ break;
+ case 0x7f:
+ writer(1, "\\x7f"_sd);
+ break;
+ default:
+ break;
+ }
+ },
+ [](const auto& writer, uint8_t unescaped) {
+ std::array<char, 4> buffer = {
+ '\\', 'x', kHexChar[unescaped >> 4], kHexChar[unescaped & 0xf]};
+ writer(1, StringData(buffer.data(), buffer.size()));
+ },
+ [](const auto& writer, uint8_t first, uint8_t second) {
+ std::array<char, 8> buffer = {'\\',
+ 'x',
+ kHexChar[first >> 4],
+ kHexChar[first & 0xf],
+ '\\',
+ 'x',
+ kHexChar[second >> 4],
+ kHexChar[second & 0xf]};
+ writer(2, StringData(buffer.data(), buffer.size()));
+ }
+
+ );
+}
+std::string escapeForJSON(StringData str) {
+ return escape(str,
+ [](const auto& writer, uint8_t unescaped) {
+ switch (unescaped) {
+ case '\0':
+ writer(1, "\\u0000"_sd);
+ break;
+ case 0x01:
+ writer(1, "\\u0001"_sd);
+ break;
+ case 0x02:
+ writer(1, "\\u0002"_sd);
+ break;
+ case 0x03:
+ writer(1, "\\u0003"_sd);
+ break;
+ case 0x04:
+ writer(1, "\\u0004"_sd);
+ break;
+ case 0x05:
+ writer(1, "\\u0005"_sd);
+ break;
+ case 0x06:
+ writer(1, "\\u0006"_sd);
+ break;
+ case 0x07:
+ writer(1, "\\u0007"_sd);
+ break;
+ case 0x08:
+ writer(1, "\\b"_sd);
+ break;
+ case 0x09:
+ writer(1, "\\t"_sd);
+ break;
+ case 0x0a:
+ writer(1, "\\n"_sd);
+ break;
+ case 0x0b:
+ writer(1, "\\u000b"_sd);
+ break;
+ case 0x0c:
+ writer(1, "\\f"_sd);
+ break;
+ case 0x0d:
+ writer(1, "\\r"_sd);
+ break;
+ case 0x0e:
+ writer(1, "\\u000e"_sd);
+ break;
+ case 0x0f:
+ writer(1, "\\u000f"_sd);
+ break;
+ case 0x10:
+ writer(1, "\\u0010"_sd);
+ break;
+ case 0x11:
+ writer(1, "\\u0011"_sd);
+ break;
+ case 0x12:
+ writer(1, "\\u0012"_sd);
+ break;
+ case 0x13:
+ writer(1, "\\u0013"_sd);
+ break;
+ case 0x14:
+ writer(1, "\\u0014"_sd);
+ break;
+ case 0x15:
+ writer(1, "\\u0015"_sd);
+ break;
+ case 0x16:
+ writer(1, "\\u0016"_sd);
+ break;
+ case 0x17:
+ writer(1, "\\u0017"_sd);
+ break;
+ case 0x18:
+ writer(1, "\\u0018"_sd);
+ break;
+ case 0x19:
+ writer(1, "\\u0019"_sd);
+ break;
+ case 0x1a:
+ writer(1, "\\u001a"_sd);
+ break;
+ case 0x1b:
+ writer(1, "\\u001b"_sd);
+ break;
+ case 0x1c:
+ writer(1, "\\u000c"_sd);
+ break;
+ case 0x1d:
+ writer(1, "\\u001d"_sd);
+ break;
+ case 0x1e:
+ writer(1, "\\u001e"_sd);
+ break;
+ case 0x1f:
+ writer(1, "\\u001f"_sd);
+ break;
+ case '\\':
+ writer(1, "\\\\"_sd);
+ break;
+ case '\"':
+ writer(1, "\\\""_sd);
+ break;
+ case 0x7f:
+ writer(1, "\\u007f"_sd);
+ break;
+ default:
+ break;
+ }
+ },
+ [](const auto& writer, uint8_t unescaped) {
+ std::array<char, 6> buffer = {
+ '\\', 'u', '0', '0', kHexChar[unescaped >> 4], kHexChar[unescaped & 0xf]};
+ writer(1, StringData(buffer.data(), buffer.size()));
+ },
+ [](const auto& writer, uint8_t first, uint8_t second) {
+ std::array<char, 6> buffer = {'\\',
+ 'u',
+ kHexChar[first >> 4],
+ kHexChar[first & 0xf],
+ kHexChar[second >> 4],
+ kHexChar[second & 0xf]};
+ writer(2, StringData(buffer.data(), buffer.size()));
+ });
+}
+} // namespace mongo::logv2
diff --git a/src/mongo/logv2/string_escape.h b/src/mongo/logv2/string_escape.h
new file mode 100644
index 00000000000..5cf392e32b6
--- /dev/null
+++ b/src/mongo/logv2/string_escape.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright (C) 2019-present MongoDB, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the Server Side Public License, version 1,
+ * as published by MongoDB, Inc.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * Server Side Public License for more details.
+ *
+ * You should have received a copy of the Server Side Public License
+ * along with this program. If not, see
+ * <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ * As a special exception, the copyright holders give permission to link the
+ * code of portions of this program with the OpenSSL library under certain
+ * conditions as described in each individual source file and distribute
+ * linked combinations including the program with the OpenSSL library. You
+ * must comply with the Server Side Public License in all respects for
+ * all of the code used other than as permitted herein. If you modify file(s)
+ * with this exception, you may extend this exception to your version of the
+ * file(s), but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version. If you delete this
+ * exception statement from all source files in the program, then also delete
+ * it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/string_data.h"
+
+#include <string>
+
+namespace mongo::logv2 {
+std::string escapeForText(StringData str);
+std::string escapeForJSON(StringData str);
+} // namespace mongo::logv2