diff options
author | Henrik Edin <henrik.edin@mongodb.com> | 2019-11-26 15:29:49 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-11-26 15:29:49 +0000 |
commit | 5e2967253a16233076ffcc64839c4cba4706e1e8 (patch) | |
tree | 6427fe892f36cfbe68584697e7706af934b9a0b5 /src/mongo | |
parent | 272c89db8935802eb43535382960dd7fe24326d9 (diff) | |
download | mongo-5e2967253a16233076ffcc64839c4cba4706e1e8.tar.gz |
SERVER-44621 Add string escaping to text formatter in logv2
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/logv2/constants.h | 3 | ||||
-rw-r--r-- | src/mongo/logv2/log_test_v2.cpp | 47 | ||||
-rw-r--r-- | src/mongo/logv2/plain_formatter.cpp | 8 | ||||
-rw-r--r-- | src/mongo/logv2/string_escape.cpp | 427 | ||||
-rw-r--r-- | src/mongo/logv2/string_escape.h | 39 |
6 files changed, 524 insertions, 1 deletions
diff --git a/src/mongo/SConscript b/src/mongo/SConscript index b085df33adf..706d19eab53 100644 --- a/src/mongo/SConscript +++ b/src/mongo/SConscript @@ -123,6 +123,7 @@ baseEnv.Library( 'logv2/log_tag.cpp', 'logv2/plain_formatter.cpp', 'logv2/ramlog.cpp', + 'logv2/string_escape.cpp', 'logv2/text_formatter.cpp', 'platform/decimal128.cpp', 'platform/mutex.cpp', diff --git a/src/mongo/logv2/constants.h b/src/mongo/logv2/constants.h index a235bd8081b..7029e8209e8 100644 --- a/src/mongo/logv2/constants.h +++ b/src/mongo/logv2/constants.h @@ -35,4 +35,7 @@ namespace mongo::logv2::constants { // memory. constexpr size_t kNumStaticAttrs = 16; +// Allocate extra space to fit some escape sequences +constexpr size_t kReservedSpaceForEscaping = 16; + } // namespace mongo::logv2::constants diff --git a/src/mongo/logv2/log_test_v2.cpp b/src/mongo/logv2/log_test_v2.cpp index b0c9afa827b..92d122fd514 100644 --- a/src/mongo/logv2/log_test_v2.cpp +++ b/src/mongo/logv2/log_test_v2.cpp @@ -412,6 +412,53 @@ TEST_F(LogTestV2, JSONFormat) { ASSERT(log.getField("attr"_sd).Obj().getField("name").String() == t.toString()); } +TEST_F(LogTestV2, Unicode) { + std::vector<std::string> lines; + auto sink = LogTestBackend::create(lines); + sink->set_filter(ComponentSettingsFilter(LogManager::global().getGlobalDomain(), + LogManager::global().getGlobalSettings())); + sink->set_formatter(PlainFormatter()); + attach(sink); + + std::pair<StringData, StringData> strs[] = { + // Single byte characters that needs to be escaped + {"\a\b\f\n\r\t\v\\\0\x7f\x1b"_sd, "\\a\\b\\f\\n\\r\\t\\v\\\\\\0\\x7f\\e"_sd}, + // multi byte characters that needs to be escaped (unicode control characters) + {"\u0080\u009f"_sd, "\\xc2\\x80\\xc2\\x9f"_sd}, + // Valid 2 Octet sequence, LATIN SMALL LETTER N WITH TILDE + {"\u00f1"_sd, "\u00f1"_sd}, + // Invalid 2 Octet Sequence, result is escaped + {"\xc3\x28"_sd, "\\xc3\x28"_sd}, + // Invalid Sequence Identifier, result is escaped + {"\xa0\xa1"_sd, "\\xa0\\xa1"_sd}, + // Valid 3 Octet sequence, RUNIC LETTER TIWAZ TIR TYR T + {"\u16cf"_sd, "\u16cf"_sd}, + // Invalid 3 Octet Sequence (in 2nd Octet), result is escaped + {"\xe2\x28\xa1"_sd, "\\xe2\x28\\xa1"_sd}, + // Invalid 3 Octet Sequence (in 3rd Octet), result is escaped + {"\xe2\x82\x28"_sd, "\\xe2\\x82\x28"_sd}, + // Valid 4 Octet sequence, GOTHIC LETTER MANNA + {"\U0001033c"_sd, "\U0001033c"_sd}, + // Invalid 4 Octet Sequence (in 2nd Octet), result is escaped + {"\xf0\x28\x8c\xbc"_sd, "\\xf0\x28\\x8c\\xbc"_sd}, + // Invalid 4 Octet Sequence (in 3rd Octet), result is escaped + {"\xf0\x90\x28\xbc"_sd, "\\xf0\\x90\x28\\xbc"_sd}, + // Invalid 4 Octet Sequence (in 4th Octet), result is escaped + {"\xf0\x28\x8c\x28"_sd, "\\xf0\x28\\x8c\x28"_sd}, + // Valid 5 Octet Sequence (but not Unicode!), result is escaped + {"\xf8\xa1\xa1\xa1\xa1"_sd, "\\xf8\\xa1\\xa1\\xa1\\xa1"_sd}, + // Valid 6 Octet Sequence (but not Unicode!), result is escaped + {"\xfc\xa1\xa1\xa1\xa1\xa1"_sd, "\\xfc\\xa1\\xa1\\xa1\\xa1\\xa1"_sd}, + // Invalid 3 Octet sequence, buffer ends prematurely, result is escaped + {"\xe2\x82"_sd, "\\xe2\\x82"_sd}, + }; + + for (const auto& pair : strs) { + LOGV2("{}", "name"_attr = pair.first); + ASSERT_EQUALS(lines.back(), pair.second); + } +} + TEST_F(LogTestV2, Threads) { std::vector<std::string> linesPlain; auto plainSink = LogTestBackend::create(linesPlain); diff --git a/src/mongo/logv2/plain_formatter.cpp b/src/mongo/logv2/plain_formatter.cpp index 235172e6dda..82caf3ecf14 100644 --- a/src/mongo/logv2/plain_formatter.cpp +++ b/src/mongo/logv2/plain_formatter.cpp @@ -33,6 +33,7 @@ #include "mongo/logv2/attribute_storage.h" #include "mongo/logv2/attributes.h" #include "mongo/logv2/constants.h" +#include "mongo/logv2/string_escape.h" #include <boost/container/small_vector.hpp> #include <boost/log/attributes/value_extraction.hpp> @@ -47,7 +48,7 @@ namespace { struct TextValueExtractor { void operator()(StringData name, CustomAttributeValue const& val) { - _storage.push_back(val.toString()); + _storage.push_back(escapeForText(val.toString())); operator()(name, _storage.back()); } @@ -56,6 +57,11 @@ struct TextValueExtractor { operator()(name, _storage.back()); } + void operator()(StringData name, StringData val) { + _storage.push_back(escapeForText(val)); + operator()(name, _storage.back()); + } + template <typename T> void operator()(StringData name, const T& val) { args.push_back(fmt::internal::make_arg<fmt::format_context>(val)); diff --git a/src/mongo/logv2/string_escape.cpp b/src/mongo/logv2/string_escape.cpp new file mode 100644 index 00000000000..c270c031668 --- /dev/null +++ b/src/mongo/logv2/string_escape.cpp @@ -0,0 +1,427 @@ +/** + * Copyright (C) 2019-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/logv2/string_escape.h" + +#include "mongo/logv2/constants.h" + +#include <algorithm> +#include <array> +#include <iterator> + +namespace mongo::logv2 { +namespace { +constexpr char kHexChar[] = "0123456789abcdef"; + +// 'singleHandler' Function to write a valid single byte UTF-8 sequence with desired escaping. +// 'singleEscaper' Function to write a byte of invalid UTF-8 encoding +// 'twoEscaper' Function to write a valid two byte UTF-8 sequence with desired escaping, for C1 +// control codes. +// All these functions take a function object as their first parameter to perform the +// writing of any escaped data. This function expects the number of handled bytes as its first +// parameter and the corresponding escaped string as the second. They are templates to they can be +// inlined. +template <typename SingleByteHandler, typename SingleByteEscaper, typename TwoByteEscaper> +std::string escape(StringData str, + SingleByteHandler singleHandler, + SingleByteEscaper singleEscaper, + TwoByteEscaper twoEscaper) { + std::string escaped; + // If input string is over the SSO size and we're going to need to allocate memory, add some + // extra to fit a couple of eventual escape sequences. + if (str.size() > escaped.capacity()) + escaped.reserve(str.size() + constants::kReservedSpaceForEscaping); + + // The range [begin, it) contains input that does not need to be escaped and that has not been + // written to output yet. + // The range [it end) contains remaining input to scan 'begin' is pointing to the beginning of + // the input that has not yet been written to 'escaped'. + // 'it' is pointing to the beginning of the unicode code point we're currently processing in the + // while-loop below. 'end' is the end of the input sequence. + auto begin = str.begin(); + auto it = str.begin(); + auto end = str.end(); + + // Writes an escaped sequence to output after flushing pending input that does not need to be + // escaped. 'it' is assumed to be at the beginning of the input sequence represented by the + // escaped data. + // 'numHandled' the number of bytes of unescaped data being written escaped in 'escapeSequence' + auto flushAndWrite = [&](size_t numHandled, StringData escapeSequence) { + // Flush range of unmodified input + escaped.append(begin, it); + begin = it + numHandled; + + // Write escaped data + escaped.append(escapeSequence.rawData(), escapeSequence.size()); + }; + + auto isValidCodePoint = [&](auto pos, int len) { + return std::distance(pos, end) >= len && + std::all_of(pos + 1, pos + len, [](uint8_t c) { return (c >> 6) == 0b10; }); + }; + + // Helper function to write a valid one byte UTF-8 sequence from the input stream + auto writeValid1Byte = [&]() { singleHandler(flushAndWrite, *it); }; + + // Helper function to write a valid two byte UTF-8 sequence from the input stream + auto writeValid2Byte = [&]() { + uint8_t first = *it; + uint8_t second = *(it + 1); + + if (MONGO_unlikely(first == 0xc2 && second >= 0x80 && second < 0xa0)) { + twoEscaper(flushAndWrite, first, second); + } + }; + + // Helper function to write an invalid UTF-8 sequence from the input stream + // Will try and write up to num bytes but bail if we reach the end of the input. + // Updates the position of 'it'. + auto writeInvalid = [&](uint8_t c) { singleEscaper(flushAndWrite, c); }; + + + while (it != end) { + uint8_t c = *it; + bool bit7 = (c >> 7) & 1; + if (MONGO_likely(!bit7)) { + writeValid1Byte(); + ++it; + continue; + } + + bool bit6 = (c >> 6) & 1; + if (MONGO_unlikely(!bit6)) { + writeInvalid(c); + ++it; + continue; + } + + bool bit5 = (c >> 5) & 1; + if (!bit5) { + // 2 byte sequence + if (MONGO_likely(isValidCodePoint(it, 2))) { + writeValid2Byte(); + it += 2; + } else { + writeInvalid(c); + ++it; + } + + continue; + } + + bool bit4 = (c >> 4) & 1; + if (!bit4) { + // 3 byte sequence + if (MONGO_likely(isValidCodePoint(it, 3))) { + it += 3; + } else { + writeInvalid(c); + ++it; + } + continue; + } + + bool bit3 = (c >> 3) & 1; + if (bit3) { + writeInvalid(c); + ++it; + continue; + } + + // 4 byte sequence + if (MONGO_likely(isValidCodePoint(it, 4))) { + it += 4; + } else { + writeInvalid(c); + ++it; + } + } + // Write last block + escaped.append(begin, it); + return escaped; +} +} // namespace +std::string escapeForText(StringData str) { + return escape(str, + [](const auto& writer, uint8_t unescaped) { + switch (unescaped) { + case '\0': + writer(1, "\\0"_sd); + break; + case 0x01: + writer(1, "\\x01"_sd); + break; + case 0x02: + writer(1, "\\x02"_sd); + break; + case 0x03: + writer(1, "\\x03"_sd); + break; + case 0x04: + writer(1, "\\x04"_sd); + break; + case 0x05: + writer(1, "\\x05"_sd); + break; + case 0x06: + writer(1, "\\x06"_sd); + break; + case 0x07: + writer(1, "\\a"_sd); + break; + case 0x08: + writer(1, "\\b"_sd); + break; + case 0x09: + writer(1, "\\t"_sd); + break; + case 0x0a: + writer(1, "\\n"_sd); + break; + case 0x0b: + writer(1, "\\v"_sd); + break; + case 0x0c: + writer(1, "\\f"_sd); + break; + case 0x0d: + writer(1, "\\r"_sd); + break; + case 0x0e: + writer(1, "\\x0e"_sd); + break; + case 0x0f: + writer(1, "\\x0f"_sd); + break; + case 0x10: + writer(1, "\\x10"_sd); + break; + case 0x11: + writer(1, "\\x11"_sd); + break; + case 0x12: + writer(1, "\\x12"_sd); + break; + case 0x13: + writer(1, "\\x13"_sd); + break; + case 0x14: + writer(1, "\\x14"_sd); + break; + case 0x15: + writer(1, "\\x15"_sd); + break; + case 0x16: + writer(1, "\\x16"_sd); + break; + case 0x17: + writer(1, "\\x17"_sd); + break; + case 0x18: + writer(1, "\\x18"_sd); + break; + case 0x19: + writer(1, "\\x19"_sd); + break; + case 0x1a: + writer(1, "\\x1a"_sd); + break; + case 0x1b: + writer(1, "\\e"_sd); + break; + case 0x1c: + writer(1, "\\x1c"_sd); + break; + case 0x1d: + writer(1, "\\x1d"_sd); + break; + case 0x1e: + writer(1, "\\x1e"_sd); + break; + case 0x1f: + writer(1, "\\x1f"_sd); + break; + case '\\': + writer(1, "\\\\"_sd); + break; + case 0x7f: + writer(1, "\\x7f"_sd); + break; + default: + break; + } + }, + [](const auto& writer, uint8_t unescaped) { + std::array<char, 4> buffer = { + '\\', 'x', kHexChar[unescaped >> 4], kHexChar[unescaped & 0xf]}; + writer(1, StringData(buffer.data(), buffer.size())); + }, + [](const auto& writer, uint8_t first, uint8_t second) { + std::array<char, 8> buffer = {'\\', + 'x', + kHexChar[first >> 4], + kHexChar[first & 0xf], + '\\', + 'x', + kHexChar[second >> 4], + kHexChar[second & 0xf]}; + writer(2, StringData(buffer.data(), buffer.size())); + } + + ); +} +std::string escapeForJSON(StringData str) { + return escape(str, + [](const auto& writer, uint8_t unescaped) { + switch (unescaped) { + case '\0': + writer(1, "\\u0000"_sd); + break; + case 0x01: + writer(1, "\\u0001"_sd); + break; + case 0x02: + writer(1, "\\u0002"_sd); + break; + case 0x03: + writer(1, "\\u0003"_sd); + break; + case 0x04: + writer(1, "\\u0004"_sd); + break; + case 0x05: + writer(1, "\\u0005"_sd); + break; + case 0x06: + writer(1, "\\u0006"_sd); + break; + case 0x07: + writer(1, "\\u0007"_sd); + break; + case 0x08: + writer(1, "\\b"_sd); + break; + case 0x09: + writer(1, "\\t"_sd); + break; + case 0x0a: + writer(1, "\\n"_sd); + break; + case 0x0b: + writer(1, "\\u000b"_sd); + break; + case 0x0c: + writer(1, "\\f"_sd); + break; + case 0x0d: + writer(1, "\\r"_sd); + break; + case 0x0e: + writer(1, "\\u000e"_sd); + break; + case 0x0f: + writer(1, "\\u000f"_sd); + break; + case 0x10: + writer(1, "\\u0010"_sd); + break; + case 0x11: + writer(1, "\\u0011"_sd); + break; + case 0x12: + writer(1, "\\u0012"_sd); + break; + case 0x13: + writer(1, "\\u0013"_sd); + break; + case 0x14: + writer(1, "\\u0014"_sd); + break; + case 0x15: + writer(1, "\\u0015"_sd); + break; + case 0x16: + writer(1, "\\u0016"_sd); + break; + case 0x17: + writer(1, "\\u0017"_sd); + break; + case 0x18: + writer(1, "\\u0018"_sd); + break; + case 0x19: + writer(1, "\\u0019"_sd); + break; + case 0x1a: + writer(1, "\\u001a"_sd); + break; + case 0x1b: + writer(1, "\\u001b"_sd); + break; + case 0x1c: + writer(1, "\\u000c"_sd); + break; + case 0x1d: + writer(1, "\\u001d"_sd); + break; + case 0x1e: + writer(1, "\\u001e"_sd); + break; + case 0x1f: + writer(1, "\\u001f"_sd); + break; + case '\\': + writer(1, "\\\\"_sd); + break; + case '\"': + writer(1, "\\\""_sd); + break; + case 0x7f: + writer(1, "\\u007f"_sd); + break; + default: + break; + } + }, + [](const auto& writer, uint8_t unescaped) { + std::array<char, 6> buffer = { + '\\', 'u', '0', '0', kHexChar[unescaped >> 4], kHexChar[unescaped & 0xf]}; + writer(1, StringData(buffer.data(), buffer.size())); + }, + [](const auto& writer, uint8_t first, uint8_t second) { + std::array<char, 6> buffer = {'\\', + 'u', + kHexChar[first >> 4], + kHexChar[first & 0xf], + kHexChar[second >> 4], + kHexChar[second & 0xf]}; + writer(2, StringData(buffer.data(), buffer.size())); + }); +} +} // namespace mongo::logv2 diff --git a/src/mongo/logv2/string_escape.h b/src/mongo/logv2/string_escape.h new file mode 100644 index 00000000000..5cf392e32b6 --- /dev/null +++ b/src/mongo/logv2/string_escape.h @@ -0,0 +1,39 @@ +/** + * Copyright (C) 2019-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * <http://www.mongodb.com/licensing/server-side-public-license>. + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/base/string_data.h" + +#include <string> + +namespace mongo::logv2 { +std::string escapeForText(StringData str); +std::string escapeForJSON(StringData str); +} // namespace mongo::logv2 |