SERVER-44621 Add string escaping to text formatter in logv2

author: Henrik Edin <henrik.edin@mongodb.com> 2019-11-26 15:29:49 +0000
committer: evergreen <evergreen@mongodb.com> 2019-11-26 15:29:49 +0000
commit: 5e2967253a16233076ffcc64839c4cba4706e1e8 (patch)
tree: 6427fe892f36cfbe68584697e7706af934b9a0b5 /src/mongo
parent: 272c89db8935802eb43535382960dd7fe24326d9 (diff)
download: mongo-5e2967253a16233076ffcc64839c4cba4706e1e8.tar.gz
6 files changed, 524 insertions, 1 deletions
diff --git a/src/mongo/SConscript b/src/mongo/SConscript
index b085df33adf..706d19eab53 100644
--- a/src/mongo/SConscript
+++ b/src/mongo/SConscript
@@ -123,6 +123,7 @@ baseEnv.Library(
         'logv2/log_tag.cpp',
         'logv2/plain_formatter.cpp',
         'logv2/ramlog.cpp',
+        'logv2/string_escape.cpp',
         'logv2/text_formatter.cpp',
         'platform/decimal128.cpp',
         'platform/mutex.cpp',
diff --git a/src/mongo/logv2/constants.h b/src/mongo/logv2/constants.h
index a235bd8081b..7029e8209e8 100644
--- a/src/mongo/logv2/constants.h
+++ b/src/mongo/logv2/constants.h
@@ -35,4 +35,7 @@ namespace mongo::logv2::constants {
 // memory.
 constexpr size_t kNumStaticAttrs = 16;
 
+// Allocate extra space to fit some escape sequences
+constexpr size_t kReservedSpaceForEscaping = 16;
+
 }  // namespace mongo::logv2::constants
diff --git a/src/mongo/logv2/log_test_v2.cpp b/src/mongo/logv2/log_test_v2.cpp
index b0c9afa827b..92d122fd514 100644
--- a/src/mongo/logv2/log_test_v2.cpp
+++ b/src/mongo/logv2/log_test_v2.cpp
@@ -412,6 +412,53 @@ TEST_F(LogTestV2, JSONFormat) {
     ASSERT(log.getField("attr"_sd).Obj().getField("name").String() == t.toString());
 }
 
+TEST_F(LogTestV2, Unicode) {
+    std::vector<std::string> lines;
+    auto sink = LogTestBackend::create(lines);
+    sink->set_filter(ComponentSettingsFilter(LogManager::global().getGlobalDomain(),
+                                             LogManager::global().getGlobalSettings()));
+    sink->set_formatter(PlainFormatter());
+    attach(sink);
+
+    std::pair<StringData, StringData> strs[] = {
+        // Single byte characters that needs to be escaped
+        {"\a\b\f\n\r\t\v\\\0\x7f\x1b"_sd, "\\a\\b\\f\\n\\r\\t\\v\\\\\\0\\x7f\\e"_sd},
+        // multi byte characters that needs to be escaped (unicode control characters)
+        {"\u0080\u009f"_sd, "\\xc2\\x80\\xc2\\x9f"_sd},
+        // Valid 2 Octet sequence, LATIN SMALL LETTER N WITH TILDE
+        {"\u00f1"_sd, "\u00f1"_sd},
+        // Invalid 2 Octet Sequence, result is escaped
+        {"\xc3\x28"_sd, "\\xc3\x28"_sd},
+        // Invalid Sequence Identifier, result is escaped
+        {"\xa0\xa1"_sd, "\\xa0\\xa1"_sd},
+        // Valid 3 Octet sequence, RUNIC LETTER TIWAZ TIR TYR T
+        {"\u16cf"_sd, "\u16cf"_sd},
+        // Invalid 3 Octet Sequence (in 2nd Octet), result is escaped
+        {"\xe2\x28\xa1"_sd, "\\xe2\x28\\xa1"_sd},
+        // Invalid 3 Octet Sequence (in 3rd Octet), result is escaped
+        {"\xe2\x82\x28"_sd, "\\xe2\\x82\x28"_sd},
+        // Valid 4 Octet sequence, GOTHIC LETTER MANNA
+        {"\U0001033c"_sd, "\U0001033c"_sd},
+        // Invalid 4 Octet Sequence (in 2nd Octet), result is escaped
+        {"\xf0\x28\x8c\xbc"_sd, "\\xf0\x28\\x8c\\xbc"_sd},
+        // Invalid 4 Octet Sequence (in 3rd Octet), result is escaped
+        {"\xf0\x90\x28\xbc"_sd, "\\xf0\\x90\x28\\xbc"_sd},
+        // Invalid 4 Octet Sequence (in 4th Octet), result is escaped
+        {"\xf0\x28\x8c\x28"_sd, "\\xf0\x28\\x8c\x28"_sd},
+        // Valid 5 Octet Sequence (but not Unicode!), result is escaped
+        {"\xf8\xa1\xa1\xa1\xa1"_sd, "\\xf8\\xa1\\xa1\\xa1\\xa1"_sd},
+        // Valid 6 Octet Sequence (but not Unicode!), result is escaped
+        {"\xfc\xa1\xa1\xa1\xa1\xa1"_sd, "\\xfc\\xa1\\xa1\\xa1\\xa1\\xa1"_sd},
+        // Invalid 3 Octet sequence, buffer ends prematurely, result is escaped
+        {"\xe2\x82"_sd, "\\xe2\\x82"_sd},
+    };
+
+    for (const auto& pair : strs) {
+        LOGV2("{}", "name"_attr = pair.first);
+        ASSERT_EQUALS(lines.back(), pair.second);
+    }
+}
+
 TEST_F(LogTestV2, Threads) {
     std::vector<std::string> linesPlain;
     auto plainSink = LogTestBackend::create(linesPlain);
diff --git a/src/mongo/logv2/plain_formatter.cpp b/src/mongo/logv2/plain_formatter.cpp
index 235172e6dda..82caf3ecf14 100644
--- a/src/mongo/logv2/plain_formatter.cpp
+++ b/src/mongo/logv2/plain_formatter.cpp
@@ -33,6 +33,7 @@
 #include "mongo/logv2/attribute_storage.h"
 #include "mongo/logv2/attributes.h"
 #include "mongo/logv2/constants.h"
+#include "mongo/logv2/string_escape.h"
 
 #include <boost/container/small_vector.hpp>
 #include <boost/log/attributes/value_extraction.hpp>
@@ -47,7 +48,7 @@ namespace {
 
 struct TextValueExtractor {
     void operator()(StringData name, CustomAttributeValue const& val) {
-        _storage.push_back(val.toString());
+        _storage.push_back(escapeForText(val.toString()));
         operator()(name, _storage.back());
     }
 
@@ -56,6 +57,11 @@ struct TextValueExtractor {
         operator()(name, _storage.back());
     }
 
+    void operator()(StringData name, StringData val) {
+        _storage.push_back(escapeForText(val));
+        operator()(name, _storage.back());
+    }
+
     template <typename T>
     void operator()(StringData name, const T& val) {
         args.push_back(fmt::internal::make_arg<fmt::format_context>(val));
diff --git a/src/mongo/logv2/string_escape.cpp b/src/mongo/logv2/string_escape.cpp
new file mode 100644
index 00000000000..c270c031668
--- /dev/null
+++ b/src/mongo/logv2/string_escape.cpp
@@ -0,0 +1,427 @@
+/**
+ *    Copyright (C) 2019-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/logv2/string_escape.h"
+
+#include "mongo/logv2/constants.h"
+
+#include <algorithm>
+#include <array>
+#include <iterator>
+
+namespace mongo::logv2 {
+namespace {
+constexpr char kHexChar[] = "0123456789abcdef";
+
+// 'singleHandler' Function to write a valid single byte UTF-8 sequence with desired escaping.
+// 'singleEscaper' Function to write a byte of invalid UTF-8 encoding
+// 'twoEscaper' Function to write a valid two byte UTF-8 sequence with desired escaping, for C1
+// control codes.
+// All these functions take a function object as their first parameter to perform the
+// writing of any escaped data. This function expects the number of handled bytes as its first
+// parameter and the corresponding escaped string as the second. They are templates to they can be
+// inlined.
+template <typename SingleByteHandler, typename SingleByteEscaper, typename TwoByteEscaper>
+std::string escape(StringData str,
+                   SingleByteHandler singleHandler,
+                   SingleByteEscaper singleEscaper,
+                   TwoByteEscaper twoEscaper) {
+    std::string escaped;
+    // If input string is over the SSO size and we're going to need to allocate memory, add some
+    // extra to fit a couple of eventual escape sequences.
+    if (str.size() > escaped.capacity())
+        escaped.reserve(str.size() + constants::kReservedSpaceForEscaping);
+
+    // The range [begin, it) contains input that does not need to be escaped and that has not been
+    // written to output yet.
+    // The range [it end) contains remaining input to scan 'begin' is pointing to the beginning of
+    // the input that has not yet been written to 'escaped'.
+    // 'it' is pointing to the beginning of the unicode code point we're currently processing in the
+    // while-loop below. 'end' is the end of the input sequence.
+    auto begin = str.begin();
+    auto it = str.begin();
+    auto end = str.end();
+
+    // Writes an escaped sequence to output after flushing pending input that does not need to be
+    // escaped. 'it' is assumed to be at the beginning of the input sequence represented by the
+    // escaped data.
+    // 'numHandled' the number of bytes of unescaped data being written escaped in 'escapeSequence'
+    auto flushAndWrite = [&](size_t numHandled, StringData escapeSequence) {
+        // Flush range of unmodified input
+        escaped.append(begin, it);
+        begin = it + numHandled;
+
+        // Write escaped data
+        escaped.append(escapeSequence.rawData(), escapeSequence.size());
+    };
+
+    auto isValidCodePoint = [&](auto pos, int len) {
+        return std::distance(pos, end) >= len &&
+            std::all_of(pos + 1, pos + len, [](uint8_t c) { return (c >> 6) == 0b10; });
+    };
+
+    // Helper function to write a valid one byte UTF-8 sequence from the input stream
+    auto writeValid1Byte = [&]() { singleHandler(flushAndWrite, *it); };
+
+    // Helper function to write a valid two byte UTF-8 sequence from the input stream
+    auto writeValid2Byte = [&]() {
+        uint8_t first = *it;
+        uint8_t second = *(it + 1);
+
+        if (MONGO_unlikely(first == 0xc2 && second >= 0x80 && second < 0xa0)) {
+            twoEscaper(flushAndWrite, first, second);
+        }
+    };
+
+    // Helper function to write an invalid UTF-8 sequence from the input stream
+    // Will try and write up to num bytes but bail if we reach the end of the input.
+    // Updates the position of 'it'.
+    auto writeInvalid = [&](uint8_t c) { singleEscaper(flushAndWrite, c); };
+
+
+    while (it != end) {
+        uint8_t c = *it;
+        bool bit7 = (c >> 7) & 1;
+        if (MONGO_likely(!bit7)) {
+            writeValid1Byte();
+            ++it;
+            continue;
+        }
+
+        bool bit6 = (c >> 6) & 1;
+        if (MONGO_unlikely(!bit6)) {
+            writeInvalid(c);
+            ++it;
+            continue;
+        }
+
+        bool bit5 = (c >> 5) & 1;
+        if (!bit5) {
+            // 2 byte sequence
+            if (MONGO_likely(isValidCodePoint(it, 2))) {
+                writeValid2Byte();
+                it += 2;
+            } else {
+                writeInvalid(c);
+                ++it;
+            }
+
+            continue;
+        }
+
+        bool bit4 = (c >> 4) & 1;
+        if (!bit4) {
+            // 3 byte sequence
+            if (MONGO_likely(isValidCodePoint(it, 3))) {
+                it += 3;
+            } else {
+                writeInvalid(c);
+                ++it;
+            }
+            continue;
+        }
+
+        bool bit3 = (c >> 3) & 1;
+        if (bit3) {
+            writeInvalid(c);
+            ++it;
+            continue;
+        }
+
+        // 4 byte sequence
+        if (MONGO_likely(isValidCodePoint(it, 4))) {
+            it += 4;
+        } else {
+            writeInvalid(c);
+            ++it;
+        }
+    }
+    // Write last block
+    escaped.append(begin, it);
+    return escaped;
+}
+}  // namespace
+std::string escapeForText(StringData str) {
+    return escape(str,
+                  [](const auto& writer, uint8_t unescaped) {
+                      switch (unescaped) {
+                          case '\0':
+                              writer(1, "\\0"_sd);
+                              break;
+                          case 0x01:
+                              writer(1, "\\x01"_sd);
+                              break;
+                          case 0x02:
+                              writer(1, "\\x02"_sd);
+                              break;
+                          case 0x03:
+                              writer(1, "\\x03"_sd);
+                              break;
+                          case 0x04:
+                              writer(1, "\\x04"_sd);
+                              break;
+                          case 0x05:
+                              writer(1, "\\x05"_sd);
+                              break;
+                          case 0x06:
+                              writer(1, "\\x06"_sd);
+                              break;
+                          case 0x07:
+                              writer(1, "\\a"_sd);
+                              break;
+                          case 0x08:
+                              writer(1, "\\b"_sd);
+                              break;
+                          case 0x09:
+                              writer(1, "\\t"_sd);
+                              break;
+                          case 0x0a:
+                              writer(1, "\\n"_sd);
+                              break;
+                          case 0x0b:
+                              writer(1, "\\v"_sd);
+                              break;
+                          case 0x0c:
+                              writer(1, "\\f"_sd);
+                              break;
+                          case 0x0d:
+                              writer(1, "\\r"_sd);
+                              break;
+                          case 0x0e:
+                              writer(1, "\\x0e"_sd);
+                              break;
+                          case 0x0f:
+                              writer(1, "\\x0f"_sd);
+                              break;
+                          case 0x10:
+                              writer(1, "\\x10"_sd);
+                              break;
+                          case 0x11:
+                              writer(1, "\\x11"_sd);
+                              break;
+                          case 0x12:
+                              writer(1, "\\x12"_sd);
+                              break;
+                          case 0x13:
+                              writer(1, "\\x13"_sd);
+                              break;
+                          case 0x14:
+                              writer(1, "\\x14"_sd);
+                              break;
+                          case 0x15:
+                              writer(1, "\\x15"_sd);
+                              break;
+                          case 0x16:
+                              writer(1, "\\x16"_sd);
+                              break;
+                          case 0x17:
+                              writer(1, "\\x17"_sd);
+                              break;
+                          case 0x18:
+                              writer(1, "\\x18"_sd);
+                              break;
+                          case 0x19:
+                              writer(1, "\\x19"_sd);
+                              break;
+                          case 0x1a:
+                              writer(1, "\\x1a"_sd);
+                              break;
+                          case 0x1b:
+                              writer(1, "\\e"_sd);
+                              break;
+                          case 0x1c:
+                              writer(1, "\\x1c"_sd);
+                              break;
+                          case 0x1d:
+                              writer(1, "\\x1d"_sd);
+                              break;
+                          case 0x1e:
+                              writer(1, "\\x1e"_sd);
+                              break;
+                          case 0x1f:
+                              writer(1, "\\x1f"_sd);
+                              break;
+                          case '\\':
+                              writer(1, "\\\\"_sd);
+                              break;
+                          case 0x7f:
+                              writer(1, "\\x7f"_sd);
+                              break;
+                          default:
+                              break;
+                      }
+                  },
+                  [](const auto& writer, uint8_t unescaped) {
+                      std::array<char, 4> buffer = {
+                          '\\', 'x', kHexChar[unescaped >> 4], kHexChar[unescaped & 0xf]};
+                      writer(1, StringData(buffer.data(), buffer.size()));
+                  },
+                  [](const auto& writer, uint8_t first, uint8_t second) {
+                      std::array<char, 8> buffer = {'\\',
+                                                    'x',
+                                                    kHexChar[first >> 4],
+                                                    kHexChar[first & 0xf],
+                                                    '\\',
+                                                    'x',
+                                                    kHexChar[second >> 4],
+                                                    kHexChar[second & 0xf]};
+                      writer(2, StringData(buffer.data(), buffer.size()));
+                  }
+
+    );
+}
+std::string escapeForJSON(StringData str) {
+    return escape(str,
+                  [](const auto& writer, uint8_t unescaped) {
+                      switch (unescaped) {
+                          case '\0':
+                              writer(1, "\\u0000"_sd);
+                              break;
+                          case 0x01:
+                              writer(1, "\\u0001"_sd);
+                              break;
+                          case 0x02:
+                              writer(1, "\\u0002"_sd);
+                              break;
+                          case 0x03:
+                              writer(1, "\\u0003"_sd);
+                              break;
+                          case 0x04:
+                              writer(1, "\\u0004"_sd);
+                              break;
+                          case 0x05:
+                              writer(1, "\\u0005"_sd);
+                              break;
+                          case 0x06:
+                              writer(1, "\\u0006"_sd);
+                              break;
+                          case 0x07:
+                              writer(1, "\\u0007"_sd);
+                              break;
+                          case 0x08:
+                              writer(1, "\\b"_sd);
+                              break;
+                          case 0x09:
+                              writer(1, "\\t"_sd);
+                              break;
+                          case 0x0a:
+                              writer(1, "\\n"_sd);
+                              break;
+                          case 0x0b:
+                              writer(1, "\\u000b"_sd);
+                              break;
+                          case 0x0c:
+                              writer(1, "\\f"_sd);
+                              break;
+                          case 0x0d:
+                              writer(1, "\\r"_sd);
+                              break;
+                          case 0x0e:
+                              writer(1, "\\u000e"_sd);
+                              break;
+                          case 0x0f:
+                              writer(1, "\\u000f"_sd);
+                              break;
+                          case 0x10:
+                              writer(1, "\\u0010"_sd);
+                              break;
+                          case 0x11:
+                              writer(1, "\\u0011"_sd);
+                              break;
+                          case 0x12:
+                              writer(1, "\\u0012"_sd);
+                              break;
+                          case 0x13:
+                              writer(1, "\\u0013"_sd);
+                              break;
+                          case 0x14:
+                              writer(1, "\\u0014"_sd);
+                              break;
+                          case 0x15:
+                              writer(1, "\\u0015"_sd);
+                              break;
+                          case 0x16:
+                              writer(1, "\\u0016"_sd);
+                              break;
+                          case 0x17:
+                              writer(1, "\\u0017"_sd);
+                              break;
+                          case 0x18:
+                              writer(1, "\\u0018"_sd);
+                              break;
+                          case 0x19:
+                              writer(1, "\\u0019"_sd);
+                              break;
+                          case 0x1a:
+                              writer(1, "\\u001a"_sd);
+                              break;
+                          case 0x1b:
+                              writer(1, "\\u001b"_sd);
+                              break;
+                          case 0x1c:
+                              writer(1, "\\u000c"_sd);
+                              break;
+                          case 0x1d:
+                              writer(1, "\\u001d"_sd);
+                              break;
+                          case 0x1e:
+                              writer(1, "\\u001e"_sd);
+                              break;
+                          case 0x1f:
+                              writer(1, "\\u001f"_sd);
+                              break;
+                          case '\\':
+                              writer(1, "\\\\"_sd);
+                              break;
+                          case '\"':
+                              writer(1, "\\\""_sd);
+                              break;
+                          case 0x7f:
+                              writer(1, "\\u007f"_sd);
+                              break;
+                          default:
+                              break;
+                      }
+                  },
+                  [](const auto& writer, uint8_t unescaped) {
+                      std::array<char, 6> buffer = {
+                          '\\', 'u', '0', '0', kHexChar[unescaped >> 4], kHexChar[unescaped & 0xf]};
+                      writer(1, StringData(buffer.data(), buffer.size()));
+                  },
+                  [](const auto& writer, uint8_t first, uint8_t second) {
+                      std::array<char, 6> buffer = {'\\',
+                                                    'u',
+                                                    kHexChar[first >> 4],
+                                                    kHexChar[first & 0xf],
+                                                    kHexChar[second >> 4],
+                                                    kHexChar[second & 0xf]};
+                      writer(2, StringData(buffer.data(), buffer.size()));
+                  });
+}
+}  // namespace mongo::logv2
diff --git a/src/mongo/logv2/string_escape.h b/src/mongo/logv2/string_escape.h
new file mode 100644
index 00000000000..5cf392e32b6
--- /dev/null
+++ b/src/mongo/logv2/string_escape.h
@@ -0,0 +1,39 @@
+/**
+ *    Copyright (C) 2019-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/base/string_data.h"
+
+#include <string>
+
+namespace mongo::logv2 {
+std::string escapeForText(StringData str);
+std::string escapeForJSON(StringData str);
+}  // namespace mongo::logv2
author	Henrik Edin <henrik.edin@mongodb.com>	2019-11-26 15:29:49 +0000
committer	evergreen <evergreen@mongodb.com>	2019-11-26 15:29:49 +0000
commit	5e2967253a16233076ffcc64839c4cba4706e1e8 (patch)
tree	6427fe892f36cfbe68584697e7706af934b9a0b5 /src/mongo
parent	272c89db8935802eb43535382960dd7fe24326d9 (diff)
download	mongo-5e2967253a16233076ffcc64839c4cba4706e1e8.tar.gz