/** * Copyright (C) 2019-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/util/str_escape.h" #include #include #include namespace mongo::str { namespace { constexpr char kHexChar[] = "0123456789abcdef"; // 'singleHandler' Function to write a valid single byte UTF-8 sequence with desired escaping. // 'invalidByteHandler' Function to write a byte of invalid UTF-8 encoding // 'twoEscaper' Function to write a valid two byte UTF-8 sequence with desired escaping, for C1 // control codes. // All these functions take a function object as their first parameter to perform the // writing of any escaped data. This function expects the number of handled bytes as its first // parameter and the corresponding escaped string as the second. They are templates to they can be // inlined. template void escape(fmt::memory_buffer& buffer, StringData str, SingleByteHandler singleHandler, InvalidByteHandler invalidByteHandler, TwoByteEscaper twoEscaper) { // The range [begin, it) contains input that does not need to be escaped and that has not been // written to output yet. // The range [it end) contains remaining input to scan 'begin' is pointing to the beginning of // the input that has not yet been written to 'escaped'. // 'it' is pointing to the beginning of the unicode code point we're currently processing in the // while-loop below. 'end' is the end of the input sequence. auto begin = str.begin(); auto it = str.begin(); auto end = str.end(); // Writes an escaped sequence to output after flushing pending input that does not need to be // escaped. 'it' is assumed to be at the beginning of the input sequence represented by the // escaped data. // 'numHandled' the number of bytes of unescaped data being written escaped in 'escapeSequence' auto flushAndWrite = [&](size_t numHandled, StringData escapeSequence) { // Flush range of unmodified input buffer.append(begin, it); begin = it + numHandled; // Write escaped data buffer.append(escapeSequence.rawData(), escapeSequence.rawData() + escapeSequence.size()); }; auto isValidCodePoint = [&](auto pos, int len) { return std::distance(pos, end) >= len && std::all_of(pos + 1, pos + len, [](uint8_t c) { return (c >> 6) == 0b10; }); }; // Helper function to write a valid one byte UTF-8 sequence from the input stream auto writeValid1Byte = [&]() { singleHandler(flushAndWrite, *it); }; // Helper function to write a valid two byte UTF-8 sequence from the input stream auto writeValid2Byte = [&]() { uint8_t first = *it; uint8_t second = *(it + 1); if (MONGO_unlikely(first == 0xc2 && second >= 0x80 && second < 0xa0)) { twoEscaper(flushAndWrite, first, second); } }; // Helper function to write an invalid UTF-8 sequence from the input stream // Will try and write up to num bytes but bail if we reach the end of the input. // Updates the position of 'it'. auto writeInvalid = [&](uint8_t c) { invalidByteHandler(flushAndWrite, c); }; while (it != end) { uint8_t c = *it; bool bit7 = (c >> 7) & 1; if (MONGO_likely(!bit7)) { writeValid1Byte(); ++it; continue; } bool bit6 = (c >> 6) & 1; if (MONGO_unlikely(!bit6)) { writeInvalid(c); ++it; continue; } bool bit5 = (c >> 5) & 1; if (!bit5) { // 2 byte sequence if (MONGO_likely(isValidCodePoint(it, 2))) { writeValid2Byte(); it += 2; } else { writeInvalid(c); ++it; } continue; } bool bit4 = (c >> 4) & 1; if (!bit4) { // 3 byte sequence if (MONGO_likely(isValidCodePoint(it, 3))) { it += 3; } else { writeInvalid(c); ++it; } continue; } bool bit3 = (c >> 3) & 1; if (bit3) { writeInvalid(c); ++it; continue; } // 4 byte sequence if (MONGO_likely(isValidCodePoint(it, 4))) { it += 4; } else { writeInvalid(c); ++it; } } // Write last block buffer.append(begin, it); } } // namespace void escapeForText(fmt::memory_buffer& buffer, StringData str) { auto singleByteHandler = [](const auto& writer, uint8_t unescaped) { switch (unescaped) { case '\0': writer(1, "\\0"_sd); break; case 0x01: writer(1, "\\x01"_sd); break; case 0x02: writer(1, "\\x02"_sd); break; case 0x03: writer(1, "\\x03"_sd); break; case 0x04: writer(1, "\\x04"_sd); break; case 0x05: writer(1, "\\x05"_sd); break; case 0x06: writer(1, "\\x06"_sd); break; case 0x07: writer(1, "\\a"_sd); break; case 0x08: writer(1, "\\b"_sd); break; case 0x09: writer(1, "\\t"_sd); break; case 0x0a: writer(1, "\\n"_sd); break; case 0x0b: writer(1, "\\v"_sd); break; case 0x0c: writer(1, "\\f"_sd); break; case 0x0d: writer(1, "\\r"_sd); break; case 0x0e: writer(1, "\\x0e"_sd); break; case 0x0f: writer(1, "\\x0f"_sd); break; case 0x10: writer(1, "\\x10"_sd); break; case 0x11: writer(1, "\\x11"_sd); break; case 0x12: writer(1, "\\x12"_sd); break; case 0x13: writer(1, "\\x13"_sd); break; case 0x14: writer(1, "\\x14"_sd); break; case 0x15: writer(1, "\\x15"_sd); break; case 0x16: writer(1, "\\x16"_sd); break; case 0x17: writer(1, "\\x17"_sd); break; case 0x18: writer(1, "\\x18"_sd); break; case 0x19: writer(1, "\\x19"_sd); break; case 0x1a: writer(1, "\\x1a"_sd); break; case 0x1b: writer(1, "\\e"_sd); break; case 0x1c: writer(1, "\\x1c"_sd); break; case 0x1d: writer(1, "\\x1d"_sd); break; case 0x1e: writer(1, "\\x1e"_sd); break; case 0x1f: writer(1, "\\x1f"_sd); break; case '\\': writer(1, "\\\\"_sd); break; case 0x7f: writer(1, "\\x7f"_sd); break; default: break; } }; auto invalidByteHandler = [](const auto& writer, uint8_t invalid) { std::array buffer = {'\\', 'x', kHexChar[invalid >> 4], kHexChar[invalid & 0xf]}; writer(1, StringData(buffer.data(), buffer.size())); }; auto twoByteEscaper = [](const auto& writer, uint8_t first, uint8_t second) { std::array buffer = {'\\', 'x', kHexChar[first >> 4], kHexChar[first & 0xf], '\\', 'x', kHexChar[second >> 4], kHexChar[second & 0xf]}; writer(2, StringData(buffer.data(), buffer.size())); }; return escape(buffer, str, std::move(singleByteHandler), std::move(invalidByteHandler), std::move(twoByteEscaper)); } std::string escapeForText(StringData str) { fmt::memory_buffer buffer; escapeForText(buffer, str); return fmt::to_string(buffer); } void escapeForJSON(fmt::memory_buffer& buffer, StringData str) { auto singleByteHandler = [](const auto& writer, uint8_t unescaped) { switch (unescaped) { case '\0': writer(1, "\\u0000"_sd); break; case 0x01: writer(1, "\\u0001"_sd); break; case 0x02: writer(1, "\\u0002"_sd); break; case 0x03: writer(1, "\\u0003"_sd); break; case 0x04: writer(1, "\\u0004"_sd); break; case 0x05: writer(1, "\\u0005"_sd); break; case 0x06: writer(1, "\\u0006"_sd); break; case 0x07: writer(1, "\\u0007"_sd); break; case 0x08: writer(1, "\\b"_sd); break; case 0x09: writer(1, "\\t"_sd); break; case 0x0a: writer(1, "\\n"_sd); break; case 0x0b: writer(1, "\\u000b"_sd); break; case 0x0c: writer(1, "\\f"_sd); break; case 0x0d: writer(1, "\\r"_sd); break; case 0x0e: writer(1, "\\u000e"_sd); break; case 0x0f: writer(1, "\\u000f"_sd); break; case 0x10: writer(1, "\\u0010"_sd); break; case 0x11: writer(1, "\\u0011"_sd); break; case 0x12: writer(1, "\\u0012"_sd); break; case 0x13: writer(1, "\\u0013"_sd); break; case 0x14: writer(1, "\\u0014"_sd); break; case 0x15: writer(1, "\\u0015"_sd); break; case 0x16: writer(1, "\\u0016"_sd); break; case 0x17: writer(1, "\\u0017"_sd); break; case 0x18: writer(1, "\\u0018"_sd); break; case 0x19: writer(1, "\\u0019"_sd); break; case 0x1a: writer(1, "\\u001a"_sd); break; case 0x1b: writer(1, "\\u001b"_sd); break; case 0x1c: writer(1, "\\u001c"_sd); break; case 0x1d: writer(1, "\\u001d"_sd); break; case 0x1e: writer(1, "\\u001e"_sd); break; case 0x1f: writer(1, "\\u001f"_sd); break; case '\\': writer(1, "\\\\"_sd); break; case '\"': writer(1, "\\\""_sd); break; case 0x7f: writer(1, "\\u007f"_sd); break; default: break; } }; auto invalidByteHandler = [](const auto& writer, uint8_t) { // Write Unicode Replacement Character when the encoding is bad writer(1, "\\ufffd"_sd); }; auto twoByteEscaper = [](const auto& writer, uint8_t first, uint8_t second) { // Decode the UTF-8 and write the codepoint with \u uint16_t codepoint = ((first & 0b0001'1111) << 6) | (second & 0b0011'1111); std::array buffer = {'\\', 'u', kHexChar[codepoint >> 12], kHexChar[(codepoint >> 8) & 0b0000'1111], kHexChar[(codepoint >> 4) & 0b0000'1111], kHexChar[codepoint & 0b0000'1111]}; writer(2, StringData(buffer.data(), buffer.size())); }; return escape(buffer, str, std::move(singleByteHandler), std::move(invalidByteHandler), std::move(twoByteEscaper)); } std::string escapeForJSON(StringData str) { fmt::memory_buffer buffer; escapeForJSON(buffer, str); return fmt::to_string(buffer); } } // namespace mongo::str