/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/shell/linenoise_utf8.h" #ifdef _WIN32 #include "mongo/platform/windows_basic.h" #include "mongo/util/text.h" #include #else #include #endif namespace linenoise_utf8 { /** * Convert a null terminated UTF-8 string from UTF-8 and store it in a UChar32 destination buffer * Always null terminates the destination string if at least one character position is available * Errors in the UTF-8 encoding will be handled in two ways: the erroneous characters will be * converted to the Unicode error character U+FFFD and flag bits will be set in the * conversionErrorCode int. * * @param uchar32output Destination UChar32 buffer * @param utf8input Source UTF-8 string * @param outputBufferSizeInCharacters Destination buffer size in characters * @param outputUnicodeCharacterCount Number of UChar32 characters placed in output buffer * @param conversionErrorCode Flag bits from enum BadUTF8, or zero if no error */ void copyString8to32(UChar32* uchar32output, const UChar8* utf8input, size_t outputBufferSizeInCharacters, size_t& outputUnicodeCharacterCount, int& conversionErrorCode) { conversionErrorCode = BadUTF8_no_error; if (outputBufferSizeInCharacters == 0) { outputUnicodeCharacterCount = 0; return; } static const UChar32 errorCharacter = 0xFFFD; const UChar8* pIn = utf8input; UChar32* pOut = uchar32output; UChar32 uchar32; int reducedBufferSize = outputBufferSizeInCharacters - 1; while (*pIn && (pOut - uchar32output) < reducedBufferSize) { // default to error character so we don't set this in 18 places below uchar32 = errorCharacter; if (pIn[0] <= 0x7F) { // 0x00000000 to 0x0000007F uchar32 = pIn[0]; pIn += 1; } else if (pIn[0] <= 0xDF) { // 0x00000080 to 0x000007FF if ((pIn[0] >= 0xC2) && (pIn[1] >= 0x80) && (pIn[1] <= 0xBF)) { uchar32 = ((pIn[0] & 0x1F) << 6) | (pIn[1] & 0x3F); pIn += 2; } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 1; } } else if (pIn[0] == 0xE0) { // 0x00000800 to 0x00000FFF if ((pIn[1] >= 0xA0) && (pIn[1] <= 0xBF)) { if ((pIn[2] >= 0x80) && (pIn[2] <= 0xBF)) { uchar32 = ((pIn[1] & 0x3F) << 6) | (pIn[2] & 0x3F); pIn += 3; } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 2; } } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 1; } } else if (pIn[0] <= 0xEC) { // 0x00001000 to 0x0000CFFF if ((pIn[1] >= 0x80) && (pIn[1] <= 0xBF)) { if ((pIn[2] >= 0x80) && (pIn[2] <= 0xBF)) { uchar32 = ((pIn[0] & 0x0F) << 12) | ((pIn[1] & 0x3F) << 6) | (pIn[2] & 0x3F); pIn += 3; } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 2; } } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 1; } } else if (pIn[0] == 0xED) { // 0x0000D000 to 0x0000D7FF if ((pIn[1] >= 0x80) && (pIn[1] <= 0x9F)) { if ((pIn[2] >= 0x80) && (pIn[2] <= 0xBF)) { uchar32 = (0x0D << 12) | ((pIn[1] & 0x3F) << 6) | (pIn[2] & 0x3F); pIn += 3; } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 2; } } // // 0x0000D800 to 0x0000DFFF -- illegal surrogate value else if ((pIn[1] >= 0x80) && (pIn[1] <= 0xBF)) { if ((pIn[2] >= 0x80) && (pIn[2] <= 0xBF)) { conversionErrorCode |= BadUTF8_surrogate; pIn += 3; } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 2; } } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 1; } } else if (pIn[0] <= 0xEF) { // 0x0000E000 to 0x0000FFFF if ((pIn[1] >= 0x80) && (pIn[1] <= 0xBF)) { if ((pIn[2] >= 0x80) && (pIn[2] <= 0xBF)) { uchar32 = ((pIn[0] & 0x0F) << 12) | ((pIn[1] & 0x3F) << 6) | (pIn[2] & 0x3F); pIn += 3; } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 2; } } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 1; } } else if (pIn[0] == 0xF0) { // 0x00010000 to 0x0003FFFF if ((pIn[1] >= 0x90) && (pIn[1] <= 0xBF)) { if ((pIn[2] >= 0x80) && (pIn[2] <= 0xBF)) { if ((pIn[3] >= 0x80) && (pIn[3] <= 0xBF)) { uchar32 = ((pIn[1] & 0x3F) << 12) | ((pIn[2] & 0x3F) << 6) | (pIn[3] & 0x3F); pIn += 4; } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 3; } } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 2; } } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 1; } } else if (pIn[0] <= 0xF4) { // 0x00040000 to 0x0010FFFF if ((pIn[1] >= 0x80) && (pIn[1] <= 0xBF)) { if ((pIn[2] >= 0x80) && (pIn[2] <= 0xBF)) { if ((pIn[3] >= 0x80) && (pIn[3] <= 0xBF)) { uchar32 = ((pIn[0] & 0x07) << 18) | ((pIn[1] & 0x3F) << 12) | ((pIn[2] & 0x3F) << 6) | (pIn[3] & 0x3F); pIn += 4; } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 3; } } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 2; } } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 1; } } else { conversionErrorCode |= BadUTF8_invalid_byte; pIn += 1; } if (uchar32 != 0xFEFF) { // do not store Byte Order Mark *pOut++ = uchar32; } } *pOut = 0; outputUnicodeCharacterCount = pOut - uchar32output; } /** * Copy a null terminated UChar32 string to a UChar32 destination buffer * Always null terminates the destination string if at least one character position is available * * @param dest32 Destination UChar32 buffer * @param source32 Source UChar32 string * @param destLengthInCharacters Destination buffer length in characters */ void copyString32(UChar32* dest32, const UChar32* source32, size_t destLengthInCharacters) { if (destLengthInCharacters) { while (*source32 && --destLengthInCharacters > 0) { *dest32++ = *source32++; } *dest32 = 0; } } /** * Convert a specified number of UChar32 characters from a possibly null terminated UChar32 string * to UTF-8 and store it in a UChar8 destination buffer * Always null terminates the destination string if at least one character position is available * * @param dest8 Destination UChar8 buffer * @param source32 Source UChar32 string * @param outputBufferSizeInBytes Destination buffer size in bytes * @param charCount Maximum number of UChar32 characters to process * @return Count of bytes written to output buffer, not including null * terminator */ size_t copyString32to8counted(UChar8* dest8, const UChar32* source32, size_t outputBufferSizeInBytes, size_t charCount) { size_t outputUTF8ByteCount = 0; if (outputBufferSizeInBytes) { size_t reducedBufferSize = outputBufferSizeInBytes - 4; while (charCount-- && *source32 && outputUTF8ByteCount < reducedBufferSize) { UChar32 c = *source32++; if (c <= 0x7F) { *dest8++ = c; outputUTF8ByteCount += 1; } else if (c <= 0x7FF) { *dest8++ = 0xC0 | (c >> 6); *dest8++ = 0x80 | (0x3F & c); outputUTF8ByteCount += 2; } else if (c <= 0xFFFF) { *dest8++ = 0xE0 | (c >> 12); *dest8++ = 0x80 | (0x3F & (c >> 6)); *dest8++ = 0x80 | (0x3F & c); outputUTF8ByteCount += 3; } else if (c <= 0x1FFFFF) { *dest8++ = 0xF0 | (c >> 18); *dest8++ = 0x80 | (0x3F & (c >> 12)); *dest8++ = 0x80 | (0x3F & (c >> 6)); *dest8++ = 0x80 | (0x3F & c); outputUTF8ByteCount += 4; } } *dest8 = 0; } return outputUTF8ByteCount; } /** * Convert a null terminated UChar32 string to UTF-8 and store it in a UChar8 destination buffer * Always null terminates the destination string if at least one character position is available * * @param dest8 Destination UChar8 buffer * @param source32 Source UChar32 string * @param outputBufferSizeInBytes Destination buffer size in bytes * @return Count of bytes written to output buffer, not including null * terminator */ size_t copyString32to8(UChar8* dest8, const UChar32* source32, size_t outputBufferSizeInBytes) { return copyString32to8counted(dest8, source32, outputBufferSizeInBytes, 0x7FFFFFFF); } /** * Count characters (i.e. Unicode code points, array elements) in a null terminated UChar32 string * * @param str32 Source UChar32 string * @return String length in characters */ size_t strlen32(const UChar32* str32) { size_t length = 0; while (*str32++) { ++length; } return length; } /** * Compare two UChar32 null-terminated strings with length parameter * * @param first32 First string to compare * @param second32 Second string to compare * @param length Maximum number of characters to compare * @return Negative if first < second, positive if first > second, zero if equal */ int strncmp32(UChar32* first32, UChar32* second32, size_t length) { while (length--) { if (*first32 == 0 || *first32 != *second32) { return *first32 - *second32; } ++first32; ++second32; } return 0; } /** * Internally convert an array of UChar32 characters of specified length to UTF-8 and write it to * fileHandle * * @param fileHandle File handle to write to * @param string32 Source UChar32 characters, may not be null terminated * @param sourceLengthInCharacters Number of source characters to convert and write * @return Number of bytes written, -1 on error */ int write32(int fileHandle, const UChar32* string32, unsigned int sourceLengthInCharacters) { size_t tempBufferBytes = 4 * sourceLengthInCharacters + 1; std::unique_ptr tempCharString(new char[tempBufferBytes]); size_t count = copyString32to8counted(reinterpret_cast(tempCharString.get()), string32, tempBufferBytes, sourceLengthInCharacters); #if defined(_WIN32) if (_isatty(fileHandle)) { bool success = mongo::writeUtf8ToWindowsConsole(tempCharString.get(), count); if (!success) { return -1; } return count; } else { return _write(fileHandle, tempCharString.get(), count); } #else return write(fileHandle, tempCharString.get(), count); #endif } } // namespace linenoise_utf8