/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include #include #include namespace linenoise_utf8 { typedef unsigned char UChar8; // UTF-8 octet typedef char32_t UChar32; // Unicode code point // Error bits (or-ed together) returned from utf8toUChar32string // enum BadUTF8 { BadUTF8_no_error = 0x00, BadUTF8_invalid_byte = 0x01, BadUTF8_surrogate = 0x02 }; /** * Convert a null terminated UTF-8 std::string from UTF-8 and store it in a UChar32 destination * buffer Always null terminates the destination std::string if at least one character position is * available Errors in the UTF-8 encoding will be handled in two ways: the erroneous characters will * be converted to the Unicode error character U+FFFD and flag bits will be set in the * conversionErrorCode int. * * @param uchar32output Destination UChar32 buffer * @param utf8input Source UTF-8 string * @param outputBufferSizeInCharacters Destination buffer size in characters * @param outputUnicodeCharacterCount Number of UChar32 characters placed in output buffer * @param conversionErrorCode Flag bits from enum BadUTF8, or zero if no error */ void copyString8to32(UChar32* uchar32output, const UChar8* utf8input, size_t outputBufferSizeInCharacters, size_t& outputUnicodeCharacterCount, int& conversionErrorCode); /** * Copy a null terminated UChar32 std::string to a UChar32 destination buffer * Always null terminates the destination std::string if at least one character position is * available * * @param dest32 Destination UChar32 buffer * @param source32 Source UChar32 string * @param destLengthInCharacters Destination buffer length in characters */ void copyString32(UChar32* dest32, const UChar32* source32, size_t destLengthInCharacters); /** * Convert a specified number of UChar32 characters from a possibly null terminated UChar32 * std::string to UTF-8 and store it in a UChar8 destination buffer * Always null terminates the destination std::string if at least one character position is * available * * @param dest8 Destination UChar8 buffer * @param source32 Source UChar32 string * @param outputBufferSizeInBytes Destination buffer size in bytes * @param charCount Maximum number of UChar32 characters to process * @return Count of bytes written to output buffer, not including null * terminator */ size_t copyString32to8counted(UChar8* dest8, const UChar32* source32, size_t outputBufferSizeInBytes, size_t charCount); /** * Convert a null terminated UChar32 std::string to UTF-8 and store it in a UChar8 destination * buffer * Always null terminates the destination std::string if at least one character position is * available * * @param dest8 Destination UChar8 buffer * @param source32 Source UChar32 string * @param outputBufferSizeInBytes Destination buffer size in bytes * @return Count of bytes written to output buffer, not including null * terminator */ size_t copyString32to8(UChar8* dest8, const UChar32* source32, size_t outputBufferSizeInBytes); /** * Count characters (i.e. Unicode code points, array elements) in a null terminated UChar32 string * * @param str32 Source UChar32 string * @return std::string length in characters */ size_t strlen32(const UChar32* str32); /** * Compare two UChar32 null-terminated strings with length parameter * * @param first32 First std::string to compare * @param second32 Second std::string to compare * @param length Maximum number of characters to compare * @return Negative if first < second, positive if first > second, zero if equal */ int strncmp32(UChar32* first32, UChar32* second32, size_t length); /** * Internally convert an array of UChar32 characters of specified length to UTF-8 and write it to * fileHandle * * @param fileHandle File handle to write to * @param string32 Source UChar32 character array, may not be null terminated * @param sourceLengthInCharacters Number of source characters to convert and write * @return Number of bytes written, -1 on error */ int write32(int fileHandle, const UChar32* string32, unsigned int sourceLengthInCharacters); /** * Template and classes for UChar8 and UChar32 strings */ template struct UtfStringMixin { typedef char_type char_t; // inherited UtfStringMixin() : _len(0), _cap(0), _chars(0) {} UtfStringMixin(const UtfStringMixin& other) // copies like std::string : _len(other._len), _cap(other._len + 1), _chars(other._chars), _str(new char_t[_cap]) { memcpy(_str.get(), other._str.get(), _cap * sizeof(char_t)); } UtfStringMixin& operator=(UtfStringMixin copy) { this->swap(copy); return *this; } char_t* get() const { return _str.get(); } char_t& operator[](size_t idx) { return _str[idx]; } const char_t& operator[](size_t idx) const { return _str[idx]; } size_t length() const { return _len; } size_t capacity() const { return _cap; } size_t chars() const { return _chars; } void swap(UtfStringMixin& other) { std::swap(_len, other._len); std::swap(_cap, other._cap); std::swap(_chars, other._chars); _str.swap(other._str); } protected: size_t _len; // in units of char_t without nul size_t _cap; // size of _str buffer including nul size_t _chars; // number of codepoints std::unique_ptr _str; }; struct Utf32String; struct Utf8String : public UtfStringMixin { Utf8String() {} explicit Utf8String(const UChar32* s, int chars = -1) { if (chars == -1) { initFrom32(s, strlen32(s)); } else { initFrom32(s, chars); } } explicit Utf8String(const Utf32String& c); // defined after utf32String private: void initFrom32(const UChar32* s, int chars) { _chars = chars; _cap = _chars * sizeof(UChar32) + 1; _str.reset(new char_t[_cap]); _len = copyString32to8counted(_str.get(), s, _cap, chars); } }; struct Utf32String : public UtfStringMixin { Utf32String() {} explicit Utf32String(const UChar32* s) { _chars = _len = strlen32(s); _cap = _len + 1; _str.reset(new UChar32[_cap]); memcpy(_str.get(), s, _cap * sizeof(UChar32)); } explicit Utf32String(const UChar32* s, int textLen) { _chars = _len = textLen; _cap = _len + 1; _str.reset(new UChar32[_cap]); memcpy(_str.get(), s, _len * sizeof(UChar32)); _str[_len] = 0; } explicit Utf32String(const UChar8* s, int chars = -1) { initFrom8(s, chars); } explicit Utf32String(const Utf8String& c) { initFrom8(c.get(), c.chars()); } explicit Utf32String(size_t reserve) { _len = 0; _cap = reserve; _chars = 0; _str.reset(new UChar32[_cap]); _str[0] = 0; } void initFromBuffer(void) { _chars = _len = strlen32(_str.get()); } private: void initFrom8(const UChar8* s, int chars) { Utf32String temp; if (chars == -1) { temp._cap = strlen(reinterpret_cast(s)) + 1; // worst case ASCII } else { temp._cap = chars + 1; } temp._str.reset(new char_t[temp._cap]); int error; copyString8to32(temp._str.get(), s, temp._cap, temp._chars, error); temp._len = temp._chars; this->swap(temp); } }; inline Utf8String::Utf8String(const Utf32String& s) { initFrom32(s.get(), s.chars()); } } // namespace linenoise_utf8