// linenoise_utf8.h
/*
* Copyright 2012 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects
* for all of the code used other than as permitted herein. If you modify
* file(s) with this exception, you may extend this exception to your
* version of the file(s), but you are not obligated to do so. If you do not
* wish to do so, delete this exception statement from your version. If you
* delete this exception statement from all source files in the program,
* then also delete it in the license file.
*/
#include
#include
#include
namespace linenoise_utf8 {
typedef unsigned char UChar8; // UTF-8 octet
typedef char32_t UChar32; // Unicode code point
// Error bits (or-ed together) returned from utf8toUChar32string
//
enum BadUTF8 { BadUTF8_no_error = 0x00, BadUTF8_invalid_byte = 0x01, BadUTF8_surrogate = 0x02 };
/**
* Convert a null terminated UTF-8 std::string from UTF-8 and store it in a UChar32 destination
* buffer Always null terminates the destination std::string if at least one character position is
* available Errors in the UTF-8 encoding will be handled in two ways: the erroneous characters will
* be converted to the Unicode error character U+FFFD and flag bits will be set in the
* conversionErrorCode int.
*
* @param uchar32output Destination UChar32 buffer
* @param utf8input Source UTF-8 string
* @param outputBufferSizeInCharacters Destination buffer size in characters
* @param outputUnicodeCharacterCount Number of UChar32 characters placed in output buffer
* @param conversionErrorCode Flag bits from enum BadUTF8, or zero if no error
*/
void copyString8to32(UChar32* uchar32output,
const UChar8* utf8input,
size_t outputBufferSizeInCharacters,
size_t& outputUnicodeCharacterCount,
int& conversionErrorCode);
/**
* Copy a null terminated UChar32 std::string to a UChar32 destination buffer
* Always null terminates the destination std::string if at least one character position is
* available
*
* @param dest32 Destination UChar32 buffer
* @param source32 Source UChar32 string
* @param destLengthInCharacters Destination buffer length in characters
*/
void copyString32(UChar32* dest32, const UChar32* source32, size_t destLengthInCharacters);
/**
* Convert a specified number of UChar32 characters from a possibly null terminated UChar32
* std::string to UTF-8 and store it in a UChar8 destination buffer
* Always null terminates the destination std::string if at least one character position is
* available
*
* @param dest8 Destination UChar8 buffer
* @param source32 Source UChar32 string
* @param outputBufferSizeInBytes Destination buffer size in bytes
* @param charCount Maximum number of UChar32 characters to process
* @return Count of bytes written to output buffer, not including null
* terminator
*/
size_t copyString32to8counted(UChar8* dest8,
const UChar32* source32,
size_t outputBufferSizeInBytes,
size_t charCount);
/**
* Convert a null terminated UChar32 std::string to UTF-8 and store it in a UChar8 destination
* buffer
* Always null terminates the destination std::string if at least one character position is
* available
*
* @param dest8 Destination UChar8 buffer
* @param source32 Source UChar32 string
* @param outputBufferSizeInBytes Destination buffer size in bytes
* @return Count of bytes written to output buffer, not including null
* terminator
*/
size_t copyString32to8(UChar8* dest8, const UChar32* source32, size_t outputBufferSizeInBytes);
/**
* Count characters (i.e. Unicode code points, array elements) in a null terminated UChar32 string
*
* @param str32 Source UChar32 string
* @return std::string length in characters
*/
size_t strlen32(const UChar32* str32);
/**
* Compare two UChar32 null-terminated strings with length parameter
*
* @param first32 First std::string to compare
* @param second32 Second std::string to compare
* @param length Maximum number of characters to compare
* @return Negative if first < second, positive if first > second, zero if equal
*/
int strncmp32(UChar32* first32, UChar32* second32, size_t length);
/**
* Internally convert an array of UChar32 characters of specified length to UTF-8 and write it to
* fileHandle
*
* @param fileHandle File handle to write to
* @param string32 Source UChar32 character array, may not be null terminated
* @param sourceLengthInCharacters Number of source characters to convert and write
* @return Number of bytes written, -1 on error
*/
int write32(int fileHandle, const UChar32* string32, unsigned int sourceLengthInCharacters);
/**
* Template and classes for UChar8 and UChar32 strings
*/
template
struct UtfStringMixin {
typedef char_type char_t; // inherited
UtfStringMixin() : _len(0), _cap(0), _chars(0) {}
UtfStringMixin(const UtfStringMixin& other) // copies like std::string
: _len(other._len),
_cap(other._len + 1),
_chars(other._chars),
_str(new char_t[_cap]) {
memcpy(_str.get(), other._str.get(), _cap * sizeof(char_t));
}
UtfStringMixin& operator=(UtfStringMixin copy) {
this->swap(copy);
return *this;
}
char_t* get() const {
return _str.get();
}
char_t& operator[](size_t idx) {
return _str[idx];
}
const char_t& operator[](size_t idx) const {
return _str[idx];
}
size_t length() const {
return _len;
}
size_t capacity() const {
return _cap;
}
size_t chars() const {
return _chars;
}
void swap(UtfStringMixin& other) {
std::swap(_len, other._len);
std::swap(_cap, other._cap);
std::swap(_chars, other._chars);
_str.swap(other._str);
}
protected:
size_t _len; // in units of char_t without nul
size_t _cap; // size of _str buffer including nul
size_t _chars; // number of codepoints
std::unique_ptr _str;
};
struct Utf32String;
struct Utf8String : public UtfStringMixin {
Utf8String() {}
explicit Utf8String(const UChar32* s, int chars = -1) {
if (chars == -1) {
initFrom32(s, strlen32(s));
} else {
initFrom32(s, chars);
}
}
explicit Utf8String(const Utf32String& c); // defined after utf32String
private:
void initFrom32(const UChar32* s, int chars) {
_chars = chars;
_cap = _chars * sizeof(UChar32) + 1;
_str.reset(new char_t[_cap]);
_len = copyString32to8counted(_str.get(), s, _cap, chars);
}
};
struct Utf32String : public UtfStringMixin {
Utf32String() {}
explicit Utf32String(const UChar32* s) {
_chars = _len = strlen32(s);
_cap = _len + 1;
_str.reset(new UChar32[_cap]);
memcpy(_str.get(), s, _cap * sizeof(UChar32));
}
explicit Utf32String(const UChar32* s, int textLen) {
_chars = _len = textLen;
_cap = _len + 1;
_str.reset(new UChar32[_cap]);
memcpy(_str.get(), s, _len * sizeof(UChar32));
_str[_len] = 0;
}
explicit Utf32String(const UChar8* s, int chars = -1) {
initFrom8(s, chars);
}
explicit Utf32String(const Utf8String& c) {
initFrom8(c.get(), c.chars());
}
explicit Utf32String(size_t reserve) {
_len = 0;
_cap = reserve;
_chars = 0;
_str.reset(new UChar32[_cap]);
_str[0] = 0;
}
void initFromBuffer(void) {
_chars = _len = strlen32(_str.get());
}
private:
void initFrom8(const UChar8* s, int chars) {
Utf32String temp;
if (chars == -1) {
temp._cap = strlen(reinterpret_cast(s)) + 1; // worst case ASCII
} else {
temp._cap = chars + 1;
}
temp._str.reset(new char_t[temp._cap]);
int error;
copyString8to32(temp._str.get(), s, temp._cap, temp._chars, error);
temp._len = temp._chars;
this->swap(temp);
}
};
inline Utf8String::Utf8String(const Utf32String& s) {
initFrom32(s.get(), s.chars());
}
} // namespace linenoise_utf8