// linenoise_utf8.h
/*
* Copyright 2012 10gen Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see .
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects
* for all of the code used other than as permitted herein. If you modify
* file(s) with this exception, you may extend this exception to your
* version of the file(s), but you are not obligated to do so. If you do not
* wish to do so, delete this exception statement from your version. If you
* delete this exception statement from all source files in the program,
* then also delete it in the license file.
*/
#include
#include
#include
namespace linenoise_utf8 {
typedef unsigned char UChar8; // UTF-8 octet
typedef unsigned int UChar32; // Unicode code point
// Error bits (or-ed together) returned from utf8toUChar32string
//
enum BadUTF8 {
BadUTF8_no_error = 0x00,
BadUTF8_invalid_byte = 0x01,
BadUTF8_surrogate = 0x02
};
/**
* Convert a null terminated UTF-8 std::string from UTF-8 and store it in a UChar32 destination buffer
* Always null terminates the destination std::string if at least one character position is available
* Errors in the UTF-8 encoding will be handled in two ways: the erroneous characters will be
* converted to the Unicode error character U+FFFD and flag bits will be set in the conversionErrorCode
* int.
*
* @param uchar32output Destination UChar32 buffer
* @param utf8input Source UTF-8 string
* @param outputBufferSizeInCharacters Destination buffer size in characters
* @param outputUnicodeCharacterCount Number of UChar32 characters placed in output buffer
* @param conversionErrorCode Flag bits from enum BadUTF8, or zero if no error
*/
void copyString8to32(
UChar32* uchar32output,
const UChar8* utf8input,
size_t outputBufferSizeInCharacters,
size_t & outputUnicodeCharacterCount,
int & conversionErrorCode );
/**
* Copy a null terminated UChar32 std::string to a UChar32 destination buffer
* Always null terminates the destination std::string if at least one character position is available
*
* @param dest32 Destination UChar32 buffer
* @param source32 Source UChar32 string
* @param destLengthInCharacters Destination buffer length in characters
*/
void copyString32( UChar32* dest32, const UChar32* source32, size_t destLengthInCharacters );
/**
* Convert a specified number of UChar32 characters from a possibly null terminated UChar32 std::string to UTF-8
* and store it in a UChar8 destination buffer
* Always null terminates the destination std::string if at least one character position is available
*
* @param dest8 Destination UChar8 buffer
* @param source32 Source UChar32 string
* @param outputBufferSizeInBytes Destination buffer size in bytes
* @param charCount Maximum number of UChar32 characters to process
* @return Count of bytes written to output buffer, not including null terminator
*/
size_t copyString32to8counted( UChar8* dest8, const UChar32* source32, size_t outputBufferSizeInBytes, size_t charCount );
/**
* Convert a null terminated UChar32 std::string to UTF-8 and store it in a UChar8 destination buffer
* Always null terminates the destination std::string if at least one character position is available
*
* @param dest8 Destination UChar8 buffer
* @param source32 Source UChar32 string
* @param outputBufferSizeInBytes Destination buffer size in bytes
* @return Count of bytes written to output buffer, not including null terminator
*/
size_t copyString32to8( UChar8* dest8, const UChar32* source32, size_t outputBufferSizeInBytes );
/**
* Count characters (i.e. Unicode code points, array elements) in a null terminated UChar32 string
*
* @param str32 Source UChar32 string
* @return std::string length in characters
*/
size_t strlen32( const UChar32* str32 );
/**
* Compare two UChar32 null-terminated strings with length parameter
*
* @param first32 First std::string to compare
* @param second32 Second std::string to compare
* @param length Maximum number of characters to compare
* @return Negative if first < second, positive if first > second, zero if equal
*/
int strncmp32( UChar32* first32, UChar32* second32, size_t length );
/**
* Internally convert an array of UChar32 characters of specified length to UTF-8 and write it to fileHandle
*
* @param fileHandle File handle to write to
* @param string32 Source UChar32 character array, may not be null terminated
* @param sourceLengthInCharacters Number of source characters to convert and write
* @return Number of bytes written, -1 on error
*/
int write32( int fileHandle, const UChar32* string32, unsigned int sourceLengthInCharacters );
/**
* Template and classes for UChar8 and UChar32 strings
*/
template
struct UtfStringMixin {
typedef char_type char_t; // inherited
UtfStringMixin() : _len( 0 ), _cap( 0 ), _chars( 0 ) {}
UtfStringMixin( const UtfStringMixin& other ) // copies like std::string
:_len( other._len ), _cap( other._len+1 ), _chars( other._chars ), _str( new char_t[_cap] )
{
memcpy( _str.get(), other._str.get(), _cap * sizeof( char_t ) );
}
UtfStringMixin& operator= (UtfStringMixin copy) {
this->swap( copy );
return *this;
}
char_t* get() const { return _str.get(); }
char_t& operator[](size_t idx) { return _str[idx]; }
const char_t& operator[](size_t idx) const { return _str[idx]; }
size_t length() const { return _len; }
size_t capacity() const { return _cap; }
size_t chars() const { return _chars; }
void swap( UtfStringMixin& other ) {
std::swap( _len, other._len );
std::swap( _cap, other._cap );
std::swap( _chars, other._chars );
_str.swap( other._str );
}
protected:
size_t _len; // in units of char_t without nul
size_t _cap; // size of _str buffer including nul
size_t _chars; // number of codepoints
boost::scoped_array _str;
};
struct Utf32String;
struct Utf8String : public UtfStringMixin {
Utf8String() {}
explicit Utf8String( const UChar32* s, int chars = -1 ) {
if ( chars == -1 ) {
initFrom32( s, strlen32( s ) );
}
else {
initFrom32( s, chars );
}
}
explicit Utf8String( const Utf32String& c ); // defined after utf32String
private:
void initFrom32( const UChar32* s, int chars ) {
_chars = chars;
_cap = _chars * sizeof( UChar32 ) + 1;
_str.reset( new char_t[_cap] );
_len = copyString32to8counted( _str.get(), s, _cap, chars );
}
};
struct Utf32String : public UtfStringMixin {
Utf32String() {}
explicit Utf32String( const UChar32* s ) {
_chars = _len = strlen32( s );
_cap = _len + 1;
_str.reset( new UChar32[_cap] );
memcpy( _str.get(), s, _cap * sizeof( UChar32 ) );
}
explicit Utf32String( const UChar32* s, int textLen ) {
_chars = _len = textLen;
_cap = _len + 1;
_str.reset( new UChar32[_cap] );
memcpy( _str.get(), s, _len * sizeof( UChar32 ) );
_str[_len] = 0;
}
explicit Utf32String( const UChar8* s, int chars = -1 ) {
initFrom8( s, chars );
}
explicit Utf32String( const Utf8String& c ) {
initFrom8( c.get(), c.chars() );
}
explicit Utf32String( size_t reserve ) {
_len = 0;
_cap = reserve;
_chars = 0;
_str.reset( new UChar32[_cap] );
_str[0] = 0;
}
void initFromBuffer( void ) {
_chars = _len = strlen32( _str.get() );
}
private:
void initFrom8( const UChar8* s, int chars ) {
Utf32String temp;
if ( chars == -1 ) {
temp._cap = strlen( reinterpret_cast( s ) ) + 1; // worst case ASCII
}
else {
temp._cap = chars + 1;
}
temp._str.reset( new char_t[temp._cap] );
int error;
copyString8to32( temp._str.get(), s, temp._cap, temp._chars, error );
temp._len = temp._chars;
this->swap( temp );
}
};
inline Utf8String::Utf8String( const Utf32String& s ) {
initFrom32( s.get(), s.chars() );
}
} // namespace linenoise_utf8