diff options
Diffstat (limited to 'ACE/ace/UTF16_Encoding_Converter.cpp')
-rw-r--r-- | ACE/ace/UTF16_Encoding_Converter.cpp | 364 |
1 files changed, 364 insertions, 0 deletions
diff --git a/ACE/ace/UTF16_Encoding_Converter.cpp b/ACE/ace/UTF16_Encoding_Converter.cpp new file mode 100644 index 00000000000..d57ab9e5154 --- /dev/null +++ b/ACE/ace/UTF16_Encoding_Converter.cpp @@ -0,0 +1,364 @@ +// $Id$ + +// ====================================================================== +// +// The actual conversion methods are covered by the copyright information +// below. It is not the actual code provided by Unicode, Inc. but is an +// ACE-ified and only slightly modified version. +// Chad Elliott 4/28/2005 +// +// Copyright 2001-2004 Unicode, Inc. +// +// Limitations on Rights to Redistribute This Code +// +// Unicode, Inc. hereby grants the right to freely use the information +// supplied in this file in the creation of products supporting the +// Unicode Standard, and to make copies of this file in any form +// for internal or external distribution as long as this notice +// remains attached. +// +// ====================================================================== + +#include "ace/UTF16_Encoding_Converter.h" + +#if defined (ACE_USES_WCHAR) +#include "ace/OS_NS_stdio.h" +#include "ace/OS_Memory.h" +#include "ace/Min_Max.h" + +#if !defined (__ACE_INLINE__) +#include "ace/UTF16_Encoding_Converter.inl" +#endif /* __ACE_INLINE__ */ + +ACE_BEGIN_VERSIONED_NAMESPACE_DECL + +static const ACE_UINT32 halfShift = 10; +static const ACE_UINT32 halfBase = 0x00010000; +static const ACE_UINT32 halfMask = 0x000003FF; + +static const ACE_UINT32 UNI_SUR_HIGH_START = 0x0000D800; +static const ACE_UINT32 UNI_SUR_HIGH_END = 0x0000DBFF; +static const ACE_UINT32 UNI_SUR_LOW_START = 0x0000DC00; +static const ACE_UINT32 UNI_SUR_LOW_END = 0x0000DFFF; +static const ACE_UINT32 UNI_REPLACEMENT_CHAR = 0x0000FFFD; +static const ACE_UINT32 UNI_MAX_BMP = 0x0000FFFF; +static const ACE_UINT32 UNI_MAX_UTF16 = 0x0010FFFF; + +// Once the bits are split out into bytes of UTF-8, this is a mask OR-ed +// into the first byte, depending on how many bytes follow. There are +// as many entries in this table as there are UTF-8 sequence types. +// (I.e., one byte sequence, two byte... etc.). Remember that sequencs +// for *legal* UTF-8 will be 4 or fewer bytes total. +static const ACE_Byte firstByteMark[7] = { 0x00, 0x00, 0xC0, + 0xE0, 0xF0, 0xF8, 0xFC }; + +// Index into the table below with the first byte of a UTF-8 sequence to +// get the number of trailing bytes that are supposed to follow it. +// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is +// left as-is for anyone who may want to do such conversion, which was +// allowed in earlier algorithms. +static const ACE_Byte trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +// Magic values subtracted from a buffer value during UTF8 conversion. +// This table contains as many values as there might be trailing bytes +// in a UTF-8 sequence. +static const ACE_UINT32 offsetsFromUTF8[6] = { 0x00000000, 0x00003080, + 0x000E2080, 0x03C82080, + 0xFA082080, 0x82082080 }; + + +ACE_UTF16_Encoding_Converter::ACE_UTF16_Encoding_Converter (bool swap) + : swap_ (swap) +{ +} + +ACE_UTF16_Encoding_Converter::~ACE_UTF16_Encoding_Converter (void) +{ +} + +ACE_UTF16_Encoding_Converter::Result +ACE_UTF16_Encoding_Converter::to_utf8 (const void* source, + size_t source_size, + ACE_Byte* target, + size_t target_size, + bool strict) +{ + static const ACE_UINT32 byteMask = 0xBF; + static const ACE_UINT32 byteMark = 0x80; + Result result = CONVERSION_OK; + + ACE_Byte* targetEnd = target + target_size; + const ACE_UINT16* sourceStart = static_cast<const ACE_UINT16*> (source); + const ACE_UINT16* sourceEnd = sourceStart + + (source_size / sizeof (ACE_UINT16)); + + while (sourceStart < sourceEnd) + { + ACE_UINT16 nw = *sourceStart++; + ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_WORD (nw) : nw); + + // If we have a surrogate pair, convert to ACE_UINT32 first. + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) + { + // If the 16 bits following the high surrogate are in the + // sourceStart buffer... + if (sourceStart < sourceEnd) + { + ACE_UINT32 ch2 = (this->swap_ ? ACE_SWAP_WORD (*sourceStart) : + *sourceStart); + // If it's a low surrogate, convert to ACE_UINT32. + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) + { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++sourceStart; + } + else if (strict) + { + // it's an unpaired high surrogate + result = SOURCE_ILLEGAL; + break; + } + } + else + { + // We don't have the 16 bits following the high surrogate. + result = SOURCE_EXHAUSTED; + break; + } + } + else if (strict) + { + // UTF-16 surrogate values are illegal in UTF-32 + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) + { + result = SOURCE_ILLEGAL; + break; + } + } + + // Figure out how many bytes the result will require + unsigned short bytesToWrite = 0; + if (ch < 0x80) + bytesToWrite = 1; + else if (ch < 0x800) + bytesToWrite = 2; + else if (ch < 0x10000) + bytesToWrite = 3; + else if (ch < 0x110000) + bytesToWrite = 4; + else + { + bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) + { + result = TARGET_EXHAUSTED; + break; + } + + // NOTE: Everything falls through for efficiency purposes. + switch (bytesToWrite) + { + case 4: + *--target = (ACE_Byte)((ch | byteMark) & byteMask); + ch >>= 6; + case 3: + *--target = (ACE_Byte)((ch | byteMark) & byteMask); + ch >>= 6; + case 2: + *--target = (ACE_Byte)((ch | byteMark) & byteMask); + ch >>= 6; + case 1: + *--target = (ACE_Byte)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + + return result; +} + +ACE_UTF16_Encoding_Converter::Result +ACE_UTF16_Encoding_Converter::from_utf8 (const ACE_Byte* source, + size_t source_size, + void* target, + size_t target_size, + bool strict) +{ + Result result = CONVERSION_OK; + const ACE_Byte* sourceEnd = source + source_size; + ACE_UINT16* targetStart = static_cast<ACE_UINT16*> (target); + ACE_UINT16* targetEnd = targetStart + target_size; + + while (source < sourceEnd) + { + ACE_UINT32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) + { + result = SOURCE_EXHAUSTED; + break; + } + + // Do this check whether lenient or strict + if (!this->is_legal_utf8 (source, extraBytesToRead + 1)) + { + result = SOURCE_ILLEGAL; + break; + } + + // The cases all fall through. See "Note A" below. + switch (extraBytesToRead) + { + case 5: // remember, illegal UTF-8 + ch += *source++; + ch <<= 6; + case 4: // remember, illegal UTF-8 + ch += *source++; + ch <<= 6; + case 3: + ch += *source++; + ch <<= 6; + case 2: + ch += *source++; + ch <<= 6; + case 1: + ch += *source++; + ch <<= 6; + case 0: + ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (targetStart >= targetEnd) + { + result = TARGET_EXHAUSTED; + break; + } + + if (ch <= UNI_MAX_BMP) // Target is a character <= 0xFFFF + { + // UTF-16 surrogate values are illegal in UTF-32 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) + { + if (strict) + { + result = SOURCE_ILLEGAL; + break; + } + else + { + *targetStart++ = UNI_REPLACEMENT_CHAR; + } + } + else + { + *targetStart++ = (ACE_UINT16)ch; + } + } + else if (ch > UNI_MAX_UTF16) + { + if (strict) + { + result = SOURCE_ILLEGAL; + break; + } + else + { + *targetStart++ = UNI_REPLACEMENT_CHAR; + } + } + else + { + // targetStart is a character in range 0xFFFF - 0x10FFFF. + if (targetStart + 1 >= targetEnd) + { + result = TARGET_EXHAUSTED; + break; + } + ch -= halfBase; + *targetStart++ = (ACE_UINT16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *targetStart++ = (ACE_UINT16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + + return result; +} + +ACE_UTF16_Encoding_Converter* +ACE_UTF16_Encoding_Converter::encoded (const ACE_Byte* source, + size_t source_size) +{ + static const size_t begin = 16; + static const size_t converted = begin * 4; + + ACE_Byte target[converted]; + ACE_UTF16_Encoding_Converter* converter; + ACE_NEW_RETURN (converter, + ACE_UTF16_Encoding_Converter (false), + 0); + if (converter->to_utf8 (source, + ACE_MIN (begin, source_size), + target, + converted) == CONVERSION_OK) + { + return converter; + } + else + { + delete converter; + } + + return 0; +} + +ACE_UINT32 +ACE_UTF16_Encoding_Converter::get_UNI_SUR_HIGH_START (void) +{ + return UNI_SUR_HIGH_START; +} + +ACE_UINT32 +ACE_UTF16_Encoding_Converter::get_UNI_SUR_LOW_END (void) +{ + return UNI_SUR_LOW_END; +} + +ACE_UINT32 +ACE_UTF16_Encoding_Converter::get_UNI_REPLACEMENT_CHAR (void) +{ + return UNI_REPLACEMENT_CHAR; +} + +const ACE_Byte* +ACE_UTF16_Encoding_Converter::get_first_byte_mark (void) +{ + return firstByteMark; +} + +const ACE_Byte* +ACE_UTF16_Encoding_Converter::get_trailing_bytes_for_utf8 (void) +{ + return trailingBytesForUTF8; +} + +const ACE_UINT32* +ACE_UTF16_Encoding_Converter::get_offsets_from_utf8 (void) +{ + return offsetsFromUTF8; +} + +ACE_END_VERSIONED_NAMESPACE_DECL +#endif /* ACE_USES_WCHAR */ |