diff options
Diffstat (limited to 'ACE/ace/UTF32_Encoding_Converter.cpp')
-rw-r--r-- | ACE/ace/UTF32_Encoding_Converter.cpp | 254 |
1 files changed, 254 insertions, 0 deletions
diff --git a/ACE/ace/UTF32_Encoding_Converter.cpp b/ACE/ace/UTF32_Encoding_Converter.cpp new file mode 100644 index 00000000000..2280232e58a --- /dev/null +++ b/ACE/ace/UTF32_Encoding_Converter.cpp @@ -0,0 +1,254 @@ +// $Id$ + +// ====================================================================== +// +// The actual conversion methods are covered by the copyright information +// below. It is not the actual code provided by Unicode, Inc. but is an +// ACE-ified and only slightly modified version. +// +// Chad Elliott 4/28/2005 +// +// Copyright 2001-2004 Unicode, Inc. +// +// Limitations on Rights to Redistribute This Code +// +// Unicode, Inc. hereby grants the right to freely use the information +// supplied in this file in the creation of products supporting the +// Unicode Standard, and to make copies of this file in any form +// for internal or external distribution as long as this notice +// remains attached. +// +// ====================================================================== + +#include "ace/UTF32_Encoding_Converter.h" + +#if defined (ACE_USES_WCHAR) +#include "ace/OS_NS_stdio.h" +#include "ace/OS_Memory.h" +#include "ace/Min_Max.h" + +ACE_BEGIN_VERSIONED_NAMESPACE_DECL + +static const ACE_UINT32 UNI_MAX_LEGAL_UTF32 = 0x0010FFFF; + +ACE_UTF32_Encoding_Converter::ACE_UTF32_Encoding_Converter (bool swap) + : ACE_UTF16_Encoding_Converter (swap) +{ +} + +ACE_UTF32_Encoding_Converter::~ACE_UTF32_Encoding_Converter (void) +{ +} + +ACE_UTF32_Encoding_Converter::Result +ACE_UTF32_Encoding_Converter::to_utf8 (const void* source, + size_t source_size, + ACE_Byte* target, + size_t target_size, + bool strict) +{ + static const ACE_UINT32 byteMask = 0xBF; + static const ACE_UINT32 byteMark = 0x80; + static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START (); + static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END (); + static const ACE_Byte* firstByteMark = get_first_byte_mark (); + + Result result = CONVERSION_OK; + ACE_Byte* targetEnd = target + target_size; + const ACE_UINT32* sourceStart = static_cast<const ACE_UINT32*> (source); + const ACE_UINT32* sourceEnd = sourceStart + (source_size / sizeof (ACE_UINT32)); + + while (sourceStart < sourceEnd) + { + ACE_UINT32 nw = *sourceStart++; + ACE_UINT32 ch = (this->swap_ ? ACE_SWAP_LONG (nw) : nw); + unsigned short bytesToWrite = 0; + + if (strict) + { + // UTF-16 surrogate values are illegal in UTF-32 + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) + { + result = SOURCE_ILLEGAL; + break; + } + } + + // Figure out how many bytes the result will require. Turn any + // illegally large ACE_UINT32 things (> Plane 17) into replacement + // chars. + if (ch < 0x80) + { + bytesToWrite = 1; + } + else if (ch < 0x800) + { + bytesToWrite = 2; + } + else if (ch < 0x10000) + { + bytesToWrite = 3; + } + else if (ch <= UNI_MAX_LEGAL_UTF32) + { + bytesToWrite = 4; + } + else + { + result = SOURCE_ILLEGAL; + break; + } + + target += bytesToWrite; + if (target > targetEnd) + { + result = TARGET_EXHAUSTED; + break; + } + + // NOTE: everything falls through. + switch (bytesToWrite) + { + case 4: + *--target = (ACE_Byte)((ch | byteMark) & byteMask); + ch >>= 6; + case 3: + *--target = (ACE_Byte)((ch | byteMark) & byteMask); + ch >>= 6; + case 2: + *--target = (ACE_Byte)((ch | byteMark) & byteMask); + ch >>= 6; + case 1: + *--target = (ACE_Byte) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + + return result; +} + +ACE_UTF32_Encoding_Converter::Result +ACE_UTF32_Encoding_Converter::from_utf8 (const ACE_Byte* source, + size_t source_size, + void* target, + size_t target_size, + bool strict) +{ + static const ACE_UINT32 UNI_SUR_HIGH_START = get_UNI_SUR_HIGH_START (); + static const ACE_UINT32 UNI_SUR_LOW_END = get_UNI_SUR_LOW_END (); + static const ACE_UINT32 UNI_REPLACEMENT_CHAR = get_UNI_REPLACEMENT_CHAR (); + static const ACE_Byte* trailingBytesForUTF8 = get_trailing_bytes_for_utf8 (); + static const ACE_UINT32* offsetsFromUTF8 = get_offsets_from_utf8 (); + + Result result = CONVERSION_OK; + const ACE_Byte* sourceEnd = source + source_size; + ACE_UINT32* targetStart = static_cast<ACE_UINT32*> (target); + ACE_UINT32* targetEnd = targetStart + target_size; + + while (source < sourceEnd) + { + ACE_UINT32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) + { + result = SOURCE_EXHAUSTED; + break; + } + + // Do this check whether lenient or strict + if (!this->is_legal_utf8 (source, extraBytesToRead + 1)) + { + result = SOURCE_ILLEGAL; + break; + } + + // The cases all fall through. See "Note A" below. + switch (extraBytesToRead) + { + case 5: + ch += *source++; + ch <<= 6; + case 4: + ch += *source++; + ch <<= 6; + case 3: + ch += *source++; + ch <<= 6; + case 2: + ch += *source++; + ch <<= 6; + case 1: + ch += *source++; + ch <<= 6; + case 0: + ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (targetStart >= targetEnd) + { + result = TARGET_EXHAUSTED; + break; + } + + if (ch <= UNI_MAX_LEGAL_UTF32) + { + // UTF-16 surrogate values are illegal in UTF-32, and anything + // over Plane 17 (> 0x10FFFF) is illegal. + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) + { + if (strict) + { + result = SOURCE_ILLEGAL; + break; + } + else + { + *targetStart++ = UNI_REPLACEMENT_CHAR; + } + } + else + { + *targetStart++ = ch; + } + } + else + { + result = SOURCE_ILLEGAL; + break; + } + } + + return result; +} + +ACE_UTF32_Encoding_Converter* +ACE_UTF32_Encoding_Converter::encoded (const ACE_Byte* source, + size_t source_size) +{ + static const size_t begin = 16; + static const size_t converted = begin * 4; + + ACE_Byte target[converted]; + ACE_UTF32_Encoding_Converter* converter = 0; + ACE_NEW_RETURN (converter, + ACE_UTF32_Encoding_Converter (false), + 0); + + if (converter->to_utf8 (source, + ACE_MIN (begin, source_size), + target, + converted) == CONVERSION_OK) + { + return converter; + } + else + { + delete converter; + } + + return 0; +} + +ACE_END_VERSIONED_NAMESPACE_DECL +#endif /* ACE_USES_WCHAR */ |