diff options
Diffstat (limited to 'TAO/tests/CodeSets/libs/UCS4_UTF16/WUCS4_UTF16.cpp')
-rw-r--r-- | TAO/tests/CodeSets/libs/UCS4_UTF16/WUCS4_UTF16.cpp | 587 |
1 files changed, 0 insertions, 587 deletions
diff --git a/TAO/tests/CodeSets/libs/UCS4_UTF16/WUCS4_UTF16.cpp b/TAO/tests/CodeSets/libs/UCS4_UTF16/WUCS4_UTF16.cpp deleted file mode 100644 index bbd19acada2..00000000000 --- a/TAO/tests/CodeSets/libs/UCS4_UTF16/WUCS4_UTF16.cpp +++ /dev/null @@ -1,587 +0,0 @@ -// -*- C++ -*- -// $Id$ - -// ============================================================================ -// -// = LIBRARY -// ace -// -// = FILENAME -// WUCS4_UTF16.cpp -// -// = DESCRIPTION -// Defines the arrays required to convert between UCS-4 a 4 byte wide char -// codeset, and UCS-16, aka unicode, a 2-byte codeset. -// -// = AUTHOR -// Phil Mesnier <mesnier_p@ociweb.com> -// -// ============================================================================ - -#include "WUCS4_UTF16.h" -#include "ace/OS.h" - -ACE_RCSID(UCS4_UTF16, WUCS4_UTF16, "$Id$") - -// **************************************************************** - - -// @@ TODO: Find a better home for these definition -// Note: unlike the UNICODE standard we define these as -// half-closed ranges i.e. -// *BEGIN is the first value in the range -// *END is the first value beyond the range (END is not included -// in the range) -// Note the use of unsigned short for UTF-16 codepoints. wchar_t may -// by four bytes -typedef ACE_CDR::UShort ACE_UTF16_T; -static const size_t ACE_UTF16_CODEPOINT_SIZE = sizeof(ACE_UTF16_T); - -// surrogate high 1101.10HH.HHHH.HHHH -// surrogate low 1101.11LL.LLLL.LLLL -// 4 byte result: 0000.0000.0000.HHHH.HHHH.HHLL.LLLL.LLLL -// add offset 0000.0000.0000.0000.0001.0000.0000.0000 - -// range of surrogate values for high-order bits -static const unsigned short ACE_UTF16_SURROGATE_HIGH_BEGIN = 0xD800U; -static const unsigned short ACE_UTF16_SURROGATE_HIGH_END = 0xDC00U; - -static const unsigned short ACE_UTF16_SURROGATE_LOW_BEGIN = 0xDC00U; -static const unsigned short ACE_UTF16_SURROGATE_LOW_END = 0xE000U; - -// offset to UTF16 values encoded with surrogates start at 2^16 -static const unsigned long ACE_UTF16_SURROGATE_OFFSET = 0x000010000UL; - -// shift high order bits from surrogate into correct postion -static const int ACE_UTF16_SURROGATE_HIGH_SHIFT = 10; -static const unsigned short ACE_UTF16_SURROGATE_LOW_MASK = 0x3FF; - -// largest value that can be represented in UTF16 without using surrogates + 1 -static const unsigned long ACE_UTF16_RAW_END = 0x00010000LU; - -// largest value that can be represented in UTF16 + 1 -static const unsigned long ACE_UTF16_END = 0x00110000LU; - -// largest value that can be represented in UTF-32 + 1 -static const unsigned long ACE_UTF32_END = 0x80000000LU; - -static const unsigned short ACE_UNICODE_SUBSTITUTE_CHARACTER = 0xFFFDU; -static const unsigned short ACE_UNICODE_BOM_CORRECT = 0xFEFFU; -static const unsigned short ACE_UNICODE_BOM_SWAPPED = 0xFFFEU; - -///////////////////////////////////////////////////// -// Static inline routines to simplify conversion code -// @@ should be in anonymous namespace when ACE allows it -// or better yet, there should be a UTF-16 support thingie(technical term) -// that provides these methods. -// Performance: depends on compiler inlining + optimization for performance - -/// load next two bytes from buffer into a short. Byte swapping as necessary -static -//ACE_INLINE -ACE_UTF16_T -load_raw_wchar (const char * buffer, size_t & pos, int do_byte_swap) -{ - // need a two byte object to load the UTF16 2 byte codepoint - ACE_UTF16_T utf16_char = * reinterpret_cast<ACE_UTF16_T const *> (&buffer[pos*ACE_UTF16_CODEPOINT_SIZE]); -#if ! defined (ACE_DISABLE_SWAP_ON_READ) - if (do_byte_swap) - { - ACE_CDR::swap_2 ( - &buffer[pos*ACE_UTF16_CODEPOINT_SIZE], - reinterpret_cast<char *> (&utf16_char)); - } -#endif - pos ++; - return utf16_char; -} - -/// convert UTF-16 surrogate pair to wchar_t -static -//ACE_INLINE -ACE_CDR::WChar -convert_surrogate_pair (ACE_UTF16_T high, ACE_UTF16_T low) -{ - return static_cast<ACE_CDR::WChar> (((high - ACE_UTF16_SURROGATE_HIGH_BEGIN) << ACE_UTF16_SURROGATE_HIGH_SHIFT) - + (low - ACE_UTF16_SURROGATE_LOW_BEGIN) - + ACE_UTF16_SURROGATE_OFFSET - ); -} - -/// load wchar from utf16 buffer -/// converts surrogate pairs -/// substitutes SUBSTITUTE_CHAR for bad encoding -static -//ACE_INLINE -ACE_CDR::WChar -load_wchar (const char * buffer, size_t & pos, size_t length, int do_byte_swap) -{ - ACE_CDR::WChar rc = ACE_UNICODE_SUBSTITUTE_CHARACTER; - if (pos < length) - { - rc = static_cast<ACE_CDR::WChar> (load_raw_wchar (buffer, pos, do_byte_swap)); - // Is this a UTF16 surrogate? - // note assumpton that SURROGATE_HIGH_END == SURROGATE_LOW_BEGIN - if (rc >= ACE_UTF16_SURROGATE_HIGH_BEGIN && rc < ACE_UTF16_SURROGATE_LOW_END) - { - // if we still have two bytes available - if (pos < length) - { - // expecting high surrogate - if (rc < ACE_UTF16_SURROGATE_HIGH_END) - { - ACE_UTF16_T low = load_raw_wchar (buffer, pos, do_byte_swap); - if (low >= ACE_UTF16_SURROGATE_LOW_BEGIN - && low < ACE_UTF16_SURROGATE_LOW_END) - { - rc = convert_surrogate_pair ( - static_cast<ACE_UTF16_T> (rc), low); - } - else - { - rc = ACE_UNICODE_SUBSTITUTE_CHARACTER; - } - } - else - { - rc = ACE_UNICODE_SUBSTITUTE_CHARACTER; - } - } - else - { - rc = ACE_UNICODE_SUBSTITUTE_CHARACTER; - } - } - } - return rc; -} - -static -//ACE_INLINE -size_t encode_utf16 (ACE_UTF16_T * buffer, ACE_CDR::WChar value) -{ - buffer[0] = static_cast<ACE_UTF16_T> (value); - size_t length = 1; - if (value >= ACE_UTF16_SURROGATE_HIGH_BEGIN) - { - if (value < ACE_UTF16_SURROGATE_LOW_END) - { - buffer[0] = ACE_UNICODE_SUBSTITUTE_CHARACTER; - } - else if ((unsigned long)value >= ACE_UTF16_RAW_END) - { - if ((unsigned long)value >= ACE_UTF16_END) - { - buffer[0] = ACE_UNICODE_SUBSTITUTE_CHARACTER; - } - else - { - ACE_CDR::WChar offset = static_cast<ACE_CDR::WChar> (value - ACE_UTF16_SURROGATE_OFFSET); - buffer[0] = (offset >> ACE_UTF16_SURROGATE_HIGH_SHIFT) - + ACE_UTF16_SURROGATE_HIGH_BEGIN; - buffer[1] = (offset & ACE_UTF16_SURROGATE_LOW_MASK) - + ACE_UTF16_SURROGATE_LOW_BEGIN; - length = 2; - } - } - } - return length; -} - -/// count number of characters in native WString that will be converted -/// to UTF-16 surrogate pairs -static -size_t count_potential_surrogates ( - const ACE_CDR::WChar *buffer, - ACE_CDR::ULong len) -{ - size_t count = 0; - for (size_t i = 0; i < len; ++i) - { - ACE_CDR::WChar value = buffer[i]; - if ((unsigned long)value >= ACE_UTF16_RAW_END && - (unsigned long)value < ACE_UTF16_END) - { - count += 1; - } - } - return count; -} - - -///////////////////////////// -// WUCS4_UTF16 implementation - -WUCS4_UTF16::WUCS4_UTF16 (void) -{ - -} - -WUCS4_UTF16::~WUCS4_UTF16 (void) -{ - -} - -// = Documented in $ACE_ROOT/ace/CDR_Stream.h -ACE_CDR::Boolean -WUCS4_UTF16::read_wchar (ACE_InputCDR &cdr, ACE_CDR::WChar &x) -{ - if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1 - && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1) - { - ACE_CDR::Octet len; - if (! this->read_1 (cdr, &len)) - { - return 0; - } - - int old_bo = cdr.byte_order(); - - ACE_UTF16_T sx = 0; - if (! this->read_2 (cdr,&sx)) - { - return 0; - } - - // Check for byte order mark, if found, consume and honor it. - if (sx == ACE_UNICODE_BOM_CORRECT || sx == ACE_UNICODE_BOM_SWAPPED) - { - // if we found it, but it came in in the wrong order - // invert the byte order flag for the duration of this method - if (sx == ACE_UNICODE_BOM_SWAPPED) - { - cdr.reset_byte_order (! old_bo); - } - this->read_2 (cdr,&sx); - } - - // check for UTF-16 surrogate pair, and if found interpret it - if (sx >= ACE_UTF16_SURROGATE_HIGH_BEGIN - && sx < ACE_UTF16_SURROGATE_LOW_END) - { - if (sx >= ACE_UTF16_SURROGATE_HIGH_END) - { - cdr.reset_byte_order (old_bo); - return 0; - } - - ACE_UTF16_T low; - if (! this->read_2 (cdr, &low)) - { - cdr.reset_byte_order (old_bo); - return 0;; - } - if (low < ACE_UTF16_SURROGATE_LOW_BEGIN - || low >= ACE_UTF16_SURROGATE_LOW_END) - { - cdr.reset_byte_order (old_bo); - return 0; - } - x = convert_surrogate_pair (sx, low); - } - else - { - x = static_cast<ACE_CDR::WChar> (sx); - } - - cdr.reset_byte_order (old_bo); - } - else - { - ACE_UTF16_T sx = 0; - if (!this->read_2 (cdr, &sx)) - { - return 0; - } - x = static_cast<ACE_CDR::WChar> (sx); - } - return 1; -} - -ACE_CDR::Boolean -WUCS4_UTF16::read_wstring (ACE_InputCDR &cdr, - ACE_CDR::WChar *&x) -{ - ACE_CDR::ULong len; - if (!this->read_4 (cdr, &len)) - return 0; - - // A check for the length being too great is done later in the - // call to read_char_array but we want to have it done before - // the memory is allocated. - if (len > 0 && len <= cdr.length()) - { - if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1 - && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1) - { - len /= ACE_UTF16_CODEPOINT_SIZE; - - //allocating one extra for the null character needed by applications - ACE_NEW_RETURN (x, - ACE_CDR::WChar [len + 1], - 0); - x[len] = L'\x00'; - if (this->read_wchar_array_i (cdr, x, len,1)) - { - // Since reading the array may have adjusted the length, - // we simply rewrite the null terminator - x[len] = L'\x00'; - return 1; - } - } - else - { - ACE_NEW_RETURN (x, - ACE_CDR::WChar [len], - 0); - if (this->read_wchar_array (cdr, x, len)) - return 1; - } - delete [] x; - } - else if (len == 0) - { - // Convert any null strings to empty strings since empty - // strings can cause crashes. (See bug 58.) - ACE_NEW_RETURN (x, - ACE_CDR::WChar[1], - 0); - x[0] = '\x00'; - return 1; - } - x = 0; - return 0; -} - -ACE_CDR::Boolean -WUCS4_UTF16::read_wchar_array_i (ACE_InputCDR & cdr, - ACE_CDR::WChar *x, - ACE_CDR::ULong &length, - int adjust_len) -{ - if (length == 0) - return 1; - char* buf; - size_t align = ACE_CDR::SHORT_ALIGN; - if (cdr.adjust (ACE_UTF16_CODEPOINT_SIZE * length, align, buf) == 0) - { - int byte_swap = cdr.do_byte_swap(); - size_t pos = 0; - - // check for byte order mark. If found, honor it then discard it - ACE_UTF16_T bom = load_raw_wchar (buf, pos, byte_swap); - if (bom == ACE_UNICODE_BOM_CORRECT || bom == ACE_UNICODE_BOM_SWAPPED) - { - if (bom == ACE_UNICODE_BOM_SWAPPED) - { - byte_swap = !byte_swap; - } - buf += ACE_UTF16_CODEPOINT_SIZE; - if (adjust_len) - length -= 1; - } - size_t bpos = 0; - for (size_t xpos = 0; xpos < length; ++xpos) - { - x[xpos] = load_wchar (buf, bpos, length, byte_swap); - } - - return 1; - } - return 0; - -} - - -ACE_CDR::Boolean -WUCS4_UTF16::read_wchar_array (ACE_InputCDR & cdr, - ACE_CDR::WChar *x, - ACE_CDR::ULong length) -{ - if (length == 0) - return 1; - - if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1 - && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1) - { - for (size_t i = 0; i < length; i++) - if (!this->read_wchar(cdr,x[i])) - return 0; - return 1; - } - else - return this->read_wchar_array_i(cdr,x,length); -} - -ACE_CDR::Boolean -WUCS4_UTF16::write_wchar (ACE_OutputCDR &cdr, - ACE_CDR::WChar x) -{ - int encode_len = 1; - if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 0) - { // wchar is not allowed with GIOP 1.0 - errno = EINVAL; - return 0; - } - else if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 1) - encode_len = 0; - - return write_wchar_i(cdr,x,1,encode_len); -} - -ACE_CDR::Boolean -WUCS4_UTF16::write_wchar_i (ACE_OutputCDR &cdr, - ACE_CDR::WChar x, - int use_BOM, - int encode_len) -{ - // If the desired char cannot be translated into a single unicode char, - // we must raise a marshal exception. - if ((unsigned long)x >= ACE_UTF16_RAW_END && - (unsigned long)x < ACE_UTF16_END) - return 0; - - int len = 0; - ACE_CDR::UShort buffer[2]; - if (use_BOM) - { - len = 2; - buffer[0] = ACE_UNICODE_BOM_CORRECT; - buffer[1] = static_cast<ACE_CDR::Short> (x); - } - else - { - len = 1; - if (cdr.byte_order()) - ACE_CDR::swap_2 (reinterpret_cast<const char *> (&x), - reinterpret_cast<char *> (buffer)); - else - buffer[0] = static_cast<ACE_CDR::Short> (x); - } - - if (encode_len) - { - unsigned char tcsize = static_cast<unsigned char> (len * ACE_UTF16_CODEPOINT_SIZE); - if (this->write_1 (cdr, &tcsize)) - return this->write_array(cdr, &buffer, tcsize, 1, 1); - else - return 0; - } - if (this->write_2 (cdr, buffer) == 0) - return 0; - if (len == 2) - return this->write_2 (cdr,buffer+1); - return 1; -} - -ACE_CDR::Boolean -WUCS4_UTF16::write_wstring (ACE_OutputCDR & cdr, - ACE_CDR::ULong len, - const ACE_CDR::WChar *x) -{ - if (static_cast<ACE_CDR::Short> (this->major_version(cdr)) == 1 - && static_cast<ACE_CDR::Short> (this->minor_version(cdr)) > 1) - { - // count characters that will require surrogates to - // determine transmission length - len++; // make room for BOM - ACE_UTF16_T bom = ACE_UNICODE_BOM_CORRECT; - ACE_CDR::ULong length = len + count_potential_surrogates (x, len); - ACE_CDR::ULong l = length * ACE_UTF16_CODEPOINT_SIZE; - if (this->write_4 (cdr, &l) && x != 0) - { - this->write_2 (cdr,&bom); - return this->write_measured_wchar_array (cdr, x, len, length); - } - } - else - { - ACE_CDR::ULong l = len + 1; - if (this->write_4 (cdr, &l)) - if (x != 0) - return this->write_wchar_array (cdr, x, len + 1); - else - { - ACE_UTF16_T s = 0; - return this->write_2 (cdr,&s); - } - } - return 0; -} - -ACE_CDR::Boolean -WUCS4_UTF16::write_wchar_array (ACE_OutputCDR & cdr, - const ACE_CDR::WChar *x, - ACE_CDR::ULong length) -{ -#if 0 - // I do not believe this is correct, because this could yield an array - // with an incorrect number of elements for the space allotted. - return this->write_measured_wchar_array ( - cdr, - x, - length, - length + count_potential_surrogates (x, length)); -#endif - - int encode_len = 1; - if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 0) - { // wchar is not allowed with GIOP 1.0 - errno = EINVAL; - return 0; - } - else if (static_cast<ACE_CDR::Short> (this->minor_version(cdr)) == 1) - encode_len = 0; - - for (size_t i = 0; i < length; i++) - if (this->write_wchar_i (cdr,x[i],0,encode_len) == 0) - return 0; - return 1; -} - -ACE_CDR::Boolean -WUCS4_UTF16::write_measured_wchar_array (ACE_OutputCDR & cdr, - const ACE_CDR::WChar *x, - ACE_CDR::ULong length, - ACE_CDR::ULong transmission_length) -{ - if (length == 0) - return 1; - char* buf; - size_t align = ACE_CDR::SHORT_ALIGN; - if (cdr.adjust (ACE_UTF16_CODEPOINT_SIZE * transmission_length, align, buf) - != 0) - { - return 0; - } - - ACE_UTF16_T *sb = reinterpret_cast<ACE_UTF16_T *> (buf); - size_t sbpos = 0; - - for (size_t i = 0; i < length; i++) - { - sbpos += encode_utf16 (& sb[sbpos], x[i]); - } -#if defined (ACE_ENABLE_SWAP_ON_WRITE) - // NOTE this will rarely be enabled. See the comments in ace/OS.h - if (cdr.do_byte_swap()) - { - // note can't use swap_2_array because in-place swaps are not safe :-< - // and we don't want to allocate a new array - for (size_t i = 0; i < sbpos; i++) - { - char * pchar = static_cast<char *> (&sb[i]); - // ACE_CDR::swap_2 (pchar, pchar); - // can't use swap_2 because inplace swaps are not safe - // and work-arounds like copying to another buffer lose - // any performance improvement from - // that fancy asm code, so we might as well just: - char temp = pchar[0]; - pchar[0] = pchar[1]; - pchar[1] = temp; - //@@TODO write swap_2(char * inplace_buffer); - } - } -#endif /* ACE_ENABLE_SWAP_ON_WRITE */ - return 1; -} |