diff options
Diffstat (limited to 'ext/mbstring/libmbfl/filters/mbfilter_utf16.c')
-rw-r--r-- | ext/mbstring/libmbfl/filters/mbfilter_utf16.c | 230 |
1 files changed, 120 insertions, 110 deletions
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index 620e8a76f7..c2c30973db 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -30,15 +30,17 @@ #include "mbfilter.h" #include "mbfilter_utf16.h" +static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter); + static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL}; const mbfl_encoding mbfl_encoding_utf16 = { mbfl_no_encoding_utf16, "UTF-16", "UTF-16", - (const char *(*)[])&mbfl_encoding_utf16_aliases, + mbfl_encoding_utf16_aliases, NULL, - MBFL_ENCTYPE_MWC2BE, + MBFL_ENCTYPE_MWC2, &vtbl_utf16_wchar, &vtbl_wchar_utf16 }; @@ -49,7 +51,7 @@ const mbfl_encoding mbfl_encoding_utf16be = { "UTF-16BE", NULL, NULL, - MBFL_ENCTYPE_MWC2BE, + MBFL_ENCTYPE_MWC2, &vtbl_utf16be_wchar, &vtbl_wchar_utf16be }; @@ -60,7 +62,7 @@ const mbfl_encoding mbfl_encoding_utf16le = { "UTF-16LE", NULL, NULL, - MBFL_ENCTYPE_MWC2LE, + MBFL_ENCTYPE_MWC2, &vtbl_utf16le_wchar, &vtbl_wchar_utf16le }; @@ -71,7 +73,7 @@ const struct mbfl_convert_vtbl vtbl_utf16_wchar = { mbfl_filt_conv_common_ctor, NULL, mbfl_filt_conv_utf16_wchar, - mbfl_filt_conv_common_flush, + mbfl_filt_conv_utf16_wchar_flush, NULL, }; @@ -91,7 +93,7 @@ const struct mbfl_convert_vtbl vtbl_utf16be_wchar = { mbfl_filt_conv_common_ctor, NULL, mbfl_filt_conv_utf16be_wchar, - mbfl_filt_conv_common_flush, + mbfl_filt_conv_utf16_wchar_flush, NULL, }; @@ -111,7 +113,7 @@ const struct mbfl_convert_vtbl vtbl_utf16le_wchar = { mbfl_filt_conv_common_ctor, NULL, mbfl_filt_conv_utf16le_wchar, - mbfl_filt_conv_common_flush, + mbfl_filt_conv_utf16_wchar_flush, NULL, }; @@ -127,111 +129,89 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf16le = { #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) -/* - * UTF-16 => wchar - */ int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter) { - int n, endian; - - endian = filter->status & 0xff00; - switch (filter->status & 0x0f) { - case 0: - if (endian) { - n = c & 0xff; - } else { - n = (c & 0xff) << 8; - } - filter->cache |= n; - filter->status++; - break; - default: - if (endian) { - n = (c & 0xff) << 8; + /* Start with the assumption that the string is big-endian; + * If we find a little-endian BOM, then we will change that assumption */ + if (filter->status == 0) { + filter->cache = c & 0xFF; + filter->status = 1; + } else { + int n = (filter->cache << 8) | (c & 0xFF); + if (n == 0xFFFE) { + /* Switch to little-endian mode */ + filter->filter_function = mbfl_filt_conv_utf16le_wchar; + filter->cache = filter->status = 0; } else { - n = c & 0xff; - } - n |= filter->cache & 0xffff; - filter->status &= ~0x0f; - if (n >= 0xd800 && n < 0xdc00) { - filter->cache = ((n & 0x3ff) << 16) + 0x400000; - } else if (n >= 0xdc00 && n < 0xe000) { - n &= 0x3ff; - n |= (filter->cache & 0xfff0000) >> 6; - filter->cache = 0; - if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) { - CK((*filter->output_function)(n, filter->data)); - } else { /* illegal character */ - n &= MBFL_WCSGROUP_MASK; - n |= MBFL_WCSGROUP_THROUGH; + filter->filter_function = mbfl_filt_conv_utf16be_wchar; + if (n >= 0xD800 && n <= 0xDBFF) { + filter->cache = n & 0x3FF; /* Pick out 10 data bits */ + filter->status = 2; + return c; + } else if (n >= 0xDC00 && n <= 0xDFFF) { + /* This is wrong; second part of surrogate pair has come first */ + CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data)); + } else if (n != 0xFEFF) { CK((*filter->output_function)(n, filter->data)); } - } else { - int is_first = filter->status & 0x10; - filter->cache = 0; - filter->status |= 0x10; - if (!is_first) { - if (n == 0xfffe) { - if (endian) { - filter->status &= ~0x100; /* big-endian */ - } else { - filter->status |= 0x100; /* little-endian */ - } - break; - } else if (n == 0xfeff) { - break; - } - } - CK((*filter->output_function)(n, filter->data)); + filter->cache = filter->status = 0; } - break; } return c; } -/* - * UTF-16BE => wchar - */ int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter) { int n; switch (filter->status) { - case 0: + case 0: /* First byte */ + filter->cache = c & 0xFF; filter->status = 1; - n = (c & 0xff) << 8; - filter->cache |= n; break; - default: - filter->status = 0; - n = (filter->cache & 0xff00) | (c & 0xff); - if (n >= 0xd800 && n < 0xdc00) { - filter->cache = ((n & 0x3ff) << 16) + 0x400000; - } else if (n >= 0xdc00 && n < 0xe000) { - n &= 0x3ff; - n |= (filter->cache & 0xfff0000) >> 6; - filter->cache = 0; - if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) { - CK((*filter->output_function)(n, filter->data)); - } else { /* illegal character */ - n &= MBFL_WCSGROUP_MASK; - n |= MBFL_WCSGROUP_THROUGH; - CK((*filter->output_function)(n, filter->data)); - } + + case 1: /* Second byte */ + n = (filter->cache << 8) | (c & 0xFF); + if (n >= 0xD800 && n <= 0xDBFF) { + filter->cache = n & 0x3FF; /* Pick out 10 data bits */ + filter->status = 2; + } else if (n >= 0xDC00 && n <= 0xDFFF) { + /* This is wrong; second part of surrogate pair has come first */ + CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data)); + filter->status = 0; } else { - filter->cache = 0; CK((*filter->output_function)(n, filter->data)); + filter->status = 0; } break; + + case 2: /* Second part of surrogate, first byte */ + filter->cache = (filter->cache << 8) | (c & 0xFF); + filter->status = 3; + break; + + case 3: /* Second part of surrogate, second byte */ + n = ((filter->cache & 0xFF) << 8) | (c & 0xFF); + if (n >= 0xD800 && n <= 0xDBFF) { + /* Wrong; that's the first half of a surrogate pair, not the second */ + CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data)); + filter->cache = n & 0x3FF; + filter->status = 2; + } else if (n >= 0xDC00 && n <= 0xDFFF) { + n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000; + CK((*filter->output_function)(n, filter->data)); + filter->status = 0; + } else { + CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data)); + CK((*filter->output_function)(n, filter->data)); + filter->status = 0; + } } return c; } -/* - * wchar => UTF-16BE - */ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter) { int n; @@ -253,38 +233,53 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter) return c; } -/* - * UTF-16LE => wchar - */ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter) { int n; switch (filter->status) { case 0: + filter->cache = c & 0xff; filter->status = 1; - n = c & 0xff; - filter->cache |= n; break; - default: - filter->status = 0; - n = (filter->cache & 0xff) | ((c & 0xff) << 8); - if (n >= 0xd800 && n < 0xdc00) { - filter->cache = ((n & 0x3ff) << 16) + 0x400000; - } else if (n >= 0xdc00 && n < 0xe000) { - n &= 0x3ff; - n |= (filter->cache & 0xfff0000) >> 6; - filter->cache = 0; - if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) { - CK((*filter->output_function)(n, filter->data)); - } else { /* illegal character */ - n &= MBFL_WCSGROUP_MASK; - n |= MBFL_WCSGROUP_THROUGH; - CK((*filter->output_function)(n, filter->data)); - } + + case 1: + if ((c & 0xfc) == 0xd8) { + /* Looks like we have a surrogate pair here */ + filter->cache += ((c & 0x3) << 8); + filter->status = 2; + } else if ((c & 0xfc) == 0xdc) { + /* This is wrong; the second part of the surrogate pair has come first + * Flag it with `MBFL_WCSGROUP_THROUGH`; the following filter will handle + * the error */ + n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH; + CK((*filter->output_function)(n, filter->data)); + filter->status = 0; } else { - filter->cache = 0; + CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data)); + filter->status = 0; + } + break; + + case 2: + filter->cache = (filter->cache << 10) + (c & 0xff); + filter->status = 3; + break; + + case 3: + n = (filter->cache & 0xFF) | ((c & 0xFF) << 8); + if (n >= 0xD800 && n <= 0xDBFF) { + CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data)); + filter->cache = n & 0x3FF; + filter->status = 2; + } else if (n >= 0xDC00 && n <= 0xDFFF) { + n = filter->cache + ((c & 0x3) << 8) + 0x10000; + CK((*filter->output_function)(n, filter->data)); + filter->status = 0; + } else { + CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data)); CK((*filter->output_function)(n, filter->data)); + filter->status = 0; } break; } @@ -292,9 +287,6 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter) return c; } -/* - * wchar => UTF-16LE - */ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter) { int n; @@ -315,3 +307,21 @@ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter) return c; } + +static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter) +{ + int status = filter->status; + int cache = filter->cache; + filter->status = filter->cache = 0; + + if (status) { + /* Input string was truncated */ + CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data)); + } + + if (filter->flush_function) { + (*filter->flush_function)(filter->data); + } + + return 0; +} |