diff options
Diffstat (limited to 'ext/mbstring/tests/cp932_encoding.phpt')
-rw-r--r-- | ext/mbstring/tests/cp932_encoding.phpt | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/ext/mbstring/tests/cp932_encoding.phpt b/ext/mbstring/tests/cp932_encoding.phpt new file mode 100644 index 0000000000..b426281f24 --- /dev/null +++ b/ext/mbstring/tests/cp932_encoding.phpt @@ -0,0 +1,108 @@ +--TEST-- +Exhaustive test of CP932 encoding verification and conversion +--SKIPIF-- +<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?> +--FILE-- +<?php +srand(4321); /* Make results consistent */ +include('encoding_tests.inc'); +mb_substitute_character(0x25); // '%' + +/* Read in the table of all characters in CP932 */ +readConversionTable(__DIR__ . '/data/CP932.txt', $validChars, $fromUnicode); + +/* Aside from the characters in that table, we also support a 'user' area + * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */ +$codepoint = 0xE000; +for ($i = 0xF0; $i <= 0xF9; $i++) { + for ($j = 0x40; $j <= 0xFC; $j++) { + if ($j == 0x7F) + continue; + $utf16 = pack('n', $codepoint); + $cp932 = chr($i) . chr($j); + $validChars[$cp932] = $utf16; + $fromUnicode[$utf16] = $cp932; + $codepoint++; + } +} + +/* U+00A2 is CENT SIGN; convert to FULLWIDTH CENT SIGN */ +$fromUnicode["\x00\xA2"] = "\x81\x91"; +/* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */ +$fromUnicode["\x00\xA3"] = "\x81\x92"; +/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */ +$fromUnicode["\x00\xA5"] = "\x81\x8F"; + +/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE) + * But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */ +$fromUnicode["\x30\x1C"] = "\x81\x60"; +/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN), + * but when converting Unicode to CP932, we also accept U+2212 (MINUS SIGN) */ +$fromUnicode["\x22\x12"] = "\x81\x7C"; +/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO), + * but when converting Unicode to CP932, we also accept U+2016 + * (DOUBLE VERTICAL LINE) */ +$fromUnicode["\x20\x16"] = "\x81\x61"; +/* We map the JIS X 0208 NOT SIGN to U+FFE2 (FULLWIDTH NOT SIGN), + * but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */ +$fromUnicode["\x00\xAC"] = "\x81\xCA"; + +/* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */ +$fromUnicode["\x20\x3E"] = "\x81\x50"; + +/* U+00AF is MACRON; it can also go to FULLWIDTH MACRON */ +$fromUnicode["\x00\xAF"] = "\x81\x50"; + +findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2)); + +findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2)); + +/* There are 396 Unicode codepoints which are non-invertible in CP932 + * (multiple CP932 byte sequences map to the same codepoint) + * Some of these are 3-way pile-ups. I wonder what the fine folks at MS + * were thinking when they designed this text encoding. */ + +/* Everything from 0xED00-0xEEFF falls in this unfortunate category + * (Other sequences in 0xFA00-0xFBFF map to the same codepoints, and when + * converting from Unicode back to CP932, we favor the F's rather than the E's) */ +$nonInvertible = array(); +for ($i = 0xED00; $i <= 0xEEFF; $i++) { + $bytes = pack('n', $i); + if (isset($validChars[$bytes])) { + unset($fromUnicode[$validChars[$bytes]]); + $nonInvertible[$bytes] = $validChars[$bytes]; + unset($validChars[$bytes]); // will test these separately + } +} + +/* There are 23 other collisions between 2-byte sequences which variously + * start with 0x81, 0x87, or 0xFA + * We _love_ 0x81 and use it when possible. 0x87 is a second favorite */ +for ($i = 0xFA4A; $i <= 0xFA53; $i++) { + $bytes = pack('n', $i); + unset($fromUnicode[$validChars[$bytes]]); + $nonInvertible[$bytes] = $validChars[$bytes]; + unset($validChars[$bytes]); // will test these separately +} +foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xFA54, 0xFA58, 0xFA59, 0xFA5A, 0xFA5B] as $i) { + $bytes = pack('n', $i); + unset($fromUnicode[$validChars[$bytes]]); + $nonInvertible[$bytes] = $validChars[$bytes]; + unset($validChars[$bytes]); // will test these separately +} + +testAllValidChars($validChars, 'CP932', 'UTF-16BE'); +foreach ($nonInvertible as $cp932 => $unicode) + testValidString($cp932, $unicode, 'CP932', 'UTF-16BE', false); +echo "CP932 verification and conversion works on all valid characters\n"; + +testAllInvalidChars($invalidChars, $validChars, 'CP932', 'UTF-16BE', "\x00%"); +echo "CP932 verification and conversion works on all invalid characters\n"; + +convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%'); +echo "Unicode -> CP932 conversion works on all invalid codepoints\n"; +?> +--EXPECT-- +CP932 verification and conversion works on all valid characters +CP932 verification and conversion works on all invalid characters +Unicode -> CP932 conversion works on all invalid codepoints |