ext/mbstring/tests/cp932_encoding.phpt


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108

--TEST--
Exhaustive test of CP932 encoding verification and conversion
--SKIPIF--
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
--FILE--
<?php
srand(4321); /* Make results consistent */
include('encoding_tests.inc');
mb_substitute_character(0x25); // '%'

/* Read in the table of all characters in CP932 */
readConversionTable(__DIR__ . '/data/CP932.txt', $validChars, $fromUnicode);

/* Aside from the characters in that table, we also support a 'user' area
 * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */
$codepoint = 0xE000;
for ($i = 0xF0; $i <= 0xF9; $i++) {
	for ($j = 0x40; $j <= 0xFC; $j++) {
		if ($j == 0x7F)
			continue;
		$utf16 = pack('n', $codepoint);
		$cp932 = chr($i) . chr($j);
		$validChars[$cp932] = $utf16;
		$fromUnicode[$utf16] = $cp932;
		$codepoint++;
	}
}

/* U+00A2 is CENT SIGN; convert to FULLWIDTH CENT SIGN */
$fromUnicode["\x00\xA2"] = "\x81\x91";
/* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */
$fromUnicode["\x00\xA3"] = "\x81\x92";
/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */
$fromUnicode["\x00\xA5"] = "\x81\x8F";

/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)
 * But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */
$fromUnicode["\x30\x1C"] = "\x81\x60";
/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN),
 * but when converting Unicode to CP932, we also accept U+2212 (MINUS SIGN) */
$fromUnicode["\x22\x12"] = "\x81\x7C";
/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO),
 * but when converting Unicode to CP932, we also accept U+2016
 * (DOUBLE VERTICAL LINE) */
$fromUnicode["\x20\x16"] = "\x81\x61";
/* We map the JIS X 0208 NOT SIGN to U+FFE2 (FULLWIDTH NOT SIGN),
 * but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */
$fromUnicode["\x00\xAC"] = "\x81\xCA";

/* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */
$fromUnicode["\x20\x3E"] = "\x81\x50";

/* U+00AF is MACRON; it can also go to FULLWIDTH MACRON */
$fromUnicode["\x00\xAF"] = "\x81\x50";

findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2));

findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));

/* There are 396 Unicode codepoints which are non-invertible in CP932
 * (multiple CP932 byte sequences map to the same codepoint)
 * Some of these are 3-way pile-ups. I wonder what the fine folks at MS
 * were thinking when they designed this text encoding. */

/* Everything from 0xED00-0xEEFF falls in this unfortunate category
 * (Other sequences in 0xFA00-0xFBFF map to the same codepoints, and when
 * converting from Unicode back to CP932, we favor the F's rather than the E's) */
$nonInvertible = array();
for ($i = 0xED00; $i <= 0xEEFF; $i++) {
	$bytes = pack('n', $i);
	if (isset($validChars[$bytes])) {
		unset($fromUnicode[$validChars[$bytes]]);
		$nonInvertible[$bytes] = $validChars[$bytes];
		unset($validChars[$bytes]); // will test these separately
	}
}

/* There are 23 other collisions between 2-byte sequences which variously
 * start with 0x81, 0x87, or 0xFA
 * We _love_ 0x81 and use it when possible. 0x87 is a second favorite */
for ($i = 0xFA4A; $i <= 0xFA53; $i++) {
	$bytes = pack('n', $i);
	unset($fromUnicode[$validChars[$bytes]]);
	$nonInvertible[$bytes] = $validChars[$bytes];
	unset($validChars[$bytes]); // will test these separately
}
foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xFA54, 0xFA58, 0xFA59, 0xFA5A, 0xFA5B] as $i) {
	$bytes = pack('n', $i);
	unset($fromUnicode[$validChars[$bytes]]);
	$nonInvertible[$bytes] = $validChars[$bytes];
	unset($validChars[$bytes]); // will test these separately
}

testAllValidChars($validChars, 'CP932', 'UTF-16BE');
foreach ($nonInvertible as $cp932 => $unicode)
	testValidString($cp932, $unicode, 'CP932', 'UTF-16BE', false);
echo "CP932 verification and conversion works on all valid characters\n";

testAllInvalidChars($invalidChars, $validChars, 'CP932', 'UTF-16BE', "\x00%");
echo "CP932 verification and conversion works on all invalid characters\n";

convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%');
echo "Unicode -> CP932 conversion works on all invalid codepoints\n";
?>
--EXPECT--
CP932 verification and conversion works on all valid characters
CP932 verification and conversion works on all invalid characters
Unicode -> CP932 conversion works on all invalid codepoints