summaryrefslogtreecommitdiff
path: root/ext/mbstring/tests/sjis_mobile_encodings.phpt
blob: 49b5e9c093971afd2eaa18ef718e9bfbbbdff25f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
--TEST--
Exhaustive test of Shift-JIS DoCoMo, KDDI, SoftBank encoding verification and conversion
--SKIPIF--
<?php
extension_loaded('mbstring') or die('skip mbstring not available');
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
?>
--FILE--
<?php
srand(818); /* Make results consistent */
include('encoding_tests.inc');
mb_substitute_character(0x25); // '%'

/* Read in the table of all characters in Windows-932
 * (The SJIS-Mobile encodings all use MS extensions) */
readConversionTable(__DIR__ . '/data/CP932.txt', $sjisChars, $fromUnicode, true);

/* U+301C (WAVE DASH) converts to SJIS 0x8160 (WAVE DASH) */
$fromUnicode["\x00\x00\x30\x1C"] = "\x81\x60";
/* U+2212 (MINUS SIGN) converts to SJIS 0x817C (FULLWIDTH HYPHEN-MINUS) */
$fromUnicode["\x00\x00\x22\x12"] = "\x81\x7C";
/* U+203E (OVERLINE) converts to SJIS 0x8150 (FULLWIDTH MACRON) */
$fromUnicode["\x00\x00\x20\x3E"] = "\x81\x50";
/* U+2016 (DOUBLE VERTICAL LINE) converts to SJIS 0x8161 (PARALLEL TO) */
$fromUnicode["\x00\x00\x20\x16"] = "\x81\x61";
/* U+00AF (MACRON) converts to SJIS 0x8150 (FULLWIDTH MACRON) */
$fromUnicode["\x00\x00\x00\xAF"] = "\x81\x50";
/* U+00AC (NOT SIGN) converts to SJIS 0x81CA (FULLWIDTH NOT SIGN) */
$fromUnicode["\x00\x00\x00\xAC"] = "\x81\xCA";
/* U+00A5 (YEN SIGN) converts to SJIS 0x818F (FULLWIDTH YEN SIGN) */
$fromUnicode["\x00\x00\x00\xA5"] = "\x81\x8F";
/* U+00A3 (POUND SIGN) converts to SJIS 0x8192 (FULLWIDTH POUND SIGN) */
$fromUnicode["\x00\x00\x00\xA3"] = "\x81\x92";
/* U+00A2 (CENT SIGN) converts to SJIS 0x8191 (FULLWIDTH CENT SIGN) */
$fromUnicode["\x00\x00\x00\xA2"] = "\x81\x91";

/* Aside from the characters in that table, we also support a 'user' area
 * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */
$codepoint = 0xE000;
for ($i = 0xF0; $i <= 0xF9; $i++) {
  for ($j = 0x40; $j <= 0xFC; $j++) {
    if ($j == 0x7F)
      continue;
    $utf32 = pack('N', $codepoint);
    $cp932 = chr($i) . chr($j);
    $sjisChars[$cp932] = $utf32;
    $fromUnicode[$utf32] = $cp932;
    $codepoint++;
  }
}

$invalidCodepoints = array();
for ($i = 0; $i <= 0xFFFF; $i++) {
  $cp = pack('N', $i);
  if (!isset($fromUnicode[$cp]))
    $invalidCodepoints[$cp] = true;
}

/* Windows-932 has many cases where two different kuten codes map to the same
 * Unicode codepoints
 *
 * Everything from 0xED00-0xEEFF falls in this unfortunate category
 * (Other sequences in 0xFA00-0xFC4B map to the same codepoints.)
 * Our implementation of CP932 prefers the F's, but for SJIS-Mobile,
 * we prefer the E's */
$nonInvertible = array();
for ($i = 0xFA00; $i <= 0xFC4B; $i++) {
  $bytes = pack('n', $i);
  if (isset($sjisChars[$bytes])) {
    $nonInvertible[$bytes] = $sjisChars[$bytes];
    unset($fromUnicode[$sjisChars[$bytes]]);
  }
}

/* Other "collisions" */
foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xEEF9] as $i) {
  $bytes = pack('n', $i);
  $nonInvertible[$bytes] = $sjisChars[$bytes];
  unset($fromUnicode[$sjisChars[$bytes]]);
}

$nonInvertibleSoftbank = $nonInvertible;
$nonInvertibleDocomo   = $nonInvertible;

/* Now read table of vendor-specific emoji encodings */
$docomo = $sjisChars;
$kddi = $sjisChars;
$softbank = $sjisChars;
$sbEmoji = array();
$fp = fopen(realpath(__DIR__ . '/data/EmojiSources.txt'), 'r+');
while ($line = fgets($fp, 256)) {
  if ($line[0] == '#')
    continue;
  $fields = explode(';', rtrim($line));
  if (count($fields) >= 4) {
    if (sscanf($fields[0], "%x %x", $cp1, $cp2) == 2) {
      $utf32 = pack('N', $cp1) . pack('N', $cp2);
    } else {
      $utf32 = pack('N', hexdec($fields[0]));
      unset($invalidCodepoints[$utf32]);
    }

    if ($fields[1])
      $docomo[pack('n', hexdec($fields[1]))] = $utf32;
    if ($fields[2])
      $kddi[pack('n', hexdec($fields[2]))] = $utf32;
    if ($fields[3]) {
      $bytes = pack('n', hexdec($fields[3]));
      $sbEmoji[$bytes] = $utf32;
      unset($nonInvertibleSoftbank[$bytes]);
    }
  }
}

/* Other, vendor-specific emoji which do not appear in EmojiSources.txt
 * Most of these don't exist in Unicode and have been mapped to 'private
 * area' codepoints */
$docomo["\xF9\x4A"] = "\x00\x0F\xEE\x16"; // PIAS PI
$docomo["\xF9\x4B"] = "\x00\x0F\xEE\x17"; // PIAS A
$docomo["\xF9\x4C"] = "\x00\x0F\xEE\x18"; // INVERSE TICKET
$docomo["\xF9\x4D"] = "\x00\x0F\xEE\x19"; // KATAKANA ABBREVIATION FOR TICKET ("chi ke")
$docomo["\xF9\x4E"] = "\x00\x0F\xEE\x1A"; // RESERVE BY PHONE
$docomo["\xF9\x4F"] = "\x00\x0F\xEE\x1B"; // P CODE
$docomo["\xF9\x53"] = "\x00\x0F\xEE\x1C"; // MOVIES 2
$docomo["\xF9\x54"] = "\x00\x0F\xEE\x1D"; // PIAS PI INVERSE
$docomo["\xF9\x58"] = "\x00\x0F\xEE\x1E"; // PIAS PI CIRCLE
$docomo["\xF9\x59"] = "\x00\x0F\xEE\x1F"; // PIAS PI SQUARE
$docomo["\xF9\x5A"] = "\x00\x0F\xEE\x20"; // CHECK
$docomo["\xF9\x5F"] = "\x00\x0F\xEE\x21"; // F
$docomo["\xF9\x60"] = "\x00\x0F\xEE\x22"; // D
$docomo["\xF9\x61"] = "\x00\x0F\xEE\x23"; // S
$docomo["\xF9\x62"] = "\x00\x0F\xEE\x24"; // C
$docomo["\xF9\x63"] = "\x00\x0F\xEE\x25"; // R
$docomo["\xF9\x64"] = "\x00\x00\x25\xEA"; // SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK
$nonInvertibleDocomo["\xF9\x64"] = "\x00\x00\x25\xEA";
$docomo["\xF9\x65"] = "\x00\x00\x25\xA0"; // BLACK SQUARE
$nonInvertibleDocomo["\xF9\x65"] = "\x00\x00\x25\xA0";
$docomo["\xF9\x66"] = "\x00\x00\x25\xBF"; // DOWNWARD TRIANGLE
$nonInvertibleDocomo["\xF9\x66"] = "\x00\x00\x25\xBF";
/* TODO: test that FEE28 converts to F966, for backwards compatibility */
$docomo["\xF9\x67"] = "\x00\x0F\xEE\x29"; // QUADRUPLE DAGGER
$docomo["\xF9\x68"] = "\x00\x0F\xEE\x2A"; // TRIPLE DAGGER
$docomo["\xF9\x69"] = "\x00\x0F\xEE\x2B"; // DOUBLE DAGGER
$docomo["\xF9\x6A"] = "\x00\x00\x20\x20"; // DAGGER
$nonInvertibleDocomo["\xF9\x6A"] = "\x00\x00\x20\x20";
/* TODO: test that FEE2C converts to F96A, for backwards compatibility */
$docomo["\xF9\x6B"] = "\x00\x0F\xEE\x2D"; // I (meaning "inexpensive")
$docomo["\xF9\x6C"] = "\x00\x0F\xEE\x2E"; // M (meaning "moderate")
$docomo["\xF9\x6D"] = "\x00\x0F\xEE\x2F"; // E (meaning "expensive")
$docomo["\xF9\x6E"] = "\x00\x0F\xEE\x30"; // VE (meaning "very expensive")
$docomo["\xF9\x6F"] = "\x00\x0F\xEE\x31"; // SPHERE
$docomo["\xF9\x70"] = "\x00\x0F\xEE\x32"; // CREDIT CARDS NOT ACCEPTED
$docomo["\xF9\x71"] = "\x00\x0F\xEE\x33"; // CHECKBOX
$docomo["\xF9\x75"] = "\x00\x0F\xEE\x10"; // I-MODE
$docomo["\xF9\x76"] = "\x00\x0F\xEE\x11"; // I-MODE WITH FRAME
$docomo["\xF9\x78"] = "\x00\x0F\xEE\x12"; // PROVIDED BY DOCOMO
$docomo["\xF9\x79"] = "\x00\x0F\xEE\x13"; // DOCOMO POINT
$docomo["\xF9\x84"] = "\x00\x00\x27\xBF"; // FREE DIAL; mapped to DOUBLE CURLY LOOP
unset($invalidCodepoints["\x00\x00\x27\xBF"]);
$docomo["\xF9\x86"] = "\x00\x0F\xE8\x2D"; // MOBILE Q
$docomo["\xF9\xB1"] = "\x00\x0F\xEE\x14"; // I-APPLI
$docomo["\xF9\xB2"] = "\x00\x0F\xEE\x15"; // I-APPLI WITH BORDER

$kddi["\xF7\x94"] = "\x00\x0F\xEE\x40"; // EZ WEB
$kddi["\xF7\xCF"] = "\x00\x0F\xEE\x41"; // EZ PLUS
$kddi["\xF3\x70"] = "\x00\x0F\xEE\x42"; // EZ NAVIGATION
$kddi["\xF4\x78"] = "\x00\x0F\xEE\x43"; // EZ MOVIE
$kddi["\xF4\x86"] = "\x00\x0F\xEE\x44"; // CMAIL
$kddi["\xF4\x8E"] = "\x00\x0F\xEE\x45"; // JAVA (TM)
$kddi["\xF4\x8F"] = "\x00\x0F\xEE\x46"; // BREW
$kddi["\xF4\x90"] = "\x00\x0F\xEE\x47"; // EZ RING MUSIC
$kddi["\xF4\x91"] = "\x00\x0F\xEE\x48"; // EZ NAVI
$kddi["\xF4\x92"] = "\x00\x0F\xEE\x49"; // WIN
$kddi["\xF4\x93"] = "\x00\x0F\xEE\x4A"; // PREMIUM SIGN
$kddi["\xF7\x48"] = "\x00\x0F\xE8\x2D"; // MOBILE Q
$kddi["\xF7\xA3"] = "\x00\x0F\xE8\x3C"; // PDC ("personal digital cellular")
$kddi["\xF7\xD2"] = "\x00\x0F\xEB\x89"; // OPENWAVE

$sbEmoji["\xF7\xB1"] = "\x00\x00\x27\xBF"; // FREE DIAL; mapped to DOUBLE CURLY
$sbEmoji["\xF7\xF4"] = "\x00\x0F\xEE\x77"; // J-PHONE SHOP
$sbEmoji["\xF7\xF5"] = "\x00\x0F\xEE\x78"; // SKY WEB
$sbEmoji["\xF7\xF6"] = "\x00\x0F\xEE\x79"; // SKY WALKER
$sbEmoji["\xF7\xF7"] = "\x00\x0F\xEE\x7A"; // SKY MELODY
$sbEmoji["\xF7\xF8"] = "\x00\x0F\xEE\x7B"; // J-PHONE 1
$sbEmoji["\xF7\xF9"] = "\x00\x0F\xEE\x7C"; // J-PHONE 2
$sbEmoji["\xF7\xFA"] = "\x00\x0F\xEE\x7D"; // J-PHONE 3

/* SoftBank-specific 'JSky1', 'JSky2', 'VODAFONE1', 'VODAFONE2', etc. emoji,
 * which are not supported by Unicode */
for ($i = 0xFBD8; $i <= 0xFBDE; $i++) {
  $bytes = pack('n', $i);
  $sbEmoji[$bytes] = pack('N', 0xFEE70 + $i - 0xFBD8);
  unset($nonInvertibleSoftbank[$bytes]);
}
/* SoftBank-specific emoji for Shibuya department store */
$sbEmoji["\xFB\xAA"] = "\x00\x0F\xE4\xC5";
unset($nonInvertibleSoftbank["\xFB\xAA"]);

$softbank = array_merge($softbank, $sbEmoji);

/* For Softbank, we support an alternative representation for emoji which
 * uses sequences starting with ESC. Apparently this was used in older
 * versions of Softbank's phones.
 * ESC could be followed by 6 different ASCII characters, each of which
 * represented a different ku code */
$escCodeToKu = array('G' => 0x91, 'E' => 0x8D, 'F' => 0x8E, 'O' => 0x92, 'P' => 0x95, 'Q' => 0x96);
$escCodeMaxTen = array('G' => 0x7A, 'E' => 0x7A, 'F' => 0x7A, 'O' => 0x6D, 'P' => 0x6C, 'Q' => 0x5E);

function shiftJISEncode($ku, $ten) {
  $ku -= 0x21;
  $ten -= 0x21;
  $hiBits = $ku >> 1;
  $loBit  = $ku % 2;
  if ($hiBits < 31) {
    $sjis = chr($hiBits + 0x81);
  } else {
    $sjis = chr($hiBits - 31 + 0xE0);
  }
  if ($loBit == 0) {
    if ($ten < 63)
      return $sjis . chr($ten + 0x40);
    else
      return $sjis . chr($ten - 63 + 0x80);
  } else {
    return $sjis . chr($ten + 0x9F);
  }
}

foreach ($escCodeToKu as $char => $ku) {
  for ($ten = 0x21; $ten <= $escCodeMaxTen[$char]; $ten++) {
    $sjis = shiftJISEncode($ku, $ten);
    if (isset($sbEmoji[$sjis])) {
      $bytes = "\x1B\$" . $char . chr($ten);
      $unicode = $softbank[$sjis];
      $nonInvertibleSoftbank[$bytes] = $softbank[$bytes] = $unicode;
    }
  }
}

/* A bare ESC is not valid for Softbank, since it is used for escape sequences
 * which represent emoji */
unset($softbank["\x1B"]);

function testSJISVariant($validChars, $nonInvertible, $encoding) {
  global $fromUnicode, $invalidCodepoints, $escCodeToKu;

  $lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
  findInvalidChars($validChars, $invalidChars, $truncated, $lenTable);

  foreach ($escCodeToKu as $char => $unused) {
    unset($invalidChars["\x1B\$" . $char . "\x0F"]);
    unset($truncated["\x1B\$" . $char]);
  }

  $escapes = [];
  foreach ($nonInvertible as $bytes => $unicode) {
    unset($validChars[$bytes]);
    if (substr($bytes, 0, 1) === "\x1B")
      array_push($escapes, $bytes);
  }
  /* 0xF is used to terminate a run of emoji encoded using ESC sequence
   * We couldn't do this earlier or `findInvalidChars` wouldn't have worked
   * as desired */
  foreach ($escapes as $bytes) {
    $nonInvertible[$bytes . "\x0F"] = $nonInvertible[$bytes];
    unset($nonInvertible[$bytes]);
  }

  testAllValidChars($validChars, $encoding, 'UTF-32BE');
  testAllValidChars($nonInvertible, $encoding, 'UTF-32BE', false);
  echo "$encoding verification and conversion works on all valid characters\n";

  testAllInvalidChars($invalidChars, $validChars, $encoding, 'UTF-32BE', "\x00\x00\x00%");
  testTruncatedChars($truncated, $encoding, 'UTF-32BE', "\x00\x00\x00%");
  echo "$encoding verification and conversion works on all invalid characters\n";

  convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-32BE', $encoding, '%');
  echo "Unicode -> $encoding conversion works on all invalid codepoints\n";
}

testSJISVariant($docomo,   $nonInvertibleDocomo,   'SJIS-Mobile#DOCOMO');
testSJISVariant($kddi,     $nonInvertible,         'SJIS-Mobile#KDDI');
testSJISVariant($softbank, $nonInvertibleSoftbank, 'SJIS-Mobile#SOFTBANK');

?>
--EXPECT--
SJIS-Mobile#DOCOMO verification and conversion works on all valid characters
SJIS-Mobile#DOCOMO verification and conversion works on all invalid characters
Unicode -> SJIS-Mobile#DOCOMO conversion works on all invalid codepoints
SJIS-Mobile#KDDI verification and conversion works on all valid characters
SJIS-Mobile#KDDI verification and conversion works on all invalid characters
Unicode -> SJIS-Mobile#KDDI conversion works on all invalid codepoints
SJIS-Mobile#SOFTBANK verification and conversion works on all valid characters
SJIS-Mobile#SOFTBANK verification and conversion works on all invalid characters
Unicode -> SJIS-Mobile#SOFTBANK conversion works on all invalid codepoints