diff options
author | Alex Dowad <alexinbeijing@gmail.com> | 2020-11-14 21:15:11 +0200 |
---|---|---|
committer | Alex Dowad <alexinbeijing@gmail.com> | 2020-11-25 20:51:45 +0200 |
commit | e4ee97911132c6ad4dee372369472316a33b4eee (patch) | |
tree | 0a220096aaf7b90475d553ceb084b9fbcf279752 /ext/mbstring | |
parent | 315d48b4340f79731882b7f87422801a065475b8 (diff) | |
download | php-git-e4ee97911132c6ad4dee372369472316a33b4eee.tar.gz |
0x5C is not a Yen sign in CP932 (or CP51932)
When Microsoft created CP932 (their version of Shift-JIS), they explicitly
used bytes 0-0x7F to represent ASCII characters rather than JIS X 0201
characters.
So when converting Unicode to CP932, it is not correct to convert U+00A5
to CP932 0x5C. Fortunately, CP932 does have a multi-byte FULLWIDTH YEN SIGN
character which we can use instead.
CP51932 uses the same extended character set as CP932; while CP932 is
MicroSoft's extended version of Shift-JIS, CP51932 is their extended version
of EUC-JP. So the same reasoning applies to CP51932.
Diffstat (limited to 'ext/mbstring')
-rw-r--r-- | ext/mbstring/libmbfl/filters/mbfilter_cp51932.c | 4 | ||||
-rw-r--r-- | ext/mbstring/libmbfl/filters/mbfilter_cp932.c | 4 | ||||
-rw-r--r-- | ext/mbstring/tests/cp51932_encoding.phpt | 3 | ||||
-rw-r--r-- | ext/mbstring/tests/cp932_encoding.phpt | 2 |
4 files changed, 9 insertions, 4 deletions
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c index d9fecc9d4d..aa52d05481 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c @@ -214,8 +214,8 @@ mbfl_filt_conv_wchar_cp51932(int c, mbfl_convert_filter *filter) } if (s1 >= 0x8080) s1 = -1; /* we don't support JIS X0213 */ if (s1 <= 0) { - if (c == 0xa5) { /* YEN SIGN */ - s1 = 0x005c; /* YEN SIGN */ + if (c == 0xa5) { /* YEN SIGN */ + s1 = 0x216F; /* FULLWIDTH YEN SIGN */ } else if (c == 0x203e) { /* OVER LINE */ s1 = 0x007e; /* FULLWIDTH MACRON */ } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c index 6246600de8..ec192faa2b 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c @@ -251,8 +251,8 @@ mbfl_filt_conv_wchar_cp932(int c, mbfl_convert_filter *filter) s2 = 1; } if (s1 <= 0) { - if (c == 0xa5) { /* YEN SIGN */ - s1 = 0x005c; /* YEN SIGN */ + if (c == 0xa5) { /* YEN SIGN */ + s1 = 0x216F; /* FULLWIDTH YEN SIGN */ } else if (c == 0x203e) { /* OVER LINE */ s1 = 0x007e; /* FULLWIDTH MACRON */ } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */ diff --git a/ext/mbstring/tests/cp51932_encoding.phpt b/ext/mbstring/tests/cp51932_encoding.phpt index 8dbbeb85a0..2fc25fd3c8 100644 --- a/ext/mbstring/tests/cp51932_encoding.phpt +++ b/ext/mbstring/tests/cp51932_encoding.phpt @@ -84,6 +84,9 @@ unset($fromUnicode["\x30\x94"]); // Don't map hiragana vu to katakana vu for ($i = 0; $i <= 0x7F; $i++) $validChars[chr($i)] = "\x00" . chr($i); +/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */ +$fromUnicode["\x00\xA5"] = "\xA1\xEF"; + testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false); testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false); echo "CP51932 verification and conversion works on all valid characters\n"; diff --git a/ext/mbstring/tests/cp932_encoding.phpt b/ext/mbstring/tests/cp932_encoding.phpt index ddcf8a449b..ec9e76f1f6 100644 --- a/ext/mbstring/tests/cp932_encoding.phpt +++ b/ext/mbstring/tests/cp932_encoding.phpt @@ -30,6 +30,8 @@ for ($i = 0xF0; $i <= 0xF9; $i++) { $fromUnicode["\x00\xA2"] = "\x81\x91"; /* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */ $fromUnicode["\x00\xA3"] = "\x81\x92"; +/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */ +$fromUnicode["\x00\xA5"] = "\x81\x8F"; /* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE) * But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */ |