summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Dowad <alexinbeijing@gmail.com>2021-01-13 21:28:50 +0200
committerAlex Dowad <alexinbeijing@gmail.com>2021-01-14 22:26:24 +0200
commit4d65c2a992e0301c4b0ec9b3c73b44ee2d7802df (patch)
treec63c09154c7f3535966307f4b8c52d765560f9f5
parentb4292284201eacad83b807dc1ac89e87d799badf (diff)
downloadphp-git-4d65c2a992e0301c4b0ec9b3c73b44ee2d7802df.tar.gz
ISO-2022-JP-2004 conversion: represent backslash and tilde as ASCII
This issue dates back to some commits I merged recently, which made encodings like Shift-JIS-2004 use appropriate JIS X 0208 characters to represent backslashes and tildes, rather than single-byte characters which are used in those encodings with a different meaning (for example, in these encodings, 0x5C is used for a halfwidth Yen sign, rather than a backslash). There was an unintended side effect: ISO-2022-JP-2004 was also made to represent backslashes and tildes using JIS X 0208 characters. However, ISO-2022-JP explicitly includes ASCII as one of its selectable character sets, and ISO-2022-JP-2004 is just an extension of ISO-2022-JP. So when converting text to ISO-2022-JP-2004, we can convert Unicode backslashes and tildes to ASCII rather than using the corresponding JIS X 0208 characters.
-rw-r--r--ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c10
1 files changed, 8 insertions, 2 deletions
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c
index 1e549fdebb..061ba49861 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c
@@ -482,8 +482,8 @@ int mbfl_filt_conv_jis2004_wchar_flush(mbfl_convert_filter *filter)
return 0;
}
-int
-mbfl_filt_conv_wchar_jis2004(int c, mbfl_convert_filter *filter) {
+int mbfl_filt_conv_wchar_jis2004(int c, mbfl_convert_filter *filter)
+{
int k;
int c1, c2, s1, s2;
@@ -548,6 +548,12 @@ retry:
}
}
+ if (s1 <= 0 && filter->to->no_encoding == mbfl_no_encoding_2022jp_2004 && (c == 0x5C || c == 0x7E)) {
+ /* ISO-2022-JP-2004 can represent ASCII characters directly, so there is no need
+ * to use the JIS X 0208 REVERSE SOLIDUS for ASCII backslash, or WAVE DASH for tilde */
+ s1 = c;
+ }
+
/* check for major japanese chars: U+4E00 - U+9FFF */
if (s1 <= 0) {
for (k=0; k < uni2jis_tbl_len ;k++) {