summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Dowad <alexinbeijing@gmail.com>2020-10-17 21:29:47 +0200
committerAlex Dowad <alexinbeijing@gmail.com>2021-01-15 21:55:41 +0200
commit34ece408728a13991fdecdbf31e9bf12ca8c1902 (patch)
tree8aec6b693a31ec6edd57ed03cdc775e8d75c97b2
parentfcbe45de1042c06c00cc9f957c7654d4835dafa3 (diff)
downloadphp-git-34ece408728a13991fdecdbf31e9bf12ca8c1902.tar.gz
Remove useless mbstring encoding 'JIS-ms'
MicroSoft invented three encodings very similar to ISO-2022-JP/JIS7/JIS8, called CP50220, CP50221, and CP50222. All three are supported by mbstring. Since these encodings are very similar, some code can be shared. Actually, conversion of CP50220/1/2 to Unicode is exactly the same operation; it's when converting from Unicode to CP50220/1/2 that some small differences arise in how certain katakana are handled. The most important common code was a function called `mbfl_filt_wchar_jis_ms`. The `jis_ms` part doubtless refers to the fact that these encodings are modified versions of 'JIS' invented by 'MS'. mbstring also went a step further and exported 'JIS-ms' to userland as a separate encoding from CP50220/1/2. If users requested 'JIS-ms' conversion, they got something like CP50220/1/2, minus their special ways of handling half-width katakana when converting from Unicode. But... that 'encoding' is not something which actually exists in the world outside of mbstring. CP50220/1/2 do exist in MicroSoft software, but not 'JIS-ms'. For a text encoding conversion library, inventing new variant encodings and implementing them is not very productive. Our interest is in handling text encodings which real people actually use for... you know, storing actual text and things like that.
-rw-r--r--ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c203
-rw-r--r--ext/mbstring/libmbfl/filters/mbfilter_cp5022x.h6
-rw-r--r--ext/mbstring/libmbfl/mbfl/mbfl_encoding.c1
-rw-r--r--ext/mbstring/libmbfl/mbfl/mbfl_encoding.h1
4 files changed, 13 insertions, 198 deletions
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c
index d98366d1a6..da4b08c892 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c
@@ -45,19 +45,12 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter);
* This was just CP50220, but the implementation was less strict regarding
* invalid characters; it would silently pass some through
* This 'encoding' only existed in mbstring. In case some poor, lost soul is
- * still using it, retain minimal support by aliasing it to CP50220 */
-static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", NULL};
-
-const mbfl_encoding mbfl_encoding_jis_ms = {
- mbfl_no_encoding_jis_ms,
- "JIS-ms",
- "ISO-2022-JP",
- NULL,
- NULL,
- MBFL_ENCTYPE_MBCS | MBFL_ENCTYPE_GL_UNSAFE,
- &vtbl_jis_ms_wchar,
- &vtbl_wchar_jis_ms
-};
+ * still using it, retain minimal support by aliasing it to CP50220
+ *
+ * Further, mbstring also had a made-up encoding called "JIS-ms"
+ * This was the same as CP5022{0,1,2}, but without their special ways of
+ * handling conversion of Unicode half-width katakana */
+static const char *cp50220_aliases[] = {"cp50220raw", "cp50220-raw", "JIS-ms", NULL};
const mbfl_encoding mbfl_encoding_cp50220 = {
mbfl_no_encoding_cp50220,
@@ -92,32 +85,12 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
&vtbl_wchar_cp50222
};
-const struct mbfl_convert_vtbl vtbl_jis_ms_wchar = {
- mbfl_no_encoding_jis_ms,
- mbfl_no_encoding_wchar,
- mbfl_filt_conv_common_ctor,
- NULL,
- mbfl_filt_conv_jis_ms_wchar,
- mbfl_filt_conv_common_flush,
- NULL,
-};
-
-const struct mbfl_convert_vtbl vtbl_wchar_jis_ms = {
- mbfl_no_encoding_wchar,
- mbfl_no_encoding_jis_ms,
- mbfl_filt_conv_common_ctor,
- NULL,
- mbfl_filt_conv_wchar_jis_ms,
- mbfl_filt_conv_any_jis_flush,
- NULL,
-};
-
const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
mbfl_no_encoding_cp50220,
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
- mbfl_filt_conv_jis_ms_wchar,
+ mbfl_filt_conv_cp5022x_wchar,
mbfl_filt_conv_cp5022x_wchar_flush,
NULL,
};
@@ -137,7 +110,7 @@ const struct mbfl_convert_vtbl vtbl_cp50221_wchar = {
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
- mbfl_filt_conv_jis_ms_wchar,
+ mbfl_filt_conv_cp5022x_wchar,
mbfl_filt_conv_cp5022x_wchar_flush,
NULL,
};
@@ -157,7 +130,7 @@ const struct mbfl_convert_vtbl vtbl_cp50222_wchar = {
mbfl_no_encoding_wchar,
mbfl_filt_conv_common_ctor,
NULL,
- mbfl_filt_conv_jis_ms_wchar,
+ mbfl_filt_conv_cp5022x_wchar,
mbfl_filt_conv_cp5022x_wchar_flush,
NULL,
};
@@ -174,11 +147,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_cp50222 = {
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
-/*
- * JIS-ms => wchar
- */
-int
-mbfl_filt_conv_jis_ms_wchar(int c, mbfl_convert_filter *filter)
+int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
{
int c1, s, w;
@@ -356,154 +325,6 @@ static int mbfl_filt_conv_cp5022x_wchar_flush(mbfl_convert_filter *filter)
}
/*
- * wchar => JIS
- */
-int
-mbfl_filt_conv_wchar_jis_ms(int c, mbfl_convert_filter *filter)
-{
- int s = 0;
-
- if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
- s = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
- } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
- s = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
- } else if (c >= ucs_i_jis_table_min && c < ucs_i_jis_table_max) {
- s = ucs_i_jis_table[c - ucs_i_jis_table_min];
- } else if (c >= ucs_r_jis_table_min && c < ucs_r_jis_table_max) {
- s = ucs_r_jis_table[c - ucs_r_jis_table_min];
- } else if (c >= 0xe000 && c < (0xe000 + 10 * 94)) {
- /* PUE => Microsoft extended (pseudo 95ku - 114ku) */
- /* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
- s = c - 0xe000;
- s = ((s / 94) + 0x7F) << 8 | ((s % 94) + 0x21);
- } else if (c >= (0xe000 + 10 * 94) && c <= (0xe000 + 20 * 94)) {
- /* PUE => JISX0212 user-defined (G3 85ku - 94ku) */
- /* See http://www.opengroup.or.jp/jvc/cde/ucs-conv.html#ch4_2 */
- s = c - (0xe000 + 10 * 94);
- s = (s / 94 + 0xf5) << 8 | (s % 94 + 0xa1);
- }
-
- /* do some transliteration */
- if (s <= 0) {
- if (c == 0xa5) { /* YEN SIGN */
- s = 0x1005c;
- } else if (c == 0x203e) { /* OVER LINE */
- s = 0x1007e;
- } else if (c == 0xff3c) { /* FULLWIDTH REVERSE SOLIDUS */
- s = 0x2140;
- } else if (c == 0x2225) { /* PARALLEL TO */
- s = 0x2142;
- } else if (c == 0xff0d) { /* FULLWIDTH HYPHEN-MINUS */
- s = 0x215d;
- } else if (c == 0xffe0) { /* FULLWIDTH CENT SIGN */
- s = 0x2171;
- } else if (c == 0xffe1) { /* FULLWIDTH POUND SIGN */
- s = 0x2172;
- } else if (c == 0xffe2) { /* FULLWIDTH NOT SIGN */
- s = 0x224c;
- }
- }
- if (s <= 0 || (s >= 0x8080 && s < 0x10000)) {
- int i;
- s = -1;
-
- for (i = 0;
- i < cp932ext1_ucs_table_max - cp932ext1_ucs_table_min; i++) {
- const int oh = cp932ext1_ucs_table_min / 94;
-
- if (c == cp932ext1_ucs_table[i]) {
- s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
- break;
- }
- }
-
- if (s < 0) {
- const int oh = cp932ext2_ucs_table_min / 94;
- const int cp932ext2_ucs_table_size =
- cp932ext2_ucs_table_max - cp932ext2_ucs_table_min;
- for (i = 0; i < cp932ext2_ucs_table_size; i++) {
- if (c == cp932ext2_ucs_table[i]) {
- s = ((i / 94 + oh + 0x21) << 8) + (i % 94 + 0x21);
- break;
- }
- }
- }
-
- if (s < 0) {
- const int cp932ext3_ucs_table_size =
- cp932ext3_ucs_table_max - cp932ext3_ucs_table_min;
- const int limit = cp932ext3_ucs_table_size >
- cp932ext3_eucjp_table_size ?
- cp932ext3_eucjp_table_size:
- cp932ext3_ucs_table_size;
- for (i = 0; i < limit; i++) {
- if (c == cp932ext3_ucs_table[i]) {
- s = cp932ext3_eucjp_table[i];
- break;
- }
- }
- }
-
- if (c == 0) {
- s = 0;
- } else if (s <= 0) {
- s = -1;
- }
- }
-
- if (s >= 0) {
- if (s < 0x80) { /* ASCII */
- if ((filter->status & 0xff00) != 0) {
- CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
- CK((*filter->output_function)(0x28, filter->data)); /* '(' */
- CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
- }
- filter->status = 0;
- CK((*filter->output_function)(s, filter->data));
- } else if (s < 0x100) { /* kana */
- if ((filter->status & 0xff00) != 0x100) {
- CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
- CK((*filter->output_function)(0x28, filter->data)); /* '(' */
- CK((*filter->output_function)(0x49, filter->data)); /* 'I' */
- }
- filter->status = 0x100;
- CK((*filter->output_function)(s & 0x7f, filter->data));
- } else if (s < 0x8080) { /* X 0208 */
- if ((filter->status & 0xff00) != 0x200) {
- CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
- CK((*filter->output_function)(0x24, filter->data)); /* '$' */
- CK((*filter->output_function)(0x42, filter->data)); /* 'B' */
- }
- filter->status = 0x200;
- CK((*filter->output_function)((s >> 8) & 0xff, filter->data));
- CK((*filter->output_function)(s & 0xff, filter->data));
- } else if (s < 0x10000) { /* X 0212 */
- if ((filter->status & 0xff00) != 0x300) {
- CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
- CK((*filter->output_function)(0x24, filter->data)); /* '$' */
- CK((*filter->output_function)(0x28, filter->data)); /* '(' */
- CK((*filter->output_function)(0x44, filter->data)); /* 'D' */
- }
- filter->status = 0x300;
- CK((*filter->output_function)((s >> 8) & 0x7f, filter->data));
- CK((*filter->output_function)(s & 0x7f, filter->data));
- } else { /* X 0201 latin */
- if ((filter->status & 0xff00) != 0x400) {
- CK((*filter->output_function)(0x1b, filter->data)); /* ESC */
- CK((*filter->output_function)(0x28, filter->data)); /* '(' */
- CK((*filter->output_function)(0x4a, filter->data)); /* 'J' */
- }
- filter->status = 0x400;
- CK((*filter->output_function)(s & 0x7f, filter->data));
- }
- } else {
- CK(mbfl_filt_conv_illegal_output(c, filter));
- }
-
- return c;
-}
-
-/*
* wchar => CP50220
*/
static void
@@ -843,8 +664,8 @@ mbfl_filt_conv_wchar_cp50222_flush(mbfl_convert_filter *filter)
}
filter->status &= 0xff;
- if (filter->flush_function != NULL) {
- return (*filter->flush_function)(filter->data);
+ if (filter->flush_function) {
+ (*filter->flush_function)(filter->data);
}
return 0;
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.h b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.h
index 12ab19d497..fdbaad6775 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.h
+++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.h
@@ -32,13 +32,10 @@
#include "mbfilter.h"
-extern const mbfl_encoding mbfl_encoding_jis_ms;
extern const mbfl_encoding mbfl_encoding_cp50220;
extern const mbfl_encoding mbfl_encoding_cp50221;
extern const mbfl_encoding mbfl_encoding_cp50222;
-extern const struct mbfl_convert_vtbl vtbl_jis_ms_wchar;
-extern const struct mbfl_convert_vtbl vtbl_wchar_jis_ms;
extern const struct mbfl_convert_vtbl vtbl_cp50220_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_cp50220;
extern const struct mbfl_convert_vtbl vtbl_cp50221_wchar;
@@ -46,8 +43,7 @@ extern const struct mbfl_convert_vtbl vtbl_wchar_cp50221;
extern const struct mbfl_convert_vtbl vtbl_cp50222_wchar;
extern const struct mbfl_convert_vtbl vtbl_wchar_cp50222;
-int mbfl_filt_conv_jis_ms_wchar(int c, mbfl_convert_filter *filter);
-int mbfl_filt_conv_wchar_jis_ms(int c, mbfl_convert_filter *filter);
+int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_cp50220(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_cp50221(int c, mbfl_convert_filter *filter);
int mbfl_filt_conv_wchar_cp50222(int c, mbfl_convert_filter *filter);
diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c
index 12239b96ea..1a0e65d95d 100644
--- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c
+++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c
@@ -161,7 +161,6 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
&mbfl_encoding_koi8u,
&mbfl_encoding_armscii8,
&mbfl_encoding_cp850,
- &mbfl_encoding_jis_ms,
&mbfl_encoding_2022jp_2004,
&mbfl_encoding_2022jp_kddi,
&mbfl_encoding_cp50220,
diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h
index 9f926d035c..dc8ae1d5e1 100644
--- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h
+++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h
@@ -113,7 +113,6 @@ enum mbfl_no_encoding {
mbfl_no_encoding_8859_16,
mbfl_no_encoding_armscii8,
mbfl_no_encoding_cp850,
- mbfl_no_encoding_jis_ms,
mbfl_no_encoding_cp50220,
mbfl_no_encoding_cp50221,
mbfl_no_encoding_cp50222,