diff options
author | Nikita Popov <nikita.ppv@gmail.com> | 2017-08-03 21:53:21 +0200 |
---|---|---|
committer | Nikita Popov <nikita.ppv@gmail.com> | 2017-08-03 21:53:59 +0200 |
commit | fb9bf5b64b6c09b9d93bbd1dadd64884e0af66f3 (patch) | |
tree | d11785d671b4d6a35452bcfc5e05adeabfa18fdb | |
parent | a8a9e93e9a902ffd4099e3ba2a7a269da09120c5 (diff) | |
download | php-git-fb9bf5b64b6c09b9d93bbd1dadd64884e0af66f3.tar.gz |
Revert/fix substitution character fallback
The introduced checks were not correct in two respects:
* It was checked whether the source encoding of the string matches
the internal encoding, while the actually relevant encoding is
the *target* encoding.
* Even if the correct encoding is used, the checks are still too
conservative. Just because something is not a "Unicode-encoding"
does not mean that it does not map any non-ASCII characters.
I've reverted the added checks and instead adjusted mbfl_convert
to first try to use the provided substitution character and if
that fails, perform the fallback to '?' at that point. This means
that any codepoint mapped in the target encoding should now be
correctly supported and anything else should fall back to '?'.
-rw-r--r-- | ext/mbstring/libmbfl/mbfl/mbfl_convert.c | 22 | ||||
-rw-r--r-- | ext/mbstring/mbstring.c | 37 | ||||
-rw-r--r-- | ext/mbstring/tests/bug69086.phpt | 6 | ||||
-rw-r--r-- | ext/mbstring/tests/mb_chr.phpt | 2 | ||||
-rw-r--r-- | ext/mbstring/tests/mb_substitute_character_variation2.phpt | 2 |
5 files changed, 30 insertions, 39 deletions
diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c index b553ad5d13..a73a0c80e5 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c @@ -467,14 +467,26 @@ int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char int mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter) { - int mode_backup, ret, n, m, r; + int mode_backup, substchar_backup, ret, n, m, r; ret = 0; + mode_backup = filter->illegal_mode; - filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE; + substchar_backup = filter->illegal_substchar; + + /* The used substitution character may not be supported by the target character encoding. + * If that happens, first try to use "?" instead and if that also fails, silently drop the + * character. */ + if (filter->illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR + && filter->illegal_substchar != 0x3f) { + filter->illegal_substchar = 0x3f; + } else { + filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE; + } + switch (mode_backup) { case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR: - ret = (*filter->filter_function)(filter->illegal_substchar, filter); + ret = (*filter->filter_function)(substchar_backup, filter); break; case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG: if (c >= 0) { @@ -560,14 +572,16 @@ mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter) } ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";"); } else { - ret = (*filter->filter_function)(filter->illegal_substchar, filter); + ret = (*filter->filter_function)(substchar_backup, filter); } } break; default: break; } + filter->illegal_mode = mode_backup; + filter->illegal_substchar = substchar_backup; filter->num_illegalchar++; return ret; diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 732b5f65ba..9b9458afc4 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -937,6 +937,7 @@ static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_leng if (convd == NULL) { return -1; } + mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); @@ -3254,29 +3255,9 @@ MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, con php_error_docref(NULL, E_WARNING, "Unable to create character encoding converter"); return NULL; } - mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); - - if (string.no_encoding == MBSTRG(current_internal_encoding)->no_encoding) { - mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); - } else if (php_mb_is_no_encoding_unicode(string.no_encoding) && php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) { - - if (php_mb_is_no_encoding_utf8(string.no_encoding)) { - - if (MBSTRG(current_filter_illegal_substchar) > 0xd7ff && - 0xe000 > MBSTRG(current_filter_illegal_substchar) - ) { - mbfl_buffer_converter_illegal_substchar(convd, 0x3f); - } else { - mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); - } - - } else { - mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); - } - } else { - mbfl_buffer_converter_illegal_substchar(convd, 0x3f); - } + mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode)); + mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar)); /* do it */ ret = mbfl_buffer_converter_feed_result(convd, &string, &result); @@ -5199,17 +5180,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len if (php_mb_is_no_encoding_utf8(no_enc)) { if (0 > cp || cp > 0x10ffff || (cp > 0xd7ff && 0xe000 > cp)) { - if (php_mb_is_no_encoding_utf8(MBSTRG(current_internal_encoding)->no_encoding)) { - cp = MBSTRG(current_filter_illegal_substchar); - } else if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) { - if (0xd800 > MBSTRG(current_filter_illegal_substchar) || MBSTRG(current_filter_illegal_substchar) > 0xdfff) { - cp = MBSTRG(current_filter_illegal_substchar); - } else { - cp = 0x3f; - } - } else { - cp = 0x3f; - } + cp = MBSTRG(current_filter_illegal_substchar); } if (cp < 0x80) { diff --git a/ext/mbstring/tests/bug69086.phpt b/ext/mbstring/tests/bug69086.phpt index 921d61cca4..9566e10968 100644 --- a/ext/mbstring/tests/bug69086.phpt +++ b/ext/mbstring/tests/bug69086.phpt @@ -8,7 +8,13 @@ mb_substitute_character(0xfffd); var_dump("?" === mb_convert_encoding("\x80", "Shift_JIS", "EUC-JP")); mb_internal_encoding("UCS-4BE"); var_dump("\x00\x00\xff\xfd" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8")); + +mb_internal_encoding("UTF-8"); +mb_substitute_character(0xfffd); +var_dump("\u{fffd}" === mb_convert_encoding("\x80", "UTF-8", "EUC-JP-2004")); + ?> --EXPECT-- bool(true) bool(true) +bool(true) diff --git a/ext/mbstring/tests/mb_chr.phpt b/ext/mbstring/tests/mb_chr.phpt index 19e1a704ec..8ec35920c3 100644 --- a/ext/mbstring/tests/mb_chr.phpt +++ b/ext/mbstring/tests/mb_chr.phpt @@ -22,7 +22,7 @@ var_dump( mb_internal_encoding("EUC-JP"); mb_substitute_character(0xa4a2); var_dump( - "?" === mb_chr(0xd800, "UTF-8") + "\u{a4a2}" === mb_chr(0xd800, "UTF-8") ); // Invalid diff --git a/ext/mbstring/tests/mb_substitute_character_variation2.phpt b/ext/mbstring/tests/mb_substitute_character_variation2.phpt index 202561afc7..6248174aa6 100644 --- a/ext/mbstring/tests/mb_substitute_character_variation2.phpt +++ b/ext/mbstring/tests/mb_substitute_character_variation2.phpt @@ -35,5 +35,5 @@ var_dump(bin2hex(mb_convert_encoding($string_mb, "ISO-8859-1", "UTF-8"))); string(14) "3f3f3f3f3f3f3f" string(14) "42424242424242" string(0) "" -string(0) "" +string(14) "3f3f3f3f3f3f3f" ===DONE=== |