diff options
author | Karl Williamson <khw@cpan.org> | 2016-07-26 13:35:18 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2016-08-02 19:31:07 -0600 |
commit | 94762aa0738da6a58c332bc089ca8e8102fe3a77 (patch) | |
tree | 8c31680c6775618a297d1ef0d3d2a93309aa51ac /locale.c | |
parent | 81dd1fd4957b9e8b6a5d7b6415e45a2da26423cc (diff) | |
download | perl-94762aa0738da6a58c332bc089ca8e8102fe3a77.tar.gz |
locale.c: Improve embedded NUL collation handling
When collating a Perl string that contains NUL characters, these are
replaced by the lowest collating non-NUL control. However, if something
goes wrong with the calculation, until this commit \001 is arbitrarily
used. But \001 is likely to be one of the characters that didn't work,
so we shouldn't arbitrarily choose it. Instead, what this commit does
is to repeat the calculation a second time, allowing any character, not
just controls. If that fails, something is seriously wrong, and an
error is returned. This will cause the comparison that called this
function to fall back to using a raw string compare, instead of locale
collation.
Also, previously an empty transformed string was considered an error,
but that actually should be allowable. This commit also changes a
similar finding of the highest collating character to not error on an
empty string, and to return an error if no such character is found.
Also, some comments were deleted that suggested that all this might not
be necessary. But since those were written, real world experience shows
that it is.
And finally, debug outputting the transformation of the found
replacements is deleted, as these come from a recursive call, and the
transformation will have already been output in that call.
Diffstat (limited to 'locale.c')
-rw-r--r-- | locale.c | 87 |
1 files changed, 39 insertions, 48 deletions
@@ -1462,13 +1462,6 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, * less-than-perfect results with that character and NUL. This is * unavoidable unless we replace strxfrm with our own implementation. * - * XXX This code may be overkill. khw wrote it before realizing that if - * you change a NUL into some other character, that that may change the - * strxfrm results if that character is part of a sequence with other - * characters for weight calculations. To minimize the chances of this, - * now the replacement is restricted to another control (likely to be - * \001). But the full generality has been retained. - * * This is one of the few places in the perl core, where we can use * standard functions like strlen() and strcat(). It's because we're * looking for NULs. */ @@ -1476,6 +1469,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, char * e = s + len; char * sans_nuls; STRLEN cur_min_char_len; + int try_non_controls; /* If we don't know what control character sorts lowest for this * locale, find it */ @@ -1489,6 +1483,13 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, prefixed. */ DEBUG_Lv(PerlIO_printf(Perl_debug_log, "Looking to replace NUL\n")); + + /* Unlikely, but it may be that no control will work to replace + * NUL, in which case we instead look for any character */ + for (try_non_controls = 0; + try_non_controls < 2; + try_non_controls++) + { /* Look through all legal code points (NUL isn't) */ for (j = 1; j < 256; j++) { char * x; /* j's xfrm plus collation index */ @@ -1500,7 +1501,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, char cur_source[] = { (char) j, '\0' , '\0' }; if (PL_in_utf8_COLLATE_locale) { - if (! isCNTRL_L1(j)) { + if (! try_non_controls && ! isCNTRL_L1(j)) { continue; } @@ -1511,7 +1512,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, trial_len = 2; } } - else if (! isCNTRL_LC(j)) { + else if (! try_non_controls && ! isCNTRL_LC(j)) { continue; } @@ -1519,11 +1520,8 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, x = _mem_collxfrm(cur_source, trial_len, &x_len, PL_in_utf8_COLLATE_locale); - /* If something went wrong (which it shouldn't), just - * ignore this code point */ - if ( x_len == 0 - || strlen(x + COLLXFRM_HDR_LEN) < x_len) - { + /* Ignore any character that didn't successfully transform */ + if (! x) { continue; } @@ -1546,30 +1544,28 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, } } /* end of loop through all bytes */ - /* Unlikely, but possible, if there aren't any controls in the - * locale, arbitrarily use \001 */ - if (cur_min_x == NULL) { - STRLEN x_len; /* temporary */ - cur_min_x = _mem_collxfrm("\001", 1, &x_len, - PL_in_utf8_COLLATE_locale); - /* cur_min_cp was already initialized to 1 */ + if (cur_min_x) { + break; + } + + /* Unlikely, but possible, if there aren't any controls that + * work in the locale, repeat the loop, looking for any + * character that works */ + DEBUG_L(PerlIO_printf(Perl_debug_log, + "_mem_collxfrm: No control worked. Trying non-controls\n")); } - DEBUG_L(PerlIO_printf(Perl_debug_log, - "_mem_collxfrm: lowest collating non-NUL control in the " - "0-255 range in locale %s is 0x%02X\n", - PL_collation_name, - cur_min_cp)); - if (DEBUG_Lv_TEST) { - unsigned i; - PerlIO_printf(Perl_debug_log, "Its xfrm is"); - for (i = 0; i < strlen(cur_min_x + COLLXFRM_HDR_LEN); i ++) { - PerlIO_printf(Perl_debug_log, " %02x", - (U8) *(cur_min_x + COLLXFRM_HDR_LEN + i)); - } - PerlIO_printf(Perl_debug_log, "\n"); + if (! cur_min_x) { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "_mem_collxfrm: Couldn't find any character to replace" + " embedded NULs in locale %s with", PL_collation_name)); + goto bad; } + DEBUG_L(PerlIO_printf(Perl_debug_log, + "_mem_collxfrm: Replacing embedded NULs in locale %s with " + "0x%02X\n", PL_collation_name, cur_min_cp)); + Safefree(cur_min_x); } @@ -1668,8 +1664,7 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, /* If something went wrong (which it shouldn't), just * ignore this code point */ - if (x_len == 0) { - Safefree(x); + if (! x) { continue; } @@ -1687,23 +1682,19 @@ Perl__mem_collxfrm(pTHX_ const char *input_string, } } + if (! cur_max_x) { + DEBUG_L(PerlIO_printf(Perl_debug_log, + "_mem_collxfrm: Couldn't find any character to" + " replace above-Latin1 chars in locale %s with", + PL_collation_name)); + goto bad; + } + DEBUG_L(PerlIO_printf(Perl_debug_log, "_mem_collxfrm: highest 1-byte collating character" " in locale %s is 0x%02X\n", PL_collation_name, PL_strxfrm_max_cp)); - if (DEBUG_Lv_TEST) { - unsigned i; - PerlIO_printf(Perl_debug_log, "Its xfrm is "); - for (i = 0; - i < strlen(cur_max_x + COLLXFRM_HDR_LEN); - i++) - { - PerlIO_printf(Perl_debug_log, " %02x", - (U8) cur_max_x[i + COLLXFRM_HDR_LEN]); - } - PerlIO_printf(Perl_debug_log, "\n"); - } Safefree(cur_max_x); } |