diff options
-rw-r--r-- | embed.fnc | 2 | ||||
-rw-r--r-- | embedvar.h | 1 | ||||
-rw-r--r-- | intrpvar.h | 1 | ||||
-rw-r--r-- | lib/locale.t | 15 | ||||
-rw-r--r-- | locale.c | 151 | ||||
-rw-r--r-- | pod/perldelta.pod | 8 | ||||
-rw-r--r-- | pod/perllocale.pod | 15 | ||||
-rw-r--r-- | proto.h | 4 | ||||
-rw-r--r-- | t/porting/libperl.t | 7 |
9 files changed, 180 insertions, 24 deletions
@@ -909,7 +909,7 @@ Ap |I32 * |markstack_grow #if defined(USE_LOCALE_COLLATE) p |int |magic_setcollxfrm|NN SV* sv|NN MAGIC* mg : Defined in locale.c, used only in sv.c -p |char* |mem_collxfrm |NN const char* s|STRLEN len|NN STRLEN* xlen +p |char* |mem_collxfrm |NN const char* input_string|STRLEN len|NN STRLEN* xlen #endif Afpd |SV* |mess |NN const char* pat|... Apd |SV* |mess_sv |NN SV* basemsg|bool consume diff --git a/embedvar.h b/embedvar.h index bd151934fd..794ed9a31e 100644 --- a/embedvar.h +++ b/embedvar.h @@ -309,6 +309,7 @@ #define PL_stderrgv (vTHX->Istderrgv) #define PL_stdingv (vTHX->Istdingv) #define PL_strtab (vTHX->Istrtab) +#define PL_strxfrm_min_char (vTHX->Istrxfrm_min_char) #define PL_sub_generation (vTHX->Isub_generation) #define PL_subline (vTHX->Isubline) #define PL_subname (vTHX->Isubname) diff --git a/intrpvar.h b/intrpvar.h index 9366383221..42872e5ba2 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -564,6 +564,7 @@ PERLVAR(I, collation_name, char *) /* Name of current collation */ PERLVAR(I, collxfrm_base, Size_t) /* Basic overhead in *xfrm() */ PERLVARI(I, collxfrm_mult,Size_t, 2) /* Expansion factor in *xfrm() */ PERLVARI(I, collation_ix, U32, 0) /* Collation generation index */ +PERLVARA(I, strxfrm_min_char, 3, char) PERLVARI(I, collation_standard, bool, TRUE) /* Assume simple collation */ #endif /* USE_LOCALE_COLLATE */ diff --git a/lib/locale.t b/lib/locale.t index ddb5d795d9..ce0c9879d3 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -1740,7 +1740,7 @@ foreach my $Locale (@Locale) { ++$locales_test_number; $test_names{$locales_test_number} - = 'TODO Skip in locales where \001 has primary sorting weight; ' + = 'Skip in locales where \001 has primary sorting weight; ' . 'otherwise verify that \0 doesn\'t have primary sorting weight'; if ("a\001c" lt "ab") { report_result($Locale, $locales_test_number, 1); @@ -1749,6 +1749,19 @@ foreach my $Locale (@Locale) { my $ok = "ab" lt "a\0c"; report_result($Locale, $locales_test_number, $ok); } + + ++$locales_test_number; + $test_names{$locales_test_number} + = 'TODO Verify that strings with embedded NUL collate'; + my $ok = "a\0a\0a" lt "a\001a\001a"; + report_result($Locale, $locales_test_number, $ok); + + ++$locales_test_number; + $test_names{$locales_test_number} + = 'TODO Verify that strings with embedded NUL and ' + . 'extra trailing NUL collate'; + $ok = "a\0a\0" lt "a\001a\001"; + report_result($Locale, $locales_test_number, $ok); } my $ok1; @@ -486,6 +486,7 @@ Perl_new_collate(pTHX_ const char *newcoll) PL_collxfrm_base = 0; PL_collxfrm_mult = 2; PL_in_utf8_COLLATE_locale = FALSE; + *PL_strxfrm_min_char = '\0'; return; } @@ -500,6 +501,7 @@ Perl_new_collate(pTHX_ const char *newcoll) } PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE); + *PL_strxfrm_min_char = '\0'; /* A locale collation definition includes primary, secondary, tertiary, * etc. weights for each character. To sort, the primary weights are @@ -1295,13 +1297,136 @@ Perl_init_i18nl10n(pTHX_ int printwarn) */ char * -Perl_mem_collxfrm(pTHX_ const char *s, STRLEN len, STRLEN *xlen) +Perl_mem_collxfrm(pTHX_ const char *input_string, + STRLEN len, + STRLEN *xlen + ) { + char * s = (char *) input_string; + STRLEN s_strlen = strlen(input_string); char *xbuf; - STRLEN xAlloc, xin, xout; /* xalloc is a reserved word in VC */ + STRLEN xAlloc, xout; /* xalloc is a reserved word in VC */ PERL_ARGS_ASSERT_MEM_COLLXFRM; + /* Replace any embedded NULs with the control that sorts before any others. + * This will give as good as possible results on strings that don't + * otherwise contain that character, but otherwise there may be + * less-than-perfect results with that character and NUL. This is + * unavoidable unless we replace strxfrm with our own implementation. + * + * XXX This code may be overkill. khw wrote it before realizing that if + * you change a NUL into some other character, that that may change the + * strxfrm results if that character is part of a sequence with other + * characters for weight calculations. To minimize the chances of this, + * now the replacement is restricted to another control (likely to be + * \001). But the full generality has been retained. + * + * This is one of the few places in the perl core, where we can use + * standard functions like strlen() and strcat(). It's because we're + * looking for NULs. */ + if (s_strlen < len) { + char * e = s + len; + char * sans_nuls; + STRLEN cur_min_char_len; + + /* If we don't know what control character sorts lowest for this + * locale, find it */ + if (*PL_strxfrm_min_char == '\0') { + int j; + char * cur_min_x = NULL; /* Cur cp's xfrm, (except it also + includes the collation index + prefixed. */ + + /* Look through all legal code points (NUL isn't) */ + for (j = 1; j < 256; j++) { + char * x; /* j's xfrm plus collation index */ + STRLEN x_len; /* length of 'x' */ + STRLEN trial_len = 1; + + /* Create a 1 byte string of the current code point, but with + * room to be 2 bytes */ + char cur_source[] = { (char) j, '\0' , '\0' }; + + if (PL_in_utf8_COLLATE_locale) { + if (! isCNTRL_L1(j)) { + continue; + } + + /* If needs to be 2 bytes, find them */ + if (! UVCHR_IS_INVARIANT(j)) { + continue; /* Can't handle variants yet */ + } + } + else if (! isCNTRL_LC(j)) { + continue; + } + + /* Then transform it */ + x = mem_collxfrm(cur_source, trial_len, &x_len); + + /* If something went wrong (which it shouldn't), just + * ignore this code point */ + if ( x_len == 0 + || strlen(x + sizeof(PL_collation_ix)) < x_len) + { + continue; + } + + /* If this character's transformation is lower than + * the current lowest, this one becomes the lowest */ + if ( cur_min_x == NULL + || strLT(x + sizeof(PL_collation_ix), + cur_min_x + sizeof(PL_collation_ix))) + { + strcpy(PL_strxfrm_min_char, cur_source); + cur_min_x = x; + } + else { + Safefree(x); + } + } /* end of loop through all bytes */ + + /* Unlikely, but possible, if there aren't any controls in the + * locale, arbitrarily use \001 */ + if (cur_min_x == NULL) { + STRLEN x_len; /* temporary */ + cur_min_x = mem_collxfrm("\001", 1, &x_len); + /* cur_min_cp was already initialized to 1 */ + } + + Safefree(cur_min_x); + } + + /* The worst case length for the replaced string would be if every + * character in it is NUL. Multiply that by the length of each + * replacement, and allow for a trailing NUL */ + cur_min_char_len = strlen(PL_strxfrm_min_char); + Newx(sans_nuls, (len * cur_min_char_len) + 1, char); + *sans_nuls = '\0'; + + + /* Replace each NUL with the lowest collating control. Loop until have + * exhausted all the NULs */ + while (s + s_strlen < e) { + strcat(sans_nuls, s); + + /* Do the actual replacement */ + strcat(sans_nuls, PL_strxfrm_min_char); + + /* Move past the input NUL */ + s += s_strlen + 1; + s_strlen = strlen(s); + } + + /* And add anything that trails the final NUL */ + strcat(sans_nuls, s); + + /* Switch so below we transform this modified string */ + s = sans_nuls; + len = strlen(s); + } + /* The first element in the output is the collation id, used by * sv_collxfrm(); then comes the space for the transformed string. The * equation should give us a good estimate as to how much is needed */ @@ -1316,17 +1441,16 @@ Perl_mem_collxfrm(pTHX_ const char *s, STRLEN len, STRLEN *xlen) /* Then the transformation of the input. We loop until successful, or we * give up */ - for (xin = 0; xin < len; ) { - Size_t xused; - for (;;) { - xused = strxfrm(xbuf + xout, s + xin, xAlloc - xout); + STRLEN xused = strxfrm(xbuf + xout, s, xAlloc - xout); /* If the transformed string occupies less space than we told * strxfrm() was available, it means it successfully transformed * the whole string. */ - if ((STRLEN)xused < xAlloc - xout) + if (xused < xAlloc - xout) { + xout += xused; break; + } if (UNLIKELY(xused >= PERL_INT_MAX)) goto bad; @@ -1340,19 +1464,20 @@ Perl_mem_collxfrm(pTHX_ const char *s, STRLEN len, STRLEN *xlen) goto bad; } - xin += strlen(s + xin) + 1; - xout += xused; + *xlen = xout - sizeof(PL_collation_ix); + - /* Embedded NULs are understood but silently skipped - * because they make no sense in locale collation. */ + if (s != input_string) { + Safefree(s); } - xbuf[xout] = '\0'; - *xlen = xout - sizeof(PL_collation_ix); return xbuf; bad: Safefree(xbuf); + if (s != input_string) { + Safefree(s); + } *xlen = 0; return NULL; } diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 76f4972c08..11a089b989 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -27,6 +27,14 @@ here, but most should go in the L</Performance Enhancements> section. [ List each enhancement as a =head2 entry ] +=head2 Better locale collation of strings containing embedded C<NUL> +characters + +In locales that have multi-level character weights, these are now +ignored at the higher priority ones. There are still some gotchas in +some strings, though. See +L<perllocale/Collation of strings containing embedded C<NUL> characters>. + =head1 Security XXX Any security-related notices go here. In particular, any security diff --git a/pod/perllocale.pod b/pod/perllocale.pod index 0c7e769111..d842a0781a 100644 --- a/pod/perllocale.pod +++ b/pod/perllocale.pod @@ -1567,13 +1567,14 @@ called, and whatever it does is what you get. =head2 Collation of strings containing embedded C<NUL> characters -Perl handles C<NUL> characters in the middle of strings. In many -locales, control characters are ignored unless the strings otherwise -compare equal. Unlike other control characters, C<NUL> characters are -never ignored. For example, if given that C<"b"> sorts after -C<"\001">, and C<"c"> sorts after C<"b">, C<"a\0c"> always sorts before -C<"ab">. This is true even in locales in which C<"ab"> sorts before -C<"a\001c">. +C<NUL> characters will sort the same as the lowest collating control +character does, or to C<"\001"> in the unlikely event that there are no +control characters at all in the locale. In cases where the strings +don't contain this non-C<NUL> control, the results will be correct, and +in many locales, this control, whatever it might be, will rarely be +encountered. But there are cases where a C<NUL> should sort before this +control, but doesn't. If two strings do collate identically, the one +containing the C<NUL> will sort to earlier. =head2 Broken systems @@ -5785,9 +5785,9 @@ STATIC char* S_stdize_locale(pTHX_ char* locs); PERL_CALLCONV int Perl_magic_setcollxfrm(pTHX_ SV* sv, MAGIC* mg); #define PERL_ARGS_ASSERT_MAGIC_SETCOLLXFRM \ assert(sv); assert(mg) -PERL_CALLCONV char* Perl_mem_collxfrm(pTHX_ const char* s, STRLEN len, STRLEN* xlen); +PERL_CALLCONV char* Perl_mem_collxfrm(pTHX_ const char* input_string, STRLEN len, STRLEN* xlen); #define PERL_ARGS_ASSERT_MEM_COLLXFRM \ - assert(s); assert(xlen) + assert(input_string); assert(xlen) /* PERL_CALLCONV char* sv_collxfrm(pTHX_ SV *const sv, STRLEN *const nxp); */ PERL_CALLCONV char* Perl_sv_collxfrm_flags(pTHX_ SV *const sv, STRLEN *const nxp, I32 const flags); #define PERL_ARGS_ASSERT_SV_COLLXFRM_FLAGS \ diff --git a/t/porting/libperl.t b/t/porting/libperl.t index 00f2606738..161f7162d3 100644 --- a/t/porting/libperl.t +++ b/t/porting/libperl.t @@ -527,6 +527,13 @@ for my $symbol (sort keys %unexpected) { SKIP: { skip("uses sprintf for Gconvert in sv.o"); } + } + elsif ( $symbol eq 'strcat' + && @o == 1 && $o[0] eq 'locale.o') + { + SKIP: { + skip("locale.o legitimately uses strcat"); + } } else { is(@o, 0, "uses no $symbol (@o)"); } |