diff options
-rw-r--r-- | embed.fnc | 6 | ||||
-rw-r--r-- | embed.h | 5 | ||||
-rw-r--r-- | embedvar.h | 1 | ||||
-rw-r--r-- | intrpvar.h | 1 | ||||
-rw-r--r-- | lib/locale.t | 54 | ||||
-rw-r--r-- | locale.c | 207 | ||||
-rw-r--r-- | pod/perldelta.pod | 10 | ||||
-rw-r--r-- | pod/perllocale.pod | 54 | ||||
-rw-r--r-- | proto.h | 7 | ||||
-rw-r--r-- | sv.c | 3 |
10 files changed, 307 insertions, 41 deletions
@@ -910,6 +910,12 @@ Ap |I32 * |markstack_grow p |int |magic_setcollxfrm|NN SV* sv|NN MAGIC* mg : Defined in locale.c, used only in sv.c p |char* |mem_collxfrm |NN const char* input_string|STRLEN len|NN STRLEN* xlen +# if defined(PERL_IN_LOCALE_C) || defined(PERL_IN_SV_C) +pM |char* |_mem_collxfrm |NN const char* input_string \ + |STRLEN len \ + |NN STRLEN* xlen \ + |bool utf8 +# endif #endif Afpd |SV* |mess |NN const char* pat|... Apd |SV* |mess_sv |NN SV* basemsg|bool consume @@ -1559,6 +1559,11 @@ #define share_hek_flags(a,b,c,d) S_share_hek_flags(aTHX_ a,b,c,d) #define unshare_hek_or_pvn(a,b,c,d) S_unshare_hek_or_pvn(aTHX_ a,b,c,d) # endif +# if defined(PERL_IN_LOCALE_C) || defined(PERL_IN_SV_C) +# if defined(USE_LOCALE_COLLATE) +#define _mem_collxfrm(a,b,c,d) Perl__mem_collxfrm(aTHX_ a,b,c,d) +# endif +# endif # if defined(PERL_IN_MALLOC_C) #define adjust_size_and_find_bucket S_adjust_size_and_find_bucket # endif diff --git a/embedvar.h b/embedvar.h index 67383680f5..c2831d642a 100644 --- a/embedvar.h +++ b/embedvar.h @@ -310,6 +310,7 @@ #define PL_stdingv (vTHX->Istdingv) #define PL_strtab (vTHX->Istrtab) #define PL_strxfrm_is_behaved (vTHX->Istrxfrm_is_behaved) +#define PL_strxfrm_max_cp (vTHX->Istrxfrm_max_cp) #define PL_strxfrm_min_char (vTHX->Istrxfrm_min_char) #define PL_sub_generation (vTHX->Isub_generation) #define PL_subline (vTHX->Isubline) diff --git a/intrpvar.h b/intrpvar.h index f540a9d924..ca1bb718b9 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -567,6 +567,7 @@ PERLVARI(I, collation_ix, U32, 0) /* Collation generation index */ PERLVARA(I, strxfrm_min_char, 3, char) PERLVARI(I, strxfrm_is_behaved, bool, TRUE) /* Assume until proven otherwise that it works */ +PERLVARI(I, strxfrm_max_cp, U8, 0) /* Highest collating cp in locale */ PERLVARI(I, collation_standard, bool, TRUE) /* Assume simple collation */ #endif /* USE_LOCALE_COLLATE */ diff --git a/lib/locale.t b/lib/locale.t index ce0c9879d3..9afa9a40d1 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -1752,16 +1752,66 @@ foreach my $Locale (@Locale) { ++$locales_test_number; $test_names{$locales_test_number} - = 'TODO Verify that strings with embedded NUL collate'; + = 'Verify that strings with embedded NUL collate'; my $ok = "a\0a\0a" lt "a\001a\001a"; report_result($Locale, $locales_test_number, $ok); ++$locales_test_number; $test_names{$locales_test_number} - = 'TODO Verify that strings with embedded NUL and ' + = 'Verify that strings with embedded NUL and ' . 'extra trailing NUL collate'; $ok = "a\0a\0" lt "a\001a\001"; report_result($Locale, $locales_test_number, $ok); + + ++$locales_test_number; + $test_names{$locales_test_number} + = "Skip in non-UTF-8 locales; otherwise verify that UTF8ness " + . "doesn't matter with collation"; + if (! $is_utf8_locale) { + report_result($Locale, $locales_test_number, 1); + } + else { + + # khw can't think of anything better. Start with a string that is + # higher than its UTF-8 representation in both EBCDIC and ASCII + my $string = chr utf8::unicode_to_native(0xff); + my $utf8_string = $string; + utf8::upgrade($utf8_string); + + # 8 should be lt 9 in all locales (except ones that aren't + # ASCII-based, which might fail this) + $ok = ("a${string}8") lt ("a${utf8_string}9"); + report_result($Locale, $locales_test_number, $ok); + } + + ++$locales_test_number; + $test_names{$locales_test_number} + = "Skip in UTF-8 locales; otherwise verify that single byte " + . "collates before 0x100 and above"; + if ($is_utf8_locale) { + report_result($Locale, $locales_test_number, 1); + } + else { + my $max_collating = chr 0; # Find byte that collates highest + for my $i (0 .. 255) { + my $char = chr $i; + $max_collating = $char if $char gt $max_collating; + } + $ok = $max_collating lt chr 0x100; + report_result($Locale, $locales_test_number, $ok); + } + + ++$locales_test_number; + $test_names{$locales_test_number} + = "Skip in UTF-8 locales; otherwise verify that 0x100 and " + . "above collate in code point order"; + if ($is_utf8_locale) { + report_result($Locale, $locales_test_number, 1); + } + else { + $ok = chr 0x100 lt chr 0x101; + report_result($Locale, $locales_test_number, $ok); + } } my $ok1; @@ -487,6 +487,7 @@ Perl_new_collate(pTHX_ const char *newcoll) PL_collxfrm_mult = 2; PL_in_utf8_COLLATE_locale = FALSE; *PL_strxfrm_min_char = '\0'; + PL_strxfrm_max_cp = 0; return; } @@ -502,6 +503,7 @@ Perl_new_collate(pTHX_ const char *newcoll) PL_in_utf8_COLLATE_locale = _is_cur_LC_category_utf8(LC_COLLATE); *PL_strxfrm_min_char = '\0'; + PL_strxfrm_max_cp = 0; /* A locale collation definition includes primary, secondary, tertiary, * etc. weights for each character. To sort, the primary weights are @@ -564,7 +566,7 @@ Perl_new_collate(pTHX_ const char *newcoll) char * x_shorter; /* We also transform a substring of 'longer' */ Size_t x_len_shorter; - /* mem_collxfrm() is used get the transformation (though here we + /* _mem_collxfrm() is used get the transformation (though here we * are interested only in its length). It is used because it has * the intelligence to handle all cases, but to work, it needs some * values of 'm' and 'b' to get it started. For the purposes of @@ -576,9 +578,18 @@ Perl_new_collate(pTHX_ const char *newcoll) PL_collxfrm_mult = 5 * sizeof(UV); /* Find out how long the transformation really is */ - x_longer = mem_collxfrm(longer, - sizeof(longer) - 1, - &x_len_longer); + x_longer = _mem_collxfrm(longer, + sizeof(longer) - 1, + &x_len_longer, + + /* We avoid converting to UTF-8 in the + * called function by telling it the + * string is in UTF-8 if the locale is a + * UTF-8 one. Since the string passed + * here is invariant under UTF-8, we can + * claim it's UTF-8 even though it isn't. + * */ + PL_in_utf8_COLLATE_locale); Safefree(x_longer); /* Find out how long the transformation of a substring of 'longer' @@ -586,9 +597,10 @@ Perl_new_collate(pTHX_ const char *newcoll) * sufficient to calculate 'm' and 'b'. The substring is all of * 'longer' except the first character. This minimizes the chances * of being swayed by outliers */ - x_shorter = mem_collxfrm(longer + 1, + x_shorter = _mem_collxfrm(longer + 1, sizeof(longer) - 2, - &x_len_shorter); + &x_len_shorter, + PL_in_utf8_COLLATE_locale); Safefree(x_shorter); /* If the results are nonsensical for this simple test, the whole @@ -1364,29 +1376,44 @@ Perl_init_i18nl10n(pTHX_ int printwarn) #ifdef USE_LOCALE_COLLATE -/* - * mem_collxfrm() is a bit like strxfrm() but with two important - * differences. First, it handles embedded NULs. Second, it allocates - * a bit more memory than needed for the transformed data itself. - * The real transformed data begins at offset sizeof(collationix). - * *xlen is set to the length of that, and doesn't include the collation index - * size. - * Please see sv_collxfrm() to see how this is used. - */ +char * +Perl_mem_collxfrm(pTHX_ const char *input_string, STRLEN len, STRLEN *xlen) +{ + /* This function is retained for compatibility in case someone outside core + * is using this (but it is undocumented) */ + + PERL_ARGS_ASSERT_MEM_COLLXFRM; + + return _mem_collxfrm(input_string, len, xlen, FALSE); +} char * -Perl_mem_collxfrm(pTHX_ const char *input_string, - STRLEN len, - STRLEN *xlen +Perl__mem_collxfrm(pTHX_ const char *input_string, + STRLEN len, /* Length of 'input_string' */ + STRLEN *xlen, /* Set to length of returned string + (not including the collation index + prefix) */ + bool utf8 /* Is the input in UTF-8? */ ) { + + /* _mem_collxfrm() is a bit like strxfrm() but with two important + * differences. First, it handles embedded NULs. Second, it allocates a bit + * more memory than needed for the transformed data itself. The real + * transformed data begins at offset sizeof(collationix). *xlen is set to + * the length of that, and doesn't include the collation index size. + * Please see sv_collxfrm() to see how this is used. */ + char * s = (char *) input_string; STRLEN s_strlen = strlen(input_string); char *xbuf = NULL; STRLEN xAlloc, xout; /* xalloc is a reserved word in VC */ bool first_time = TRUE; /* Cleared after first loop iteration */ - PERL_ARGS_ASSERT_MEM_COLLXFRM; + PERL_ARGS_ASSERT__MEM_COLLXFRM; + + /* Must be NUL-terminated */ + assert(*(input_string + len) == '\0'); /* If this locale has defective collation, skip */ if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0) { @@ -1439,7 +1466,9 @@ Perl_mem_collxfrm(pTHX_ const char *input_string, /* If needs to be 2 bytes, find them */ if (! UVCHR_IS_INVARIANT(j)) { - continue; /* Can't handle variants yet */ + char * d = cur_source; + append_utf8_from_native_byte((U8) j, (U8 **) &d); + trial_len = 2; } } else if (! isCNTRL_LC(j)) { @@ -1447,7 +1476,8 @@ Perl_mem_collxfrm(pTHX_ const char *input_string, } /* Then transform it */ - x = mem_collxfrm(cur_source, trial_len, &x_len); + x = _mem_collxfrm(cur_source, trial_len, &x_len, + PL_in_utf8_COLLATE_locale); /* If something went wrong (which it shouldn't), just * ignore this code point */ @@ -1475,7 +1505,8 @@ Perl_mem_collxfrm(pTHX_ const char *input_string, * locale, arbitrarily use \001 */ if (cur_min_x == NULL) { STRLEN x_len; /* temporary */ - cur_min_x = mem_collxfrm("\001", 1, &x_len); + cur_min_x = _mem_collxfrm("\001", 1, &x_len, + PL_in_utf8_COLLATE_locale); /* cur_min_cp was already initialized to 1 */ } @@ -1511,10 +1542,140 @@ Perl_mem_collxfrm(pTHX_ const char *input_string, len = strlen(s); } + /* Make sure the UTF8ness of the string and locale match */ + if (utf8 != PL_in_utf8_COLLATE_locale) { + const char * const t = s; /* Temporary so we can later find where the + input was */ + + /* Here they don't match. Change the string's to be what the locale is + * expecting */ + + if (! utf8) { /* locale is UTF-8, but input isn't; upgrade the input */ + s = (char *) bytes_to_utf8((const U8 *) s, &len); + utf8 = TRUE; + } + else { /* locale is not UTF-8; but input is; downgrade the input */ + + s = (char *) bytes_from_utf8((const U8 *) s, &len, &utf8); + + /* If the downgrade was successful we are done, but if the input + * contains things that require UTF-8 to represent, have to do + * damage control ... */ + if (UNLIKELY(utf8)) { + + /* What we do is construct a non-UTF-8 string with + * 1) the characters representable by a single byte converted + * to be so (if necessary); + * 2) and the rest converted to collate the same as the + * highest collating representable character. That makes + * them collate at the end. This is similar to how we + * handle embedded NULs, but we use the highest collating + * code point instead of the smallest. Like the NUL case, + * this isn't perfect, but is the best we can reasonably + * do. Every above-255 code point will sort the same as + * the highest-sorting 0-255 code point. If that code + * point can combine in a sequence with some other code + * points for weight calculations, us changing something to + * be it can adversely affect the results. But in most + * cases, it should work reasonably. And note that this is + * really an illegal situation: using code points above 255 + * on a locale where only 0-255 are valid. If two strings + * sort entirely equal, then the sort order for the + * above-255 code points will be in code point order. */ + + utf8 = FALSE; + + /* If we haven't calculated the code point with the maximum + * collating order for this locale, do so now */ + if (! PL_strxfrm_max_cp) { + int j; + + /* The current transformed string that collates the + * highest (except it also includes the prefixed collation + * index. */ + char * cur_max_x = NULL; + + /* Look through all legal code points (NUL isn't) */ + for (j = 1; j < 256; j++) { + char * x; + STRLEN x_len; + + /* Create a 1-char string of the current code point. */ + char cur_source[] = { (char) j, '\0' }; + + /* Then transform it */ + x = _mem_collxfrm(cur_source, 1, &x_len, FALSE); + + /* If something went wrong (which it shouldn't), just + * ignore this code point */ + if (x_len == 0) { + Safefree(x); + continue; + } + + /* If this character's transformation is higher than + * the current highest, this one becomes the highest */ + if ( cur_max_x == NULL + || strGT(x + sizeof(PL_collation_ix), + cur_max_x + sizeof(PL_collation_ix))) + { + PL_strxfrm_max_cp = j; + cur_max_x = x; + } + else { + Safefree(x); + } + } + + Safefree(cur_max_x); + } + + /* Here we know which legal code point collates the highest. + * We are ready to construct the non-UTF-8 string. The length + * will be at least 1 byte smaller than the input string + * (because we changed at least one 2-byte character into a + * single byte), but that is eaten up by the trailing NUL */ + Newx(s, len, char); + + { + STRLEN i; + STRLEN d= 0; + + for (i = 0; i < len; i+= UTF8SKIP(t + i)) { + U8 cur_char = t[i]; + if (UTF8_IS_INVARIANT(cur_char)) { + s[d++] = cur_char; + } + else if (UTF8_IS_DOWNGRADEABLE_START(cur_char)) { + s[d++] = EIGHT_BIT_UTF8_TO_NATIVE(cur_char, t[i+1]); + } + else { /* Replace illegal cp with highest collating + one */ + s[d++] = PL_strxfrm_max_cp; + } + } + s[d++] = '\0'; + Renew(s, d, char); /* Free up unused space */ + } + } + } + + /* Here, we have constructed a modified version of the input. It could + * be that we already had a modified copy before we did this version. + * If so, that copy is no longer needed */ + if (t != input_string) { + Safefree(t); + } + } + /* The first element in the output is the collation id, used by * sv_collxfrm(); then comes the space for the transformed string. The * equation should give us a good estimate as to how much is needed */ - xAlloc = sizeof(PL_collation_ix) + PL_collxfrm_base + (PL_collxfrm_mult * len) + 1; + xAlloc = sizeof(PL_collation_ix) + + PL_collxfrm_base + + (PL_collxfrm_mult * ((utf8) + ? utf8_length((U8 *) s, (U8 *) s + len) + : len)); Newx(xbuf, xAlloc, char); if (UNLIKELY(! xbuf)) goto bad; diff --git a/pod/perldelta.pod b/pod/perldelta.pod index d334bb8c7b..69a3d53390 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -27,6 +27,16 @@ here, but most should go in the L</Performance Enhancements> section. [ List each enhancement as a =head2 entry ] +=head2 Perl can now do default collation in UTF-8 locales on platforms +that support it + +Some platforms natively do a reasonable job of collating and sorting in +UTF-8 locales. Perl now works with those. For portability and full +control, L<Unicode::Collate> is still recommended, but now you may +not need to do anything special to get good-enough results, depending on +your application. See +L<perllocale/Category C<LC_COLLATE>: Collation: Text Comparisons and Sorting> + =head2 Better locale collation of strings containing embedded C<NUL> characters diff --git a/pod/perllocale.pod b/pod/perllocale.pod index b6778604e3..369f8dc1d1 100644 --- a/pod/perllocale.pod +++ b/pod/perllocale.pod @@ -33,9 +33,11 @@ design deficiencies, and nowadays, there is a series of "UTF-8 locales", based on Unicode. These are locales whose character set is Unicode, encoded in UTF-8. Starting in v5.20, Perl fully supports UTF-8 locales, except for sorting and string comparisons like C<lt> and -C<ge>. (Use L<Unicode::Collate> for these.) Perl continues to support -the old non UTF-8 locales as well. There are currently no UTF-8 locales -for EBCDIC platforms. +C<ge>. Starting in v5.26, Perl can handle these reasonably as well, +depending on the platform's implementation. However, for earlier +releases or for better control, use L<Unicode::Collate> . Perl continues to +support the old non UTF-8 locales as well. There are currently no UTF-8 +locales for EBCDIC platforms. (Unicode is also creating C<CLDR>, the "Common Locale Data Repository", L<http://cldr.unicode.org/> which includes more types of information than @@ -815,10 +817,31 @@ C<$equal_in_locale> will be true if the collation locale specifies a dictionary-like ordering that ignores space characters completely and which folds case. -Perl currently only supports single-byte locales for C<LC_COLLATE>. This means -that a UTF-8 locale likely will just give you machine-native ordering. -Use L<Unicode::Collate> for the full implementation of the Unicode -Collation Algorithm. +Perl uses the platform's C library collation functions C<strcoll()> and +C<strxfrm()>. That means you get whatever they give. On some +platforms, these functions work well on UTF-8 locales, giving +a reasonable default collation for the code points that are important in +that locale. (And if they aren't working well, the problem may only be +that the locale definition is deficient, so can be fixed by using a +better definition file. Unicode's definitions (see L</Freely available +locale definitions>) provide reasonable UTF-8 locale collation +definitions.) Starting in Perl v5.26, Perl's use of these functions has +been made more seamless. This may be sufficient for your needs. For +more control, and to make sure strings containing any code point (not +just the ones important in the locale) collate properly, the +L<Unicode::Collate> module is suggested. + +In non-UTF-8 locales (hence single byte), code points above 0xFF are +technically invalid. But if present, again starting in v5.26, they will +collate to the same position as the highest valid code point does. This +generally gives good results, but the collation order may be skewed if +the valid code point gets special treatment when it forms particular +sequences with other characters as defined by the locale. +When two strings collate identically, the code point order is used as a +tie breaker. + +If Perl detects that there are problems with the locale collation order, +it reverts to using non-locale collation rules for that locale. If Perl detects that there are problems with the locale collation order, it reverts to using non-locale collation rules for that locale. @@ -1417,9 +1440,12 @@ into bankers, bikers, gamers, and so on. The support of Unicode is new starting from Perl version v5.6, and more fully implemented in versions v5.8 and later. See L<perluniintro>. -Starting in Perl v5.20, UTF-8 locales are supported in Perl, except for -C<LC_COLLATE> (use L<Unicode::Collate> instead). If you have Perl v5.16 -or v5.18 and can't upgrade, you can use +Starting in Perl v5.20, UTF-8 locales are supported in Perl, except +C<LC_COLLATE> is only partially supported; collation support is improved +in Perl v5.26 to a level that may be sufficient for your needs +(see L</Category C<LC_COLLATE>: Collation: Text Comparisons and Sorting>). + +If you have Perl v5.16 or v5.18 and can't upgrade, you can use use locale ':not_characters'; @@ -1445,10 +1471,7 @@ command line switch. This form of the pragma allows essentially seamless handling of locales with Unicode. The collation order will be by Unicode code point order. -It is strongly -recommended that when you need to order and sort strings that you use -the standard module L<Unicode::Collate> which gives much better results -in many instances than you can get with the old-style locale handling. +L<Unicode::Collate> can be used to get Unicode rules collation. All the modules and switches just described can be used in v5.20 with just plain C<use locale>, and, should the input locales not be UTF-8, @@ -1564,7 +1587,8 @@ consistently to regular expression matching except for bracketed character classes; in v5.14 it was extended to all regex matches; and in v5.16 to the casing operations such as C<\L> and C<uc()>. For collation, in all releases so far, the system's C<strxfrm()> function is -called, and whatever it does is what you get. +called, and whatever it does is what you get. Starting in v5.26, various +bugs are fixed with the way perl uses this function. =head1 BUGS @@ -4430,6 +4430,13 @@ PERL_CALLCONV SV* Perl_hfree_next_entry(pTHX_ HV *hv, STRLEN *indexp); #define PERL_ARGS_ASSERT_HFREE_NEXT_ENTRY \ assert(hv); assert(indexp) #endif +#if defined(PERL_IN_LOCALE_C) || defined(PERL_IN_SV_C) +# if defined(USE_LOCALE_COLLATE) +PERL_CALLCONV char* Perl__mem_collxfrm(pTHX_ const char* input_string, STRLEN len, STRLEN* xlen, bool utf8); +#define PERL_ARGS_ASSERT__MEM_COLLXFRM \ + assert(input_string); assert(xlen) +# endif +#endif #if defined(PERL_IN_MALLOC_C) STATIC int S_adjust_size_and_find_bucket(size_t *nbytes_p); #define PERL_ARGS_ASSERT_ADJUST_SIZE_AND_FIND_BUCKET \ @@ -8152,7 +8152,7 @@ Perl_sv_collxfrm_flags(pTHX_ SV *const sv, STRLEN *const nxp, const I32 flags) Safefree(mg->mg_ptr); s = SvPV_flags_const(sv, len, flags); - if ((xf = mem_collxfrm(s, len, &xlen))) { + if ((xf = _mem_collxfrm(s, len, &xlen, cBOOL(SvUTF8(sv))))) { if (! mg) { mg = sv_magicext(sv, 0, PERL_MAGIC_collxfrm, &PL_vtbl_collxfrm, 0, 0); @@ -14779,6 +14779,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_collation_standard = proto_perl->Icollation_standard; PL_collxfrm_base = proto_perl->Icollxfrm_base; PL_collxfrm_mult = proto_perl->Icollxfrm_mult; + PL_strxfrm_max_cp = proto_perl->Istrxfrm_max_cp; #endif /* USE_LOCALE_COLLATE */ #ifdef USE_LOCALE_NUMERIC |