diff options
author | Karl Williamson <khw@cpan.org> | 2021-03-12 10:30:53 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2022-11-29 12:53:39 -0700 |
commit | f101e19ae74be4d1d912e4d176a6e7a305bee770 (patch) | |
tree | 8d7bf1779484d1cb8202789d431f653407cb7a80 /ext | |
parent | 4d597d86ca65c369680ced9b942edbde4a6ec0a7 (diff) | |
download | perl-f101e19ae74be4d1d912e4d176a6e7a305bee770.tar.gz |
Fix POSIX::strxfrm()
This commit does two things.
Most simply it extends strxfrm() to handle strings containing NUL
characters. Previously the transformation stopped at the first NUL
encountered.
Second, it combines the implementation of this with the existing
implementation used for the 'cmp' operator, eliminating existing
discrepancies and preventing future ones.
This function takes an SV containing a PV. The encoding of that
PV is based on the locale of the LC_CTYPE locale. It really doesn't
make sense to collate based off of the sequencing of a different locale,
which prior to this commit it would do (but not for 'cmp') if the
LC_COLLATION locale were different.
As an example, consider the string:
my $string = quotemeta join "", map { chr } (1..255);
and with LC_TYPE=8859-1 (Latin-1, used for several Western European
languages), LC_COLLATE set to ja_JP.utf8. This doesn't make much sense,
outside of specialty uses such as a lazy implementation of a
Japanese/French dictionary, or for quoting snippets in one language in a
document written in the other. ('lazy' because such text should really
be changing locales to the language of the snippet currently being
worked on.) Nevertheless Perl should do something as sensible as
possible. and this commit changes POSIX::strxfrm() to use the method
already in use by the code implementing 'cmp'. Prior to this commit,
POSIX::strxfrm($string) yielded on glibc 12.1:
^\3^\4^\5^\6^\a^\b^\t^\n^\13^\f^\r^\16^\17^\20^\21^\22^\23^\24^\25^\26^\27^\30^\31^\32^\e^\34^\35^\36^\37^ ^!^\"^#^\$^%^&^'^(^)^*^+^,^-^.^/^0^123456789:;^<^=^>^?^\@^A^BCDEFGHIJKLMNOPQRSTUVWXYZ[\\^]^^^_^`a^bcdefghijklmnopqrstuvwxyz{|^}^~^\177^\302\200^\302\201^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3^\3
These are effectively a sorting order, and it is not meant to be human
understandable. But it is clear that most of the characters had the
same weight of 3, so a libc sort would mark them as ties in sorting
order.
And after,
^\3^\4^\5^\6^\a^\b^\t^\n^\13^\f^\r^\16^\17^\20^\21^\22^\23^\24^\25^\26^\27^\30^\31^\32^\e^\34^\35^\36^\37^ ^!^\"^#^\$^%^&^'^(^)^*^+^,^-^.^/^0^123456789:;^<^=^>^?^\@^A^BCDEFGHIJKLMNOPQRSTUVWXYZ[\\^]^^^_^`a^bcdefghijklmnopqrstuvwxyz{|^}^~^\177^\302\200^\302\201^\302\202^\302\203^\302\204^\302\205^\302\206^\302\207^\302\210^\302\211^\302\212^\302\213^\302\214^\302\215^\302\216^\302\217^\3\3^\3\3^\302\220^\302\221^\302\222^\302\223^\302\224^\302\225^\302\226^\302\227^\302\230^\302\231^\302\232^\302\233^\302\234^\302\235^\302\236^\302\237^\3\3^\341\257\211^\304\257^\304\260^\341\257\221^\3\3^\341\257\212^\304\266^\303\255^\341\257\216^\341\257\215^\3\3^\305\225^\3\3^\341\257\217^\341\257\203^\304\251^\304\234^\3\3^\3\3^\303\253^\3\3^\305\260^\3\3^\341\257\200^\3\3^\341\257\214^\3\3^\3\3^\3\3^\3\3^\341\257\213^\341\260\236^\341\260\235^\341\260\240^\341\260\246^\341\260\237^\341\260\245^\341\260\202^\341\260\252^\341\260\256^\341\260\255^\341\260\260^\341\260\257^\341\260\273^\341\260\272^\341\260\275^\341\260\274^\3\3^\341\261\213^\341\261\215^\341\261\214^\341\261\217^\341\261\223^\341\261\216^\304\235^\341\260\211^\341\261\236^\341\261\235^\341\261\240^\341\261\237^\341\261\255^\341\260\214^\341\260\232^\341\261\264^\341\261\263^\341\261\266^\341\261\274^\341\261\265^\341\261\273^\341\260\215^\341\262\200^\341\262\204^\341\262\203^\341\262\206^\341\262\205^\341\262\221^\341\262\220^\341\262\223^\341\262\222^\341\260\217^\341\262\240^\341\262\242^\341\262\241^\341\262\244^\341\262\250^\341\262\243^\304\236^\341\260\230^\341\262\263^\341\262\262^\341\262\265^\341\262\264^\341\263\202^\341\260\234^\341\263\203
which shows that most of the ties have been resolved, and hence the
results are more sensible
Diffstat (limited to 'ext')
-rw-r--r-- | ext/POSIX/POSIX.xs | 23 | ||||
-rw-r--r-- | ext/POSIX/lib/POSIX.pm | 2 | ||||
-rw-r--r-- | ext/POSIX/lib/POSIX.pod | 11 |
3 files changed, 14 insertions, 22 deletions
diff --git a/ext/POSIX/POSIX.xs b/ext/POSIX/POSIX.xs index cd8447e825..ec409c06fd 100644 --- a/ext/POSIX/POSIX.xs +++ b/ext/POSIX/POSIX.xs @@ -3437,24 +3437,11 @@ void strxfrm(src) SV * src CODE: - { - STRLEN srclen; - STRLEN dstlen; - STRLEN buflen; - char *p = SvPV(src,srclen); - srclen++; - buflen = srclen * 4 + 1; - ST(0) = sv_2mortal(newSV(buflen)); - dstlen = strxfrm(SvPVX(ST(0)), p, (size_t)buflen); - if (dstlen >= buflen) { - dstlen++; - SvGROW(ST(0), dstlen); - strxfrm(SvPVX(ST(0)), p, (size_t)dstlen); - dstlen--; - } - SvCUR_set(ST(0), dstlen); - SvPOK_only(ST(0)); - } +#ifdef USE_LOCALE_COLLATE + ST(0) = Perl_strxfrm(aTHX_ src); +#else + ST(0) = src; +#endif SysRet mkfifo(filename, mode) diff --git a/ext/POSIX/lib/POSIX.pm b/ext/POSIX/lib/POSIX.pm index aabb03cd2e..c99bb35f11 100644 --- a/ext/POSIX/lib/POSIX.pm +++ b/ext/POSIX/lib/POSIX.pm @@ -4,7 +4,7 @@ use warnings; our ($AUTOLOAD, %SIGRT); -our $VERSION = '2.08'; +our $VERSION = '2.09'; require XSLoader; diff --git a/ext/POSIX/lib/POSIX.pod b/ext/POSIX/lib/POSIX.pod index d14f53247e..6ef6dc66d5 100644 --- a/ext/POSIX/lib/POSIX.pod +++ b/ext/POSIX/lib/POSIX.pod @@ -1972,9 +1972,14 @@ Used with C<eq> or C<cmp> as an alternative to C<L</strcoll>>. Not really needed since Perl can do this transparently, see L<perllocale>. -Beware that in a UTF-8 locale, anything you pass to this function must -be in UTF-8; and when not in a UTF-8 locale, anything passed must not be -UTF-8 encoded. +Unlike the libc C<strxfrm>, this allows NUL characters in the input +C<$src>. + +It doesn't make sense for a string to be encoded in one locale (say, +ISO-8859-6, Arabic) and to collate it based on another (like ISO-8859-7, +Greek). Perl assumes that the current C<LC_CTYPE> locale correctly +represents the encoding of C<$src>, and ignores the value of +C<LC_COLLATE>. =item C<sysconf> |