diff options
-rw-r--r-- | pod/perldelta.pod | 9 | ||||
-rw-r--r-- | pod/perldiag.pod | 11 | ||||
-rw-r--r-- | pod/perlrecharclass.pod | 8 | ||||
-rw-r--r-- | regcomp.c | 47 | ||||
-rw-r--r-- | regcomp.h | 8 | ||||
-rw-r--r-- | regexec.c | 19 | ||||
-rw-r--r-- | t/lib/warnings/regexec | 47 | ||||
-rw-r--r-- | t/re/reg_mesg.t | 2 | ||||
-rw-r--r-- | t/re/regex_sets.t | 41 |
9 files changed, 174 insertions, 18 deletions
diff --git a/pod/perldelta.pod b/pod/perldelta.pod index d08581a281..aafbd1cfda 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -27,6 +27,15 @@ here, but most should go in the L</Performance Enhancements> section. [ List each enhancement as a =head2 entry ] +=head2 C<qr/(?[ ])/> now works in UTF-8 locales + +L<Extended Bracketed Character Classes|perlrecharclass/Extended Bracketed Character Classes> +now will successfully compile when S<C<use locale>> is in effect. The compiled +pattern will use standard Unicode rules. If the runtime locale is not a +UTF-8 one, a warning is raised and standard Unicode rules are used +anyway. No tainting is done since the outcome does not actually depend +on the locale. + =head1 Security XXX Any security-related notices go here. In particular, any security diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 2effeeb875..918d35c7a9 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -6610,14 +6610,13 @@ is deprecated. See L<perlvar/"$[">. form if you wish to use an empty line as the terminator of the here-document. -=item Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale +=item Use of %s for non-UTF-8 locale is wrong. Assuming a UTF-8 locale (W locale) You are matching a regular expression using locale rules, -and a Unicode boundary is being matched, but the locale is not a Unicode -one. This doesn't make sense. Perl will continue, assuming a Unicode -(UTF-8) locale, but the results could well be wrong except if the locale -happens to be ISO-8859-1 (Latin1) where this message is spurious and can -be ignored. +and the specified construct was encountered. This construct is only +valid for UTF-8 locales, which the current locale isn't. This doesn't +make sense. Perl will continue, assuming a Unicode (UTF-8) locale, but +the results are likely to be wrong. =item Use of /c modifier is meaningless in s/// diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index ce287710d2..f46de4c801 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -1106,8 +1106,12 @@ just three limitations: =item 1 -This construct cannot be used within the scope of -C<use locale> (or the C<E<sol>l> regex modifier). +When compiled within the scope of C<use locale> (or the C<E<sol>l> regex +modifier), this construct assumes that the execution-time locale will be +a UTF-8 one, and the generated pattern always uses Unicode rules. What +gets matched or not thus isn't dependent on the actual runtime locale, so +tainting is not enabled. But a C<locale> category warning is raised +if the runtime locale turns out to not be UTF-8. =item 2 @@ -13349,14 +13349,16 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, this function */ const bool save_fold = FOLD; /* Temporary */ char *save_end, *save_parse; /* Temporaries */ + const bool in_locale = LOC; /* we turn off /l during processing */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; - if (LOC) { /* XXX could make valid in UTF-8 locales */ - vFAIL("(?[...]) not valid in locale"); + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } + RExC_uni_semantics = 1; /* The use of this operator implies /u. This is required so that the compile time values are valid in all runtime cases */ @@ -13439,6 +13441,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, nextchar(pRExC_state); Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + } + return node; } goto no_close; @@ -14001,9 +14007,36 @@ redo_curchar: if (!node) FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, PTR2UV(flagp)); + + /* Fix up the node type if we are in locale. (We have pretended we are + * under /u for the purposes of regclass(), as this construct will only + * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so + * as to cause any warnings about bad locales to be output in regexec.c), + * and add the flag that indicates to check if not in a UTF-8 locale. The + * reason we above forbid optimization into something other than an ANYOF + * node is simply to minimize the number of code changes in regexec.c. + * Otherwise we would have to create new EXACTish node types and deal with + * them. This decision could be revisited should this construct become + * popular. + * + * (One might think we could look at the resulting ANYOF node and suppress + * the flag if everything is above 255, as those would be UTF-8 only, + * but this isn't true, as the components that led to that result could + * have been locale-affected, and just happen to cancel each other out + * under UTF-8 locales.) */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + + assert(OP(node) == ANYOF); + + OP(node) = ANYOFL; + ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8; + } + if (save_fold) { RExC_flags |= RXf_PMf_FOLD; } + RExC_parse = save_parse + 1; RExC_end = save_end; SvREFCNT_dec_NN(final); @@ -17044,8 +17077,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SV* bitmap_invlist; /* Will hold what the bit map contains */ - if (OP(o) == ANYOFL) - sv_catpvs(sv, "{loc}"); + if (OP(o) == ANYOFL) { + if (flags & ANYOF_LOC_REQ_UTF8) { + sv_catpvs(sv, "{utf8-loc}"); + } + else { + sv_catpvs(sv, "{loc}"); + } + } if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); @@ -378,7 +378,7 @@ struct regnode_ssc { * reach this high). */ #define ANYOF_ONLY_HAS_BITMAP ((U32) -1) -/* Flags for node->flags of ANYOF. These are in short supply, with one +/* Flags for node->flags of ANYOF. These are in short supply, with none * currently available. The ABOVE_BITMAP_ALL bit could be freed up * by resorting to creating a swash containing everything above 255. This * introduces a performance penalty. An option that wouldn't slow things down @@ -426,6 +426,9 @@ struct regnode_ssc { * at compile-time */ #define ANYOF_MATCHES_POSIXL 0x08 +/* Only under /l. If set, none of INVERT, LOC_FOLD, POSIXL, + * HAS_NONBITMAP_NON_UTF8_MATCHES can be set */ +#define ANYOF_LOC_REQ_UTF8 0x10 /* Can match something outside the bitmap that isn't in utf8 */ #define ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES 0x20 @@ -452,7 +455,8 @@ struct regnode_ssc { /* These are the flags that apply to both regular ANYOF nodes and synthetic * start class nodes during construction of the SSC. During finalization of * the SSC, other of the flags could be added to it */ -#define ANYOF_COMMON_FLAGS (ANYOF_HAS_UTF8_NONBITMAP_MATCHES) +#define ANYOF_COMMON_FLAGS ( ANYOF_HAS_UTF8_NONBITMAP_MATCHES \ + |ANYOF_LOC_REQ_UTF8) /* Character classes for node->classflags of ANYOF */ /* Should be synchronized with a table in regprop() */ @@ -86,6 +86,9 @@ #include "invlist_inline.h" #include "unicode_constants.h" +static const char utf8_locale_required[] = + "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"; + #ifdef DEBUGGING /* At least one required character in the target string is expressible only in * UTF-8. */ @@ -1822,6 +1825,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, switch (OP(c)) { case ANYOFL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); + } + /* FALLTHROUGH */ case ANYOFD: case ANYOF: @@ -5730,6 +5738,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFL: /* /[abc]/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + if ((FLAGS(scan) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) + { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); + } /* FALLTHROUGH */ case ANYOFD: /* /[abc]/d */ case ANYOF: /* /[abc]/ */ @@ -8245,6 +8258,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } case ANYOFL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + if ((FLAGS(p) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); + } /* FALLTHROUGH */ case ANYOFD: case ANYOF: @@ -8589,7 +8606,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_ALLOW_FFFF */ if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); - if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) { + if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOF_LOC_REQ_UTF8)) { _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); } } diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec index 750880e4db..1f3b65b167 100644 --- a/t/lib/warnings/regexec +++ b/t/lib/warnings/regexec @@ -212,3 +212,50 @@ Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 16. Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. +######## +# NAME (?[ ]) in non-UTF-8 locale +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +no warnings 'experimental::regex_sets'; +use warnings 'locale'; +use locale; +setlocale(&POSIX::LC_CTYPE, "C"); +"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i; +"K" =~ /(?[ \N{KELVIN SIGN} ])/i; +"k" =~ /(?[ \N{KELVIN SIGN} ])/i; +":" =~ /(?[ \: ])/; +no warnings 'locale'; +EXPECT +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 9. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 9. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 10. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 10. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 11. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 11. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 12. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 12. +######## +# NAME (?[ ]) in UTF-8 locale +require '../loc_tools.pl'; +unless (locales_enabled()) { + print("SKIPPED\n# locales not available\n"),exit; +} +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +my $utf8_locale = find_utf8_ctype_locale(); +unless ($utf8_locale) { + print("SKIPPED\n# No UTF-8 locale available\n"),exit; +} +no warnings 'experimental::regex_sets'; +use warnings 'locale'; +use locale; +setlocale(&POSIX::LC_CTYPE, $utf8_locale); +"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i; +"K" =~ /(?[ \N{KELVIN SIGN} ])/i; +"k" =~ /(?[ \N{KELVIN SIGN} ])/i; +":" =~ /(?[ \: ])/; +EXPECT diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index a0588241f3..d9d9d7437b 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -205,7 +205,6 @@ my @death = '/(?[[[:w:]]])/' => "POSIX class [:w:] unknown {#} m/(?[[[:w:]{#}]])/", '/(?[[:w:]])/' => "POSIX class [:w:] unknown {#} m/(?[[:w:]{#}])/", '/(?[a])/' => 'Unexpected character {#} m/(?[a{#}])/', - '/(?[\t])/l' => '(?[...]) not valid in locale {#} m/(?[{#}\t])/', '/(?[ + \t ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ +{#} \t ])/', '/(?[ \cK - ( + \t ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ \cK - ( +{#} \t ) ])/', '/(?[ \cK ( \t ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/(?[ \cK ({#} \t ) ])/', @@ -410,7 +409,6 @@ my @death_utf8 = mark_as_utf8( '/ネ(?[[[:ネ:]]])ネ/' => "POSIX class [:ネ:] unknown {#} m/ネ(?[[[:ネ:]{#}]])ネ/", '/ネ(?[[:ネ:]])ネ/' => "POSIX class [:ネ:] unknown {#} m/ネ(?[[:ネ:]{#}])ネ/", '/ネ(?[ネ])ネ/' => 'Unexpected character {#} m/ネ(?[ネ{#}])ネ/', - '/ネ(?[ネ])/l' => '(?[...]) not valid in locale {#} m/ネ(?[{#}ネ])/', '/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/', '/ネ(?[ \cK - ( + [ネ] ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ \cK - ( +{#} [ネ] ) ])/', '/ネ(?[ \cK ( [ネ] ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/ネ(?[ \cK ({#} [ネ] ) ])/', diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t index 48a4f00b8e..c85fde6f2d 100644 --- a/t/re/regex_sets.t +++ b/t/re/regex_sets.t @@ -9,7 +9,8 @@ BEGIN { chdir 't' if -d 't'; @INC = ('../lib','.','../ext/re'); require './test.pl'; - require './test.pl'; require './charset_tools.pl'; + require './charset_tools.pl'; + require './loc_tools.pl'; skip_all_without_unicode_tables(); } @@ -96,6 +97,44 @@ like("k", $still_fold, "/i on interpolated (?[ ]) is retained in outer without / eval 'my $x = qr/(?[ [a] ])/; qr/(?[ $x ])/'; is($@, "", 'qr/(?[ [a] ])/ can be interpolated'); +if (! is_miniperl() && locales_enabled('LC_CTYPE')) { + my $utf8_locale = find_utf8_ctype_locale; + SKIP: { + skip("No utf8 locale available on this platform", 8) unless $utf8_locale; + + setlocale(&POSIX::LC_ALL, "C"); + use locale; + + $kelvin_fold = qr/(?[ \N{KELVIN SIGN} ])/i; + my $single_char_class = qr/(?[ \: ])/; + + setlocale(&POSIX::LC_ALL, $utf8_locale); + + like("\N{KELVIN SIGN}", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches itself under /i in UTF8-locale'); + like("K", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in UTF8-locale'); + like("k", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in UTF8-locale'); + like(":", $single_char_class, + '(?[ : ]) matches itself in UTF8-locale (a single character class)'); + + setlocale(&POSIX::LC_ALL, "C"); + + # These should generate warnings (the above 4 shouldn't), but like() + # suppresses them, so the warnings tests are in t/lib/warnings/regexec + like("\N{KELVIN SIGN}", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches itself under /i in C locale'); + like("K", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in C locale'); + like("k", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in C locale'); + like(":", $single_char_class, + '(?[ : ]) matches itself in C locale (a single character class)'); + } +} + + done_testing(); 1; |