summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pod/perldelta.pod9
-rw-r--r--pod/perldiag.pod11
-rw-r--r--pod/perlrecharclass.pod8
-rw-r--r--regcomp.c47
-rw-r--r--regcomp.h8
-rw-r--r--regexec.c19
-rw-r--r--t/lib/warnings/regexec47
-rw-r--r--t/re/reg_mesg.t2
-rw-r--r--t/re/regex_sets.t41
9 files changed, 174 insertions, 18 deletions
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index d08581a281..aafbd1cfda 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -27,6 +27,15 @@ here, but most should go in the L</Performance Enhancements> section.
[ List each enhancement as a =head2 entry ]
+=head2 C<qr/(?[ ])/> now works in UTF-8 locales
+
+L<Extended Bracketed Character Classes|perlrecharclass/Extended Bracketed Character Classes>
+now will successfully compile when S<C<use locale>> is in effect. The compiled
+pattern will use standard Unicode rules. If the runtime locale is not a
+UTF-8 one, a warning is raised and standard Unicode rules are used
+anyway. No tainting is done since the outcome does not actually depend
+on the locale.
+
=head1 Security
XXX Any security-related notices go here. In particular, any security
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 2effeeb875..918d35c7a9 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -6610,14 +6610,13 @@ is deprecated. See L<perlvar/"$[">.
form if you wish to use an empty line as the terminator of the
here-document.
-=item Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale
+=item Use of %s for non-UTF-8 locale is wrong. Assuming a UTF-8 locale
(W locale) You are matching a regular expression using locale rules,
-and a Unicode boundary is being matched, but the locale is not a Unicode
-one. This doesn't make sense. Perl will continue, assuming a Unicode
-(UTF-8) locale, but the results could well be wrong except if the locale
-happens to be ISO-8859-1 (Latin1) where this message is spurious and can
-be ignored.
+and the specified construct was encountered. This construct is only
+valid for UTF-8 locales, which the current locale isn't. This doesn't
+make sense. Perl will continue, assuming a Unicode (UTF-8) locale, but
+the results are likely to be wrong.
=item Use of /c modifier is meaningless in s///
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index ce287710d2..f46de4c801 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -1106,8 +1106,12 @@ just three limitations:
=item 1
-This construct cannot be used within the scope of
-C<use locale> (or the C<E<sol>l> regex modifier).
+When compiled within the scope of C<use locale> (or the C<E<sol>l> regex
+modifier), this construct assumes that the execution-time locale will be
+a UTF-8 one, and the generated pattern always uses Unicode rules. What
+gets matched or not thus isn't dependent on the actual runtime locale, so
+tainting is not enabled. But a C<locale> category warning is raised
+if the runtime locale turns out to not be UTF-8.
=item 2
diff --git a/regcomp.c b/regcomp.c
index 7820315429..91e16031c9 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13349,14 +13349,16 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
this function */
const bool save_fold = FOLD; /* Temporary */
char *save_end, *save_parse; /* Temporaries */
+ const bool in_locale = LOC; /* we turn off /l during processing */
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
- if (LOC) { /* XXX could make valid in UTF-8 locales */
- vFAIL("(?[...]) not valid in locale");
+ if (in_locale) {
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
+
RExC_uni_semantics = 1; /* The use of this operator implies /u. This
is required so that the compile time values
are valid in all runtime cases */
@@ -13439,6 +13441,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
nextchar(pRExC_state);
Set_Node_Length(node,
RExC_parse - oregcomp_parse + 1); /* MJD */
+ if (in_locale) {
+ set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
+ }
+
return node;
}
goto no_close;
@@ -14001,9 +14007,36 @@ redo_curchar:
if (!node)
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
PTR2UV(flagp));
+
+ /* Fix up the node type if we are in locale. (We have pretended we are
+ * under /u for the purposes of regclass(), as this construct will only
+ * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
+ * as to cause any warnings about bad locales to be output in regexec.c),
+ * and add the flag that indicates to check if not in a UTF-8 locale. The
+ * reason we above forbid optimization into something other than an ANYOF
+ * node is simply to minimize the number of code changes in regexec.c.
+ * Otherwise we would have to create new EXACTish node types and deal with
+ * them. This decision could be revisited should this construct become
+ * popular.
+ *
+ * (One might think we could look at the resulting ANYOF node and suppress
+ * the flag if everything is above 255, as those would be UTF-8 only,
+ * but this isn't true, as the components that led to that result could
+ * have been locale-affected, and just happen to cancel each other out
+ * under UTF-8 locales.) */
+ if (in_locale) {
+ set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
+
+ assert(OP(node) == ANYOF);
+
+ OP(node) = ANYOFL;
+ ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
+ }
+
if (save_fold) {
RExC_flags |= RXf_PMf_FOLD;
}
+
RExC_parse = save_parse + 1;
RExC_end = save_end;
SvREFCNT_dec_NN(final);
@@ -17044,8 +17077,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_
SV* bitmap_invlist; /* Will hold what the bit map contains */
- if (OP(o) == ANYOFL)
- sv_catpvs(sv, "{loc}");
+ if (OP(o) == ANYOFL) {
+ if (flags & ANYOF_LOC_REQ_UTF8) {
+ sv_catpvs(sv, "{utf8-loc}");
+ }
+ else {
+ sv_catpvs(sv, "{loc}");
+ }
+ }
if (flags & ANYOF_LOC_FOLD)
sv_catpvs(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
diff --git a/regcomp.h b/regcomp.h
index 7e43908e30..0f2617b022 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -378,7 +378,7 @@ struct regnode_ssc {
* reach this high). */
#define ANYOF_ONLY_HAS_BITMAP ((U32) -1)
-/* Flags for node->flags of ANYOF. These are in short supply, with one
+/* Flags for node->flags of ANYOF. These are in short supply, with none
* currently available. The ABOVE_BITMAP_ALL bit could be freed up
* by resorting to creating a swash containing everything above 255. This
* introduces a performance penalty. An option that wouldn't slow things down
@@ -426,6 +426,9 @@ struct regnode_ssc {
* at compile-time */
#define ANYOF_MATCHES_POSIXL 0x08
+/* Only under /l. If set, none of INVERT, LOC_FOLD, POSIXL,
+ * HAS_NONBITMAP_NON_UTF8_MATCHES can be set */
+#define ANYOF_LOC_REQ_UTF8 0x10
/* Can match something outside the bitmap that isn't in utf8 */
#define ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES 0x20
@@ -452,7 +455,8 @@ struct regnode_ssc {
/* These are the flags that apply to both regular ANYOF nodes and synthetic
* start class nodes during construction of the SSC. During finalization of
* the SSC, other of the flags could be added to it */
-#define ANYOF_COMMON_FLAGS (ANYOF_HAS_UTF8_NONBITMAP_MATCHES)
+#define ANYOF_COMMON_FLAGS ( ANYOF_HAS_UTF8_NONBITMAP_MATCHES \
+ |ANYOF_LOC_REQ_UTF8)
/* Character classes for node->classflags of ANYOF */
/* Should be synchronized with a table in regprop() */
diff --git a/regexec.c b/regexec.c
index 78ad2bcada..781bc6bce6 100644
--- a/regexec.c
+++ b/regexec.c
@@ -86,6 +86,9 @@
#include "invlist_inline.h"
#include "unicode_constants.h"
+static const char utf8_locale_required[] =
+ "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
+
#ifdef DEBUGGING
/* At least one required character in the target string is expressible only in
* UTF-8. */
@@ -1822,6 +1825,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
switch (OP(c)) {
case ANYOFL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
+
/* FALLTHROUGH */
case ANYOFD:
case ANYOF:
@@ -5730,6 +5738,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
case ANYOFL: /* /[abc]/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if ((FLAGS(scan) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE)
+ {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
/* FALLTHROUGH */
case ANYOFD: /* /[abc]/d */
case ANYOF: /* /[abc]/ */
@@ -8245,6 +8258,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
}
case ANYOFL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if ((FLAGS(p) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
/* FALLTHROUGH */
case ANYOFD:
case ANYOF:
@@ -8589,7 +8606,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
* UTF8_ALLOW_FFFF */
if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
- if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
+ if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOF_LOC_REQ_UTF8)) {
_CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
}
}
diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec
index 750880e4db..1f3b65b167 100644
--- a/t/lib/warnings/regexec
+++ b/t/lib/warnings/regexec
@@ -212,3 +212,50 @@ Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at -
Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 16.
Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17.
Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17.
+########
+# NAME (?[ ]) in non-UTF-8 locale
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+ print("SKIPPED\n# no POSIX\n"),exit;
+}
+no warnings 'experimental::regex_sets';
+use warnings 'locale';
+use locale;
+setlocale(&POSIX::LC_CTYPE, "C");
+"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i;
+"K" =~ /(?[ \N{KELVIN SIGN} ])/i;
+"k" =~ /(?[ \N{KELVIN SIGN} ])/i;
+":" =~ /(?[ \: ])/;
+no warnings 'locale';
+EXPECT
+Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 9.
+Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 9.
+Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 10.
+Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 10.
+Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 11.
+Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 11.
+Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 12.
+Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 12.
+########
+# NAME (?[ ]) in UTF-8 locale
+require '../loc_tools.pl';
+unless (locales_enabled()) {
+ print("SKIPPED\n# locales not available\n"),exit;
+}
+eval { require POSIX; POSIX->import("locale_h") };
+if ($@) {
+ print("SKIPPED\n# no POSIX\n"),exit;
+}
+my $utf8_locale = find_utf8_ctype_locale();
+unless ($utf8_locale) {
+ print("SKIPPED\n# No UTF-8 locale available\n"),exit;
+}
+no warnings 'experimental::regex_sets';
+use warnings 'locale';
+use locale;
+setlocale(&POSIX::LC_CTYPE, $utf8_locale);
+"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i;
+"K" =~ /(?[ \N{KELVIN SIGN} ])/i;
+"k" =~ /(?[ \N{KELVIN SIGN} ])/i;
+":" =~ /(?[ \: ])/;
+EXPECT
diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t
index a0588241f3..d9d9d7437b 100644
--- a/t/re/reg_mesg.t
+++ b/t/re/reg_mesg.t
@@ -205,7 +205,6 @@ my @death =
'/(?[[[:w:]]])/' => "POSIX class [:w:] unknown {#} m/(?[[[:w:]{#}]])/",
'/(?[[:w:]])/' => "POSIX class [:w:] unknown {#} m/(?[[:w:]{#}])/",
'/(?[a])/' => 'Unexpected character {#} m/(?[a{#}])/',
- '/(?[\t])/l' => '(?[...]) not valid in locale {#} m/(?[{#}\t])/',
'/(?[ + \t ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ +{#} \t ])/',
'/(?[ \cK - ( + \t ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ \cK - ( +{#} \t ) ])/',
'/(?[ \cK ( \t ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/(?[ \cK ({#} \t ) ])/',
@@ -410,7 +409,6 @@ my @death_utf8 = mark_as_utf8(
'/ネ(?[[[:ネ:]]])ネ/' => "POSIX class [:ネ:] unknown {#} m/ネ(?[[[:ネ:]{#}]])ネ/",
'/ネ(?[[:ネ:]])ネ/' => "POSIX class [:ネ:] unknown {#} m/ネ(?[[:ネ:]{#}])ネ/",
'/ネ(?[ネ])ネ/' => 'Unexpected character {#} m/ネ(?[ネ{#}])ネ/',
- '/ネ(?[ネ])/l' => '(?[...]) not valid in locale {#} m/ネ(?[{#}ネ])/',
'/ネ(?[ + [ネ] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ +{#} [ネ] ])/',
'/ネ(?[ \cK - ( + [ネ] ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ネ(?[ \cK - ( +{#} [ネ] ) ])/',
'/ネ(?[ \cK ( [ネ] ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/ネ(?[ \cK ({#} [ネ] ) ])/',
diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t
index 48a4f00b8e..c85fde6f2d 100644
--- a/t/re/regex_sets.t
+++ b/t/re/regex_sets.t
@@ -9,7 +9,8 @@ BEGIN {
chdir 't' if -d 't';
@INC = ('../lib','.','../ext/re');
require './test.pl';
- require './test.pl'; require './charset_tools.pl';
+ require './charset_tools.pl';
+ require './loc_tools.pl';
skip_all_without_unicode_tables();
}
@@ -96,6 +97,44 @@ like("k", $still_fold, "/i on interpolated (?[ ]) is retained in outer without /
eval 'my $x = qr/(?[ [a] ])/; qr/(?[ $x ])/';
is($@, "", 'qr/(?[ [a] ])/ can be interpolated');
+if (! is_miniperl() && locales_enabled('LC_CTYPE')) {
+ my $utf8_locale = find_utf8_ctype_locale;
+ SKIP: {
+ skip("No utf8 locale available on this platform", 8) unless $utf8_locale;
+
+ setlocale(&POSIX::LC_ALL, "C");
+ use locale;
+
+ $kelvin_fold = qr/(?[ \N{KELVIN SIGN} ])/i;
+ my $single_char_class = qr/(?[ \: ])/;
+
+ setlocale(&POSIX::LC_ALL, $utf8_locale);
+
+ like("\N{KELVIN SIGN}", $kelvin_fold,
+ '(?[ \N{KELVIN SIGN} ]) matches itself under /i in UTF8-locale');
+ like("K", $kelvin_fold,
+ '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in UTF8-locale');
+ like("k", $kelvin_fold,
+ '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in UTF8-locale');
+ like(":", $single_char_class,
+ '(?[ : ]) matches itself in UTF8-locale (a single character class)');
+
+ setlocale(&POSIX::LC_ALL, "C");
+
+ # These should generate warnings (the above 4 shouldn't), but like()
+ # suppresses them, so the warnings tests are in t/lib/warnings/regexec
+ like("\N{KELVIN SIGN}", $kelvin_fold,
+ '(?[ \N{KELVIN SIGN} ]) matches itself under /i in C locale');
+ like("K", $kelvin_fold,
+ '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in C locale');
+ like("k", $kelvin_fold,
+ '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in C locale');
+ like(":", $single_char_class,
+ '(?[ : ]) matches itself in C locale (a single character class)');
+ }
+}
+
+
done_testing();
1;