summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--charclass_invlists.h2
-rw-r--r--lib/unicore/mktables3
-rw-r--r--lib/unicore/uni_keywords.pl2
-rw-r--r--pod/perldelta.pod14
-rw-r--r--pod/perlunicode.pod4
-rw-r--r--regcharclass.h2
-rw-r--r--regcomp.c86
-rw-r--r--t/re/regexp_unicode_prop.t10
-rw-r--r--uni_keywords.h2
9 files changed, 104 insertions, 21 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index 87cd5938fc..e81fae56ca 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -395174,7 +395174,7 @@ static const U8 WB_table[23][23] = {
* 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
* 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
* 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
- * 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables
+ * 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 645365628d..6be1f41ee4 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -17075,6 +17075,9 @@ other two shortcuts, and Unicode continues to define new properties that begin
with C<"In">, so it's quite possible that a conflict will occur in the future.
The compound form is guaranteed to not become obsolete, and its meaning is
clearer anyway. See L<perlunicode/"Blocks"> for more information about this.
+
+User-defined properties must begin with "In" or "Is". These override any
+Unicode property of the same name.
END
}
my $text = $Is_flags_text;
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
index afa9f6496f..a665a9cc9f 100644
--- a/lib/unicore/uni_keywords.pl
+++ b/lib/unicore/uni_keywords.pl
@@ -1260,7 +1260,7 @@
# 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
# 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
# 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
-# 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables
+# 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables
# a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
# 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
# e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 7ce4cf8f01..23d3fe7656 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -45,6 +45,20 @@ XXX For a release on a stable branch, this section aspires to be:
[ List each incompatible change as a =head2 entry ]
+=head2 C<\p{I<user-defined>}> properties now always override official
+Unicode ones
+
+Previously, if and only if a user-defined property was declared prior to
+the compilation of the regular expression pattern containing it, its
+definition was used instead of any official Unicode property with the
+same name. Now, it always overrides the offical property. This
+change could break existing code that relied (likely unwittingly) on the
+previous behavior. Without this fix, if Unicode released a new version
+with a new property that happens to have the same name as the one you
+had long been using, your program would break when you upgraded to a
+perl that used that new Unicode version. See L<perlunicode/User-Defined
+Character Properties>. [GH #17205]
+
=head1 Deprecations
XXX Any deprecated features, syntax, modules etc. should be listed here.
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index e048df6df6..59058225b7 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1066,7 +1066,9 @@ You can define your own binary character properties by defining subroutines
whose names begin with C<"In"> or C<"Is">. (The experimental feature
L<perlre/(?[ ])> provides an alternative which allows more complex
definitions.) The subroutines can be defined in any
-package. The user-defined properties can be used in the regular expression
+package. They override any Unicode properties expressed as the same
+names. The user-defined properties can be used in the regular
+expression
C<\p{}> and C<\P{}> constructs; if you are using a user-defined property from a
package other than the one you are in, you must specify its package in the
C<\p{}> or C<\P{}> construct.
diff --git a/regcharclass.h b/regcharclass.h
index cb259247ea..1bd317c28d 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -2245,7 +2245,7 @@
* 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
* 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
* 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
- * 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables
+ * 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl
diff --git a/regcomp.c b/regcomp.c
index 32023239e2..028fd061b1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -426,6 +426,14 @@ struct RExC_state_t {
#define _invlist_intersection_complement_2nd(a, b, output) \
_invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
+/* We add a marker if we are deferring expansion of a potential user-defined
+ * property until it is needed at runtime the first time it is encountered in a
+ * pattern match. This marker that shouldn't conflict with any that could be
+ * in a legal name is appended to its name to indicate this. There is a string
+ * and character form */
+#define DEFERRED_PROP_EXPANSION_MARKERs "~"
+#define DEFERRED_PROP_EXPANSION_MARKERc '~'
+
/* About scan_data_t.
During optimisation we recurse through the regexp program performing
@@ -19845,11 +19853,13 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
continue;
}
- /* Here, didn't find a legal hex number. Just add it from
- * here to the next \n */
+ /* Here, didn't find a legal hex number. Just add the text
+ * from here up to the next \n, omitting any trailing
+ * markers. */
remaining -= len;
- len = strcspn(si_string, "\n");
+ len = strcspn(si_string,
+ DEFERRED_PROP_EXPANSION_MARKERs "\n");
remaining -= len;
if (matches_string) {
sv_catpvn(matches_string, si_string, len);
@@ -19860,6 +19870,13 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
sv_catpvs(matches_string, " ");
si_string += len;
+ if ( remaining
+ && UCHARAT(si_string)
+ == DEFERRED_PROP_EXPANSION_MARKERc)
+ {
+ si_string++;
+ remaining--;
+ }
if (remaining && UCHARAT(si_string) == '\n') {
si_string++;
remaining--;
@@ -23099,7 +23116,7 @@ Perl_parse_uniprop_string(pTHX_
* Other parameters will be set on return as described below */
const char * const name, /* The first non-blank in the \p{}, \P{} */
- const Size_t name_len, /* Its length in bytes, not including any
+ Size_t name_len, /* Its length in bytes, not including any
trailing space */
const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
const bool to_fold, /* ? Is this under /i */
@@ -23147,6 +23164,9 @@ Perl_parse_uniprop_string(pTHX_
qualified name */
bool invert_return = FALSE; /* ? Do we need to complement the result before
returning it */
+ bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
+ explicit utf8:: package that we strip
+ off */
PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
@@ -23205,6 +23225,17 @@ Perl_parse_uniprop_string(pTHX_
break;
}
+ /* If this looks like it is a marker we inserted at compile time,
+ * ignore it; otherwise keep it as it would have been user input. */
+ if ( UNLIKELY(cur == DEFERRED_PROP_EXPANSION_MARKERc)
+ && ! deferrable
+ && could_be_user_defined
+ && i == name_len - 1)
+ {
+ name_len--;
+ continue;
+ }
+
/* Otherwise, this character is part of the name. */
lookup_name[j++] = cur;
@@ -23238,6 +23269,7 @@ Perl_parse_uniprop_string(pTHX_
lookup_name += STRLENs("utf8::");
j -= STRLENs("utf8::");
equals_pos -= STRLENs("utf8::");
+ stripped_utf8_pkg = TRUE;
}
/* Here, we are either done with the whole property name, if it was simple;
@@ -23634,7 +23666,29 @@ Perl_parse_uniprop_string(pTHX_
/* Here, the name could be for a user defined property, which are
* implemented as subs. */
user_sub = get_cvn_flags(name, name_len, 0);
- if (user_sub) {
+ if (! user_sub) {
+
+ /* Here, the property name could be a user-defined one, but there
+ * is no subroutine to handle it (as of now). Defer handling it
+ * until runtime. Otherwise, a block defined by Unicode in a later
+ * release would get the synonym InFoo added for it, and existing
+ * code that used that name would suddenly break if it referred to
+ * the property before the sub was declared. See [perl #134146] */
+ if (deferrable) {
+ goto definition_deferred;
+ }
+
+ /* If we haven't already stripped the package name (if one), do so
+ * now so can look for an official property with the stripped name.
+ * */
+ if (! stripped_utf8_pkg) {
+ lookup_name += non_pkg_begin;
+ j -= non_pkg_begin;
+ }
+
+ /* Drop down to look up in the official properties */
+ }
+ else {
const char insecure[] = "Insecure user-defined property";
/* Here, there is a sub by the correct name. Normally we call it
@@ -24270,18 +24324,34 @@ Perl_parse_uniprop_string(pTHX_
definition_deferred:
+ {
+ bool is_qualified = non_pkg_begin != 0; /* If has "::" */
+
/* Here it could yet to be defined, so defer evaluation of this
* until its needed at runtime. We need the fully qualified property name
- * to avoid ambiguity, and a trailing newline */
+ * to avoid ambiguity */
if (! fq_name) {
fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
- non_pkg_begin != 0 /* If has "::" */
- );
+ is_qualified);
}
+
+ /* If it didn't come with a package, or the package is utf8::, this
+ * actually could be an official Unicode property whose inclusion we
+ * are deferring until runtime to make sure that it isn't overridden by
+ * a user-defined property of the same name (which we haven't
+ * encountered yet). Add a marker to indicate this possibility, for
+ * use at such time when we first need the definition during pattern
+ * matching execution */
+ if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
+ sv_catpvs(fq_name, DEFERRED_PROP_EXPANSION_MARKERs);
+ }
+
+ /* We also need a trailing newline */
sv_catpvs(fq_name, "\n");
*user_defined_ptr = TRUE;
return fq_name;
+ }
}
#endif
diff --git a/t/re/regexp_unicode_prop.t b/t/re/regexp_unicode_prop.t
index 6df2968735..5c5a1d75b7 100644
--- a/t/re/regexp_unicode_prop.t
+++ b/t/re/regexp_unicode_prop.t
@@ -143,6 +143,7 @@ BEGIN {
Dash => ['-'],
ASCII_Hex_Digit => ['!-', 'A'],
IsAsciiHexAndDash => ['-', 'A'],
+ InLatin1 => ['\x{0100}', '!\x{00FF}'],
);
@USER_CASELESS_PROPERTIES = (
@@ -194,12 +195,6 @@ BEGIN {
}
}
-# These override the official ones, so if found before defined, the official
-# ones prevail, so can't test deferred definition
-my @OVERRIDING_USER_DEFINED_PROPERTIES = (
- InLatin1 => ['\x{0100}', '!\x{00FF}'],
-);
-
#
# From the short properties we populate POSIX-like classes.
#
@@ -249,8 +244,7 @@ while (my ($class, $chars) = each %SHORT_PROPERTIES) {
push @CLASSES => "# Short properties" => %SHORT_PROPERTIES,
"# POSIX like properties" => %d,
- "# User defined properties" => @USER_DEFINED_PROPERTIES,
- "# Overriding user defined properties" => @OVERRIDING_USER_DEFINED_PROPERTIES;
+ "# User defined properties" => @USER_DEFINED_PROPERTIES;
#
diff --git a/uni_keywords.h b/uni_keywords.h
index c3bf4bfe31..5e0e630ab6 100644
--- a/uni_keywords.h
+++ b/uni_keywords.h
@@ -7283,7 +7283,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
* 78e2600e24fa7d5ab62117de50b382f8b31b08401c37a0782c38dacb340b64e7 lib/unicore/extracted/DLineBreak.txt
* 1bde4ad73e271c6349fbd1972e54f38bba5cc1900c28f678e79b9e8909b31793 lib/unicore/extracted/DNumType.txt
* 6278722699123f3890e4b1cc42011e96d8960e4958a3b93484361530983d2611 lib/unicore/extracted/DNumValues.txt
- * 08071cd168b1ac72bf01f13a82c4d0470a391e2bdd0b706e9fe20ab17cc861c8 lib/unicore/mktables
+ * 498da0b9ef6a52bfd71bda5771005bbe4cfc37b456d9d350cd840991eb80c8b1 lib/unicore/mktables
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* e9283c761c5a95e3379384ca47c13a284f08d743c2be6e5091f1152b1b6b7a37 regen/mk_PL_charclass.pl