summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MANIFEST1
-rw-r--r--pod/perldiag.pod5
-rw-r--r--regcomp.c220
-rw-r--r--t/lib/croak/regcomp56
-rw-r--r--utf8.c2
5 files changed, 185 insertions, 99 deletions
diff --git a/MANIFEST b/MANIFEST
index b41aa29c96..c11c8f556b 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -5485,6 +5485,7 @@ t/lib/croak/pp Test croak calls from pp.c
t/lib/croak/pp_ctl Test croak calls from pp_ctl.c
t/lib/croak/pp_hot Test croak calls from pp_hot.c
t/lib/croak/pp_sys Test croak calls from pp_sys.c
+t/lib/croak/regcomp Test croak calls from regcomp.c
t/lib/croak/toke Test croak calls from toke.c
t/lib/croak/toke_l1 Test croak calls from toke.c; file is not UTF-8 encoded
t/lib/cygwin.t Builtin cygwin function tests
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 891989c423..72b36321ec 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -7151,7 +7151,10 @@ modifier is not presently meaningful in substitutions.
use the /g modifier. Currently, /c is meaningful only when /g is
used. (This may change in the future.)
-=item Use of code point 0x%s is not allowed; the permissible max is 0x%s.
+=item Use of code point 0x%s is not allowed; the permissible max is 0x%x
+
+=item Use of code point 0x%s is not allowed; the permissible max is 0x%x
+in regex; marked by <-- HERE in m/%s/
(F) You used a code point that is not allowed, because it is too large.
Unicode only allows code points up to 0x10FFFF, but Perl allows much
diff --git a/regcomp.c b/regcomp.c
index f8d9771a57..92c8fe416d 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -12307,15 +12307,13 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
*/
char * endbrace; /* points to '}' following the name */
- char * endchar; /* Points to '.' or '}' ending cur char in the input
- stream */
char* p = RExC_parse; /* Temporary */
SV * substitute_parse;
- STRLEN len;
char *orig_end;
char *save_start;
I32 flags;
+ Size_t count = 0; /* code point count kept internally by this function */
GET_RE_DEBUG_FLAGS_DECL;
@@ -12400,139 +12398,165 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
vFAIL("\\N{NAME} must be resolved by the lexer");
}
+ /* This code purposely indented below because of future changes coming */
+
+ /* We can get to here when the input is \N{U+...} or when toke.c has
+ * converted a name to the \N{U+...} form. This include changing a
+ * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
+
RExC_parse += 2; /* Skip past the 'U+' */
- /* Because toke.c has generated a special construct for us guaranteed
- * not to have NULs, we can use a str function */
- endchar = RExC_parse + strcspn(RExC_parse, ".}");
+ /* Code points are separated by dots. The '}' terminates the whole
+ * thing. */
+
+ do { /* Loop until the ending brace */
+ UV cp = 0;
+ char * start_digit; /* The first of the current code point */
+ if (! isXDIGIT(*RExC_parse)) {
+ RExC_parse++;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
+ }
+
+ start_digit = RExC_parse;
+ count++;
- /* Code points are separated by dots. If none, there is only one code
- * point, and is terminated by the brace */
+ /* Loop through the hex digits of the current code point */
+ do {
+ /* Adding this digit will shift the result 4 bits. If that
+ * result would be above IV_MAX, it's overflow */
+ if (cp > IV_MAX >> 4) {
+
+ /* Find the end of the code point */
+ do {
+ RExC_parse ++;
+ } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
+
+ /* Be sure to synchronize this message with the similar one
+ * in utf8.c */
+ vFAIL4("Use of code point 0x%.*s is not allowed; the"
+ " permissible max is 0x%" UVxf,
+ (int) (RExC_parse - start_digit), start_digit, IV_MAX);
+ }
- if (endchar >= endbrace) {
- STRLEN length_of_hex;
- I32 grok_hex_flags;
+ /* Accumulate this (valid) digit into the running total */
+ cp = (cp << 4) + READ_XDIGIT(RExC_parse);
- /* Here, exactly one code point. If that isn't what is wanted,
- * fail */
- if (! code_point_p) {
- RExC_parse = p;
- return FALSE;
+ /* READ_XDIGIT advanced the input pointer. Ignore a single
+ * underscore separator */
+ if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
+ RExC_parse++;
+ }
+ } while (isXDIGIT(*RExC_parse));
+
+ /* Here, have accumulated the next code point */
+ if (RExC_parse >= endbrace) { /* If done ... */
+ if (count != 1) {
+ goto do_concat;
+ }
+
+ /* Here, is a single code point; fail if doesn't want that */
+ if (! code_point_p) {
+ RExC_parse = p;
+ return FALSE;
+ }
+
+ /* A single code point is easy to handle; just return it */
+ *code_point_p = UNI_TO_NATIVE(cp);
+ RExC_parse = endbrace;
+ nextchar(pRExC_state);
+ return TRUE;
}
- /* Convert code point from hex */
- length_of_hex = (STRLEN)(endchar - RExC_parse);
- grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
-
- /* No errors in the first pass (See [perl
- * #122671].) We let the code below find the
- * errors when there are multiple chars. */
- | ((SIZE_ONLY)
- ? PERL_SCAN_SILENT_ILLDIGIT
- : 0);
-
- /* This routine is the one place where both single- and
- * double-quotish \N{U+xxxx} are evaluated. The value is a Unicode
- * code point which must be converted to native. */
- *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
- &length_of_hex,
- &grok_hex_flags,
- NULL));
-
- /* The tokenizer should have guaranteed validity, but it's possible
- * to bypass it by using single quoting, so check. Don't do the
- * check here when there are multiple chars; we do it below anyway.
- * */
- if (length_of_hex == 0
- || length_of_hex != (STRLEN)(endchar - RExC_parse) )
- {
- RExC_parse += length_of_hex; /* Includes all the valid */
+ /* Here, the only legal thing would be a multiple character
+ * sequence (of the form "\N{U+c1.c2. ... }". So the next
+ * character must be a dot (and the one after that can't be the
+ * endbrace, or we'd have something like \N{U+100.} ) */
+ if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
? UTF8SKIP(RExC_parse)
: 1;
- /* Guard against malformed utf8 */
- if (RExC_parse >= endchar) {
- RExC_parse = endchar;
+ if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
+ RExC_parse = endbrace;
}
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- RExC_parse = endbrace + 1;
- return TRUE;
- }
-
- /* Here, is a multiple character sequence */
-
- /* Count the code points, if desired, in the sequence */
- if (cp_count) {
- *cp_count = 0;
- while (RExC_parse < endbrace) {
- /* Point to the beginning of the next character in the sequence. */
- RExC_parse = endchar + 1;
- endchar = RExC_parse + strcspn(RExC_parse, ".}");
- (*cp_count)++;
- }
- }
+ /* Here, looks like its really a multiple character sequence. Fail
+ * if that's not what the caller wants. */
+ if (! node_p) {
+
+ /* But even if failing, we count the code points if requested, and
+ * don't back up up the pointer as the caller is expected to
+ * handle this situation */
+ if (cp_count) {
+ char * dot = RExC_parse + 1;
+ do {
+ dot = (char *) memchr(dot, '.', endbrace - dot);
+ if (! dot) {
+ break;
+ }
+ count++;
+ dot++;
+ } while (dot < endbrace);
+ count++;
- /* Fail if caller doesn't want to handle a multi-code-point sequence.
- * But don't backup up the pointer if the caller wants to know how many
- * code points there are (they can then handle things) */
- if (! node_p) {
- if (! cp_count) {
- RExC_parse = p;
+ *cp_count = count;
+ RExC_parse = endbrace;
+ nextchar(pRExC_state);
+ }
+ else { /* Back up the pointer. */
+ RExC_parse = p;
+ }
+ return FALSE;
}
- return FALSE;
- }
- /* What is done here is to convert this to a sub-pattern of the form
- * \x{char1}\x{char2}... and then call reg recursively to parse it
- * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
- * while not having to worry about special handling that some code
- * points may have. */
+ /* What is done here is to convert this to a sub-pattern of the
+ * form \x{char1}\x{char2}... and then call reg recursively to
+ * parse it (enclosing in "(?: ... )" ). That way, it retains its
+ * atomicness, while not having to worry about special handling
+ * that some code points may have. */
- substitute_parse = newSVpvs("?:");
+ if (count == 1) {
+ substitute_parse = newSVpvs("?:");
+ }
- while (RExC_parse < endbrace) {
+ do_concat:
/* Convert to notation the rest of the code understands */
sv_catpv(substitute_parse, "\\x{");
- sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+ sv_catpvn(substitute_parse, start_digit, RExC_parse - start_digit);
sv_catpv(substitute_parse, "}");
- /* Point to the beginning of the next character in the sequence. */
- RExC_parse = endchar + 1;
- endchar = RExC_parse + strcspn(RExC_parse, ".}");
-
- }
- sv_catpv(substitute_parse, ")");
+ /* Move to after the dot (or ending brace the final time through.)
+ * */
+ RExC_parse++;
- len = SvCUR(substitute_parse);
+ } while (RExC_parse < endbrace);
- /* Don't allow empty number */
- if (len < (STRLEN) 8) {
- RExC_parse = endbrace;
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
- }
+ sv_catpv(substitute_parse, ")");
- /* The values are Unicode, and therefore not subject to recoding, but
- * have to be converted to native on a non-Unicode (meaning non-ASCII)
- * platform. */
#ifdef EBCDIC
+ /* The values are Unicode, and therefore have to be converted to native
+ * on a non-Unicode (meaning non-ASCII) platform. */
RExC_recode_x_to_native = 1;
#endif
+ /* Here, we have the string the name evaluates to, ready to be parsed,
+ * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
+ * constructs. This can be called from within a substitute parse already.
+ * The error reporting mechanism doesn't work for 2 levels of this, but the
+ * code above has validated this new construct, so there should be no
+ * errors generated by the below.*/
save_start = RExC_start;
orig_end = RExC_end;
- RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
- len);
- RExC_end = RExC_parse + len;
+ RExC_parse = RExC_start = SvPVX(substitute_parse);
+ RExC_end = RExC_parse + SvCUR(substitute_parse);
*node_p = reg(pRExC_state, 1, &flags, depth+1);
/* Restore the saved values */
- RExC_start = RExC_adjusted_start = save_start;
+ RExC_start = save_start;
RExC_parse = endbrace;
RExC_end = orig_end;
#ifdef EBCDIC
diff --git a/t/lib/croak/regcomp b/t/lib/croak/regcomp
new file mode 100644
index 0000000000..19586d53ee
--- /dev/null
+++ b/t/lib/croak/regcomp
@@ -0,0 +1,56 @@
+__END__
+# NAME \N{U+too large} on 64-bit machine
+# SKIP ? use Config; $Config{uvsize} < 8 && "Not 64 bit"
+qr/\N{U+7FFFFFFFFFFFFFFF}/;
+qr/\N{U+1_0000_0000_0000_0000}/;
+EXPECT
+Use of code point 0x1_0000_0000_0000_0000 is not allowed; the permissible max is 0x7fffffffffffffff in regex; marked by <-- HERE in m/\N{U+1_0000_0000_0000_0000 <-- HERE }/ at - line 2.
+########
+# NAME \N{U+too large} on 32-bit machine
+# SKIP ? use Config; $Config{uvsize} > 4 && "Not 32 bit"
+qr/\N{U+7FFFFFFF}/;
+qr/\N{U+1_0000_0000}/;
+EXPECT
+Use of code point 0x1_0000_0000 is not allowed; the permissible max is 0x7fffffff in regex; marked by <-- HERE in m/\N{U+1_0000_0000 <-- HERE }/ at - line 2.
+########
+# NAME \N{U+100.too large} on 64-bit machine
+# SKIP ? use Config; $Config{uvsize} < 8 && "Not 64 bit"
+qr/\N{U+100.7FFFFFFFFFFFFFFF}/;
+qr/\N{U+100.1_0000_0000_0000_0000}/;
+EXPECT
+Use of code point 0x1_0000_0000_0000_0000 is not allowed; the permissible max is 0x7fffffffffffffff in regex; marked by <-- HERE in m/\N{U+100.1_0000_0000_0000_0000 <-- HERE }/ at - line 2.
+########
+# NAME \N{U+100.too large} on 32-bit machine
+# SKIP ? use Config; $Config{uvsize} > 4 && "Not 32 bit"
+qr/\N{U+100.7FFFFFFF}/;
+qr/\N{U+100.1_0000_0000}/;
+EXPECT
+Use of code point 0x1_0000_0000 is not allowed; the permissible max is 0x7fffffff in regex; marked by <-- HERE in m/\N{U+100.1_0000_0000 <-- HERE }/ at - line 2.
+########
+# NAME \N{U+.}
+my $p00="\\N{U+.}"; qr/$p00/;
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/\N{U+. <-- HERE }/ at - line 1.
+########
+# NAME \N{U+100.}
+my $p00="\\N{U+100.}"; qr/$p00/;
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/\N{U+100. <-- HERE }/ at - line 1.
+########
+# NAME \N{U+_100}
+my $p00="\\N{U+_100}"; qr/$p00/;
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/\N{U+_ <-- HERE 100}/ at - line 1.
+########
+# NAME \N{U+100_}
+my $p00="\\N{U+100_}"; qr/$p00/;
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/\N{U+100_ <-- HERE }/ at - line 1.
+########
+# NAME [ß\N{U+.}]
+my $p00="[ß\\N{U+.}]"; qr/$p00/ui;
+# The sharp s under /i recodes the parse, and this was causing a segfault when
+# the error message referred to the original pattern
+EXPECT
+Invalid hexadecimal number in \N{U+...} in regex; marked by <-- HERE in m/[ß\N{U+. <-- HERE }]/ at - line 1.
+########
diff --git a/utf8.c b/utf8.c
index eede51e13b..aacfc575d6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -36,6 +36,8 @@
static const char malformed_text[] = "Malformed UTF-8 character";
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
+
+/* Be sure to synchronize this message with the similar one in regcomp.c */
static const char cp_above_legal_max[] =
"Use of code point 0x%" UVXf " is not allowed; the"
" permissible max is 0x%" UVXf;