summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-03-20 10:25:17 -0600
committerKarl Williamson <public@khwilliamson.com>2011-03-20 12:16:12 -0600
commite2a7e16564e5652c046ec138d11bfa77e7c86836 (patch)
treee0d2fb7425a57217663e09e894580464d92d9ea9
parent9d64099bb0a1ca98620e6124baa4038dd20cf89e (diff)
downloadperl-e2a7e16564e5652c046ec138d11bfa77e7c86836.tar.gz
reg_namedseq: Restructure so doesn't duplicate code
This routine now calls reg() recursively after converting the parse to something the rest of the code understands. This eliminates duplicated code, and allows for uniform treatment of code points, as things were getting out of sync. It also eliminates the restrction on how many characters a named sequence can expand to. toke now converts its input (which is in Unicode terms) to native on EBCDIC platforms, so the rest of the code can can continue to ignore that. The restriction on the length of the number of characters a named sequence is hereby removed, because reg() handles that.
-rw-r--r--pod/perldiag.pod8
-rw-r--r--regcomp.c184
-rw-r--r--t/re/pat_advanced.t13
-rw-r--r--t/re/re_tests4
-rw-r--r--toke.c26
5 files changed, 59 insertions, 176 deletions
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index a461d7bc82..235c39c69f 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -5296,14 +5296,6 @@ removed in a future version.
Currently all but the first one are discarded when used in a regular
expression pattern bracketed character class.
-=item Using just the first characters returned by \N{}
-
-(W) A charnames handler may return a sequence of characters. There is a
-finite limit as to the number of characters that can be used, which this
-sequence exceeded. In the message, the characters in the sequence are
-separated by dots, and each is shown by its ordinal in hex. Anything to
-the left of the C<HERE> was retained; anything to the right was discarded.
-
=item Using !~ with %s doesn't make sense
(F) Using the C<!~> operator with C<s///r>, C<tr///r> or C<y///r> is
diff --git a/regcomp.c b/regcomp.c
index 796cefa75f..cf0f3db648 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -4613,6 +4613,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_seen_evals = 0;
RExC_extralen = 0;
+ RExC_override_recoding = 0;
/* First pass: determine size, legality. */
RExC_parse = exp;
@@ -7794,168 +7795,55 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept
ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
}
else { /* Not a char class */
- char *s; /* String to put in generated EXACT node */
- STRLEN len = 0; /* Its current byte length */
+
+ /* What is done here is to convert this to a sub-pattern of the form
+ * (?:\x{char1}\x{char2}...)
+ * and then call reg recursively. That way, it retains its atomicness,
+ * while not having to worry about special handling that some code
+ * points may have. toke.c has converted the original Unicode values
+ * to native, so that we can just pass on the hex values unchanged. We
+ * do have to set a flag to keep recoding from happening in the
+ * recursion */
+
+ SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
+ STRLEN len;
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
- ret = reg_node(pRExC_state,
- (U8) ((! FOLD) ? EXACT
- : (LOC)
- ? EXACTFL
- : (MORE_ASCII_RESTRICTED)
- ? EXACTFA
- : (AT_LEAST_UNI_SEMANTICS)
- ? EXACTFU
- : EXACTF));
- s= STRING(ret);
-
- /* Exact nodes can hold only a U8 length's of text = 255. Loop through
- * the input which is of the form now 'c1.c2.c3...}' until find the
- * ending brace or exceed length 255. The characters that exceed this
- * limit are dropped. The limit could be relaxed should it become
- * desirable by reparsing this as (?:\N{NAME}), so could generate
- * multiple EXACT nodes, as is done for just regular input. But this
- * is primarily a named character, and not intended to be a huge long
- * string, so 255 bytes should be good enough */
- while (1) {
- STRLEN length_of_hex;
- I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
- | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
- UV cp; /* Ord of current character */
- bool use_this_char_fold = FOLD;
+ char *orig_end = RExC_end;
+
+ while (RExC_parse < endbrace) {
/* Code points are separated by dots. If none, there is only one
* code point, and is terminated by the brace */
endchar = RExC_parse + strcspn(RExC_parse, ".}");
- /* The values are Unicode even on EBCDIC machines */
- length_of_hex = (STRLEN)(endchar - RExC_parse);
- cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
- if ( length_of_hex == 0
- || length_of_hex != (STRLEN)(endchar - RExC_parse) )
- {
- RExC_parse += length_of_hex; /* Includes all the valid */
- RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
- /* Guard against malformed utf8 */
- if (RExC_parse >= endchar) RExC_parse = endchar;
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
- }
-
- /* XXX ? Change to ANYOF node
- if (FOLD
- && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))
- && is_TRICKYFOLD_cp(cp))
- {
- }
- */
-
- /* Under /aa, we can't mix ASCII with non- in a fold. If we are
- * folding, and the source isn't ASCII, look through all the
- * characters it folds to. If any one of them is ASCII, forbid
- * this fold. (cp is uni, so the 127 below is correct even for
- * EBCDIC). Similarly under locale rules, we don't mix under 256
- * with above 255. XXX It really doesn't make sense to have \N{}
- * which means a Unicode rules under locale. I (khw) think this
- * should be warned about, but the counter argument is that people
- * who have programmed around Perl's earlier lack of specifying the
- * rules and used \N{} to force Unicode things in a local
- * environment shouldn't get suddenly a warning */
- if (use_this_char_fold) {
- if (LOC && cp < 256) { /* Fold not known until run-time */
- use_this_char_fold = FALSE;
- }
- else if ((cp > 127 && MORE_ASCII_RESTRICTED)
- || (cp > 255 && LOC))
- {
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
- U8* s = tmpbuf;
- U8* e;
- STRLEN foldlen;
-
- (void) toFOLD_uni(cp, tmpbuf, &foldlen);
- e = s + foldlen;
-
- while (s < e) {
- if (isASCII(*s)
- || (LOC && (UTF8_IS_INVARIANT(*s)
- || UTF8_IS_DOWNGRADEABLE_START(*s))))
- {
- use_this_char_fold = FALSE;
- break;
- }
- s += UTF8SKIP(s);
- }
- }
- }
-
- if (! use_this_char_fold) { /* Not folding, just append to the
- string */
- STRLEN unilen;
-
- /* Quit before adding this character if would exceed limit */
- if (len + UNISKIP(cp) > U8_MAX) break;
-
- unilen = reguni(pRExC_state, cp, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- } else { /* Folding, output the folded equivalent */
- STRLEN foldlen,numlen;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
- cp = toFOLD_uni(cp, tmpbuf, &foldlen);
-
- /* Quit before exceeding size limit */
- if (len + foldlen > U8_MAX) break;
-
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen)
- {
- cp = utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, cp, s);
- s += unilen;
- len += unilen;
- /* In EBCDIC the numlen and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break; /* "Can't happen." */
- }
- }
+ /* Convert to notation the rest of the code understands */
+ sv_catpv(substitute_parse, "\\x{");
+ sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+ sv_catpv(substitute_parse, "}");
/* Point to the beginning of the next character in the sequence. */
RExC_parse = endchar + 1;
-
- /* Quit if no more characters */
- if (RExC_parse >= endbrace) break;
}
+ sv_catpv(substitute_parse, ")");
+ RExC_parse = SvPV(substitute_parse, len);
- if (SIZE_ONLY) {
- if (RExC_parse < endbrace) {
- ckWARNreg(RExC_parse - 1,
- "Using just the first characters returned by \\N{}");
- }
-
- RExC_size += STR_SZ(len);
- } else {
- STR_LEN(ret) = len;
- RExC_emit += STR_SZ(len);
+ /* Don't allow empty number */
+ if (len < 8) {
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
+ RExC_end = RExC_parse + len;
- RExC_parse = endbrace + 1;
+ /* The values are Unicode, and therefore not subject to recoding */
+ RExC_override_recoding = 1;
+
+ ret = reg(pRExC_state, 1, flagp, depth+1);
+
+ RExC_parse = endbrace;
+ RExC_end = orig_end;
+ RExC_override_recoding = 0;
- *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
- with malformed in t/re/pat_advanced.t */
- RExC_parse --;
- Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
}
@@ -8775,7 +8663,7 @@ tryagain:
goto recode_encoding;
break;
recode_encoding:
- {
+ if (! RExC_override_recoding) {
SV* enc = PL_encoding;
ender = reg_recode((const char)(U8)ender, &enc);
if (!enc && SIZE_ONLY)
@@ -9765,7 +9653,7 @@ parseit:
break;
}
recode_encoding:
- {
+ if (! RExC_override_recoding) {
SV* enc = PL_encoding;
value = reg_recode((const char)(U8)value, &enc);
if (!enc && SIZE_ONLY)
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index b102188f55..6d7624d4fb 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -1007,18 +1007,7 @@ sub run_tests {
# If remove the limitation in regcomp code these should work
# differently
undef $w;
- eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string fails gracefully'];
- ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit';
- undef $w;
- eval q [ok "\N{LONG-STR}" =~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string fails gracefully'];
- ok $w && $w =~ /Using just the first characters returned/, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit';
- undef $w;
- eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/, 'Verify that too long a string doesnt work'];
- ok $w && $w =~ /Using just the first characters returned/, 'Verify that got too-long string warning in \N{} that exceeds the limit';
- undef $w;
- eval q [ok "\N{TOO-LONG-STR}" !~ /^\N{TOO-LONG-STR}$/i, 'Verify under folding that too long a string doesnt work'];
- ok $w && $w =~ /Using just the first characters returned/i, 'Verify under folding that got too-long string warning in \N{} that exceeds the limit';
- undef $w;
+ eval q [ok "\N{TOO-LONG-STR}" =~ /^\N{TOO-LONG-STR}$/, 'Verify that what once was too long a string works'];
eval 'q(syntax error) =~ /\N{MALFORMED}/';
ok $@ && $@ =~ /Malformed/, 'Verify that malformed utf8 gives an error';
undef $w;
diff --git a/t/re/re_tests b/t/re/re_tests
index b3815298bb..af246775e2 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1439,7 +1439,7 @@ abc\N abc\n n
# figures it out.
\N{U+} - c - Invalid hexadecimal number
[\N{U+}] - c - Invalid hexadecimal number
-\N{U+4AG3} - c - Invalid hexadecimal number
+\N{U+4AG3} - c - Illegal hexadecimal digit
[\N{U+4AG3}] - c - Invalid hexadecimal number
abc\N{def - c - \\N{NAME} must be resolved by the lexer
@@ -1453,7 +1453,7 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer
# Verify works in single quotish context; regex compiler delivers slightly different msg
# \N{U+BEEF.BEAD} succeeds here, because can't completely hide it from the outside.
-\N{U+0xBEEF} - c - Invalid hexadecimal number
+\N{U+0xBEEF} - c - Illegal hexadecimal digit
\c` - c - \"\\c`\" is more clearly written simply as \"\\ \"
\c1 - c - \"\\c1\" is more clearly written simply as \"q\"
\cA \001 y $& \1
diff --git a/toke.c b/toke.c
index 2dbe7f7d07..6933e62af4 100644
--- a/toke.c
+++ b/toke.c
@@ -3140,12 +3140,22 @@ S_scan_const(pTHX_ char *start)
if (PL_lex_inpat) {
- /* Pass through to the regex compiler unchanged. The
- * reason we evaluated the number above is to make sure
- * there wasn't a syntax error. */
+ /* On non-EBCDIC platforms, pass through to the regex
+ * compiler unchanged. The reason we evaluated the
+ * number above is to make sure there wasn't a syntax
+ * error. But on EBCDIC we convert to native so
+ * downstream code can continue to assume it's native
+ */
s -= 5; /* Include the '\N{U+' */
+#ifdef EBCDIC
+ d += my_snprintf(d, e - s + 1 + 1, /* includes the }
+ and the \0 */
+ "\\N{U+%X}",
+ (unsigned int) UNI_TO_NATIVE(uv));
+#else
Copy(s, d, e - s + 1, char); /* 1 = include the } */
d += e - s + 1;
+#endif
}
else { /* Not a pattern: convert the hex to string */
@@ -3239,10 +3249,13 @@ S_scan_const(pTHX_ char *start)
}
/* Convert first code point to hex, including the
- * boiler plate before it */
+ * boiler plate before it. For all these, we
+ * convert to native format so that downstream code
+ * can continue to assume the input is native */
output_length =
my_snprintf(hex_string, sizeof(hex_string),
- "\\N{U+%X", (unsigned int) uv);
+ "\\N{U+%X",
+ (unsigned int) UNI_TO_NATIVE(uv));
/* Make sure there is enough space to hold it */
d = off + SvGROW(sv, off
@@ -3267,7 +3280,8 @@ S_scan_const(pTHX_ char *start)
output_length =
my_snprintf(hex_string, sizeof(hex_string),
- ".%X", (unsigned int) uv);
+ ".%X",
+ (unsigned int) UNI_TO_NATIVE(uv));
d = off + SvGROW(sv, off
+ output_length