summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
authorFather Chrysostomos <sprout@cpan.org>2014-12-14 06:55:30 -0800
committerFather Chrysostomos <sprout@cpan.org>2014-12-17 17:22:55 -0800
commit4cbd7e223e673d0984095465ae7480b0eb3dbb42 (patch)
treede9b1572839a2f54e25b823cb924d4095b012682 /toke.c
parentb045b8b598844ddce261c725f3596c9336b42b79 (diff)
downloadperl-4cbd7e223e673d0984095465ae7480b0eb3dbb42.tar.gz
[perl #123417] Allow lexer to parse \N{U+dotted.hex}
This ‘unoffical’ notation is used in stringifying regular expressions that contain named sequences, so that qr/\N{foo}/ stringified can be incorporated into another regular expression in a different scope and still mean the same thing. This also needs to work with eval "/$that_qr/". I didn’t because the lexer rejected this syntax.
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c46
1 files changed, 40 insertions, 6 deletions
diff --git a/toke.c b/toke.c
index 4003ab1703..c47e2c2dae 100644
--- a/toke.c
+++ b/toke.c
@@ -3283,34 +3283,68 @@ S_scan_const(pTHX_ char *start)
if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_SILENT_ILLDIGIT
| PERL_SCAN_DISALLOW_PREFIX;
STRLEN len;
s += 2; /* Skip to next char after the 'U+' */
len = e - s;
uv = grok_hex(s, &len, &flags, NULL);
- if (len == 0 || len != (STRLEN)(e - s)) {
+ if (len == 0
+ || ( len != (STRLEN)(e - s) && s[len] != '.'
+ && PL_lex_inpat))
+ {
+ bad_NU:
yyerror("Invalid hexadecimal number in \\N{U+...}");
s = e + 1;
continue;
}
if (PL_lex_inpat) {
- s -= 5; /* Include the '\N{U+' */
#ifdef EBCDIC
+ s -= 5; /* Include the '\N{U+' */
/* On EBCDIC platforms, in \N{U+...}, the '...' is a
* Unicode value, so convert to native so downstream
* code can continue to assume it's native */
+ /* XXX This should be in the regexp parser,
+ because doing it here makes /\N{U+41}/ and
+ =~ '\N{U+41}' do different things. */
d += my_snprintf(d, e - s + 1 + 1, /* includes the '}'
and the \0 */
- "\\N{U+%X}",
+ "\\N{U+%X",
(unsigned int) UNI_TO_NATIVE(uv));
+ s += 5 + len;
+ while (*s == '.') {
+ s++;
+ len = e - s;
+ uv = grok_hex(s, &len, &flags, NULL);
+ if (!len
+ || (len != (STRLEN)(e - s) && s[len] != '.'))
+ goto bad_NU;
+ s--;
+ d += my_snprintf(
+ d, e - s + 1 + 1, ".%X",
+ (unsigned int)UNI_TO_NATIVE(uv)
+ );
+ s += len + 1;
+ }
+ *(d++) = '}';
#else
/* On non-EBCDIC platforms, pass it through unchanged.
- * The reason we evaluated the number above is to make
+ * The reason we evaluate the numbers is to make
* sure there wasn't a syntax error. */
- Copy(s, d, e - s + 1, char); /* +1 is for the '}' */
- d += e - s + 1;
+ const char * const orig_s = s - 5;
+ while (*s == '.') {
+ s++;
+ len = e - s;
+ uv = grok_hex(s, &len, &flags, NULL);
+ if (!len
+ || (len != (STRLEN)(e - s) && s[len] != '.'))
+ goto bad_NU;
+ }
+ /* +1 is for the '}' */
+ Copy(orig_s, d, e - orig_s + 1, char);
+ d += e - orig_s + 1;
#endif
}
else { /* Not a pattern: convert the hex to string */