diff options
author | Father Chrysostomos <sprout@cpan.org> | 2014-12-14 06:55:30 -0800 |
---|---|---|
committer | Father Chrysostomos <sprout@cpan.org> | 2014-12-17 17:22:55 -0800 |
commit | 4cbd7e223e673d0984095465ae7480b0eb3dbb42 (patch) | |
tree | de9b1572839a2f54e25b823cb924d4095b012682 /toke.c | |
parent | b045b8b598844ddce261c725f3596c9336b42b79 (diff) | |
download | perl-4cbd7e223e673d0984095465ae7480b0eb3dbb42.tar.gz |
[perl #123417] Allow lexer to parse \N{U+dotted.hex}
This ‘unoffical’ notation is used in stringifying regular expressions
that contain named sequences, so that qr/\N{foo}/ stringified can be
incorporated into another regular expression in a different scope and
still mean the same thing.
This also needs to work with eval "/$that_qr/". I didn’t because the
lexer rejected this syntax.
Diffstat (limited to 'toke.c')
-rw-r--r-- | toke.c | 46 |
1 files changed, 40 insertions, 6 deletions
@@ -3283,34 +3283,68 @@ S_scan_const(pTHX_ char *start) if (*s == 'U' && s[1] == '+') { /* \N{U+...} */ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX; STRLEN len; s += 2; /* Skip to next char after the 'U+' */ len = e - s; uv = grok_hex(s, &len, &flags, NULL); - if (len == 0 || len != (STRLEN)(e - s)) { + if (len == 0 + || ( len != (STRLEN)(e - s) && s[len] != '.' + && PL_lex_inpat)) + { + bad_NU: yyerror("Invalid hexadecimal number in \\N{U+...}"); s = e + 1; continue; } if (PL_lex_inpat) { - s -= 5; /* Include the '\N{U+' */ #ifdef EBCDIC + s -= 5; /* Include the '\N{U+' */ /* On EBCDIC platforms, in \N{U+...}, the '...' is a * Unicode value, so convert to native so downstream * code can continue to assume it's native */ + /* XXX This should be in the regexp parser, + because doing it here makes /\N{U+41}/ and + =~ '\N{U+41}' do different things. */ d += my_snprintf(d, e - s + 1 + 1, /* includes the '}' and the \0 */ - "\\N{U+%X}", + "\\N{U+%X", (unsigned int) UNI_TO_NATIVE(uv)); + s += 5 + len; + while (*s == '.') { + s++; + len = e - s; + uv = grok_hex(s, &len, &flags, NULL); + if (!len + || (len != (STRLEN)(e - s) && s[len] != '.')) + goto bad_NU; + s--; + d += my_snprintf( + d, e - s + 1 + 1, ".%X", + (unsigned int)UNI_TO_NATIVE(uv) + ); + s += len + 1; + } + *(d++) = '}'; #else /* On non-EBCDIC platforms, pass it through unchanged. - * The reason we evaluated the number above is to make + * The reason we evaluate the numbers is to make * sure there wasn't a syntax error. */ - Copy(s, d, e - s + 1, char); /* +1 is for the '}' */ - d += e - s + 1; + const char * const orig_s = s - 5; + while (*s == '.') { + s++; + len = e - s; + uv = grok_hex(s, &len, &flags, NULL); + if (!len + || (len != (STRLEN)(e - s) && s[len] != '.')) + goto bad_NU; + } + /* +1 is for the '}' */ + Copy(orig_s, d, e - orig_s + 1, char); + d += e - orig_s + 1; #endif } else { /* Not a pattern: convert the hex to string */ |