[perl #123417] Allow lexer to parse \N{U+dotted.hex}

This ‘unoffical’ notation is used in stringifying regular expressions that contain named sequences, so that qr/\N{foo}/ stringified can be incorporated into another regular expression in a different scope and still mean the same thing. This also needs to work with eval "/$that_qr/". I didn’t because the lexer rejected this syntax.
author: Father Chrysostomos <sprout@cpan.org> 2014-12-14 06:55:30 -0800
committer: Father Chrysostomos <sprout@cpan.org> 2014-12-17 17:22:55 -0800
commit: 4cbd7e223e673d0984095465ae7480b0eb3dbb42 (patch)
tree: de9b1572839a2f54e25b823cb924d4095b012682 /toke.c
parent: b045b8b598844ddce261c725f3596c9336b42b79 (diff)
download: perl-4cbd7e223e673d0984095465ae7480b0eb3dbb42.tar.gz
1 files changed, 40 insertions, 6 deletions
diff --git a/toke.c b/toke.c
index 4003ab1703..c47e2c2dae 100644
--- a/toke.c
+++ b/toke.c
@@ -3283,34 +3283,68 @@ S_scan_const(pTHX_ char *start)
 
 		if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
 		    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+				| PERL_SCAN_SILENT_ILLDIGIT
 				| PERL_SCAN_DISALLOW_PREFIX;
 		    STRLEN len;
 
 		    s += 2;	    /* Skip to next char after the 'U+' */
 		    len = e - s;
 		    uv = grok_hex(s, &len, &flags, NULL);
-		    if (len == 0 || len != (STRLEN)(e - s)) {
+		    if (len == 0
+		     || (  len != (STRLEN)(e - s) && s[len] != '.'
+			&& PL_lex_inpat))
+		    {
+		      bad_NU:
 			yyerror("Invalid hexadecimal number in \\N{U+...}");
 			s = e + 1;
 			continue;
 		    }
 
 		    if (PL_lex_inpat) {
-			s -= 5;	    /* Include the '\N{U+' */
 #ifdef EBCDIC
+			s -= 5;	    /* Include the '\N{U+' */
                         /* On EBCDIC platforms, in \N{U+...}, the '...' is a
                          * Unicode value, so convert to native so downstream
                          * code can continue to assume it's native */
+                        /* XXX This should be in the regexp parser,
+                               because doing it here makes /\N{U+41}/ and
+                               =~ '\N{U+41}' do different things.  */
 			d += my_snprintf(d, e - s + 1 + 1,  /* includes the '}'
 							       and the \0 */
-                                         "\\N{U+%X}",
+                                         "\\N{U+%X",
                                          (unsigned int) UNI_TO_NATIVE(uv));
+                        s += 5 + len;
+                        while (*s == '.') {
+                            s++;
+                            len = e - s;
+                            uv = grok_hex(s, &len, &flags, NULL);
+                            if (!len
+                             || (len != (STRLEN)(e - s) && s[len] != '.'))
+                                goto bad_NU;
+                            s--;
+                            d += my_snprintf(
+                                     d, e - s + 1 + 1, ".%X",
+                                     (unsigned int)UNI_TO_NATIVE(uv)
+                                 );
+                            s += len + 1;
+                        }
+                        *(d++) = '}';
 #else
                         /* On non-EBCDIC platforms, pass it through unchanged.
-                         * The reason we evaluated the number above is to make
+                         * The reason we evaluate the numbers is to make
                          * sure there wasn't a syntax error. */
-			Copy(s, d, e - s + 1, char);	/* +1 is for the '}' */
-			d += e - s + 1;
+                        const char * const orig_s = s - 5;
+                        while (*s == '.') {
+                            s++;
+                            len = e - s;
+                            uv = grok_hex(s, &len, &flags, NULL);
+                            if (!len
+                             || (len != (STRLEN)(e - s) && s[len] != '.'))
+                                goto bad_NU;
+                        }
+                        /* +1 is for the '}' */
+                        Copy(orig_s, d, e - orig_s + 1, char);
+                        d += e - orig_s + 1;
 #endif
 		    }
 		    else {  /* Not a pattern: convert the hex to string */
author	Father Chrysostomos <sprout@cpan.org>	2014-12-14 06:55:30 -0800
committer	Father Chrysostomos <sprout@cpan.org>	2014-12-17 17:22:55 -0800
commit	4cbd7e223e673d0984095465ae7480b0eb3dbb42 (patch)
tree	de9b1572839a2f54e25b823cb924d4095b012682 /toke.c
parent	b045b8b598844ddce261c725f3596c9336b42b79 (diff)
download	perl-4cbd7e223e673d0984095465ae7480b0eb3dbb42.tar.gz