Fix qr'\N{U+41}' on EBCDIC platforms

Prior to this commit, the regex compiler was relying on the lexer to do the translation from Unicode to native for \N{...} constructs, where it was simpler to do. However, when the pattern is a single-quoted string, it is passed unchanged to the regex compiler, and did not work. Fixing it required some refactoring, though it led to a clean API in a static function. This was spotted by Father Chrysostomos.
author: Karl Williamson <khw@cpan.org> 2015-03-16 22:38:20 -0600
committer: Karl Williamson <khw@cpan.org> 2015-03-18 16:14:37 -0600
commit: b6d67071cc036ae5056dfe9b570ba76942fc08f4 (patch)
tree: fa13bf0bff4cbc76df8df602b33174470e3ab31a /toke.c
parent: 7a4ca5b4c6cbf0022494a8f350fe000abb4b3034 (diff)
download: perl-b6d67071cc036ae5056dfe9b570ba76942fc08f4.tar.gz
1 files changed, 18 insertions, 17 deletions
diff --git a/toke.c b/toke.c
index bfcb060abe..414a03aa83 100644
--- a/toke.c
+++ b/toke.c
@@ -3276,12 +3276,7 @@ S_scan_const(pTHX_ char *start)
                  *  Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...}
                  *      if a pattern; otherwise convert to utf8
                  *
-                 * If the regex compiler should ever need to differentiate
-                 * between the \N{U+...} and \N{name} forms, that could easily
-                 * be done here by stripping any leading zeros from the
-                 * \N{U+...} case, and adding them to the other one. */
-
-                /* Here, 's' points to the 'N'; the test below is guaranteed to
+                 * Here, 's' points to the 'N'; the test below is guaranteed to
 		 * succeed if we are being called on a pattern, as we already
                  * know from a test above that the next character is a '{'.  A
                  * non-pattern \N must mean 'named character', which requires
@@ -3413,9 +3408,15 @@ S_scan_const(pTHX_ char *start)
                                     char hex_string[4];
                                     int len =
                                         my_snprintf(hex_string,
-                                                    sizeof(hex_string),
-                                                    "%02X.", (U8) *str);
-                                    PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
+                                                  sizeof(hex_string),
+                                                  "%02X.",
+
+                                                  /* The regex compiler is
+                                                   * expecting Unicode, not
+                                                   * native */
+                                                  (U8) NATIVE_TO_LATIN1(*str));
+                                    PERL_MY_SNPRINTF_POST_GUARD(len,
+                                                           sizeof(hex_string));
                                     Copy(hex_string, d, 3, char);
                                     d += 3;
                                     str++;
@@ -3439,12 +3440,12 @@ S_scan_const(pTHX_ char *start)
                                                         len,
                                                         &char_length,
                                                         UTF8_ALLOW_ANYUV);
-                                /* Convert first code point to hex, including
-                                 * the boiler plate before it. */
+                                /* Convert first code point to Unicode hex,
+                                 * including the boiler plate before it. */
                                 output_length =
                                     my_snprintf(hex_string, sizeof(hex_string),
-                                                "\\N{U+%X",
-                                                (unsigned int) uv);
+                                             "\\N{U+%X",
+                                             (unsigned int) NATIVE_TO_UNI(uv));
 
                                 /* Make sure there is enough space to hold it */
                                 d = off + SvGROW(sv, off
@@ -3456,7 +3457,7 @@ S_scan_const(pTHX_ char *start)
                                 d += output_length;
 
                                 /* For each subsequent character, append dot and
-                                * its ordinal in hex */
+                                * its Unicode code point in hex */
                                 while ((str += char_length) < str_end) {
                                     const STRLEN off = d - SvPVX_const(sv);
                                     U32 uv = utf8n_to_uvchr((U8 *) str,
@@ -3465,9 +3466,9 @@ S_scan_const(pTHX_ char *start)
                                                             UTF8_ALLOW_ANYUV);
                                     output_length =
                                         my_snprintf(hex_string,
-                                                    sizeof(hex_string),
-                                                    ".%X",
-                                                    (unsigned int) uv);
+                                             sizeof(hex_string),
+                                             ".%X",
+                                             (unsigned int) NATIVE_TO_UNI(uv));
 
                                     d = off + SvGROW(sv, off
                                                         + output_length
author	Karl Williamson <khw@cpan.org>	2015-03-16 22:38:20 -0600
committer	Karl Williamson <khw@cpan.org>	2015-03-18 16:14:37 -0600
commit	b6d67071cc036ae5056dfe9b570ba76942fc08f4 (patch)
tree	fa13bf0bff4cbc76df8df602b33174470e3ab31a /toke.c
parent	7a4ca5b4c6cbf0022494a8f350fe000abb4b3034 (diff)
download	perl-b6d67071cc036ae5056dfe9b570ba76942fc08f4.tar.gz