summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2015-03-16 22:38:20 -0600
committerKarl Williamson <khw@cpan.org>2015-03-18 16:14:37 -0600
commitb6d67071cc036ae5056dfe9b570ba76942fc08f4 (patch)
treefa13bf0bff4cbc76df8df602b33174470e3ab31a /toke.c
parent7a4ca5b4c6cbf0022494a8f350fe000abb4b3034 (diff)
downloadperl-b6d67071cc036ae5056dfe9b570ba76942fc08f4.tar.gz
Fix qr'\N{U+41}' on EBCDIC platforms
Prior to this commit, the regex compiler was relying on the lexer to do the translation from Unicode to native for \N{...} constructs, where it was simpler to do. However, when the pattern is a single-quoted string, it is passed unchanged to the regex compiler, and did not work. Fixing it required some refactoring, though it led to a clean API in a static function. This was spotted by Father Chrysostomos.
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c35
1 files changed, 18 insertions, 17 deletions
diff --git a/toke.c b/toke.c
index bfcb060abe..414a03aa83 100644
--- a/toke.c
+++ b/toke.c
@@ -3276,12 +3276,7 @@ S_scan_const(pTHX_ char *start)
* Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...}
* if a pattern; otherwise convert to utf8
*
- * If the regex compiler should ever need to differentiate
- * between the \N{U+...} and \N{name} forms, that could easily
- * be done here by stripping any leading zeros from the
- * \N{U+...} case, and adding them to the other one. */
-
- /* Here, 's' points to the 'N'; the test below is guaranteed to
+ * Here, 's' points to the 'N'; the test below is guaranteed to
* succeed if we are being called on a pattern, as we already
* know from a test above that the next character is a '{'. A
* non-pattern \N must mean 'named character', which requires
@@ -3413,9 +3408,15 @@ S_scan_const(pTHX_ char *start)
char hex_string[4];
int len =
my_snprintf(hex_string,
- sizeof(hex_string),
- "%02X.", (U8) *str);
- PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
+ sizeof(hex_string),
+ "%02X.",
+
+ /* The regex compiler is
+ * expecting Unicode, not
+ * native */
+ (U8) NATIVE_TO_LATIN1(*str));
+ PERL_MY_SNPRINTF_POST_GUARD(len,
+ sizeof(hex_string));
Copy(hex_string, d, 3, char);
d += 3;
str++;
@@ -3439,12 +3440,12 @@ S_scan_const(pTHX_ char *start)
len,
&char_length,
UTF8_ALLOW_ANYUV);
- /* Convert first code point to hex, including
- * the boiler plate before it. */
+ /* Convert first code point to Unicode hex,
+ * including the boiler plate before it. */
output_length =
my_snprintf(hex_string, sizeof(hex_string),
- "\\N{U+%X",
- (unsigned int) uv);
+ "\\N{U+%X",
+ (unsigned int) NATIVE_TO_UNI(uv));
/* Make sure there is enough space to hold it */
d = off + SvGROW(sv, off
@@ -3456,7 +3457,7 @@ S_scan_const(pTHX_ char *start)
d += output_length;
/* For each subsequent character, append dot and
- * its ordinal in hex */
+ * its Unicode code point in hex */
while ((str += char_length) < str_end) {
const STRLEN off = d - SvPVX_const(sv);
U32 uv = utf8n_to_uvchr((U8 *) str,
@@ -3465,9 +3466,9 @@ S_scan_const(pTHX_ char *start)
UTF8_ALLOW_ANYUV);
output_length =
my_snprintf(hex_string,
- sizeof(hex_string),
- ".%X",
- (unsigned int) uv);
+ sizeof(hex_string),
+ ".%X",
+ (unsigned int) NATIVE_TO_UNI(uv));
d = off + SvGROW(sv, off
+ output_length