diff options
author | Karl Williamson <khw@cpan.org> | 2015-03-16 22:38:20 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2015-03-18 16:14:37 -0600 |
commit | b6d67071cc036ae5056dfe9b570ba76942fc08f4 (patch) | |
tree | fa13bf0bff4cbc76df8df602b33174470e3ab31a /toke.c | |
parent | 7a4ca5b4c6cbf0022494a8f350fe000abb4b3034 (diff) | |
download | perl-b6d67071cc036ae5056dfe9b570ba76942fc08f4.tar.gz |
Fix qr'\N{U+41}' on EBCDIC platforms
Prior to this commit, the regex compiler was relying on the lexer to do
the translation from Unicode to native for \N{...} constructs, where it
was simpler to do. However, when the pattern is a single-quoted string,
it is passed unchanged to the regex compiler, and did not work. Fixing
it required some refactoring, though it led to a clean API in a static
function.
This was spotted by Father Chrysostomos.
Diffstat (limited to 'toke.c')
-rw-r--r-- | toke.c | 35 |
1 files changed, 18 insertions, 17 deletions
@@ -3276,12 +3276,7 @@ S_scan_const(pTHX_ char *start) * Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...} * if a pattern; otherwise convert to utf8 * - * If the regex compiler should ever need to differentiate - * between the \N{U+...} and \N{name} forms, that could easily - * be done here by stripping any leading zeros from the - * \N{U+...} case, and adding them to the other one. */ - - /* Here, 's' points to the 'N'; the test below is guaranteed to + * Here, 's' points to the 'N'; the test below is guaranteed to * succeed if we are being called on a pattern, as we already * know from a test above that the next character is a '{'. A * non-pattern \N must mean 'named character', which requires @@ -3413,9 +3408,15 @@ S_scan_const(pTHX_ char *start) char hex_string[4]; int len = my_snprintf(hex_string, - sizeof(hex_string), - "%02X.", (U8) *str); - PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string)); + sizeof(hex_string), + "%02X.", + + /* The regex compiler is + * expecting Unicode, not + * native */ + (U8) NATIVE_TO_LATIN1(*str)); + PERL_MY_SNPRINTF_POST_GUARD(len, + sizeof(hex_string)); Copy(hex_string, d, 3, char); d += 3; str++; @@ -3439,12 +3440,12 @@ S_scan_const(pTHX_ char *start) len, &char_length, UTF8_ALLOW_ANYUV); - /* Convert first code point to hex, including - * the boiler plate before it. */ + /* Convert first code point to Unicode hex, + * including the boiler plate before it. */ output_length = my_snprintf(hex_string, sizeof(hex_string), - "\\N{U+%X", - (unsigned int) uv); + "\\N{U+%X", + (unsigned int) NATIVE_TO_UNI(uv)); /* Make sure there is enough space to hold it */ d = off + SvGROW(sv, off @@ -3456,7 +3457,7 @@ S_scan_const(pTHX_ char *start) d += output_length; /* For each subsequent character, append dot and - * its ordinal in hex */ + * its Unicode code point in hex */ while ((str += char_length) < str_end) { const STRLEN off = d - SvPVX_const(sv); U32 uv = utf8n_to_uvchr((U8 *) str, @@ -3465,9 +3466,9 @@ S_scan_const(pTHX_ char *start) UTF8_ALLOW_ANYUV); output_length = my_snprintf(hex_string, - sizeof(hex_string), - ".%X", - (unsigned int) uv); + sizeof(hex_string), + ".%X", + (unsigned int) NATIVE_TO_UNI(uv)); d = off + SvGROW(sv, off + output_length |