summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c62
1 files changed, 28 insertions, 34 deletions
diff --git a/toke.c b/toke.c
index 4b2937a7ec..2118baf417 100644
--- a/toke.c
+++ b/toke.c
@@ -2777,14 +2777,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
}
}
- /* A custom translator can leave res not in UTF-8, so make sure. XXX This
- * can be revisited to not use utf8 for characters that don't need it when
- * regexes don't have to be in utf8 for Unicode semantics. If doing so,
- * remember EBCDIC */
- if (! SvUTF8(res)) {
- sv_utf8_upgrade(res);
- }
- else { /* Don't accept malformed input */
+ if (SvUTF8(res)) { /* Don't accept malformed input */
const U8* first_bad_char_loc;
STRLEN len;
const char* const str = SvPV_const(res, len);
@@ -3398,31 +3391,6 @@ S_scan_const(pTHX_ char *start)
/* Here it looks like a named character */
- if (PL_lex_inpat) {
-
- /* XXX This block is temporary code. \N{} implies that the
- * pattern is to have Unicode semantics, and therefore
- * currently has to be encoded in utf8. By putting it in
- * utf8 now, we save a whole pass in the regular expression
- * compiler. Once that code is changed so Unicode
- * semantics doesn't necessarily have to be in utf8, this
- * block should be removed. However, the code that parses
- * the output of this would have to be changed to not
- * necessarily expect utf8 */
- if (!has_utf8) {
- SvCUR_set(sv, d - SvPVX_const(sv));
- SvPOK_on(sv);
- *d = '\0';
- /* See Note on sizing above. */
- sv_utf8_upgrade_flags_grow(sv,
- SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- /* 5 = '\N{' + cur char + NUL */
- (STRLEN)(send - s) + 5);
- d = SvPVX(sv) + SvCUR(sv);
- has_utf8 = TRUE;
- }
- }
-
if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
| PERL_SCAN_DISALLOW_PREFIX;
@@ -3504,11 +3472,36 @@ S_scan_const(pTHX_ char *start)
const char *str_end = str + len;
const STRLEN off = d - SvPVX_const(sv);
+
+ if (! SvUTF8(res)) {
+ /* For the non-UTF-8 case, we can determine the
+ * exact length needed without having to parse
+ * through the string. Each character takes up
+ * 2 hex digits plus either a trailing dot or
+ * the "}" */
+ d = off + SvGROW(sv, off
+ + 3 * len
+ + 6 /* For the "\N{U+", and
+ trailing NUL */
+ + (STRLEN)(send - e));
+ Copy("\\N{U+", d, 5, char);
+ d += 5;
+ while (str < str_end) {
+ char hex_string[4];
+ my_snprintf(hex_string, sizeof(hex_string),
+ "%02X.", (U8) *str);
+ Copy(hex_string, d, 3, char);
+ d += 3;
+ str++;
+ }
+ d--; /* We will overwrite below the final
+ dot with a right brace */
+ }
+ else {
STRLEN char_length; /* cur char's byte length */
STRLEN output_length; /* and the number of bytes
after this is translated
into hex digits */
-
/* 2 hex per byte; 2 chars for '\N'; 2 chars for
* max('U+', '.'); and 1 for NUL */
char hex_string[2 * UTF8_MAXBYTES + 5];
@@ -3556,6 +3549,7 @@ S_scan_const(pTHX_ char *start)
Copy(hex_string, d, output_length, char);
d += output_length;
}
+ }
*d++ = '}'; /* Done. Add the trailing brace */
}