diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-11-11 14:47:53 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-11-11 15:34:03 -0700 |
commit | 94ca1619c5a35b493b5e6374422aac5b5fce4678 (patch) | |
tree | 30876fd538e85f9c816f78dc9bbdae7124692de3 /toke.c | |
parent | 55025e0373118aae3b91192b1eb95b1aad0d6c9d (diff) | |
download | perl-94ca1619c5a35b493b5e6374422aac5b5fce4678.tar.gz |
toke.c: Don't force \N{} into utf8 unnecessarily
regcomp.c no longer requires everything containing \N{} to be in UTF-8.
I'm not sure of the exact commit; it might even have been in 5.16. But
it was done by the time 86e88272fdabc40e3b168a3cc43af5e86284d01b was
done.
Therefore we can remove the temporary code that forced utf8, and replace
it with code that handles the non-utf8 case.
Note that outside patterns, \N{} still forces utf8. This is so that
Unicode semantics will be imposed on the string it resides in, no matter
how it is used. Patterns have a flag that indicates Unicode semantics,
so don't need to be in utf8.
Diffstat (limited to 'toke.c')
-rw-r--r-- | toke.c | 62 |
1 files changed, 28 insertions, 34 deletions
@@ -2777,14 +2777,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) } } - /* A custom translator can leave res not in UTF-8, so make sure. XXX This - * can be revisited to not use utf8 for characters that don't need it when - * regexes don't have to be in utf8 for Unicode semantics. If doing so, - * remember EBCDIC */ - if (! SvUTF8(res)) { - sv_utf8_upgrade(res); - } - else { /* Don't accept malformed input */ + if (SvUTF8(res)) { /* Don't accept malformed input */ const U8* first_bad_char_loc; STRLEN len; const char* const str = SvPV_const(res, len); @@ -3398,31 +3391,6 @@ S_scan_const(pTHX_ char *start) /* Here it looks like a named character */ - if (PL_lex_inpat) { - - /* XXX This block is temporary code. \N{} implies that the - * pattern is to have Unicode semantics, and therefore - * currently has to be encoded in utf8. By putting it in - * utf8 now, we save a whole pass in the regular expression - * compiler. Once that code is changed so Unicode - * semantics doesn't necessarily have to be in utf8, this - * block should be removed. However, the code that parses - * the output of this would have to be changed to not - * necessarily expect utf8 */ - if (!has_utf8) { - SvCUR_set(sv, d - SvPVX_const(sv)); - SvPOK_on(sv); - *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow(sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - /* 5 = '\N{' + cur char + NUL */ - (STRLEN)(send - s) + 5); - d = SvPVX(sv) + SvCUR(sv); - has_utf8 = TRUE; - } - } - if (*s == 'U' && s[1] == '+') { /* \N{U+...} */ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX; @@ -3504,11 +3472,36 @@ S_scan_const(pTHX_ char *start) const char *str_end = str + len; const STRLEN off = d - SvPVX_const(sv); + + if (! SvUTF8(res)) { + /* For the non-UTF-8 case, we can determine the + * exact length needed without having to parse + * through the string. Each character takes up + * 2 hex digits plus either a trailing dot or + * the "}" */ + d = off + SvGROW(sv, off + + 3 * len + + 6 /* For the "\N{U+", and + trailing NUL */ + + (STRLEN)(send - e)); + Copy("\\N{U+", d, 5, char); + d += 5; + while (str < str_end) { + char hex_string[4]; + my_snprintf(hex_string, sizeof(hex_string), + "%02X.", (U8) *str); + Copy(hex_string, d, 3, char); + d += 3; + str++; + } + d--; /* We will overwrite below the final + dot with a right brace */ + } + else { STRLEN char_length; /* cur char's byte length */ STRLEN output_length; /* and the number of bytes after this is translated into hex digits */ - /* 2 hex per byte; 2 chars for '\N'; 2 chars for * max('U+', '.'); and 1 for NUL */ char hex_string[2 * UTF8_MAXBYTES + 5]; @@ -3556,6 +3549,7 @@ S_scan_const(pTHX_ char *start) Copy(hex_string, d, output_length, char); d += output_length; } + } *d++ = '}'; /* Done. Add the trailing brace */ } |