toke.c: Don't force \N{} into utf8 unnecessarily

regcomp.c no longer requires everything containing \N{} to be in UTF-8. I'm not sure of the exact commit; it might even have been in 5.16. But it was done by the time 86e88272fdabc40e3b168a3cc43af5e86284d01b was done. Therefore we can remove the temporary code that forced utf8, and replace it with code that handles the non-utf8 case. Note that outside patterns, \N{} still forces utf8. This is so that Unicode semantics will be imposed on the string it resides in, no matter how it is used. Patterns have a flag that indicates Unicode semantics, so don't need to be in utf8.
author: Karl Williamson <public@khwilliamson.com> 2012-11-11 14:47:53 -0700
committer: Karl Williamson <public@khwilliamson.com> 2012-11-11 15:34:03 -0700
commit: 94ca1619c5a35b493b5e6374422aac5b5fce4678 (patch)
tree: 30876fd538e85f9c816f78dc9bbdae7124692de3 /toke.c
parent: 55025e0373118aae3b91192b1eb95b1aad0d6c9d (diff)
download: perl-94ca1619c5a35b493b5e6374422aac5b5fce4678.tar.gz
1 files changed, 28 insertions, 34 deletions
diff --git a/toke.c b/toke.c
index 4b2937a7ec..2118baf417 100644
--- a/toke.c
+++ b/toke.c
@@ -2777,14 +2777,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e)
         }
     }
 
-    /* A custom translator can leave res not in UTF-8, so make sure.  XXX This
-     * can be revisited to not use utf8 for characters that don't need it when
-     * regexes don't have to be in utf8 for Unicode semantics.  If doing so,
-     * remember EBCDIC */
-    if (! SvUTF8(res)) {
-        sv_utf8_upgrade(res);
-    }
-    else { /* Don't accept malformed input */
+    if (SvUTF8(res)) { /* Don't accept malformed input */
         const U8* first_bad_char_loc;
         STRLEN len;
         const char* const str = SvPV_const(res, len);
@@ -3398,31 +3391,6 @@ S_scan_const(pTHX_ char *start)
 
 		/* Here it looks like a named character */
 
-		if (PL_lex_inpat) {
-
-		    /* XXX This block is temporary code.  \N{} implies that the
-		     * pattern is to have Unicode semantics, and therefore
-		     * currently has to be encoded in utf8.  By putting it in
-		     * utf8 now, we save a whole pass in the regular expression
-		     * compiler.  Once that code is changed so Unicode
-		     * semantics doesn't necessarily have to be in utf8, this
-		     * block should be removed.  However, the code that parses
-		     * the output of this would have to be changed to not
-		     * necessarily expect utf8 */
-		    if (!has_utf8) {
-			SvCUR_set(sv, d - SvPVX_const(sv));
-			SvPOK_on(sv);
-			*d = '\0';
-			/* See Note on sizing above.  */
-			sv_utf8_upgrade_flags_grow(sv,
-					SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-					/* 5 = '\N{' + cur char + NUL */
-					(STRLEN)(send - s) + 5);
-			d = SvPVX(sv) + SvCUR(sv);
-			has_utf8 = TRUE;
-		    }
-		}
-
 		if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
 		    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
 				| PERL_SCAN_DISALLOW_PREFIX;
@@ -3504,11 +3472,36 @@ S_scan_const(pTHX_ char *start)
 
 			    const char *str_end = str + len;
 			    const STRLEN off = d - SvPVX_const(sv);
+
+                            if (! SvUTF8(res)) {
+                                /* For the non-UTF-8 case, we can determine the
+                                 * exact length needed without having to parse
+                                 * through the string.  Each character takes up
+                                 * 2 hex digits plus either a trailing dot or
+                                 * the "}" */
+                                d = off + SvGROW(sv, off
+                                                    + 3 * len
+                                                    + 6 /* For the "\N{U+", and
+                                                           trailing NUL */
+                                                    + (STRLEN)(send - e));
+                                Copy("\\N{U+", d, 5, char);
+                                d += 5;
+                                while (str < str_end) {
+                                    char hex_string[4];
+                                    my_snprintf(hex_string, sizeof(hex_string),
+                                                "%02X.", (U8) *str);
+                                    Copy(hex_string, d, 3, char);
+                                    d += 3;
+                                    str++;
+                                }
+                                d--;    /* We will overwrite below the final
+                                           dot with a right brace */
+                            }
+                            else {
 			    STRLEN char_length;	    /* cur char's byte length */
 			    STRLEN output_length;   /* and the number of bytes
 						       after this is translated
 						       into hex digits */
-
 			    /* 2 hex per byte; 2 chars for '\N'; 2 chars for
 			     * max('U+', '.'); and 1 for NUL */
 			    char hex_string[2 * UTF8_MAXBYTES + 5];
@@ -3556,6 +3549,7 @@ S_scan_const(pTHX_ char *start)
 				Copy(hex_string, d, output_length, char);
 				d += output_length;
 			    }
+			    }
 
 			    *d++ = '}';	/* Done.  Add the trailing brace */
 			}
author	Karl Williamson <public@khwilliamson.com>	2012-11-11 14:47:53 -0700
committer	Karl Williamson <public@khwilliamson.com>	2012-11-11 15:34:03 -0700
commit	94ca1619c5a35b493b5e6374422aac5b5fce4678 (patch)
tree	30876fd538e85f9c816f78dc9bbdae7124692de3 /toke.c
parent	55025e0373118aae3b91192b1eb95b1aad0d6c9d (diff)
download	perl-94ca1619c5a35b493b5e6374422aac5b5fce4678.tar.gz