PATCH: [perl #130656] tr// failue with UTF-8 across lines

This bug happend under things like tr/\x{101}-\x{200}/ \x{201}-\x{301}/ The newline in the middle was crucial. As a result the second line got parsed already knowing that the result was UTF-8, and as a result setting a variable got skipped which happens only when we discover we need to flip into UTF-8. The solution adopted here is to set the variable under other conditions, which leads to it getting set multiple times. But this extra branch and setting is confined to somehwat rare circumstances, leaving the mainline code untouched.
author: Karl Williamson <khw@cpan.org> 2017-01-31 11:15:08 -0700
committer: Karl Williamson <khw@cpan.org> 2017-01-31 11:30:15 -0700
commit: fe2ba0a2de216bca4582bfb493b196d2eb4c94ae (patch)
tree: 0f1e0f16b4191f5108a8e5c921ebaae9f54ad574 /toke.c
parent: cbf40e71df30fba4761230a8b62a34d7bb247495 (diff)
download: perl-fe2ba0a2de216bca4582bfb493b196d2eb4c94ae.tar.gz
1 files changed, 16 insertions, 3 deletions
diff --git a/toke.c b/toke.c
index 7dcdd5afa1..9972b97418 100644
--- a/toke.c
+++ b/toke.c
@@ -2866,8 +2866,6 @@ S_scan_const(pTHX_ char *start)
     bool didrange = FALSE;              /* did we just finish a range? */
     bool in_charclass = FALSE;          /* within /[...]/ */
     bool has_utf8 = FALSE;              /* Output constant is UTF8 */
-    bool has_above_latin1 = FALSE;      /* does something require special
-                                           handling in tr/// ? */
     bool  this_utf8 = cBOOL(UTF);       /* Is the source string assumed to be
                                            UTF8?  But, this can show as true
                                            when the source isn't utf8, as for
@@ -2882,6 +2880,14 @@ S_scan_const(pTHX_ char *start)
     STRLEN offset_to_max;   /* The offset in the output to where the range
                                high-end character is temporarily placed */
 
+    /* Does something require special handling in tr/// ?  This avoids extra
+     * work in a less likely case.  As such, khw didn't feel it was worth
+     * adding any branches to the more mainline code to handle this, which
+     * means that this doesn't get set in some circumstances when things like
+     * \x{100} get expanded out.  As a result there needs to be extra testing
+     * done in the tr code */
+    bool has_above_latin1 = FALSE;
+
     /* Note on sizing:  The scanned constant is placed into sv, which is
      * initialized by newSV() assuming one byte of output for every byte of
      * input.  This routine expects newSV() to allocate an extra byte for a
@@ -2962,7 +2968,7 @@ S_scan_const(pTHX_ char *start)
                     /* The tests here for being above Latin1 and similar ones
                      * in the following 'else' suffice to find all such
                      * occurences in the constant, except those added by a
-                     * backslash escape sequence, like \x{100}.  And all those
+                     * backslash escape sequence, like \x{100}.  Mostly, those
                      * set 'has_above_latin1' as appropriate */
                     if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
                         has_above_latin1 = TRUE;
@@ -3026,6 +3032,13 @@ S_scan_const(pTHX_ char *start)
                     min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
                     range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
                     range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
+
+                    /* This compensates for not all code setting
+                     * 'has_above_latin1', so that we don't skip stuff that
+                     * should be executed */
+                    if (range_max > 255) {
+                        has_above_latin1 = TRUE;
+                    }
                 }
                 else {
                     min_ptr = max_ptr - 1;
author	Karl Williamson <khw@cpan.org>	2017-01-31 11:15:08 -0700
committer	Karl Williamson <khw@cpan.org>	2017-01-31 11:30:15 -0700
commit	fe2ba0a2de216bca4582bfb493b196d2eb4c94ae (patch)
tree	0f1e0f16b4191f5108a8e5c921ebaae9f54ad574 /toke.c
parent	cbf40e71df30fba4761230a8b62a34d7bb247495 (diff)
download	perl-fe2ba0a2de216bca4582bfb493b196d2eb4c94ae.tar.gz