diff options
author | Karl Williamson <khw@cpan.org> | 2017-01-31 11:15:08 -0700 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2017-01-31 11:30:15 -0700 |
commit | fe2ba0a2de216bca4582bfb493b196d2eb4c94ae (patch) | |
tree | 0f1e0f16b4191f5108a8e5c921ebaae9f54ad574 /toke.c | |
parent | cbf40e71df30fba4761230a8b62a34d7bb247495 (diff) | |
download | perl-fe2ba0a2de216bca4582bfb493b196d2eb4c94ae.tar.gz |
PATCH: [perl #130656] tr// failue with UTF-8 across lines
This bug happend under things like
tr/\x{101}-\x{200}/
\x{201}-\x{301}/
The newline in the middle was crucial. As a result the second line got
parsed already knowing that the result was UTF-8, and as a result
setting a variable got skipped which happens only when we discover we
need to flip into UTF-8.
The solution adopted here is to set the variable under other conditions,
which leads to it getting set multiple times. But this extra branch and
setting is confined to somehwat rare circumstances, leaving the mainline
code untouched.
Diffstat (limited to 'toke.c')
-rw-r--r-- | toke.c | 19 |
1 files changed, 16 insertions, 3 deletions
@@ -2866,8 +2866,6 @@ S_scan_const(pTHX_ char *start) bool didrange = FALSE; /* did we just finish a range? */ bool in_charclass = FALSE; /* within /[...]/ */ bool has_utf8 = FALSE; /* Output constant is UTF8 */ - bool has_above_latin1 = FALSE; /* does something require special - handling in tr/// ? */ bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be UTF8? But, this can show as true when the source isn't utf8, as for @@ -2882,6 +2880,14 @@ S_scan_const(pTHX_ char *start) STRLEN offset_to_max; /* The offset in the output to where the range high-end character is temporarily placed */ + /* Does something require special handling in tr/// ? This avoids extra + * work in a less likely case. As such, khw didn't feel it was worth + * adding any branches to the more mainline code to handle this, which + * means that this doesn't get set in some circumstances when things like + * \x{100} get expanded out. As a result there needs to be extra testing + * done in the tr code */ + bool has_above_latin1 = FALSE; + /* Note on sizing: The scanned constant is placed into sv, which is * initialized by newSV() assuming one byte of output for every byte of * input. This routine expects newSV() to allocate an extra byte for a @@ -2962,7 +2968,7 @@ S_scan_const(pTHX_ char *start) /* The tests here for being above Latin1 and similar ones * in the following 'else' suffice to find all such * occurences in the constant, except those added by a - * backslash escape sequence, like \x{100}. And all those + * backslash escape sequence, like \x{100}. Mostly, those * set 'has_above_latin1' as appropriate */ if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) { has_above_latin1 = TRUE; @@ -3026,6 +3032,13 @@ S_scan_const(pTHX_ char *start) min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1); range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL); range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL); + + /* This compensates for not all code setting + * 'has_above_latin1', so that we don't skip stuff that + * should be executed */ + if (range_max > 255) { + has_above_latin1 = TRUE; + } } else { min_ptr = max_ptr - 1; |