summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2017-01-31 11:15:08 -0700
committerKarl Williamson <khw@cpan.org>2017-01-31 11:30:15 -0700
commitfe2ba0a2de216bca4582bfb493b196d2eb4c94ae (patch)
tree0f1e0f16b4191f5108a8e5c921ebaae9f54ad574 /toke.c
parentcbf40e71df30fba4761230a8b62a34d7bb247495 (diff)
downloadperl-fe2ba0a2de216bca4582bfb493b196d2eb4c94ae.tar.gz
PATCH: [perl #130656] tr// failue with UTF-8 across lines
This bug happend under things like tr/\x{101}-\x{200}/ \x{201}-\x{301}/ The newline in the middle was crucial. As a result the second line got parsed already knowing that the result was UTF-8, and as a result setting a variable got skipped which happens only when we discover we need to flip into UTF-8. The solution adopted here is to set the variable under other conditions, which leads to it getting set multiple times. But this extra branch and setting is confined to somehwat rare circumstances, leaving the mainline code untouched.
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c19
1 files changed, 16 insertions, 3 deletions
diff --git a/toke.c b/toke.c
index 7dcdd5afa1..9972b97418 100644
--- a/toke.c
+++ b/toke.c
@@ -2866,8 +2866,6 @@ S_scan_const(pTHX_ char *start)
bool didrange = FALSE; /* did we just finish a range? */
bool in_charclass = FALSE; /* within /[...]/ */
bool has_utf8 = FALSE; /* Output constant is UTF8 */
- bool has_above_latin1 = FALSE; /* does something require special
- handling in tr/// ? */
bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be
UTF8? But, this can show as true
when the source isn't utf8, as for
@@ -2882,6 +2880,14 @@ S_scan_const(pTHX_ char *start)
STRLEN offset_to_max; /* The offset in the output to where the range
high-end character is temporarily placed */
+ /* Does something require special handling in tr/// ? This avoids extra
+ * work in a less likely case. As such, khw didn't feel it was worth
+ * adding any branches to the more mainline code to handle this, which
+ * means that this doesn't get set in some circumstances when things like
+ * \x{100} get expanded out. As a result there needs to be extra testing
+ * done in the tr code */
+ bool has_above_latin1 = FALSE;
+
/* Note on sizing: The scanned constant is placed into sv, which is
* initialized by newSV() assuming one byte of output for every byte of
* input. This routine expects newSV() to allocate an extra byte for a
@@ -2962,7 +2968,7 @@ S_scan_const(pTHX_ char *start)
/* The tests here for being above Latin1 and similar ones
* in the following 'else' suffice to find all such
* occurences in the constant, except those added by a
- * backslash escape sequence, like \x{100}. And all those
+ * backslash escape sequence, like \x{100}. Mostly, those
* set 'has_above_latin1' as appropriate */
if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
has_above_latin1 = TRUE;
@@ -3026,6 +3032,13 @@ S_scan_const(pTHX_ char *start)
min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
+
+ /* This compensates for not all code setting
+ * 'has_above_latin1', so that we don't skip stuff that
+ * should be executed */
+ if (range_max > 255) {
+ has_above_latin1 = TRUE;
+ }
}
else {
min_ptr = max_ptr - 1;