diff options
author | Karl Williamson <public@khwilliamson.com> | 2012-02-28 10:11:35 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2012-02-28 10:24:52 -0700 |
commit | b4069bca6054692e4fffa8e9e04572511e910fbd (patch) | |
tree | 1f840d06fa883007d35290ad1f411960c583b596 | |
parent | 5ef3c22d461004ed95fd0cee11e2926f8b87bc7c (diff) | |
download | perl-b4069bca6054692e4fffa8e9e04572511e910fbd.tar.gz |
Patch [perl #111400] [:upper:] broken for above Latin1
This was an off-by-one error caused by my failing to realize that things
had to be done differently at the 255/256 boundary depending on whether
U+00FF matched or did not match the property.
Two properties were affected, [:upper:] and [:punct:]. The bug was that
all code points above the first one > 255 that legitimately matches the
property will match whether or not they should. In the case of
[:upper:], this meant that effectively anything from 256..infinity
matched. For [:punct:], it was anything above U+037D.
-rw-r--r-- | charclass_invlists.h | 10 | ||||
-rw-r--r-- | regen/mk_invlists.pl | 30 | ||||
-rw-r--r-- | t/re/re_tests | 22 |
3 files changed, 51 insertions, 11 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index 152793a2e6..368410fd37 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -359,7 +359,7 @@ UV PosixPunct_invlist[] = { }; UV L1PosixPunct_invlist[] = { - 21, /* Number of elements */ + 20, /* Number of elements */ 0, /* Current iteration position */ 1064334010, /* Version and data structure type */ 1, /* 0 if this is the first element of the list proper; @@ -383,8 +383,7 @@ UV L1PosixPunct_invlist[] = { 187, 188, 191, - 192, - 894 + 192 }; UV PosixSpace_invlist[] = { @@ -440,7 +439,7 @@ UV PosixUpper_invlist[] = { }; UV L1PosixUpper_invlist[] = { - 7, /* Number of elements */ + 6, /* Number of elements */ 0, /* Current iteration position */ 1064334010, /* Version and data structure type */ 1, /* 0 if this is the first element of the list proper; @@ -450,8 +449,7 @@ UV L1PosixUpper_invlist[] = { 192, 215, 216, - 223, - 256 + 223 }; UV PosixWord_invlist[] = { diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index 8102c29cd3..64e3d435c1 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -127,12 +127,17 @@ for my $prop (qw( # For the Latin1 properties, we change to use the eXtended version of the # base property, then go through the result and get rid of everything not - # in Latin1 (above 255). Actually, we retain the element that crosses the - # 255/256 boundary. For example, in the Word property, there is a range - # of code points that start at U+00F8 and goes through U+02C1. Instead of + # in Latin1 (above 255). Actually, we retain the element for the range + # that crosses the 255/256 boundary if it is one that matches the + # property. For example, in the Word property, there is a range of code + # points that start at U+00F8 and goes through U+02C1. Instead of # artifically cutting that off at 256 because 256 is the first code point # above Latin1, we let the range go to its natural ending. That gives us - # extra information with no added space taken. + # extra information with no added space taken. But if the range that + # crosses the boundary is one that doesn't match the property, we don't + # start a new range above 255, as that could be construed as going to + # infinity. For example, the Upper property doesn't include the character + # at 255, but does include the one at 256. We don't include the 256 one. my $lookup_prop = $prop; $lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//; my @invlist = prop_invlist($lookup_prop); @@ -140,7 +145,22 @@ for my $prop (qw( if ($lookup_prop ne $prop) { for my $i (0 .. @invlist - 1 - 1) { if ($invlist[$i] > 255) { - splice @invlist, $i+1; + + # In an inversion list, even-numbered elements give the code + # points that begin ranges that match the property; + # odd-numbered give ones that begin ranges that don't match. + # If $i is odd, we are at the first code point above 255 that + # doesn't match, which means the range it is ending does + # match, and crosses the 255/256 boundary. We want to include + # this ending point, so increment $i, so the splice below + # includes it. Conversely, if $i is even, it is the first + # code point above 255 that matches, which means there was no + # matching range that crossed the boundary, and we don't want + # to include this code point, so splice before it. + $i++ if $i % 2 != 0; + + # Remove everything past this. + splice @invlist, $i; last; } } diff --git a/t/re/re_tests b/t/re/re_tests index e7680c9c23..4d78a6aed8 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -1575,4 +1575,26 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer # [perl #110648] [^\p{Alphabetic}] \x{100} n - - +# [perl #111400]. Tests the first Y/N boundary above 255 for each of these. +/[[:alnum:]]/ \x{2c1} y - - +/[[:alnum:]]/ \x{2c2} n - - +/[[:alpha:]]/ \x{2c1} y - - +/[[:alpha:]]/ \x{2c2} n - - +/[[:graph:]]/ \x{377} y - - +/[[:graph:]]/ \x{378} n - - +/[[:lower:]]/ \x{100} n - - +/[[:lower:]]/ \x{101} y - - +/[[:lower:]]/ \x{102} n - - +/[[:print:]]/ \x{377} y - - +/[[:print:]]/ \x{378} n - - +/[[:punct:]]/ \x{37D} n - - +/[[:punct:]]/ \x{37E} y - - +/[[:punct:]]/ \x{388} n - - +/[[:upper:]]/ \x{100} y - - +/[[:upper:]]/ \x{101} n - - +/[[:word:]]/ \x{2c1} y - - +/[[:word:]]/ \x{2c2} n - - + + + # vim: softtabstop=0 noexpandtab |