summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-02-28 10:11:35 -0700
committerKarl Williamson <public@khwilliamson.com>2012-02-28 10:24:52 -0700
commitb4069bca6054692e4fffa8e9e04572511e910fbd (patch)
tree1f840d06fa883007d35290ad1f411960c583b596
parent5ef3c22d461004ed95fd0cee11e2926f8b87bc7c (diff)
downloadperl-b4069bca6054692e4fffa8e9e04572511e910fbd.tar.gz
Patch [perl #111400] [:upper:] broken for above Latin1
This was an off-by-one error caused by my failing to realize that things had to be done differently at the 255/256 boundary depending on whether U+00FF matched or did not match the property. Two properties were affected, [:upper:] and [:punct:]. The bug was that all code points above the first one > 255 that legitimately matches the property will match whether or not they should. In the case of [:upper:], this meant that effectively anything from 256..infinity matched. For [:punct:], it was anything above U+037D.
-rw-r--r--charclass_invlists.h10
-rw-r--r--regen/mk_invlists.pl30
-rw-r--r--t/re/re_tests22
3 files changed, 51 insertions, 11 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index 152793a2e6..368410fd37 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -359,7 +359,7 @@ UV PosixPunct_invlist[] = {
};
UV L1PosixPunct_invlist[] = {
- 21, /* Number of elements */
+ 20, /* Number of elements */
0, /* Current iteration position */
1064334010, /* Version and data structure type */
1, /* 0 if this is the first element of the list proper;
@@ -383,8 +383,7 @@ UV L1PosixPunct_invlist[] = {
187,
188,
191,
- 192,
- 894
+ 192
};
UV PosixSpace_invlist[] = {
@@ -440,7 +439,7 @@ UV PosixUpper_invlist[] = {
};
UV L1PosixUpper_invlist[] = {
- 7, /* Number of elements */
+ 6, /* Number of elements */
0, /* Current iteration position */
1064334010, /* Version and data structure type */
1, /* 0 if this is the first element of the list proper;
@@ -450,8 +449,7 @@ UV L1PosixUpper_invlist[] = {
192,
215,
216,
- 223,
- 256
+ 223
};
UV PosixWord_invlist[] = {
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index 8102c29cd3..64e3d435c1 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -127,12 +127,17 @@ for my $prop (qw(
# For the Latin1 properties, we change to use the eXtended version of the
# base property, then go through the result and get rid of everything not
- # in Latin1 (above 255). Actually, we retain the element that crosses the
- # 255/256 boundary. For example, in the Word property, there is a range
- # of code points that start at U+00F8 and goes through U+02C1. Instead of
+ # in Latin1 (above 255). Actually, we retain the element for the range
+ # that crosses the 255/256 boundary if it is one that matches the
+ # property. For example, in the Word property, there is a range of code
+ # points that start at U+00F8 and goes through U+02C1. Instead of
# artifically cutting that off at 256 because 256 is the first code point
# above Latin1, we let the range go to its natural ending. That gives us
- # extra information with no added space taken.
+ # extra information with no added space taken. But if the range that
+ # crosses the boundary is one that doesn't match the property, we don't
+ # start a new range above 255, as that could be construed as going to
+ # infinity. For example, the Upper property doesn't include the character
+ # at 255, but does include the one at 256. We don't include the 256 one.
my $lookup_prop = $prop;
$lookup_prop =~ s/^L1Posix/XPosix/ or $lookup_prop =~ s/^L1//;
my @invlist = prop_invlist($lookup_prop);
@@ -140,7 +145,22 @@ for my $prop (qw(
if ($lookup_prop ne $prop) {
for my $i (0 .. @invlist - 1 - 1) {
if ($invlist[$i] > 255) {
- splice @invlist, $i+1;
+
+ # In an inversion list, even-numbered elements give the code
+ # points that begin ranges that match the property;
+ # odd-numbered give ones that begin ranges that don't match.
+ # If $i is odd, we are at the first code point above 255 that
+ # doesn't match, which means the range it is ending does
+ # match, and crosses the 255/256 boundary. We want to include
+ # this ending point, so increment $i, so the splice below
+ # includes it. Conversely, if $i is even, it is the first
+ # code point above 255 that matches, which means there was no
+ # matching range that crossed the boundary, and we don't want
+ # to include this code point, so splice before it.
+ $i++ if $i % 2 != 0;
+
+ # Remove everything past this.
+ splice @invlist, $i;
last;
}
}
diff --git a/t/re/re_tests b/t/re/re_tests
index e7680c9c23..4d78a6aed8 100644
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1575,4 +1575,26 @@ abc\N{def - c - \\N{NAME} must be resolved by the lexer
# [perl #110648]
[^\p{Alphabetic}] \x{100} n - -
+# [perl #111400]. Tests the first Y/N boundary above 255 for each of these.
+/[[:alnum:]]/ \x{2c1} y - -
+/[[:alnum:]]/ \x{2c2} n - -
+/[[:alpha:]]/ \x{2c1} y - -
+/[[:alpha:]]/ \x{2c2} n - -
+/[[:graph:]]/ \x{377} y - -
+/[[:graph:]]/ \x{378} n - -
+/[[:lower:]]/ \x{100} n - -
+/[[:lower:]]/ \x{101} y - -
+/[[:lower:]]/ \x{102} n - -
+/[[:print:]]/ \x{377} y - -
+/[[:print:]]/ \x{378} n - -
+/[[:punct:]]/ \x{37D} n - -
+/[[:punct:]]/ \x{37E} y - -
+/[[:punct:]]/ \x{388} n - -
+/[[:upper:]]/ \x{100} y - -
+/[[:upper:]]/ \x{101} n - -
+/[[:word:]]/ \x{2c1} y - -
+/[[:word:]]/ \x{2c2} n - -
+
+
+
# vim: softtabstop=0 noexpandtab