summaryrefslogtreecommitdiff
path: root/regen
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-09-05 15:18:09 -0600
committerKarl Williamson <public@khwilliamson.com>2012-09-13 21:14:04 -0600
commit6e130234c25b195bf5141bd859d947ec051416ec (patch)
treefff306f4fa3d85a1ba186659a3613d6f685b63be /regen
parent1f063c5738967dbc1ef4271ba8b58fbed5ac5a8c (diff)
downloadperl-6e130234c25b195bf5141bd859d947ec051416ec.tar.gz
regen/regcharclass.pl: Add optimization
On UTF-8 input known to be valid, continuation bytes must be in the range 0x80 .. 0x9F. Therefore, any tests for being within those bounds will always be true, and may be omitted.
Diffstat (limited to 'regen')
-rwxr-xr-xregen/regcharclass.pl47
1 files changed, 42 insertions, 5 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index e4133fd7b4..70f46b03b4 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -710,12 +710,16 @@ sub _cond_as_str {
return 1 if @$cond == 256; # If all bytes match, is trivially true
+ if (@ranges > 1) {
# See if the entire set shares optimizable characterstics, and if
- # so, return the optimization.
+ # so, return the optimization. We delay checking for this on sets
+ # with just a single range, as there may be better optimizations
+ # available in that case.
my ($mask, $base) = calculate_mask(@$cond);
if (defined $mask && defined $base) {
return sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask, $base;
}
+ }
# Here, there was no entire-class optimization. Look at each range.
for (my $i = 0; $i < @ranges; $i++) {
@@ -729,10 +733,43 @@ sub _cond_as_str {
else {
my $output = "";
- # See if the number of elements is a power of 2 (only a single
- # bit in the representation of its count will be set) and if
- # so, it may be that a mask/compare optimization is possible.
- if (pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1) {
+ # Well-formed UTF-8 continuation bytes on ascii platforms must
+ # be in the range 0x80 .. 0xBF. If we know that the input is
+ # well-formed (indicated by not trying to be 'safe'), we can
+ # omit tests that verify that the input is within either of
+ # these bounds. (No legal UTF-8 character can begin with
+ # anything in this range, so we don't have to worry about this
+ # being a continuation byte or not.)
+ if (ASCII_PLATFORM
+ && ! $opts_ref->{safe}
+ && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
+ {
+ my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
+ my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
+
+ # If the range is the entire legal range, it matches any
+ # legal byte, so we can omit both tests. (This should
+ # happen only if the number of ranges is 1.)
+ if ($lower_limit_is_80 && $upper_limit_is_BF) {
+ return 1;
+ }
+ elsif ($lower_limit_is_80) { # Just use the upper limit test
+ $output = sprintf("( $test <= $self->{val_fmt} )",
+ $ranges[$i]->[1]);
+ }
+ elsif ($upper_limit_is_BF) { # Just use the lower limit test
+ $output = sprintf("( $test >= $self->{val_fmt} )",
+ $ranges[$i]->[0]);
+ }
+ }
+
+ # If we didn't change to omit a test above, see if the number
+ # of elements is a power of 2 (only a single bit in the
+ # representation of its count will be set) and if so, it may
+ # be that a mask/compare optimization is possible.
+ if ($output eq ""
+ && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
+ {
my @list;
push @list, $_ for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
my ($mask, $base) = calculate_mask(@list);