regen/regcharclass.pl: Add optimization

On UTF-8 input known to be valid, continuation bytes must be in the range 0x80 .. 0x9F. Therefore, any tests for being within those bounds will always be true, and may be omitted.
author: Karl Williamson <public@khwilliamson.com> 2012-09-05 15:18:09 -0600
committer: Karl Williamson <public@khwilliamson.com> 2012-09-13 21:14:04 -0600
commit: 6e130234c25b195bf5141bd859d947ec051416ec (patch)
tree: fff306f4fa3d85a1ba186659a3613d6f685b63be /regen
parent: 1f063c5738967dbc1ef4271ba8b58fbed5ac5a8c (diff)
download: perl-6e130234c25b195bf5141bd859d947ec051416ec.tar.gz
1 files changed, 42 insertions, 5 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index e4133fd7b4..70f46b03b4 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -710,12 +710,16 @@ sub _cond_as_str {
 
         return 1 if @$cond == 256;  # If all bytes match, is trivially true
 
+        if (@ranges > 1) {
             # See if the entire set shares optimizable characterstics, and if
-            # so, return the optimization.
+            # so, return the optimization.  We delay checking for this on sets
+            # with just a single range, as there may be better optimizations
+            # available in that case.
             my ($mask, $base) = calculate_mask(@$cond);
             if (defined $mask && defined $base) {
                 return sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask, $base;
             }
+        }
 
         # Here, there was no entire-class optimization.  Look at each range.
         for (my $i = 0; $i < @ranges; $i++) {
@@ -729,10 +733,43 @@ sub _cond_as_str {
             else {
                 my $output = "";
 
-                # See if the number of elements is a power of 2 (only a single
-                # bit in the representation of its count will be set) and if
-                # so, it may be that a mask/compare optimization is possible.
-                if (pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1) {
+                # Well-formed UTF-8 continuation bytes on ascii platforms must
+                # be in the range 0x80 .. 0xBF.  If we know that the input is
+                # well-formed (indicated by not trying to be 'safe'), we can
+                # omit tests that verify that the input is within either of
+                # these bounds.  (No legal UTF-8 character can begin with
+                # anything in this range, so we don't have to worry about this
+                # being a continuation byte or not.)
+                if (ASCII_PLATFORM
+                    && ! $opts_ref->{safe}
+                    && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
+                {
+                    my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
+                    my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
+
+                    # If the range is the entire legal range, it matches any
+                    # legal byte, so we can omit both tests.  (This should
+                    # happen only if the number of ranges is 1.)
+                    if ($lower_limit_is_80 && $upper_limit_is_BF) {
+                        return 1;
+                    }
+                    elsif ($lower_limit_is_80) { # Just use the upper limit test
+                        $output = sprintf("( $test <= $self->{val_fmt} )",
+                                            $ranges[$i]->[1]);
+                    }
+                    elsif ($upper_limit_is_BF) { # Just use the lower limit test
+                        $output = sprintf("( $test >= $self->{val_fmt} )",
+                                        $ranges[$i]->[0]);
+                    }
+                }
+
+                # If we didn't change to omit a test above, see if the number
+                # of elements is a power of 2 (only a single bit in the
+                # representation of its count will be set) and if so, it may
+                # be that a mask/compare optimization is possible.
+                if ($output eq ""
+                    && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
+                {
                     my @list;
                     push @list, $_  for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
                     my ($mask, $base) = calculate_mask(@list);
author	Karl Williamson <public@khwilliamson.com>	2012-09-05 15:18:09 -0600
committer	Karl Williamson <public@khwilliamson.com>	2012-09-13 21:14:04 -0600
commit	6e130234c25b195bf5141bd859d947ec051416ec (patch)
tree	fff306f4fa3d85a1ba186659a3613d6f685b63be /regen
parent	1f063c5738967dbc1ef4271ba8b58fbed5ac5a8c (diff)
download	perl-6e130234c25b195bf5141bd859d947ec051416ec.tar.gz