regcharclass.pl: Improve generated code for EBCDIC

UTF-8 has some desirable characteristics not shared by UTF-EBCDIC. One example is all the continuation bytes are in a single range. By transforming a UTF-EBCDIC byte into I8 (similar to UTF-8), we gain those characteristics, and may be able to save a conditional or three. This commit creates a 2nd pass over the bytes that are to be matched, transforming them into I8. If that pass results in fewer conditionals than the traditional, native, generated code, use the fewer result. This saves quite a bit in some of the generated code, enabling the quotemeta macro to be represented in a single part; previously it had to be split to avoid compiler macro size limits.
author: Karl Williamson <khw@cpan.org> 2021-06-30 12:54:19 -0600
committer: Karl Williamson <khw@cpan.org> 2021-08-07 04:46:45 -0600
commit: 769444cd0f1c5de5c1dcb9dc4b2b38dd682c85ae (patch)
tree: 8c52dcc91bacc24d748076fa78057c2abd620017 /regen
parent: 8787aefa956b0b3132aee309414bee4cc28895be (diff)
download: perl-769444cd0f1c5de5c1dcb9dc4b2b38dd682c85ae.tar.gz
1 files changed, 101 insertions, 32 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 63423e18d3..1fec6f92c6 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -1072,14 +1072,17 @@ sub _cond_as_str {
     @cond = $op->{vals}->@* if defined $op->{vals};
     my $test= $op->{test};
     my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
+    my $charset = $opts_ref->{charset};
     return "( $test )" unless @cond;
 
+    my (@ranges, @native_ranges);
+    my @native_conds;
+
     # rangify the list.  As we encounter a new value, it is placed in a new
     # subarray by itself.  If the next value is adjacent to it, the end point
     # of the subarray is merely incremented; and so on.  When the next value
     # that isn't adjacent to the previous one is encountered, Update() is
     # called to hoist any single-element subarray to be a scalar.
-    my @ranges;
     my $Update= sub {
         # We skip this if there are optimizations that
         # we can apply (below) to the individual ranges
@@ -1088,6 +1091,51 @@ sub _cond_as_str {
         }
     };
 
+    # Parse things twice, using different approaches for representing things,
+    # afterwards choosing the alternative with the fewest branches
+    for my $i (0, 1) {
+
+        # Should we avoid using mnemonics for code points?
+        my $always_hex = 0;
+
+        if ($i) {   # 2nd pass
+            # The second pass is only for non-ascii character sets, to see if
+            # a transform to Unicode/ASCII saves anything.
+            last if $charset =~ /ascii/i;
+
+            # If the first pass came up with a single range, we won't be able
+            # to do better than that, so don't try.
+            last if @ranges == 1;
+
+            # We calculated the native values the first iteration
+            @native_ranges = @ranges;
+            @native_conds = @cond;
+
+            # Start fresh
+            undef @ranges;
+            undef @cond;
+
+            # Determine the translation function, to/from UTF-8 or Latin1, and
+            # the corresponding transform of the condition to match
+            my $lookup;
+            if ($opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi) {
+                $lookup = $utf_2_I8{$charset};
+                $test = "NATIVE_UTF8_TO_I8($test)";
+            }
+            else {
+                $lookup = $n2a{$charset};
+                $test = "NATIVE_TO_LATIN1($test)";
+            }
+
+            # Translate the native conditions (bytes) into the Unicode ones
+            for my $condition (@native_conds) {
+                push @cond, $lookup->[$condition];
+            }
+
+            # 'f' won't be the expected 'f' on this box
+            $always_hex = 1;
+        }
+
         # Go through the code points (@cond) and collapse them as much as
         # possible into ranges
         for my $condition ( @cond ) {
@@ -1102,24 +1150,31 @@ sub _cond_as_str {
         }
         $Update->();
 
-        # _combine is used for cp type matching.
+        # _combine is used for cp type matching.  By having it here return, no
+        # second pass is done.  It could conceivably be restructured to have a
+        # second pass, but no current uses of script would actually gain any
+        # advantage by doing so, so the work hasn't been further considered.
         return $self->_combine( $test, @ranges ) if $combine;
 
         # If the input set has certain characteristics, we can optimize tests
         # for it.
 
-        # Return if all bytes match, hence is trivially true
+        # If all bytes match, is trivially true; we don't need a 2nd pass
         return 1 if @cond == 256;
 
         # If this is a single UTF-8 range which includes all possible
         # continuation bytes, and we aren't checking for well-formedness, this
         # is trivially true.
+        #
+        # (In EBCDIC, this won't happen until the 2nd pass transforms the
+        # disjoint continuation byte ranges into a single I8 one.)
         if (     @ranges == 1
             && ! $opts_ref->{safe}
             && ! $opts_ref->{no_length_checks}
             &&   $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi
             &&   $ranges[0]->[1] == 0xBF
-            &&   $ranges[0]->[0] == 0x80)
+            &&   $ranges[0]->[0] == (($charset =~ /ascii/i)
+                                        ? 0x80 : 0xA0))
         {
             return 1;
         }
@@ -1134,11 +1189,11 @@ sub _cond_as_str {
                                                                 if @ranges > 1;
                 return 1;
             }
-                                                    # this case
+
             # Here, the first range starts at 0, but doesn't match everything.
             # But the condition doesn't have to worry about being < 0
             $ranges[0] = "( $test <= "
-                        . $self->val_fmt($ranges[0]->[1]) . " )";
+                        . $self->val_fmt($ranges[0]->[1], $always_hex) . " )";
             $loop_start++;
         }
 
@@ -1151,34 +1206,41 @@ sub _cond_as_str {
             # If the final range consists of more than one byte ending with
             # the highest possible one, the condition doesn't have to worry
             # about being > FF
-            $ranges[-1] = "( $test >= " . $self->val_fmt($ranges[-1]->[0]) . " )";
+            $ranges[-1] = "( $test >= "
+                        . $self->val_fmt($ranges[-1]->[0], $always_hex) . " )";
             $loop_end--;
         }
 
-        # Look at each range to see if there any optimizations.
+        # Look at each range to see if there any optimizations.  The
+        # formatting may be thrown away, so might be wasted effort; and khw
+        # supposes this could be restructured to delay that until the final
+        # method is chosen.  But that would be more coding work than
+        # warranted, as this is executed not that many times during a
+        # development cycle.
         for (my $i = $loop_start; $i < $loop_end; $i++) {
             if (! ref $ranges[$i]) {    # Trivial case: no range
-                $ranges[$i] = $self->val_fmt($ranges[$i]) . " == $test";
+                $ranges[$i] = $self->val_fmt($ranges[$i], $always_hex)
+                            . " == $test";
             }
             elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
                 $ranges[$i] =           # Trivial case: single element range
-                        $self->val_fmt($ranges[$i]->[0]) . " == $test";
+                              $self->val_fmt($ranges[$i]->[0], $always_hex)
+                            . " == $test";
             }
             else {
                 $ranges[$i] = "inRANGE_helper_(U8, $test, "
-                        . $self->val_fmt($ranges[$i]->[0]) .", "
-                        . $self->val_fmt($ranges[$i]->[1]) . ")";
+                        . $self->val_fmt($ranges[$i]->[0], $always_hex) .", "
+                        . $self->val_fmt($ranges[$i]->[1], $always_hex) . ")";
             }
         }
 
-    my @masks;
-    if (@ranges > 1) {
+        # Here, have collapsed the matched code points into ranges.  This code
+        # also sees if some of those different ranges have bit patterns which
+        # causes them to be combinable by ANDing with a mask.  There's no need
+        # to do this if we are already down to a single range.
+        next unless @ranges > 1;
 
-        # See if the entire set shares optimizable characteristics, and if so,
-        # return the optimization.  There is no need to do this on sets with
-        # just a single range, as that can be expressed with a single
-        # conditional.
-        @masks = calculate_mask(@cond);
+        my @masks = calculate_mask(@cond);
 
         # Stringify the output of calculate_mask()
         if (@masks) {
@@ -1186,11 +1248,12 @@ sub _cond_as_str {
             foreach my $mask_ref (@masks) {
                 if (defined $mask_ref->[1]) {
                     push @masked, "( ( $test & "
-                                . $self->val_fmt($mask_ref->[1]) . " ) == "
-                                . $self->val_fmt($mask_ref->[0]) . " )";
+                        . $self->val_fmt($mask_ref->[1], $always_hex) . " ) == "
+                        . $self->val_fmt($mask_ref->[0], $always_hex) . " )";
                 }
                 else {  # An undefined mask means to use the value as-is
-                    push @masked, "$test == " . $self->val_fmt($mask_ref->[0]);
+                    push @masked, "$test == "
+                                 . $self->val_fmt($mask_ref->[0], $always_hex);
                 }
             }
 
@@ -1199,22 +1262,28 @@ sub _cond_as_str {
             # results, there is no sense trying something that is bound to be
             # worse.
             if (@masked < @ranges) {
-                return "( " . join( " || ", @masked ) . " )";
+                @ranges = @masked;
+                next;
             }
 
             @masks = @masked;
         }
+
+        # If we found some mask possibilities, and they have fewer
+        # conditionals in them than the plain range method, convert to use the
+        # masks.
+        @ranges = @masks if @masks && @masks < @ranges;
+    }  # End of both passes
+
+    # If the two passes came up with two sets, use the one with the fewest
+    # conditionals (the number of ranges is a proxy for that).  If both have
+    # the same number, prefer the native, as that omits transformations.
+    if (@native_ranges && @native_ranges <= @ranges) {
+        @ranges = @native_ranges;
+        @cond = @native_conds;
     }
 
-    # We have generated the list of bytes in two ways; one trying to use masks
-    # to cut the number of branches down, and the other to look at individual
-    # ranges (some of which could be cut down by using a mask for just it).
-    # We return whichever method uses the fewest branches.
-    return "( "
-           . join( " || ", (@masks && @masks < @ranges)
-                            ? @masks
-                            : @ranges)
-           . " )";
+    return "( " . join( " || ", @ranges) . " )";
 }
 
 # _combine
author	Karl Williamson <khw@cpan.org>	2021-06-30 12:54:19 -0600
committer	Karl Williamson <khw@cpan.org>	2021-08-07 04:46:45 -0600
commit	769444cd0f1c5de5c1dcb9dc4b2b38dd682c85ae (patch)
tree	8c52dcc91bacc24d748076fa78057c2abd620017 /regen
parent	8787aefa956b0b3132aee309414bee4cc28895be (diff)
download	perl-769444cd0f1c5de5c1dcb9dc4b2b38dd682c85ae.tar.gz