diff options
author | Karl Williamson <khw@cpan.org> | 2021-06-29 18:58:00 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-07 04:46:45 -0600 |
commit | bc5a92d8a7e80ce9c119eea0195a6626c5138615 (patch) | |
tree | 7b0dd7b5c80270de7a03437ba0465adbbeff0381 /regen | |
parent | 741c1a9d64abda23036c5dd76fd7169acf45159d (diff) | |
download | perl-bc5a92d8a7e80ce9c119eea0195a6626c5138615.tar.gz |
regcharclass.pl: Move some code earlier
We can short circuit some work by moving the test earlier. This does
not change the generated file.
Diffstat (limited to 'regen')
-rwxr-xr-x | regen/regcharclass.pl | 80 |
1 files changed, 41 insertions, 39 deletions
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 9f25658fb4..39de579127 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1096,6 +1096,46 @@ sub _cond_as_str { return 1 if @cond == 256; # If all bytes match, is trivially true + # If this is a single UTF-8 range which includes all possible + # continuation bytes, and we aren't checking for well-formedness, this + # is trivially true. + if ( @ranges == 1 + && ! $opts_ref->{safe} + && ! $opts_ref->{no_length_checks} + && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi + && $ranges[0]->[1] == 0xBF + && $ranges[0]->[0] == 0x80) + { + return 1; + } + + my $loop_start = 0; + if (ref $ranges[0] && $ranges[0]->[0] == 0) { + + # If the first range matches all 256 possible bytes, it is + # trivially true. + return 1 if $ranges[0]->[1] == 0xFF; + # this case + # Here, the first range starts at 0, but doesn't match everything. + # But the condition doesn't have to worry about being < 0 + $ranges[0] = "( $test <= " + . $self->val_fmt($ranges[0]->[1]) . " )"; + $loop_start++; + } + + my $loop_end = @ranges; + if ( @ranges + && ref $ranges[-1] + && $ranges[-1]->[1] == 0xFF + && $ranges[-1]->[0] != 0xFF) + { + # If the final range consists of more than one byte ending with + # the highest possible one, the condition doesn't have to worry + # about being > FF + $ranges[-1] = "( $test >= " . $self->val_fmt($ranges[-1]->[0]) . " )"; + $loop_end--; + } + my @masks; if (@ranges > 1) { @@ -1134,7 +1174,7 @@ sub _cond_as_str { # Here, there was no entire-class optimization that was clearly better # than doing things by ranges. Look at each range. my $range_count_extra = 0; - for (my $i = 0; $i < @ranges; $i++) { + for (my $i = $loop_start; $i < $loop_end; $i++) { if (! ref $ranges[$i]) { # Trivial case: no range $ranges[$i] = $self->val_fmt($ranges[$i]) . " == $test"; } @@ -1142,45 +1182,7 @@ sub _cond_as_str { $ranges[$i] = # Trivial case: single element range $self->val_fmt($ranges[$i]->[0]) . " == $test"; } - elsif ($ranges[$i]->[0] == 0) { - # If the range matches all 256 possible bytes, it is trivially - # true. - return 1 if $ranges[0]->[1] == 0xFF; # @ranges must be 1 in - # this case - $ranges[$i] = "( $test <= " - . $self->val_fmt($ranges[$i]->[1]) . " )"; - } - elsif ($ranges[$i]->[1] == 255) { - - # Similarly the max possible is 255, so can omit an upper bound - # test if the calculated max is the max possible one. - $ranges[$i] = "( $test >= " . $self->val_fmt($ranges[0]->[0]) . " )"; - } else { - # Well-formed UTF-8 continuation bytes on ascii platforms must be - # in the range 0x80 .. 0xBF. If we know that the input is - # well-formed (indicated by not trying to be 'safe'), we can omit - # tests that verify that the input is within either of these - # bounds. (No legal UTF-8 character can begin with anything in - # this range, so we don't have to worry about this being a - # continuation byte or not.) - if ($opts_ref->{charset} =~ /ascii/i - && (! $opts_ref->{safe} && ! $opts_ref->{no_length_checks}) - && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi) - { - # If the range is the entire legal range, it matches any legal - # byte, so we can omit both tests. (This should happen only - # if the number of ranges is 1.) - if ($ranges[$i]->[0] == 0x80 && $ranges[$i]->[1] == 0xBF) { - return 1; - } - } - - # Here, it isn't the full range of legal continuation bytes. We - # could just assume that there's nothing outside of the legal - # bounds. But inRANGE() allows us to have a single conditional, - # so the only cost of making sure it's a legal UTF-8 continuation - # byte is an extra subtraction instruction, a trivial expense. $ranges[$i] = "inRANGE_helper_(U8, $test, " . $self->val_fmt($ranges[$i]->[0]) .", " . $self->val_fmt($ranges[$i]->[1]) . ")"; |