summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNicholas Clark <nick@ccl4.org>2002-02-20 22:13:05 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-02-20 21:13:50 +0000
commiteb5603165ab61a683806a34f59885b99cd506b12 (patch)
treec165aa5e31e9ef7730560c02b22538ca4892c4c2
parentdf3477ffc5601d9159ebc5706fa20bb2af9f5d5d (diff)
downloadperl-eb5603165ab61a683806a34f59885b99cd506b12.tar.gz
Re: [PATCH] go faster for Encode's compile
Message-ID: <20020220221304.GE371@Bagpuss.unfortu.net> p4raw-id: //depot/perl@14798
-rwxr-xr-xext/Encode/compile233
1 files changed, 165 insertions, 68 deletions
diff --git a/ext/Encode/compile b/ext/Encode/compile
index 532f410e05..f622641232 100755
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -7,6 +7,71 @@ use strict;
use Getopt::Std;
my @orig_ARGV = @ARGV;
+# These may get re-ordered.
+# RAW is a do_now as inserted by &enter
+# AGG is an aggreagated do_now, as built up by &process
+use constant {
+ RAW_NEXT => 0,
+ RAW_IN_LEN => 1,
+ RAW_OUT_BYTES => 2,
+ RAW_FALLBACK => 3,
+
+ AGG_MIN_IN => 0,
+ AGG_MAX_IN => 1,
+ AGG_OUT_BYTES => 2,
+ AGG_NEXT => 3,
+ AGG_IN_LEN => 4,
+ AGG_OUT_LEN => 5,
+ AGG_FALLBACK => 6,
+};
+# (See the algorithm in encengine.c - we're building structures for it)
+
+# There are two sorts of structures.
+# "do_now" (an array, two variants of what needs storing) is whatever we need
+# to do now we've read an input byte.
+# It's housed in a "do_next" (which is how we got to it), and in turn points
+# to a "do_next" which contains all the "do_now"s for the next input byte.
+
+# There will be a "do_next" which is the start state.
+# For a single byte encoding it's the only "do_next" - each "do_now" points
+# back to it, and each "do_now" will cause bytes. There is no state.
+
+# For a multi-byte encoding where all characters in the input are the same
+# length, then there will be a tree of "do_now"->"do_next"->"do_now"
+# branching out from the start state, one step for each input byte.
+# The leaf "do_now"s will all be at the same distance from the start state,
+# only the leaf "do_now"s cause output bytes, and they in turn point back to
+# the start state.
+
+# For an encoding where there are varaible length input byte sequences, you
+# will encounter a leaf "do_now" sooner for the shorter input sequences, but
+# as before the leaves will point back to the start state.
+
+# The system will cope with escape encodings (imagine them as a mostly
+# self-contained tree for each escape state, and cross links between trees
+# at the state-switching characters) but so far no input format defines these.
+
+# The system will also cope with having output "leaves" in the middle of
+# the bifurcating branches, not just at the extremities, but again no
+# input format does this yet.
+
+# There are two variants of the "do_now" structure. The first, smaller variant
+# is generated by &enter as the input file is read. There is one structure
+# for each input byte. Say we are mapping a single byte encoding to a
+# single byte encoding, with "ABCD" going "abcd". There will be
+# 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
+
+# &process then walks the tree, building aggregate "do_now" structres for
+# adjacent bytes where possible. The aggregate is for a contiguous range of
+# bytes which each produce the same length of output, each move to the
+# same next state, and each have the same fallback flag.
+# So our 4 RAW "do_now"s above become replaced by a single structure
+# containing:
+# ["A", "D", "abcd", 1, ...]
+# ie, for an input byte $_ in "A".."D", output 1 byte, found as
+# substr ("abcd", (ord $_ - ord "A") * 1, 1)
+# which maps very nicely into pointer arithmetic in C for encengine.c
+
sub encode_U
{
# UTF-8 encode long hand - only covers part of perl's range
@@ -392,8 +457,10 @@ sub compile_enc
{
$seen{$uch} = [$page,$ch];
}
- enter($e2u,$ech,$uch,$e2u,0);
- enter($u2e,$uch,$ech,$u2e,0);
+ # Passing 2 extra args each time is 3.6% slower!
+ # Even with having to add $fallback ||= 0 in &process
+ enter($e2u,$ech,$uch);
+ enter($u2e,$uch,$ech);
}
else
{
@@ -409,26 +476,45 @@ sub compile_enc
$encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
}
-sub enter
-{
- my ($a,$s,$d,$t,$fb) = @_;
- $t = $a if @_ < 4;
-
- while (1) {
- $s =~ s/(.)//s;
- my $b = $1;
- my $e = $a->{$b};
- # 0 1 2 3 4 5
- $a->{$b} = $e = [$b,$b,'',{},1+length($s),0,$fb] unless $e;
- unless (length($s)) {
- $e->[2] = $d;
- $e->[3] = $t;
- $e->[5] = length($d);
- return;
+# my ($a,$s,$d,$t,$fb) = @_;
+sub enter {
+ my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
+ # state we shift to after this (multibyte) input character defaults to same
+ # as current state.
+ $next ||= $current;
+ # Making sure it is defined seems to be faster than {no warnings;} in
+ # &process, or passing it in as 0 explicity.
+ $fallback ||= 0;
+
+ # Start at the beginning and work forwards through the string to zero.
+ # effectively we are removing 1 character from the front each time
+ # but we don't actually edit the string. [this alone seems to be 14% speedup]
+ # Hence -$pos is the length of the remaining string.
+ my $pos = -length $inbytes;
+ while (1) {
+ my $byte = substr $inbytes, $pos, 1;
+ # RAW_NEXT => 0,
+ # RAW_IN_LEN => 1,
+ # RAW_OUT_BYTES => 2,
+ # RAW_FALLBACK => 3,
+ # to unicode an array would seem to be better, because the pages are dense.
+ # from unicode can be very sparse, favouring a hash.
+ # hash using the bytes (all length 1) as keys rather than ord value,
+ # as it's easier to sort these in &process.
+
+ # It's faster to always add $fallback even if it's undef, rather than
+ # choosing between 3 and 4 element array. (hence why we set it defined
+ # above)
+ my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
+ # When $pos was -1 we were at the last input character.
+ unless (++$pos) {
+ $do_now->[RAW_OUT_BYTES] = $outbytes;
+ $do_now->[RAW_NEXT] = $next;
+ return;
+ }
+ # Tail recursion. The intermdiate state may not have a name yet.
+ $current = $do_now->[RAW_NEXT];
}
- # Tail recursion.
- $a = $e->[3];
- }
}
@@ -471,49 +557,60 @@ sub outstring
sub process
{
- my ($name,$a) = @_;
- $name =~ s/\W+/_/g;
- $a->{Cname} = $name;
- my @keys = sort grep(ref($a->{$_}),keys %$a);
- my $l;
- my @ent;
- foreach my $b (@keys)
- {
- my ($s,undef,undef,$t,undef) = @{$a->{$b}};
- if (defined($l) &&
- ord($b) == ord($a->{$l}[1])+1 &&
- $a->{$l}[3] == $a->{$b}[3] &&
- $a->{$l}[4] == $a->{$b}[4] &&
- $a->{$l}[5] == $a->{$b}[5] &&
- $a->{$l}[6] == $a->{$b}[6]
- # && length($a->{$l}[2]) < 16
- )
- {
- my $i = ord($b)-ord($a->{$l}[0]);
- $a->{$l}[1] = $b;
- $a->{$l}[2] .= $a->{$b}[2];
+ my ($name,$a) = @_;
+ $name =~ s/\W+/_/g;
+ $a->{Cname} = $name;
+ my @raw = sort keys %{$a->{Raw}};
+ my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
+ my @ent;
+ foreach my $key (@raw) {
+ # RAW_NEXT => 0,
+ # RAW_IN_LEN => 1,
+ # RAW_OUT_BYTES => 2,
+ # RAW_FALLBACK => 3,
+ my ($next, $in_len, $out_bytes, $fallback) = @{$a->{Raw}{$key}};
+ # Now we are converting from raw to aggregate, switch from 1 byte strings
+ # to numbers
+ my $b = ord $key;
+ if ($l &&
+ # If this == fails, we're going to reset $agg_max_in below anyway.
+ $b == ++$agg_max_in &&
+ # References in numeric context give the pointer as an int.
+ $agg_next == $next &&
+ $agg_in_len == $in_len &&
+ $agg_out_len == length $out_bytes &&
+ $agg_fallback == $fallback
+ # && length($l->[AGG_OUT_BYTES]) < 16
+ ) {
+ # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
+ # we can aggregate this byte onto the end.
+ $l->[AGG_MAX_IN] = $b;
+ $l->[AGG_OUT_BYTES] .= $out_bytes;
+ } else {
+ # AGG_MIN_IN => 0,
+ # AGG_MAX_IN => 1,
+ # AGG_OUT_BYTES => 2,
+ # AGG_NEXT => 3,
+ # AGG_IN_LEN => 4,
+ # AGG_OUT_LEN => 5,
+ # AGG_FALLBACK => 6,
+ # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
+ # (only gains .6% on euc-jp -- is it worth it?)
+ push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
+ $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
+ $agg_fallback = $fallback];
}
- else
- {
- $l = $b;
- push(@ent,$b);
- }
- if (exists $t->{Cname})
- {
- $t->{'Forward'} = 1 if $t != $a;
- }
- else
- {
- process(sprintf("%s_%02x",$name,ord($s)),$t);
+ if (exists $next->{Cname}) {
+ $next->{'Forward'} = 1 if $next != $a;
+ } else {
+ process(sprintf("%s_%02x",$name,$b),$next);
}
}
- if (ord($keys[-1]) < 255)
- {
- my $t = chr(ord($keys[-1])+1);
- $a->{$t} = [$t,chr(255),undef,$a,0,0];
- push(@ent,$t);
+ # encengine.c rules say that last entry must be for 255
+ if (ord $raw[-1] < 255) {
+ push @ent, [1+ord $raw[-1], 255,undef,$a,0,0];
}
- $a->{'Entries'} = \@ent;
+ $a->{'Entries'} = \@ent;
}
sub outtable
@@ -523,10 +620,10 @@ sub outtable
# String tables
foreach my $b (@{$a->{'Entries'}})
{
- next unless $a->{$b}[5];
- my $s = ord($a->{$b}[0]);
- my $e = ord($a->{$b}[1]);
- outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$a->{$b}[2]);
+ next unless $b->[AGG_OUT_LEN];
+ my $s = $b->[AGG_MIN_IN];
+ my $e = $b->[AGG_MAX_IN];
+ outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
}
if ($a->{'Forward'})
{
@@ -535,15 +632,13 @@ sub outtable
$a->{'Done'} = 1;
foreach my $b (@{$a->{'Entries'}})
{
- my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}};
+ my ($s,$e,$out,$t,$end,$l) = @$b;
outtable($fh,$t) unless $t->{'Done'};
}
print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
foreach my $b (@{$a->{'Entries'}})
{
- my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
- my $sc = ord($s);
- my $ec = ord($e);
+ my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
$end |= 0x80 if $fb;
print $fh "{";
if ($l)
@@ -571,6 +666,7 @@ sub output
sub output_enc
{
my ($fh,$name,$a) = @_;
+ die "Changed - fix me for new structure";
foreach my $b (sort keys %$a)
{
my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
@@ -608,6 +704,7 @@ sub output_ucm_page
# warn sprintf("Page %x\n",$pre);
foreach my $b (sort keys %$t)
{
+ die "Changed - fix me for new structure";
my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
die "oops $s $e" unless $s eq $e;
my $u = ord($s);