diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2002-01-30 14:05:13 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2002-01-30 14:05:13 +0000 |
commit | d70b7d477c148def9a2f86b1e58a9a6eab8eb345 (patch) | |
tree | 04de143e95dec1235669f1e35ebd3cea22e6087b /ext | |
parent | cb551bf945dbea50db7071001928adfecb0d1bfc (diff) | |
parent | b1e7e56f1e0264e9b11883bf65d50cc9648125ec (diff) | |
download | perl-d70b7d477c148def9a2f86b1e58a9a6eab8eb345.tar.gz |
Integrate perlio:
[ 14499]
Collect some stats during compile process.
Experiment with effect of bundling all EUC-JP, EUC-CN, EUC-KR
as one XS - inconclusive - marginal win?
Add some comments to encode.h
[ 14490]
Basics of a compiled Encode XS extension
p4raw-link: @14499 on //depot/perlio: b1e7e56f1e0264e9b11883bf65d50cc9648125ec
p4raw-link: @14490 on //depot/perlio: d811239cc9c7c7e9aec8cc641d1ae1aadd03c900
p4raw-id: //depot/perl@14500
Diffstat (limited to 'ext')
-rw-r--r-- | ext/Encode/EUC_JP/Makefile.PL | 4 | ||||
-rwxr-xr-x | ext/Encode/compile | 20 | ||||
-rw-r--r-- | ext/Encode/encode.h | 65 |
3 files changed, 73 insertions, 16 deletions
diff --git a/ext/Encode/EUC_JP/Makefile.PL b/ext/Encode/EUC_JP/Makefile.PL index d9ff3bc6e9..2b51469615 100644 --- a/ext/Encode/EUC_JP/Makefile.PL +++ b/ext/Encode/EUC_JP/Makefile.PL @@ -2,7 +2,9 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; -my %tables = (EUC_JP => ['euc-jp.ucm' ], +my %tables = (EUC_JP => ['euc-jp.ucm', + # 'euc-kr.ucm', 'euc-cn.ucm' + ], ); diff --git a/ext/Encode/compile b/ext/Encode/compile index ee2cd400e8..26442a00a7 100755 --- a/ext/Encode/compile +++ b/ext/Encode/compile @@ -116,6 +116,9 @@ else my %encoding; my %strings; +my $saved = 0; +my $subsave = 0; +my $strings = 0; sub cmp_name { @@ -155,6 +158,7 @@ foreach my $enc (sort cmp_name @encfiles) if ($doC) { + print STDERR "Writing compiled form\n"; foreach my $name (sort cmp_name keys %encoding) { my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}}; @@ -213,6 +217,9 @@ END } close(D); close(H); + printf STDERR "%d bytes in string tables\n",$strings; + printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",$saved,100*$saved/$strings if $saved; + printf STDERR "%d bytes (%.3g%%) saved using substrings\n",$subsave,100*$subsave/$strings if $subsave; } elsif ($doEnc) { @@ -234,6 +241,7 @@ elsif ($doUcm) close(C); + sub compile_ucm { my ($fh,$name) = @_; @@ -269,7 +277,7 @@ sub compile_ucm push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg; $erep = join('',map(chr(hex($_)),@byte)); } - print "Scanning $name ($cs)\n"; + print "Reading $name ($cs)\n"; my $nfb = 0; my $hfb = 0; while (<$fh>) @@ -413,11 +421,17 @@ sub enter } } + + sub outstring { my ($fh,$name,$s) = @_; my $sym = $strings{$s}; - unless ($sym) + if ($sym) + { + $saved += length($s); + } + else { foreach my $o (keys %strings) { @@ -426,10 +440,12 @@ sub outstring { $sym = $strings{$o}; $sym .= sprintf("+0x%02x",$i) if ($i); + $subsave += length($s); return $sym; } } $strings{$s} = $sym = $name; + $strings += length($s); printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s); # Do in chunks of 16 chars to constrain line length # Assumes ANSI C adjacent string litteral concatenation diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h index aecc66eafe..f19cdc271d 100644 --- a/ext/Encode/encode.h +++ b/ext/Encode/encode.h @@ -1,38 +1,77 @@ #ifndef ENCODE_H #define ENCODE_H + #ifndef U8 +/* A tad devious this: + perl normally has a #define for U8 - if that isn't present + then we typedef it - leaving it #ifndef so we can do data parts without + getting extern references to the code parts + */ typedef unsigned char U8; #endif typedef struct encpage_s encpage_t; + struct encpage_s { - const U8 *seq; - encpage_t *next; - U8 min; - U8 max; - U8 dlen; - U8 slen; + /* fields ordered to pack nicely on 32-bit machines */ + const U8 *seq; /* Packed output sequences we generate if we match */ + encpage_t *next; /* Page to go to if we match */ + U8 min; /* Min value of octet to match this entry */ + U8 max; /* Max value of octet to match this entry */ + U8 dlen; /* destination length - size of entries in seq */ + U8 slen; /* source length - number of source octets needed */ }; +/* + At any point in a translation there is a page pointer which points at an array + of the above structures. + + Basic operation : + get octet from source stream. + if (octet >= min && octet < max) { + if slen is 0 then we cannot represent this character. + if we have less than slen octets (including this one) then we have a partial character. + otherwise + copy dlen octets from seq + dlen*(octet-min) to output + (dlen may be zero if we don't know yet.) + load page pointer with next to continue. + (is slen is one this is end of a character) + get next octet. + } + else { + increment the page pointer to look at next slot in the array + } + + arrays SHALL be constructed so there is an entry which matches ..0xFF at the end, + and either maps it or indicates no representation. + + if MSB of slen is set then mapping is an approximate "FALLBACK" entry. + +*/ + + typedef struct encode_s encode_t; struct encode_s { - encpage_t *t_utf8; - encpage_t *f_utf8; - const U8 *rep; - int replen; - U8 min_el; - U8 max_el; - const char *name[2]; + encpage_t *t_utf8; /* Starting table for translation from the encoding to UTF-8 form */ + encpage_t *f_utf8; /* Starting table for translation from UTF-8 to the encoding */ + const U8 *rep; /* Replacement character in this encoding e.g. "?" */ + int replen; /* Number of octets to represent replacement character */ + U8 min_el; /* Minimum octets to represent a character */ + U8 max_el; /* Maximum octets to represent a character */ + const char *name[2]; /* name(s) of this encoding */ }; #ifdef U8 +/* See comment at top of file for deviousness */ + extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx); extern void Encode_DefineEncoding(encode_t *enc); + #endif #define ENCODE_NOSPACE 1 |