summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2002-01-30 14:05:13 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-01-30 14:05:13 +0000
commitd70b7d477c148def9a2f86b1e58a9a6eab8eb345 (patch)
tree04de143e95dec1235669f1e35ebd3cea22e6087b /ext
parentcb551bf945dbea50db7071001928adfecb0d1bfc (diff)
parentb1e7e56f1e0264e9b11883bf65d50cc9648125ec (diff)
downloadperl-d70b7d477c148def9a2f86b1e58a9a6eab8eb345.tar.gz
Integrate perlio:
[ 14499] Collect some stats during compile process. Experiment with effect of bundling all EUC-JP, EUC-CN, EUC-KR as one XS - inconclusive - marginal win? Add some comments to encode.h [ 14490] Basics of a compiled Encode XS extension p4raw-link: @14499 on //depot/perlio: b1e7e56f1e0264e9b11883bf65d50cc9648125ec p4raw-link: @14490 on //depot/perlio: d811239cc9c7c7e9aec8cc641d1ae1aadd03c900 p4raw-id: //depot/perl@14500
Diffstat (limited to 'ext')
-rw-r--r--ext/Encode/EUC_JP/Makefile.PL4
-rwxr-xr-xext/Encode/compile20
-rw-r--r--ext/Encode/encode.h65
3 files changed, 73 insertions, 16 deletions
diff --git a/ext/Encode/EUC_JP/Makefile.PL b/ext/Encode/EUC_JP/Makefile.PL
index d9ff3bc6e9..2b51469615 100644
--- a/ext/Encode/EUC_JP/Makefile.PL
+++ b/ext/Encode/EUC_JP/Makefile.PL
@@ -2,7 +2,9 @@ use 5.7.2;
use strict;
use ExtUtils::MakeMaker;
-my %tables = (EUC_JP => ['euc-jp.ucm' ],
+my %tables = (EUC_JP => ['euc-jp.ucm',
+ # 'euc-kr.ucm', 'euc-cn.ucm'
+ ],
);
diff --git a/ext/Encode/compile b/ext/Encode/compile
index ee2cd400e8..26442a00a7 100755
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -116,6 +116,9 @@ else
my %encoding;
my %strings;
+my $saved = 0;
+my $subsave = 0;
+my $strings = 0;
sub cmp_name
{
@@ -155,6 +158,7 @@ foreach my $enc (sort cmp_name @encfiles)
if ($doC)
{
+ print STDERR "Writing compiled form\n";
foreach my $name (sort cmp_name keys %encoding)
{
my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
@@ -213,6 +217,9 @@ END
}
close(D);
close(H);
+ printf STDERR "%d bytes in string tables\n",$strings;
+ printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",$saved,100*$saved/$strings if $saved;
+ printf STDERR "%d bytes (%.3g%%) saved using substrings\n",$subsave,100*$subsave/$strings if $subsave;
}
elsif ($doEnc)
{
@@ -234,6 +241,7 @@ elsif ($doUcm)
close(C);
+
sub compile_ucm
{
my ($fh,$name) = @_;
@@ -269,7 +277,7 @@ sub compile_ucm
push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
$erep = join('',map(chr(hex($_)),@byte));
}
- print "Scanning $name ($cs)\n";
+ print "Reading $name ($cs)\n";
my $nfb = 0;
my $hfb = 0;
while (<$fh>)
@@ -413,11 +421,17 @@ sub enter
}
}
+
+
sub outstring
{
my ($fh,$name,$s) = @_;
my $sym = $strings{$s};
- unless ($sym)
+ if ($sym)
+ {
+ $saved += length($s);
+ }
+ else
{
foreach my $o (keys %strings)
{
@@ -426,10 +440,12 @@ sub outstring
{
$sym = $strings{$o};
$sym .= sprintf("+0x%02x",$i) if ($i);
+ $subsave += length($s);
return $sym;
}
}
$strings{$s} = $sym = $name;
+ $strings += length($s);
printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s);
# Do in chunks of 16 chars to constrain line length
# Assumes ANSI C adjacent string litteral concatenation
diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h
index aecc66eafe..f19cdc271d 100644
--- a/ext/Encode/encode.h
+++ b/ext/Encode/encode.h
@@ -1,38 +1,77 @@
#ifndef ENCODE_H
#define ENCODE_H
+
#ifndef U8
+/* A tad devious this:
+ perl normally has a #define for U8 - if that isn't present
+ then we typedef it - leaving it #ifndef so we can do data parts without
+ getting extern references to the code parts
+ */
typedef unsigned char U8;
#endif
typedef struct encpage_s encpage_t;
+
struct encpage_s
{
- const U8 *seq;
- encpage_t *next;
- U8 min;
- U8 max;
- U8 dlen;
- U8 slen;
+ /* fields ordered to pack nicely on 32-bit machines */
+ const U8 *seq; /* Packed output sequences we generate if we match */
+ encpage_t *next; /* Page to go to if we match */
+ U8 min; /* Min value of octet to match this entry */
+ U8 max; /* Max value of octet to match this entry */
+ U8 dlen; /* destination length - size of entries in seq */
+ U8 slen; /* source length - number of source octets needed */
};
+/*
+ At any point in a translation there is a page pointer which points at an array
+ of the above structures.
+
+ Basic operation :
+ get octet from source stream.
+ if (octet >= min && octet < max) {
+ if slen is 0 then we cannot represent this character.
+ if we have less than slen octets (including this one) then we have a partial character.
+ otherwise
+ copy dlen octets from seq + dlen*(octet-min) to output
+ (dlen may be zero if we don't know yet.)
+ load page pointer with next to continue.
+ (is slen is one this is end of a character)
+ get next octet.
+ }
+ else {
+ increment the page pointer to look at next slot in the array
+ }
+
+ arrays SHALL be constructed so there is an entry which matches ..0xFF at the end,
+ and either maps it or indicates no representation.
+
+ if MSB of slen is set then mapping is an approximate "FALLBACK" entry.
+
+*/
+
+
typedef struct encode_s encode_t;
struct encode_s
{
- encpage_t *t_utf8;
- encpage_t *f_utf8;
- const U8 *rep;
- int replen;
- U8 min_el;
- U8 max_el;
- const char *name[2];
+ encpage_t *t_utf8; /* Starting table for translation from the encoding to UTF-8 form */
+ encpage_t *f_utf8; /* Starting table for translation from UTF-8 to the encoding */
+ const U8 *rep; /* Replacement character in this encoding e.g. "?" */
+ int replen; /* Number of octets to represent replacement character */
+ U8 min_el; /* Minimum octets to represent a character */
+ U8 max_el; /* Maximum octets to represent a character */
+ const char *name[2]; /* name(s) of this encoding */
};
#ifdef U8
+/* See comment at top of file for deviousness */
+
extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen,
U8 *dst, STRLEN dlen, STRLEN *dout, int approx);
extern void Encode_DefineEncoding(encode_t *enc);
+
#endif
#define ENCODE_NOSPACE 1