Integrate perlio:

[ 14499] Collect some stats during compile process. Experiment with effect of bundling all EUC-JP, EUC-CN, EUC-KR as one XS - inconclusive - marginal win? Add some comments to encode.h [ 14490] Basics of a compiled Encode XS extension p4raw-link: @14499 on //depot/perlio: b1e7e56f1e0264e9b11883bf65d50cc9648125ec p4raw-link: @14490 on //depot/perlio: d811239cc9c7c7e9aec8cc641d1ae1aadd03c900 p4raw-id: //depot/perl@14500
author: Jarkko Hietaniemi <jhi@iki.fi> 2002-01-30 14:05:13 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2002-01-30 14:05:13 +0000
commit: d70b7d477c148def9a2f86b1e58a9a6eab8eb345 (patch)
tree: 04de143e95dec1235669f1e35ebd3cea22e6087b /ext
parent: cb551bf945dbea50db7071001928adfecb0d1bfc (diff)
parent: b1e7e56f1e0264e9b11883bf65d50cc9648125ec (diff)
download: perl-d70b7d477c148def9a2f86b1e58a9a6eab8eb345.tar.gz
3 files changed, 73 insertions, 16 deletions
diff --git a/ext/Encode/EUC_JP/Makefile.PL b/ext/Encode/EUC_JP/Makefile.PL
index d9ff3bc6e9..2b51469615 100644
--- a/ext/Encode/EUC_JP/Makefile.PL
+++ b/ext/Encode/EUC_JP/Makefile.PL
@@ -2,7 +2,9 @@ use 5.7.2;
 use strict;
 use ExtUtils::MakeMaker;
 
-my %tables = (EUC_JP  => ['euc-jp.ucm' ],
+my %tables = (EUC_JP  => ['euc-jp.ucm',
+              # 'euc-kr.ucm', 'euc-cn.ucm'
+              ],
 	      );
 
 
diff --git a/ext/Encode/compile b/ext/Encode/compile
index ee2cd400e8..26442a00a7 100755
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -116,6 +116,9 @@ else
 
 my %encoding;
 my %strings;
+my $saved = 0;
+my $subsave = 0;
+my $strings = 0;
 
 sub cmp_name
 {
@@ -155,6 +158,7 @@ foreach my $enc (sort cmp_name @encfiles)
 
 if ($doC)
  {
+  print STDERR "Writing compiled form\n";
   foreach my $name (sort cmp_name keys %encoding)
    {
     my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
@@ -213,6 +217,9 @@ END
    }
   close(D);
   close(H);
+  printf STDERR "%d bytes in string tables\n",$strings;
+  printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",$saved,100*$saved/$strings if $saved;
+  printf STDERR "%d bytes (%.3g%%) saved using substrings\n",$subsave,100*$subsave/$strings if $subsave;
  }
 elsif ($doEnc)
  {
@@ -234,6 +241,7 @@ elsif ($doUcm)
 close(C);
 
 
+
 sub compile_ucm
 {
  my ($fh,$name) = @_;
@@ -269,7 +277,7 @@ sub compile_ucm
    push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
    $erep = join('',map(chr(hex($_)),@byte));
   }
- print "Scanning $name ($cs)\n";
+ print "Reading $name ($cs)\n";
  my $nfb = 0;
  my $hfb = 0;
  while (<$fh>)
@@ -413,11 +421,17 @@ sub enter
   }
 }
 
+
+
 sub outstring
 {
  my ($fh,$name,$s) = @_;
  my $sym = $strings{$s};
- unless ($sym)
+ if ($sym)
+  {
+   $saved += length($s);
+  }
+ else
   {
    foreach my $o (keys %strings)
     {
@@ -426,10 +440,12 @@ sub outstring
       {
        $sym = $strings{$o};
        $sym .= sprintf("+0x%02x",$i) if ($i);
+       $subsave += length($s);
        return $sym;
       }
     }
    $strings{$s} = $sym = $name;
+   $strings += length($s);
    printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s);
    # Do in chunks of 16 chars to constrain line length
    # Assumes ANSI C adjacent string litteral concatenation
diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h
index aecc66eafe..f19cdc271d 100644
--- a/ext/Encode/encode.h
+++ b/ext/Encode/encode.h
@@ -1,38 +1,77 @@
 #ifndef ENCODE_H
 #define ENCODE_H
+
 #ifndef U8
+/* A tad devious this:
+   perl normally has a #define for U8 - if that isn't present
+   then we typedef it - leaving it #ifndef so we can do data parts without
+   getting extern references to the code parts
+ */
 typedef unsigned char U8;
 #endif
 
 typedef struct encpage_s encpage_t;
 
+
 struct encpage_s
 {
- const U8   *seq;
- encpage_t  *next;
- U8         min;
- U8         max;
- U8         dlen;
- U8         slen;
+ /* fields ordered to pack nicely on 32-bit machines */
+ const U8   *seq;       /* Packed output sequences we generate if we match */
+ encpage_t  *next;      /* Page to go to if we match */
+ U8         min;        /* Min value of octet to match this entry */
+ U8         max;        /* Max value of octet to match this entry */
+ U8         dlen;       /* destination length - size of entries in seq */
+ U8         slen;       /* source length - number of source octets needed */
 };
 
+/*
+   At any point in a translation there is a page pointer which points at an array
+   of the above structures.
+
+   Basic operation :
+   get octet from source stream.
+   if (octet >= min && octet < max) {
+      if slen is 0 then we cannot represent this character.
+      if we have less than slen octets (including this one) then we have a partial character.
+      otherwise
+       copy dlen octets from seq + dlen*(octet-min) to output
+       (dlen may be zero if we don't know yet.)
+       load page pointer with next to continue.
+       (is slen is one this is end of a character)
+       get next octet.
+   }
+   else {
+      increment the page pointer to look at next slot in the array
+   }
+
+   arrays SHALL be constructed so there is an entry which matches ..0xFF at the end,
+   and either maps it or indicates no representation.
+
+   if MSB of slen is set then mapping is an approximate "FALLBACK" entry.
+
+*/
+
+
 typedef struct encode_s encode_t;
 struct encode_s
 {
- encpage_t  *t_utf8;
- encpage_t  *f_utf8;
- const U8   *rep;
- int        replen;
- U8         min_el;
- U8         max_el;
- const char *name[2];
+ encpage_t  *t_utf8;    /* Starting table for translation from the encoding to UTF-8 form */
+ encpage_t  *f_utf8;    /* Starting table for translation from UTF-8 to the encoding */
+ const U8   *rep;       /* Replacement character in this encoding e.g. "?" */
+ int        replen;     /* Number of octets to represent replacement character */
+ U8         min_el;     /* Minimum octets to represent a character */
+ U8         max_el;     /* Maximum octets to represent a character */
+ const char *name[2];   /* name(s) of this encoding */
 };
 
 #ifdef U8
+/* See comment at top of file for deviousness */
+
 extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen,
                      U8 *dst, STRLEN dlen, STRLEN *dout, int approx);
 
 extern void Encode_DefineEncoding(encode_t *enc);
+
 #endif
 
 #define ENCODE_NOSPACE  1
author	Jarkko Hietaniemi <jhi@iki.fi>	2002-01-30 14:05:13 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2002-01-30 14:05:13 +0000
commit	d70b7d477c148def9a2f86b1e58a9a6eab8eb345 (patch)
tree	04de143e95dec1235669f1e35ebd3cea22e6087b /ext
parent	cb551bf945dbea50db7071001928adfecb0d1bfc (diff)
parent	b1e7e56f1e0264e9b11883bf65d50cc9648125ec (diff)
download	perl-d70b7d477c148def9a2f86b1e58a9a6eab8eb345.tar.gz