Integrate perlio:

[ 8290] Loose the "Loading..." warning. Another bug in fallback support [ 8288] Fix .ucm reading - forgot a chr() Start of .ucm write (for .enc to .ucm) [ 8285] Start of support of ICU-style .ucm files: - teach compile how to read a .ucm file - first guess at how to represent fallbacks in "tries". - use fallbacks if check == 0 - new return code to indicate we used one. p4raw-link: @8290 on //depot/perlio: c8991b40a4a3c9ec8efc70a2420a12ff1ce50eb0 p4raw-link: @8288 on //depot/perlio: e0c49a6baed38305536b5d74ba7486451327612c p4raw-link: @8285 on //depot/perlio: 9b37254de3a0e643859aebea34267588f789f15f p4raw-id: //depot/perl@8292
author: Jarkko Hietaniemi <jhi@iki.fi> 2001-01-01 18:11:44 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2001-01-01 18:11:44 +0000
commit: 062365604b359a1490884f0ddc7e02f237691b4d (patch)
tree: efb04757523e81d0430d8ddf3f05680f146169e8 /ext/Encode
parent: ab648d5eb0a7286231b7140c37b59641898641f5 (diff)
parent: c8991b40a4a3c9ec8efc70a2420a12ff1ce50eb0 (diff)
download: perl-062365604b359a1490884f0ddc7e02f237691b4d.tar.gz
5 files changed, 263 insertions, 66 deletions
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 0486a68c85..42c9e8471a 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -342,7 +342,7 @@ sub from_to
 
 # The global hash is declared in XS code
 $encoding{Unicode}    = bless({},'Encode::Unicode');
-$encoding{iso10646-1} = bless({},'Encode::iso10646_1');
+$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1');
 
 sub encodings
 {
@@ -378,7 +378,7 @@ sub loadEncoding
      last unless $type eq '#';
     }
    $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table'));
-   # warn "Loading $file";
+   #warn "Loading $file";
    return $class->read($fh,$name,$type);
   }
  else
@@ -408,7 +408,8 @@ sub getEncoding
 
 package Encode::Unicode;
 
-# Dummy package that provides the encode interface
+# Dummy package that provides the encode interface but leaves data
+# as UTF-8 encoded. It is here so that from_to()
 
 sub name { 'Unicode' }
 
@@ -533,7 +534,9 @@ sub fromUnicode
  return $str;
 }
 
-package Encode::iso10646_1;#
+package Encode::iso10646_1;
+# Encoding is 16-bit network order Unicode
+# Used for X font encodings
 
 sub name { 'iso10646-1' }
 
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index cca1ddcd7b..b61d89bd96 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -340,10 +340,14 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
    U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
    STRLEN dlen = SvLEN(dst);
    int code;
-   while ((code = do_encode(dir,s,&slen,d,dlen,&dlen)))
+   while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check)))
     {
      SvCUR_set(dst,dlen);
      SvPOK_on(dst);
+
+     if (code == ENCODE_FALLBACK)
+      break;
+
      switch(code)
       {
        case ENCODE_NOSPACE:
diff --git a/ext/Encode/compile b/ext/Encode/compile
index fbb08cd2e9..b890a04d81 100755
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -4,7 +4,7 @@ use strict;
 
 sub encode_U
 {
- # UTF-8 encocde long hand - only covers part of perl's range
+ # UTF-8 encode long hand - only covers part of perl's range
  my $uv = shift;
  if ($uv < 0x80)
   {
@@ -50,32 +50,47 @@ chmod(0666,$cname) if -f $cname && !-w $cname;
 open(C,">$cname") || die "Cannot open $cname:$!";
 my $dname = $cname;
 $dname =~ s/(\.[^\.]*)?$/.def/;
-chmod(0666,$dname) if -f $cname && !-w $dname;
-open(D,">$dname") || die "Cannot open $dname:$!";
-my $hname = $cname;
-$hname =~ s/(\.[^\.]*)?$/.h/;
-chmod(0666,$hname) if -f $cname && !-w $hname;
-open(H,">$hname") || die "Cannot open $hname:$!";
-
-foreach my $fh (\*C,\*D,\*H)
-{
- print $fh <<"END";
+
+my ($doC,$doEnc,$doUcm);
+
+if ($cname =~ /\.(c|xs)$/)
+ {
+  $doC = 1;
+  chmod(0666,$dname) if -f $cname && !-w $dname;
+  open(D,">$dname") || die "Cannot open $dname:$!";
+  my $hname = $cname;
+  $hname =~ s/(\.[^\.]*)?$/.h/;
+  chmod(0666,$hname) if -f $cname && !-w $hname;
+  open(H,">$hname") || die "Cannot open $hname:$!";
+
+  foreach my $fh (\*C,\*D,\*H)
+  {
+   print $fh <<"END";
 /*
  !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
  This file was autogenerated by:
  $^X $0 $cname @ARGV
 */
 END
-}
+  }
 
-if ($cname =~ /(\w+)\.xs$/)
+  if ($cname =~ /(\w+)\.xs$/)
+   {
+    print C "#include <EXTERN.h>\n";
+    print C "#include <perl.h>\n";
+    print C "#include <XSUB.h>\n";
+    print C "#define U8 U8\n";
+   }
+  print C "#include \"encode.h\"\n";
+ }
+elsif ($cname =~ /\.enc$/)
+ {
+  $doEnc = 1;
+ }
+elsif ($cname =~ /\.ucm$/)
  {
-  print C "#include <EXTERN.h>\n";
-  print C "#include <perl.h>\n";
-  print C "#include <XSUB.h>\n";
-  print C "#define U8 U8\n";
+  $doUcm = 1;
  }
-print C "#include \"encode.h\"\n";
 
 my %encoding;
 my %strings;
@@ -96,10 +111,17 @@ sub cmp_name
 
 foreach my $enc (sort cmp_name @ARGV)
  {
-  my ($name) = $enc =~ /^.*?([\w-]+)(\.enc)$/;
+  my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
   if (open(E,$enc))
    {
-    compile_enc(\*E,lc($name),\*C);
+    if ($sfx eq 'enc')
+     {
+      compile_enc(\*E,lc($name),\*C);
+     }
+    else
+     {
+      compile_ucm(\*E,lc($name),\*C);
+     }
    }
   else
    {
@@ -107,33 +129,127 @@ foreach my $enc (sort cmp_name @ARGV)
    }
  }
 
-foreach my $enc (sort cmp_name keys %encoding)
+if ($doC)
  {
-  my $sym = "${enc}_encoding";
-  $sym =~ s/\W+/_/g;
-  print C "encode_t $sym = \n";
-  print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n\n";
- }
+  foreach my $enc (sort cmp_name keys %encoding)
+   {
+    my $sym = "${enc}_encoding";
+    $sym =~ s/\W+/_/g;
+    print C "encode_t $sym = \n";
+    print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n\n";
+   }
 
-foreach my $enc (sort cmp_name keys %encoding)
- {
-  my $sym = "${enc}_encoding";
-  $sym =~ s/\W+/_/g;
-  print H "extern encode_t $sym;\n";
-  print D " Encode_Define(aTHX_ &$sym);\n";
- }
+  foreach my $enc (sort cmp_name keys %encoding)
+   {
+    my $sym = "${enc}_encoding";
+    $sym =~ s/\W+/_/g;
+    print H "extern encode_t $sym;\n";
+    print D " Encode_Define(aTHX_ &$sym);\n";
+   }
 
-if ($cname =~ /(\w+)\.xs$/)
- {
-  my $mod = $1;
-  print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
-  print C "BOOT:\n{\n";
-  print C "#include \"$dname\"\n";
-  print C "}\n";
+  if ($cname =~ /(\w+)\.xs$/)
+   {
+    my $mod = $1;
+    print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
+    print C "BOOT:\n{\n";
+    print C "#include \"$dname\"\n";
+    print C "}\n";
+   }
+  close(D);
+  close(H);
  }
 close(C);
-close(D);
-close(H);
+
+
+sub compile_ucm
+{
+ my ($fh,$name,$ch) = @_;
+ my $e2u = {};
+ my $u2e = {};
+ my $cs;
+ my %attr;
+ while (<$fh>)
+  {
+   s/#.*$//;
+   last if /^\s*CHARMAP\s*$/i;
+   if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i)
+    {
+     $attr{$1} = $2;
+    }
+  }
+ if (!defined($cs =  $attr{'code_set_name'}))
+  {
+   warn "No <code_set_name> in $name\n";
+  }
+ else
+  {
+   # $name = lc($cs);
+  }
+ my $erep;
+ my $urep;
+ if (exists $attr{'subchar'})
+  {
+   my @byte = $attr{'subchar'} =~ /^\s*(?:\\x([0-9a-f]+))+\s*$/;
+   $erep = join('',map(hex($_),@byte));
+  }
+ warn "Scanning $name ($cs)\n";
+ my $nfb = 0;
+ my $hfb = 0;
+ while (<$fh>)
+  {
+   s/#.*$//;
+   last if /^\s*END\s+CHARMAP\s*$/i;
+   next if /^\s*$/;
+   my ($u,@byte) = /^<U([0-9a-f]+)>\s+(?:\\x([0-9a-f]+))+\s*(\|[0-3]|)\s*$/i;
+   my $fb = pop(@byte);
+   if (defined($u))
+    {
+     my $uch = encode_U(hex($u));
+     my $ech = join('',map(chr(hex($_)),@byte));
+     if (length($fb))
+      {
+       $fb = substr($fb,1);
+       $hfb++;
+      }
+     else
+      {
+       $nfb++;
+       $fb = '0';
+      }
+     # $fb is fallback flag
+     # 0 - round trip safe
+     # 1 - fallback for unicode -> enc
+     # 2 - skip sub-char mapping
+     # 3 - fallback enc -> unicode
+     enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
+     enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
+    }
+   else
+    {
+     warn $_;
+    }
+
+  }
+ if ($nfb && $hfb)
+  {
+   die "$nfb entries without fallback, $hfb entries with\n";
+  }
+ if ($doC)
+  {
+   output($ch,$name.'_utf8',$e2u);
+   output($ch,'utf8_'.$name,$u2e);
+   $encoding{$name} = [$e2u->{Cname},$u2e->{Cname},
+                       outstring($ch,$e2u->{Cname}.'_def',$erep),length($erep)];
+  }
+ elsif ($doEnc)
+  {
+   output_enc($ch,$name,$e2u);
+  }
+ elsif ($doUcm)
+  {
+   output_ucm($ch,$name,$u2e);
+  }
+}
 
 sub compile_enc
 {
@@ -173,8 +289,8 @@ sub compile_enc
        if ($val || (!$ch && !$page))
         {
          my $uch = encode_U($val);
-         enter($e2u,$ech,$uch,$e2u);
-         enter($u2e,$uch,$ech,$u2e);
+         enter($e2u,$ech,$uch,$e2u,0);
+         enter($u2e,$uch,$ech,$u2e,0);
         }
        else
         {
@@ -185,26 +301,37 @@ sub compile_enc
       }
     }
   }
- output($ch,$name.'_utf8',$e2u);
- output($ch,'utf8_'.$name,$u2e);
- $encoding{$name} = [$e2u->{Cname},$u2e->{Cname},
-                     outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)];
+ if ($doC)
+  {
+   output($ch,$name.'_utf8',$e2u);
+   output($ch,'utf8_'.$name,$u2e);
+   $encoding{$name} = [$e2u->{Cname},$u2e->{Cname},
+                       outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)];
+  }
+ elsif ($doEnc)
+  {
+   output_enc($ch,$name,$e2u);
+  }
+ elsif ($doUcm)
+  {
+   output_ucm($ch,$name,$u2e);
+  }
 }
 
 sub enter
 {
- my ($a,$s,$d,$t) = @_;
+ my ($a,$s,$d,$t,$fb) = @_;
  $t = $a if @_ < 4;
  my $b = substr($s,0,1);
  my $e = $a->{$b};
  unless ($e)
   {     # 0  1  2  3         4  5
-   $e = [$b,$b,'',{},length($s),0];
+   $e = [$b,$b,'',{},length($s),0,$fb];
    $a->{$b} = $e;
   }
  if (length($s) > 1)
   {
-   enter($e->[3],substr($s,1),$d,$t);
+   enter($e->[3],substr($s,1),$d,$t,$fb);
   }
  else
   {
@@ -260,7 +387,8 @@ sub process
        ord($b) == ord($a->{$l}[1])+1 &&
        $a->{$l}[3] == $a->{$b}[3] &&
        $a->{$l}[4] == $a->{$b}[4] &&
-       $a->{$l}[5] == $a->{$b}[5]
+       $a->{$l}[5] == $a->{$b}[5] &&
+       $a->{$l}[6] == $a->{$b}[6]
        # && length($a->{$l}[2]) < 16
       )
     {
@@ -316,9 +444,10 @@ sub outtable
  print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
  foreach my $b (@{$a->{'Entries'}})
   {
-   my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}};
+   my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
    my $sc = ord($s);
    my $ec = ord($e);
+   $end |= 0x80 if $fb;
    print  $fh "{";
    if ($l)
     {
@@ -342,4 +471,60 @@ sub output
  outtable($fh,$a);
 }
 
+sub output_enc
+{
+ my ($fh,$name,$a) = @_;
+ foreach my $b (sort keys %$a)
+  {
+   my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
+  }
+}
+
+sub decode_U
+{
+ my $s = shift;
+
+}
+
+
+sub output_ucm_page
+{
+ my ($fh,$a,$t,$pre) = @_;
+ # warn sprintf("Page %x\n",$pre);
+ foreach my $b (sort keys %$t)
+  {
+   my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}};
+   die "oops $s $e" unless $s eq $e;
+   my $u = ord($s);
+   if ($n != $a && $n != $t)
+    {
+     output_ucm_page($fh,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF);
+    }
+   elsif (length($out))
+    {
+     if ($pre)
+      {
+       $u = $pre|($u &0x3f);
+      }
+     printf $fh "<U%04X> ",$u;
+     foreach my $c (split(//,$out))
+      {
+       printf $fh "\\x%02X",ord($c);
+      }
+     printf $fh " |%d\n",($fb ? 1 : 0);
+    }
+   else
+    {
+     warn join(',',@{$t->{$b}},$a,$t);
+    }
+  }
+}
+
+sub output_ucm
+{
+ my ($fh,$name,$a) = @_;
+ print $fh "CHARMAP\n";
+ output_ucm_page($fh,$a,$a,0);
+ print $fh "END CHARMAP\n";
+}
 
diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c
index f31725094d..513ef9ac5b 100644
--- a/ext/Encode/encengine.c
+++ b/ext/Encode/encengine.c
@@ -92,7 +92,7 @@ we add a flag to re-add the removed byte to the source we could handle
 #include "encode.h"
 
 int
-do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout)
+do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx)
 {
  const U8 *s    = src;
  const U8 *send = s+*slen;
@@ -106,9 +106,9 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR
    U8 byte = *s;
    while (byte > e->max)
     e++;
-   if (byte >= e->min && e->slen)
+   if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80)))
     {
-     const U8 *cend = s + e->slen;
+     const U8 *cend = s + (e->slen & 0x7f);
      if (cend <= send)
       {
        STRLEN n;
@@ -136,7 +136,11 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR
        enc = e->next;
        s++;
        if (s == cend)
-        last = s;
+        {
+         if (approx && (e->slen & 0x80))
+          code = ENCODE_FALLBACK;
+         last = s;
+        }
       }
      else
       {
diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h
index 604b97f99d..853ad041b4 100644
--- a/ext/Encode/encode.h
+++ b/ext/Encode/encode.h
@@ -28,12 +28,13 @@ struct encode_s
 
 #ifdef U8
 extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen,
-                     U8 *dst, STRLEN dlen, STRLEN *dout);
+                     U8 *dst, STRLEN dlen, STRLEN *dout, int approx);
 
 extern void Encode_DefineEncoding(encode_t *enc);
 #endif
 
-#define ENCODE_NOSPACE 1
-#define ENCODE_PARTIAL 2
-#define ENCODE_NOREP   3
+#define ENCODE_NOSPACE  1
+#define ENCODE_PARTIAL  2
+#define ENCODE_NOREP    3
+#define ENCODE_FALLBACK 4
 #endif
author	Jarkko Hietaniemi <jhi@iki.fi>	2001-01-01 18:11:44 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2001-01-01 18:11:44 +0000
commit	062365604b359a1490884f0ddc7e02f237691b4d (patch)
tree	efb04757523e81d0430d8ddf3f05680f146169e8 /ext/Encode
parent	ab648d5eb0a7286231b7140c37b59641898641f5 (diff)
parent	c8991b40a4a3c9ec8efc70a2420a12ff1ce50eb0 (diff)
download	perl-062365604b359a1490884f0ddc7e02f237691b4d.tar.gz