diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2001-01-01 18:11:44 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2001-01-01 18:11:44 +0000 |
commit | 062365604b359a1490884f0ddc7e02f237691b4d (patch) | |
tree | efb04757523e81d0430d8ddf3f05680f146169e8 /ext/Encode | |
parent | ab648d5eb0a7286231b7140c37b59641898641f5 (diff) | |
parent | c8991b40a4a3c9ec8efc70a2420a12ff1ce50eb0 (diff) | |
download | perl-062365604b359a1490884f0ddc7e02f237691b4d.tar.gz |
Integrate perlio:
[ 8290]
Loose the "Loading..." warning.
Another bug in fallback support
[ 8288]
Fix .ucm reading - forgot a chr()
Start of .ucm write (for .enc to .ucm)
[ 8285]
Start of support of ICU-style .ucm files:
- teach compile how to read a .ucm file
- first guess at how to represent fallbacks in "tries".
- use fallbacks if check == 0
- new return code to indicate we used one.
p4raw-link: @8290 on //depot/perlio: c8991b40a4a3c9ec8efc70a2420a12ff1ce50eb0
p4raw-link: @8288 on //depot/perlio: e0c49a6baed38305536b5d74ba7486451327612c
p4raw-link: @8285 on //depot/perlio: 9b37254de3a0e643859aebea34267588f789f15f
p4raw-id: //depot/perl@8292
Diffstat (limited to 'ext/Encode')
-rw-r--r-- | ext/Encode/Encode.pm | 11 | ||||
-rw-r--r-- | ext/Encode/Encode.xs | 6 | ||||
-rwxr-xr-x | ext/Encode/compile | 291 | ||||
-rw-r--r-- | ext/Encode/encengine.c | 12 | ||||
-rw-r--r-- | ext/Encode/encode.h | 9 |
5 files changed, 263 insertions, 66 deletions
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 0486a68c85..42c9e8471a 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -342,7 +342,7 @@ sub from_to # The global hash is declared in XS code $encoding{Unicode} = bless({},'Encode::Unicode'); -$encoding{iso10646-1} = bless({},'Encode::iso10646_1'); +$encoding{'iso10646-1'} = bless({},'Encode::iso10646_1'); sub encodings { @@ -378,7 +378,7 @@ sub loadEncoding last unless $type eq '#'; } $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table')); - # warn "Loading $file"; + #warn "Loading $file"; return $class->read($fh,$name,$type); } else @@ -408,7 +408,8 @@ sub getEncoding package Encode::Unicode; -# Dummy package that provides the encode interface +# Dummy package that provides the encode interface but leaves data +# as UTF-8 encoded. It is here so that from_to() sub name { 'Unicode' } @@ -533,7 +534,9 @@ sub fromUnicode return $str; } -package Encode::iso10646_1;# +package Encode::iso10646_1; +# Encoding is 16-bit network order Unicode +# Used for X font encodings sub name { 'iso10646-1' } diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index cca1ddcd7b..b61d89bd96 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -340,10 +340,14 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check) U8 *d = (U8 *) SvGROW(dst, 2*slen+1); STRLEN dlen = SvLEN(dst); int code; - while ((code = do_encode(dir,s,&slen,d,dlen,&dlen))) + while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check))) { SvCUR_set(dst,dlen); SvPOK_on(dst); + + if (code == ENCODE_FALLBACK) + break; + switch(code) { case ENCODE_NOSPACE: diff --git a/ext/Encode/compile b/ext/Encode/compile index fbb08cd2e9..b890a04d81 100755 --- a/ext/Encode/compile +++ b/ext/Encode/compile @@ -4,7 +4,7 @@ use strict; sub encode_U { - # UTF-8 encocde long hand - only covers part of perl's range + # UTF-8 encode long hand - only covers part of perl's range my $uv = shift; if ($uv < 0x80) { @@ -50,32 +50,47 @@ chmod(0666,$cname) if -f $cname && !-w $cname; open(C,">$cname") || die "Cannot open $cname:$!"; my $dname = $cname; $dname =~ s/(\.[^\.]*)?$/.def/; -chmod(0666,$dname) if -f $cname && !-w $dname; -open(D,">$dname") || die "Cannot open $dname:$!"; -my $hname = $cname; -$hname =~ s/(\.[^\.]*)?$/.h/; -chmod(0666,$hname) if -f $cname && !-w $hname; -open(H,">$hname") || die "Cannot open $hname:$!"; - -foreach my $fh (\*C,\*D,\*H) -{ - print $fh <<"END"; + +my ($doC,$doEnc,$doUcm); + +if ($cname =~ /\.(c|xs)$/) + { + $doC = 1; + chmod(0666,$dname) if -f $cname && !-w $dname; + open(D,">$dname") || die "Cannot open $dname:$!"; + my $hname = $cname; + $hname =~ s/(\.[^\.]*)?$/.h/; + chmod(0666,$hname) if -f $cname && !-w $hname; + open(H,">$hname") || die "Cannot open $hname:$!"; + + foreach my $fh (\*C,\*D,\*H) + { + print $fh <<"END"; /* !!!!!!! DO NOT EDIT THIS FILE !!!!!!! This file was autogenerated by: $^X $0 $cname @ARGV */ END -} + } -if ($cname =~ /(\w+)\.xs$/) + if ($cname =~ /(\w+)\.xs$/) + { + print C "#include <EXTERN.h>\n"; + print C "#include <perl.h>\n"; + print C "#include <XSUB.h>\n"; + print C "#define U8 U8\n"; + } + print C "#include \"encode.h\"\n"; + } +elsif ($cname =~ /\.enc$/) + { + $doEnc = 1; + } +elsif ($cname =~ /\.ucm$/) { - print C "#include <EXTERN.h>\n"; - print C "#include <perl.h>\n"; - print C "#include <XSUB.h>\n"; - print C "#define U8 U8\n"; + $doUcm = 1; } -print C "#include \"encode.h\"\n"; my %encoding; my %strings; @@ -96,10 +111,17 @@ sub cmp_name foreach my $enc (sort cmp_name @ARGV) { - my ($name) = $enc =~ /^.*?([\w-]+)(\.enc)$/; + my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/; if (open(E,$enc)) { - compile_enc(\*E,lc($name),\*C); + if ($sfx eq 'enc') + { + compile_enc(\*E,lc($name),\*C); + } + else + { + compile_ucm(\*E,lc($name),\*C); + } } else { @@ -107,33 +129,127 @@ foreach my $enc (sort cmp_name @ARGV) } } -foreach my $enc (sort cmp_name keys %encoding) +if ($doC) { - my $sym = "${enc}_encoding"; - $sym =~ s/\W+/_/g; - print C "encode_t $sym = \n"; - print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n\n"; - } + foreach my $enc (sort cmp_name keys %encoding) + { + my $sym = "${enc}_encoding"; + $sym =~ s/\W+/_/g; + print C "encode_t $sym = \n"; + print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n\n"; + } -foreach my $enc (sort cmp_name keys %encoding) - { - my $sym = "${enc}_encoding"; - $sym =~ s/\W+/_/g; - print H "extern encode_t $sym;\n"; - print D " Encode_Define(aTHX_ &$sym);\n"; - } + foreach my $enc (sort cmp_name keys %encoding) + { + my $sym = "${enc}_encoding"; + $sym =~ s/\W+/_/g; + print H "extern encode_t $sym;\n"; + print D " Encode_Define(aTHX_ &$sym);\n"; + } -if ($cname =~ /(\w+)\.xs$/) - { - my $mod = $1; - print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; - print C "BOOT:\n{\n"; - print C "#include \"$dname\"\n"; - print C "}\n"; + if ($cname =~ /(\w+)\.xs$/) + { + my $mod = $1; + print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; + print C "BOOT:\n{\n"; + print C "#include \"$dname\"\n"; + print C "}\n"; + } + close(D); + close(H); } close(C); -close(D); -close(H); + + +sub compile_ucm +{ + my ($fh,$name,$ch) = @_; + my $e2u = {}; + my $u2e = {}; + my $cs; + my %attr; + while (<$fh>) + { + s/#.*$//; + last if /^\s*CHARMAP\s*$/i; + if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) + { + $attr{$1} = $2; + } + } + if (!defined($cs = $attr{'code_set_name'})) + { + warn "No <code_set_name> in $name\n"; + } + else + { + # $name = lc($cs); + } + my $erep; + my $urep; + if (exists $attr{'subchar'}) + { + my @byte = $attr{'subchar'} =~ /^\s*(?:\\x([0-9a-f]+))+\s*$/; + $erep = join('',map(hex($_),@byte)); + } + warn "Scanning $name ($cs)\n"; + my $nfb = 0; + my $hfb = 0; + while (<$fh>) + { + s/#.*$//; + last if /^\s*END\s+CHARMAP\s*$/i; + next if /^\s*$/; + my ($u,@byte) = /^<U([0-9a-f]+)>\s+(?:\\x([0-9a-f]+))+\s*(\|[0-3]|)\s*$/i; + my $fb = pop(@byte); + if (defined($u)) + { + my $uch = encode_U(hex($u)); + my $ech = join('',map(chr(hex($_)),@byte)); + if (length($fb)) + { + $fb = substr($fb,1); + $hfb++; + } + else + { + $nfb++; + $fb = '0'; + } + # $fb is fallback flag + # 0 - round trip safe + # 1 - fallback for unicode -> enc + # 2 - skip sub-char mapping + # 3 - fallback enc -> unicode + enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/); + enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/); + } + else + { + warn $_; + } + + } + if ($nfb && $hfb) + { + die "$nfb entries without fallback, $hfb entries with\n"; + } + if ($doC) + { + output($ch,$name.'_utf8',$e2u); + output($ch,'utf8_'.$name,$u2e); + $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, + outstring($ch,$e2u->{Cname}.'_def',$erep),length($erep)]; + } + elsif ($doEnc) + { + output_enc($ch,$name,$e2u); + } + elsif ($doUcm) + { + output_ucm($ch,$name,$u2e); + } +} sub compile_enc { @@ -173,8 +289,8 @@ sub compile_enc if ($val || (!$ch && !$page)) { my $uch = encode_U($val); - enter($e2u,$ech,$uch,$e2u); - enter($u2e,$uch,$ech,$u2e); + enter($e2u,$ech,$uch,$e2u,0); + enter($u2e,$uch,$ech,$u2e,0); } else { @@ -185,26 +301,37 @@ sub compile_enc } } } - output($ch,$name.'_utf8',$e2u); - output($ch,'utf8_'.$name,$u2e); - $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, - outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)]; + if ($doC) + { + output($ch,$name.'_utf8',$e2u); + output($ch,'utf8_'.$name,$u2e); + $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, + outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)]; + } + elsif ($doEnc) + { + output_enc($ch,$name,$e2u); + } + elsif ($doUcm) + { + output_ucm($ch,$name,$u2e); + } } sub enter { - my ($a,$s,$d,$t) = @_; + my ($a,$s,$d,$t,$fb) = @_; $t = $a if @_ < 4; my $b = substr($s,0,1); my $e = $a->{$b}; unless ($e) { # 0 1 2 3 4 5 - $e = [$b,$b,'',{},length($s),0]; + $e = [$b,$b,'',{},length($s),0,$fb]; $a->{$b} = $e; } if (length($s) > 1) { - enter($e->[3],substr($s,1),$d,$t); + enter($e->[3],substr($s,1),$d,$t,$fb); } else { @@ -260,7 +387,8 @@ sub process ord($b) == ord($a->{$l}[1])+1 && $a->{$l}[3] == $a->{$b}[3] && $a->{$l}[4] == $a->{$b}[4] && - $a->{$l}[5] == $a->{$b}[5] + $a->{$l}[5] == $a->{$b}[5] && + $a->{$l}[6] == $a->{$b}[6] # && length($a->{$l}[2]) < 16 ) { @@ -316,9 +444,10 @@ sub outtable print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n"; foreach my $b (@{$a->{'Entries'}}) { - my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}}; + my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; my $sc = ord($s); my $ec = ord($e); + $end |= 0x80 if $fb; print $fh "{"; if ($l) { @@ -342,4 +471,60 @@ sub output outtable($fh,$a); } +sub output_enc +{ + my ($fh,$name,$a) = @_; + foreach my $b (sort keys %$a) + { + my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; + } +} + +sub decode_U +{ + my $s = shift; + +} + + +sub output_ucm_page +{ + my ($fh,$a,$t,$pre) = @_; + # warn sprintf("Page %x\n",$pre); + foreach my $b (sort keys %$t) + { + my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}}; + die "oops $s $e" unless $s eq $e; + my $u = ord($s); + if ($n != $a && $n != $t) + { + output_ucm_page($fh,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF); + } + elsif (length($out)) + { + if ($pre) + { + $u = $pre|($u &0x3f); + } + printf $fh "<U%04X> ",$u; + foreach my $c (split(//,$out)) + { + printf $fh "\\x%02X",ord($c); + } + printf $fh " |%d\n",($fb ? 1 : 0); + } + else + { + warn join(',',@{$t->{$b}},$a,$t); + } + } +} + +sub output_ucm +{ + my ($fh,$name,$a) = @_; + print $fh "CHARMAP\n"; + output_ucm_page($fh,$a,$a,0); + print $fh "END CHARMAP\n"; +} diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c index f31725094d..513ef9ac5b 100644 --- a/ext/Encode/encengine.c +++ b/ext/Encode/encengine.c @@ -92,7 +92,7 @@ we add a flag to re-add the removed byte to the source we could handle #include "encode.h" int -do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout) +do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx) { const U8 *s = src; const U8 *send = s+*slen; @@ -106,9 +106,9 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR U8 byte = *s; while (byte > e->max) e++; - if (byte >= e->min && e->slen) + if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) { - const U8 *cend = s + e->slen; + const U8 *cend = s + (e->slen & 0x7f); if (cend <= send) { STRLEN n; @@ -136,7 +136,11 @@ do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STR enc = e->next; s++; if (s == cend) - last = s; + { + if (approx && (e->slen & 0x80)) + code = ENCODE_FALLBACK; + last = s; + } } else { diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h index 604b97f99d..853ad041b4 100644 --- a/ext/Encode/encode.h +++ b/ext/Encode/encode.h @@ -28,12 +28,13 @@ struct encode_s #ifdef U8 extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, - U8 *dst, STRLEN dlen, STRLEN *dout); + U8 *dst, STRLEN dlen, STRLEN *dout, int approx); extern void Encode_DefineEncoding(encode_t *enc); #endif -#define ENCODE_NOSPACE 1 -#define ENCODE_PARTIAL 2 -#define ENCODE_NOREP 3 +#define ENCODE_NOSPACE 1 +#define ENCODE_PARTIAL 2 +#define ENCODE_NOREP 3 +#define ENCODE_FALLBACK 4 #endif |