diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2000-12-14 21:50:49 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2000-12-14 21:50:49 +0000 |
commit | 5fcc867f61c50d5a1d39c892190ca98e4c69a548 (patch) | |
tree | 837d141b35f05577f15bd27abecef3c591c72da5 /ext | |
parent | ba8309365e8a4d351bec425ce6b123adae09dde0 (diff) | |
parent | 311a0942a841b9a310fcf53d05fa79c643b68a12 (diff) | |
download | perl-5fcc867f61c50d5a1d39c892190ca98e4c69a548.tar.gz |
Integrate perlio:
[ 8106]
"Cold" build correction to Encode's Makefile.PL
[ 8105]
"Compiled" encodings.
Correct replacement character in EBCDIC .enc files
Add 0x7F to ASCII repertoire.
[ 8103]
Beginings of compiled encodings - checked in as a snapshot of thoughts
so far and so it does not get lost.
p4raw-link: @8106 on //depot/perlio: 311a0942a841b9a310fcf53d05fa79c643b68a12
p4raw-link: @8105 on //depot/perlio: 2f2b4ff2c154a8e461857f2e82cb815c238d0d94
p4raw-link: @8103 on //depot/perlio: 017e2addf6da99b3f648d9518de5a848be394ab8
p4raw-id: //depot/perl@8114
Diffstat (limited to 'ext')
-rw-r--r-- | ext/Encode/Encode.pm | 8 | ||||
-rw-r--r-- | ext/Encode/Encode.xs | 129 | ||||
-rw-r--r-- | ext/Encode/Encode/ascii.enc | 2 | ||||
-rw-r--r-- | ext/Encode/Makefile.PL | 39 | ||||
-rwxr-xr-x | ext/Encode/compile | 308 | ||||
-rw-r--r-- | ext/Encode/encengine.c | 160 | ||||
-rw-r--r-- | ext/Encode/encode.h | 39 |
7 files changed, 681 insertions, 4 deletions
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 8ba7232d5e..db471cb059 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -340,9 +340,9 @@ sub from_to return length($_[0] = $string); } -my %encoding = ( Unicode => bless({},'Encode::Unicode'), - 'iso10646-1' => bless({},'Encode::iso10646_1'), - ); +# The global hash is declared in XS code +$encoding{Unicode} = bless({},'Encode::Unicode'); +$encoding{iso10646-1} = bless({},'Encode::iso10646_1'); sub encodings { @@ -378,6 +378,7 @@ sub loadEncoding last unless $type eq '#'; } $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table')); + warn "Loading $file"; return $class->read($fh,$name,$type); } else @@ -568,6 +569,7 @@ sub fromUnicode return $str; } + package Encode::Escape; use Carp; diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 00c830a67d..cca1ddcd7b 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,6 +1,11 @@ #include "EXTERN.h" #include "perl.h" #include "XSUB.h" +#define U8 U8 +#include "encode.h" +#include "iso8859.h" +#include "EBCDIC.h" +#include "Symbols.h" #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \ Perl_croak(aTHX_ "panic_unimplemented"); \ @@ -313,8 +318,129 @@ PerlIO_funcs PerlIO_encode = { }; #endif +void +Encode_Define(pTHX_ encode_t *enc) +{ + HV *hash = get_hv("Encode::encoding",GV_ADD|GV_ADDMULTI); + HV *stash = gv_stashpv("Encode::XS", TRUE); + SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash); + hv_store(hash,enc->name,strlen(enc->name),sv,0); +} + void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {} +static SV * +encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check) +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src,slen); + SV *dst = sv_2mortal(newSV(2*slen+1)); + if (slen) + { + U8 *d = (U8 *) SvGROW(dst, 2*slen+1); + STRLEN dlen = SvLEN(dst); + int code; + while ((code = do_encode(dir,s,&slen,d,dlen,&dlen))) + { + SvCUR_set(dst,dlen); + SvPOK_on(dst); + switch(code) + { + case ENCODE_NOSPACE: + { + STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN); + if (need <= SvLEN(dst)) + need += UTF8_MAXLEN; + d = (U8 *) SvGROW(dst, need); + dlen = SvLEN(dst); + slen = SvCUR(src); + break; + } + + case ENCODE_NOREP: + if (dir == enc->f_utf8) + { + if (!check && ckWARN_d(WARN_UTF8)) + { + STRLEN clen; + UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0); + Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%x}\" does not map to %s", ch, enc->name); + /* FIXME: Skip over the character, copy in replacement and continue + * but that is messy so for now just fail. + */ + return &PL_sv_undef; + } + else + { + return &PL_sv_undef; + } + } + else + { + /* UTF-8 is supposed to be "Universal" so should not happen */ + Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8", + enc->name, (SvCUR(src)-slen),s+slen); + } + break; + + case ENCODE_PARTIAL: + if (!check && ckWARN_d(WARN_UTF8)) + { + Perl_warner(aTHX_ WARN_UTF8, "Partial %s character", + (dir == enc->f_utf8) ? "UTF-8" : enc->name); + } + return &PL_sv_undef; + + default: + Perl_croak(aTHX_ "Unexpected code %d converting %s %s", + code, (dir == enc->f_utf8) ? "to" : "from",enc->name); + return &PL_sv_undef; + } + } + SvCUR_set(dst,dlen); + SvPOK_on(dst); + if (check) + { + if (slen < SvCUR(src)) + { + Move(s+slen,s,SvCUR(src)-slen,U8); + } + SvCUR_set(src,SvCUR(src)-slen); + } + } + return dst; +} + +MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_ + +PROTOTYPES: ENABLE + +void +Encode_toUnicode(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: + { + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); + SvUTF8_on(ST(0)); + XSRETURN(1); + } + +void +Encode_fromUnicode(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: + { + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + sv_utf8_upgrade(src); + ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); + XSRETURN(1); + } + MODULE = Encode PACKAGE = Encode PROTOTYPES: ENABLE @@ -548,4 +674,7 @@ BOOT: #ifdef USE_PERLIO PerlIO_define_layer(&PerlIO_encode); #endif +#include "iso8859.def" +#include "EBCDIC.def" +#include "Symbols.def" } diff --git a/ext/Encode/Encode/ascii.enc b/ext/Encode/Encode/ascii.enc index e0320b8c58..284a9f51d2 100644 --- a/ext/Encode/Encode/ascii.enc +++ b/ext/Encode/Encode/ascii.enc @@ -9,7 +9,7 @@ S 0040004100420043004400450046004700480049004A004B004C004D004E004F 0050005100520053005400550056005700580059005A005B005C005D005E005F 0060006100620063006400650066006700680069006A006B006C006D006E006F -0070007100720073007400750076007700780079007A007B007C007D007E0000 +0070007100720073007400750076007700780079007A007B007C007D007E007F 0000000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL index 329937e0e2..f0d57feeca 100644 --- a/ext/Encode/Makefile.PL +++ b/ext/Encode/Makefile.PL @@ -1,7 +1,10 @@ use ExtUtils::MakeMaker; +my @tables = qw(iso8859 EBCDIC Symbols); + WriteMakefile( NAME => "Encode", VERSION_FROM => 'Encode.pm', + OBJECT => '$(O_FILES)', 'dist' => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', @@ -9,3 +12,39 @@ WriteMakefile( }, MAN3PODS => {}, ); + +package MY; + + +sub post_initialize +{ + my ($self) = @_; + push(@{$self->{'O_FILES'}},map("$_\$(OBJ_EXT)",@tables)); + $self->{'clean'}{'FILES'} .= join(' ',map("$_.*",@tables)); + return ''; +} + +sub clean +{ + my ($self) = @_; + return $self->SUPER::clean . qq[ + \$(RM_F) ].join(' ',map("$_.*",@tables))."\n" +} + +sub postamble +{ + return ' + +Encode$(OBJ_EXT) : iso8859$(OBJ_EXT) EBCDIC$(OBJ_EXT) Symbols$(OBJ_EXT) + +iso8859.c : compile Makefile + $(PERL) compile $@ Encode/ascii.enc Encode/iso8859*.enc Encode/cp1250.enc + +EBCDIC.c : compile Makefile Encode/cp1047.enc Encode/cp37.enc Encode/posix-bc.enc + $(PERL) compile $@ Encode/cp1047.enc Encode/cp37.enc Encode/posix-bc.enc + +Symbols.c : compile Makefile Encode/symbol.enc Encode/dingbats.enc + $(PERL) compile $@ Encode/symbol.enc Encode/dingbats.enc + +' +} diff --git a/ext/Encode/compile b/ext/Encode/compile new file mode 100755 index 0000000000..7020b9f186 --- /dev/null +++ b/ext/Encode/compile @@ -0,0 +1,308 @@ +#!../../perl -w +BEGIN { @INC = '../../lib' }; +use strict; + +sub encode_U +{ + my $uv = shift; + if ($uv < 0x80) + { + return chr($uv) + } + if ($uv < 0x800) + { + return chr(($uv >> 6) | 0xC0). + chr(($uv & 0x3F) | 0x80); + } + return chr(($uv >> 12) | 0xE0). + chr((($uv >> 6) & 0x3F) | 0x80). + chr(($uv & 0x3F) | 0x80); +} + +sub encode_S +{ + my ($ch,$page) = @_; + return chr($ch); +} + +sub encode_D +{ + my ($ch,$page) = @_; + return chr($page).chr($ch); +} + +sub encode_M +{ + my ($ch,$page) = @_; + return &encode_D if $page; + return &encode_S; +} + +my $cname = shift(@ARGV); +chmod(0666,$cname) if -f $cname && !-w $cname; +open(C,">$cname") || die "Cannot open $cname:$!"; +my $dname = $cname; +$dname =~ s/(\.[^\.]*)?$/.def/; +chmod(0666,$dname) if -f $cname && !-w $dname; +open(D,">$dname") || die "Cannot open $dname:$!"; +my $hname = $cname; +$hname =~ s/(\.[^\.]*)?$/.h/; +chmod(0666,$hname) if -f $cname && !-w $hname; +open(H,">$hname") || die "Cannot open $hname:$!"; + +if ($cname =~ /(\w+)\.xs$/) + { + print C "#include <EXTERN.h>\n"; + print C "#include <perl.h>\n"; + print C "#include <XSUB.h>\n"; + print C "#define U8 U8\n"; + } +print C "#include \"encode.h\"\n"; + +my %encoding; +my %strings; + +sub cmp_name +{ + if ($a =~ /^.*-(\d+)/) + { + my $an = $1; + if ($b =~ /^.*-(\d+)/) + { + my $r = $an <=> $1; + return $r if $r; + } + } + return $a cmp $b; +} + +foreach my $enc (sort cmp_name @ARGV) + { + my ($name) = $enc =~ /^.*?([\w-]+)(\.enc)$/; + if (open(E,$enc)) + { + compile(\*E,$name,\*C); + } + else + { + warn "Cannot open $enc for $name:$!"; + } + } + +foreach my $enc (sort cmp_name keys %encoding) + { + my $sym = "${enc}_encoding"; + $sym =~ s/\W+/_/g; + print C "encode_t $sym = \n"; + print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n"; + } + +foreach my $enc (sort cmp_name keys %encoding) + { + my $sym = "${enc}_encoding"; + $sym =~ s/\W+/_/g; + print H "extern encode_t $sym;\n"; + print D " Encode_Define(aTHX_ &$sym);\n"; + } + +if ($cname =~ /(\w+)\.xs$/) + { + my $mod = $1; + print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; + print C "BOOT:\n{\n"; + print C "#include \"$dname\"\n"; + print C "}\n"; + } +close(C); +close(D); +close(H); + + + +sub compile +{ + my ($fh,$name,$ch) = @_; + my $e2u = {}; + my $u2e = {}; + + my $type; + while ($type = <$fh>) + { + last if $type !~ /^\s*#/; + } + chomp($type); + return if $type eq 'E'; + my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); + warn "t=$type s=$sym d=$def p=$pages $name\n"; + my $rep = ''; + { + my $v = hex($def); + no strict 'refs'; + $rep = &{"encode_$type"}($v & 0xFF, ($v >> 8) & 0xffe); + } + while ($pages--) + { + my $line = <$fh>; + chomp($line); + my $page = hex($line); + my $ch = 0; + for (my $i = 0; $i < 16; $i++) + { + my $line = <$fh>; + for (my $j = 0; $j < 16; $j++) + { + no strict 'refs'; + my $ech = &{"encode_$type"}($ch,$page); + my $val = hex(substr($line,0,4,'')); + if ($val || (!$ch && !$page)) + { + my $uch = encode_U($val); + enter($e2u,$ech,$uch,$e2u); + enter($u2e,$uch,$ech,$u2e); + } + else + { + # No character at this position + # enter($e2u,$ech,undef,$e2u); + } + $ch++; + } + } + } + output($ch,$name.'_utf8',$e2u); + output($ch,'utf8_'.$name,$u2e); + $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, + outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)]; +} + +sub enter +{ + my ($a,$s,$d,$t) = @_; + $t = $a if @_ < 4; + my $b = substr($s,0,1); + my $e = $a->{$b}; + unless ($e) + { # 0 1 2 3 4 5 + $e = [$b,$b,'',{},length($s),0]; + $a->{$b} = $e; + } + if (length($s) > 1) + { + enter($e->[3],substr($s,1),$d,$t); + } + else + { + $e->[2] = $d; + $e->[3] = $t; + $e->[5] = length($d); + } +} + + +sub outstring +{ + my ($fh,$name,$s) = @_; + my $sym = $strings{$s}; + unless ($sym) + { + foreach my $o (keys %strings) + { + my $i = index($o,$s); + if ($i >= 0) + { + $sym = $strings{$o}; + $sym .= sprintf("+0x%02x",$i) if ($i); + return $sym; + } + } + $strings{$s} = $sym = $name; + printf $fh "static const U8 %s[%d] =\n",$name,length($s); + # Do in chunks of 16 chars to constrain line length + # Assumes ANSI C adjacent string litteral concatenation + while (length($s)) + { + my $c = substr($s,0,16,''); + print $fh '"',join('',map(sprintf('\x%02x',ord($_)),split(//,$c))),'"'; + print $fh "\n" if length($s); + } + printf $fh ";\n\n"; + } + return $sym; +} + + + +sub output +{ + my ($fh,$name,$a) = @_; + $name =~ s/\W+/_/g; + $a->{Cname} = $name; + my @keys = grep(ref($a->{$_}),sort keys %$a); + print $fh "\nstatic encpage_t $name\[\];\n"; + # Sub-tables + my %str; + my $l; + my @ent; + foreach my $b (@keys) + { + my ($s,$f,$out,$t,$end) = @{$a->{$b}}; + if (defined($l) && + ord($b) == ord($a->{$l}[1])+1 && + $a->{$l}[3] == $a->{$b}[3] && + $a->{$l}[4] == $a->{$b}[4] && + $a->{$l}[5] == $a->{$b}[5] + # && length($a->{$l}[2]) < 16 + ) + { + my $i = ord($b)-ord($a->{$l}[0]); + $a->{$l}[1] = $b; + $a->{$l}[2] .= $a->{$b}[2]; + } + else + { + $l = $b; + push(@ent,$b); + } + unless (exists $t->{Cname}) + { + output($fh,sprintf("%s_%02x",$name,ord($s)),$t); + } + } + if (ord($keys[-1]) < 255) + { + my $t = chr(ord($keys[-1])+1); + $a->{$t} = [$t,chr(255),undef,$a,0,0]; + push(@ent,$t); + } + # String tables + foreach my $b (@ent) + { + next unless $a->{$b}[5]; + my $s = ord($a->{$b}[0]); + my $e = ord($a->{$b}[1]); + outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$a->{$b}[2]); + } + + print $fh "\n"; + print $fh "static encpage_t $name\[",scalar(@ent),"] = {\n"; + foreach my $b (@ent) + { + my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}}; + my $sc = ord($s); + my $ec = ord($e); + print $fh "{"; + if ($l) + { + printf $fh outstring($fh,'',$out); + } + else + { + print $fh "0"; + } + print $fh ",",$t->{Cname}; + printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec; + } + print $fh "};\n\n"; +} + + diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c new file mode 100644 index 0000000000..f31725094d --- /dev/null +++ b/ext/Encode/encengine.c @@ -0,0 +1,160 @@ +/* +Data structures for encoding transformations. + +Perl works internally in either a native 'byte' encoding or +in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t" +representation. When we do we can use utf8_to_uv(). + +Most character encodings are either simple byte mappings or +variable length multi-byte encodings. UTF-8 can be viewed as a +rather extreme case of the latter. + +So to solve an important part of perl's encode needs we need to solve the +"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate +case. (Where one of multi-bytes will usually be UTF-8.) + +The other type of encoding is a shift encoding where a prefix sequence +determines what subsequent bytes mean. Such encodings have state. + +We also need to handle case where a character in one encoding has to be +represented as multiple characters in the other. e.g. letter+diacritic. + +The process can be considered as pseudo perl: + +my $dst = ''; +while (length($src)) + { + my $size = $count($src); + my $in_seq = substr($src,0,$size,''); + my $out_seq = $s2d_hash{$in_seq}; + if (defined $out_seq) + { + $dst .= $out_seq; + } + else + { + # an error condition + } + } +return $dst; + +That has the following components: + &src_count - a "rule" for how many bytes make up the next character in the + source. + %s2d_hash - a mapping from input sequences to output sequences + +The problem with that scheme is that it does not allow the output +character repertoire to affect the characters considered from the +input. + +So we use a "trie" representation which can also be considered +a state machine: + +my $dst = ''; +my $seq = \@s2d_seq; +my $next = \@s2d_next; +while (length($src)) + { + my $byte = $substr($src,0,1,''); + my $out_seq = $seq->[$byte]; + if (defined $out_seq) + { + $dst .= $out_seq; + } + else + { + # an error condition + } + ($next,$seq) = @$next->[$byte] if $next; + } +return $dst; + +There is now a pair of data structures to represent everything. +It is valid for output sequence at a particular point to +be defined but zero length, that just means "don't know yet". +For the single byte case there is no 'next' so new tables will be the same as +the original tables. For a multi-byte case a prefix byte will flip to the tables +for the next page (adding nothing to the output), then the tables for the page +will provide the actual output and set tables back to original base page. + +This scheme can also handle shift encodings. + +A slight enhancement to the scheme also allows for look-ahead - if +we add a flag to re-add the removed byte to the source we could handle + a" -> ä + ab -> a (and take b back please) + +*/ + +#include <EXTERN.h> +#include <perl.h> +#define U8 U8 +#include "encode.h" + +int +do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout) +{ + const U8 *s = src; + const U8 *send = s+*slen; + const U8 *last = s; + U8 *d = dst; + U8 *dend = d+dlen; + int code = 0; + while (s < send) + { + encpage_t *e = enc; + U8 byte = *s; + while (byte > e->max) + e++; + if (byte >= e->min && e->slen) + { + const U8 *cend = s + e->slen; + if (cend <= send) + { + STRLEN n; + if ((n = e->dlen)) + { + const U8 *out = e->seq+n*(byte - e->min); + U8 *oend = d+n; + if (dst) + { + if (oend <= dend) + { + while (d < oend) + *d++ = *out++; + } + else + { + /* Out of space */ + code = ENCODE_NOSPACE; + break; + } + } + else + d = oend; + } + enc = e->next; + s++; + if (s == cend) + last = s; + } + else + { + /* partial source character */ + code = ENCODE_PARTIAL; + break; + } + } + else + { + /* Cannot represent */ + code = ENCODE_NOREP; + break; + } + } + *slen = last - src; + *dout = d - dst; + return code; +} + + diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h new file mode 100644 index 0000000000..604b97f99d --- /dev/null +++ b/ext/Encode/encode.h @@ -0,0 +1,39 @@ +#ifndef ENCODE_H +#define ENCODE_H +#ifndef U8 +typedef unsigned char U8; +#endif + +typedef struct encpage_s encpage_t; + +struct encpage_s +{ + const U8 *seq; + encpage_t *next; + U8 min; + U8 max; + U8 dlen; + U8 slen; +}; + +typedef struct encode_s encode_t; +struct encode_s +{ + const char *name; + encpage_t *t_utf8; + encpage_t *f_utf8; + const U8 *rep; + int replen; +}; + +#ifdef U8 +extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, + U8 *dst, STRLEN dlen, STRLEN *dout); + +extern void Encode_DefineEncoding(encode_t *enc); +#endif + +#define ENCODE_NOSPACE 1 +#define ENCODE_PARTIAL 2 +#define ENCODE_NOREP 3 +#endif |