diff options
author | Nick Ing-Simmons <nik@tiuk.ti.com> | 2000-12-14 20:09:37 +0000 |
---|---|---|
committer | Nick Ing-Simmons <nik@tiuk.ti.com> | 2000-12-14 20:09:37 +0000 |
commit | 2f2b4ff2c154a8e461857f2e82cb815c238d0d94 (patch) | |
tree | bb042541ca37290e3cf81c34f7bdf6faf3301f7d | |
parent | 5689539bfd8a909e5d9e6f7106c2ee986b0fc447 (diff) | |
download | perl-2f2b4ff2c154a8e461857f2e82cb815c238d0d94.tar.gz |
"Compiled" encodings.
Correct replacement character in EBCDIC .enc files
Add 0x7F to ASCII repertoire.
p4raw-id: //depot/perlio@8105
-rw-r--r-- | ext/Encode/Encode.pm | 8 | ||||
-rw-r--r-- | ext/Encode/Encode.xs | 129 | ||||
-rw-r--r-- | ext/Encode/Encode/ascii.enc | 2 | ||||
-rw-r--r-- | ext/Encode/Encode/cp1047.enc | 2 | ||||
-rw-r--r-- | ext/Encode/Encode/cp37.enc | 2 | ||||
-rw-r--r-- | ext/Encode/Encode/posix-bc.enc | 2 | ||||
-rw-r--r-- | ext/Encode/Makefile.PL | 21 | ||||
-rwxr-xr-x | ext/Encode/compile | 85 | ||||
-rw-r--r-- | ext/Encode/encengine.c | 68 | ||||
-rw-r--r-- | ext/Encode/encode.h | 13 |
10 files changed, 287 insertions, 45 deletions
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 8ba7232d5e..db471cb059 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -340,9 +340,9 @@ sub from_to return length($_[0] = $string); } -my %encoding = ( Unicode => bless({},'Encode::Unicode'), - 'iso10646-1' => bless({},'Encode::iso10646_1'), - ); +# The global hash is declared in XS code +$encoding{Unicode} = bless({},'Encode::Unicode'); +$encoding{iso10646-1} = bless({},'Encode::iso10646_1'); sub encodings { @@ -378,6 +378,7 @@ sub loadEncoding last unless $type eq '#'; } $class .= ('::'.(($type eq 'E') ? 'Escape' : 'Table')); + warn "Loading $file"; return $class->read($fh,$name,$type); } else @@ -568,6 +569,7 @@ sub fromUnicode return $str; } + package Encode::Escape; use Carp; diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index f8901bb5c4..3bd8e95c66 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -1,6 +1,11 @@ #include "EXTERN.h" #include "perl.h" #include "XSUB.h" +#define U8 U8 +#include "encode.h" +#include "iso8859.h" +#include "EBCDIC.h" +#include "Symbols.h" #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \ Perl_croak(aTHX_ "panic_unimplemented"); \ @@ -313,8 +318,129 @@ PerlIO_funcs PerlIO_encode = { }; #endif +void +Encode_Define(pTHX_ encode_t *enc) +{ + HV *hash = get_hv("Encode::encoding",GV_ADD|GV_ADDMULTI); + HV *stash = gv_stashpv("Encode::XS", TRUE); + SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash); + hv_store(hash,enc->name,strlen(enc->name),sv,0); +} + void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {} +static SV * +encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check) +{ + STRLEN slen; + U8 *s = (U8 *) SvPV(src,slen); + SV *dst = sv_2mortal(newSV(2*slen+1)); + if (slen) + { + U8 *d = (U8 *) SvGROW(dst, 2*slen+1); + STRLEN dlen = SvLEN(dst); + int code; + while ((code = do_encode(dir,s,&slen,d,dlen,&dlen))) + { + SvCUR_set(dst,dlen); + SvPOK_on(dst); + switch(code) + { + case ENCODE_NOSPACE: + { + STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN); + if (need <= SvLEN(dst)) + need += UTF8_MAXLEN; + d = (U8 *) SvGROW(dst, need); + dlen = SvLEN(dst); + slen = SvCUR(src); + break; + } + + case ENCODE_NOREP: + if (dir == enc->f_utf8) + { + if (!check && ckWARN_d(WARN_UTF8)) + { + STRLEN clen; + UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0); + Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%x}\" does not map to %s", ch, enc->name); + /* FIXME: Skip over the character, copy in replacement and continue + * but that is messy so for now just fail. + */ + return &PL_sv_undef; + } + else + { + return &PL_sv_undef; + } + } + else + { + /* UTF-8 is supposed to be "Universal" so should not happen */ + Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8", + enc->name, (SvCUR(src)-slen),s+slen); + } + break; + + case ENCODE_PARTIAL: + if (!check && ckWARN_d(WARN_UTF8)) + { + Perl_warner(aTHX_ WARN_UTF8, "Partial %s character", + (dir == enc->f_utf8) ? "UTF-8" : enc->name); + } + return &PL_sv_undef; + + default: + Perl_croak(aTHX_ "Unexpected code %d converting %s %s", + code, (dir == enc->f_utf8) ? "to" : "from",enc->name); + return &PL_sv_undef; + } + } + SvCUR_set(dst,dlen); + SvPOK_on(dst); + if (check) + { + if (slen < SvCUR(src)) + { + Move(s+slen,s,SvCUR(src)-slen,U8); + } + SvCUR_set(src,SvCUR(src)-slen); + } + } + return dst; +} + +MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_ + +PROTOTYPES: ENABLE + +void +Encode_toUnicode(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: + { + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); + SvUTF8_on(ST(0)); + XSRETURN(1); + } + +void +Encode_fromUnicode(obj,src,check = 0) +SV * obj +SV * src +int check +CODE: + { + encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); + sv_utf8_upgrade(src); + ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); + XSRETURN(1); + } + MODULE = Encode PACKAGE = Encode PROTOTYPES: ENABLE @@ -548,4 +674,7 @@ BOOT: #ifdef USE_PERLIO PerlIO_define_layer(&PerlIO_encode); #endif +#include "iso8859.def" +#include "EBCDIC.def" +#include "Symbols.def" } diff --git a/ext/Encode/Encode/ascii.enc b/ext/Encode/Encode/ascii.enc index e0320b8c58..284a9f51d2 100644 --- a/ext/Encode/Encode/ascii.enc +++ b/ext/Encode/Encode/ascii.enc @@ -9,7 +9,7 @@ S 0040004100420043004400450046004700480049004A004B004C004D004E004F 0050005100520053005400550056005700580059005A005B005C005D005E005F 0060006100620063006400650066006700680069006A006B006C006D006E006F -0070007100720073007400750076007700780079007A007B007C007D007E0000 +0070007100720073007400750076007700780079007A007B007C007D007E007F 0000000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 0000000000000000000000000000000000000000000000000000000000000000 diff --git a/ext/Encode/Encode/cp1047.enc b/ext/Encode/Encode/cp1047.enc index 35e0ed3a70..8956fa4558 100644 --- a/ext/Encode/Encode/cp1047.enc +++ b/ext/Encode/Encode/cp1047.enc @@ -1,6 +1,6 @@ # Encoding file: cp1047 (EBCDIC), single-byte S -003F 0 1 +006F 0 1 00 0000000100020003009C00090086007F0097008D008E000B000C000D000E000F 0010001100120013009D000A00080087001800190092008F001C001D001E001F diff --git a/ext/Encode/Encode/cp37.enc b/ext/Encode/Encode/cp37.enc index f4b575ef5a..94d8c335bf 100644 --- a/ext/Encode/Encode/cp37.enc +++ b/ext/Encode/Encode/cp37.enc @@ -1,6 +1,6 @@ # Encoding file: cp37 (EBCDIC), single-byte S -003F 0 1 +006F 0 1 00 0000000100020003009C00090086007F0097008D008E000B000C000D000E000F 0010001100120013009D008500080087001800190092008F001C001D001E001F diff --git a/ext/Encode/Encode/posix-bc.enc b/ext/Encode/Encode/posix-bc.enc index d54f5bb24c..8b533a4933 100644 --- a/ext/Encode/Encode/posix-bc.enc +++ b/ext/Encode/Encode/posix-bc.enc @@ -1,6 +1,6 @@ # Encoding file: posix-bc (EBCDIC), single-byte S -003F 0 1 +006F 0 1 00 0000000100020003009C00090086007F0097008D008E000B000C000D000E000F 0010001100120013009D000A00080087001800190092008F001C001D001E001F diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL index 329937e0e2..fd742ccc5a 100644 --- a/ext/Encode/Makefile.PL +++ b/ext/Encode/Makefile.PL @@ -2,6 +2,7 @@ use ExtUtils::MakeMaker; WriteMakefile( NAME => "Encode", VERSION_FROM => 'Encode.pm', + OBJECT => '$(O_FILES)', 'dist' => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', @@ -9,3 +10,23 @@ WriteMakefile( }, MAN3PODS => {}, ); + +package MY; + +sub postamble +{ + return ' + +Encode$(OBJ_EXT) : iso8859$(OBJ_EXT) EBCDIC$(OBJ_EXT) Symbols$(OBJ_EXT) + +iso8859.c : compile Makefile + $(PERL) compile $@ Encode/ascii.enc Encode/iso8859*.enc Encode/cp1250.enc + +EBCDIC.c : compile Makefile Encode/cp1047.enc Encode/cp37.enc Encode/posix-bc.enc + $(PERL) compile $@ Encode/cp1047.enc Encode/cp37.enc Encode/posix-bc.enc + +Symbols.c : compile Makefile Encode/symbol.enc Encode/dingbats.enc + $(PERL) compile $@ Encode/symbol.enc Encode/dingbats.enc + +' +} diff --git a/ext/Encode/compile b/ext/Encode/compile index e2db8abb85..7020b9f186 100755 --- a/ext/Encode/compile +++ b/ext/Encode/compile @@ -1,5 +1,5 @@ #!../../perl -w -@INC = '../../lib'; +BEGIN { @INC = '../../lib' }; use strict; sub encode_U @@ -39,13 +39,44 @@ sub encode_M } my $cname = shift(@ARGV); +chmod(0666,$cname) if -f $cname && !-w $cname; open(C,">$cname") || die "Cannot open $cname:$!"; +my $dname = $cname; +$dname =~ s/(\.[^\.]*)?$/.def/; +chmod(0666,$dname) if -f $cname && !-w $dname; +open(D,">$dname") || die "Cannot open $dname:$!"; +my $hname = $cname; +$hname =~ s/(\.[^\.]*)?$/.h/; +chmod(0666,$hname) if -f $cname && !-w $hname; +open(H,">$hname") || die "Cannot open $hname:$!"; + +if ($cname =~ /(\w+)\.xs$/) + { + print C "#include <EXTERN.h>\n"; + print C "#include <perl.h>\n"; + print C "#include <XSUB.h>\n"; + print C "#define U8 U8\n"; + } print C "#include \"encode.h\"\n"; my %encoding; my %strings; -foreach my $enc (@ARGV) +sub cmp_name +{ + if ($a =~ /^.*-(\d+)/) + { + my $an = $1; + if ($b =~ /^.*-(\d+)/) + { + my $r = $an <=> $1; + return $r if $r; + } + } + return $a cmp $b; +} + +foreach my $enc (sort cmp_name @ARGV) { my ($name) = $enc =~ /^.*?([\w-]+)(\.enc)$/; if (open(E,$enc)) @@ -58,14 +89,33 @@ foreach my $enc (@ARGV) } } -print C "encode_t encodings[] = {\n"; -foreach my $enc (sort keys %encoding) +foreach my $enc (sort cmp_name keys %encoding) + { + my $sym = "${enc}_encoding"; + $sym =~ s/\W+/_/g; + print C "encode_t $sym = \n"; + print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n"; + } + +foreach my $enc (sort cmp_name keys %encoding) { - print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"},\n"; + my $sym = "${enc}_encoding"; + $sym =~ s/\W+/_/g; + print H "extern encode_t $sym;\n"; + print D " Encode_Define(aTHX_ &$sym);\n"; } -print C " {0,0,0,0,0}\n};\n"; +if ($cname =~ /(\w+)\.xs$/) + { + my $mod = $1; + print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; + print C "BOOT:\n{\n"; + print C "#include \"$dname\"\n"; + print C "}\n"; + } close(C); +close(D); +close(H); @@ -155,21 +205,20 @@ sub outstring my $sym = $strings{$s}; unless ($sym) { - if (1) + foreach my $o (keys %strings) { - foreach my $o (keys %strings) + my $i = index($o,$s); + if ($i >= 0) { - my $i = index($o,$s); - if ($i >= 0) - { - $sym = $strings{$o}; - $sym .= sprintf("+0x%02x",$i) if ($i); - return $sym; - } + $sym = $strings{$o}; + $sym .= sprintf("+0x%02x",$i) if ($i); + return $sym; } } $strings{$s} = $sym = $name; printf $fh "static const U8 %s[%d] =\n",$name,length($s); + # Do in chunks of 16 chars to constrain line length + # Assumes ANSI C adjacent string litteral concatenation while (length($s)) { my $c = substr($s,0,16,''); @@ -201,7 +250,9 @@ sub output ord($b) == ord($a->{$l}[1])+1 && $a->{$l}[3] == $a->{$b}[3] && $a->{$l}[4] == $a->{$b}[4] && - $a->{$l}[5] == $a->{$b}[5] ) + $a->{$l}[5] == $a->{$b}[5] + # && length($a->{$l}[2]) < 16 + ) { my $i = ord($b)-ord($a->{$l}[0]); $a->{$l}[1] = $b; @@ -249,7 +300,7 @@ sub output print $fh "0"; } print $fh ",",$t->{Cname}; - printf $fh ",0x%02x,0x%02x,$end},\n",$sc,$ec; + printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec; } print $fh "};\n\n"; } diff --git a/ext/Encode/encengine.c b/ext/Encode/encengine.c index a73be737e2..f31725094d 100644 --- a/ext/Encode/encengine.c +++ b/ext/Encode/encengine.c @@ -91,44 +91,70 @@ we add a flag to re-add the removed byte to the source we could handle #define U8 U8 #include "encode.h" -STRLEN -translate(encpage_t *enc, const U8 *src, STRLEN slen, U8 *dst, STRLEN dlen) +int +do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout) { - const U8 *send = src+slen; - U8 *dend = dst+dlen; - U8 *dptr = dst; - while (src < send) + const U8 *s = src; + const U8 *send = s+*slen; + const U8 *last = s; + U8 *d = dst; + U8 *dend = d+dlen; + int code = 0; + while (s < send) { encpage_t *e = enc; - U8 byte = *src++; + U8 byte = *s; while (byte > e->max) e++; - if (byte >= e->min) + if (byte >= e->min && e->slen) { - STRLEN n = e->dlen; - if (n) + const U8 *cend = s + e->slen; + if (cend <= send) { - const U8 *out = e->seq+n*(byte - e->min); - STRLEN n = *out++; - if (dptr+n <= dend) + STRLEN n; + if ((n = e->dlen)) { + const U8 *out = e->seq+n*(byte - e->min); + U8 *oend = d+n; if (dst) - Copy(out,dptr,n,U8); - dptr += n; - } - else - { - /* No room */ + { + if (oend <= dend) + { + while (d < oend) + *d++ = *out++; + } + else + { + /* Out of space */ + code = ENCODE_NOSPACE; + break; + } + } + else + d = oend; } + enc = e->next; + s++; + if (s == cend) + last = s; + } + else + { + /* partial source character */ + code = ENCODE_PARTIAL; + break; } - enc = e->next; } else { /* Cannot represent */ + code = ENCODE_NOREP; + break; } } - return dptr-dst; + *slen = last - src; + *dout = d - dst; + return code; } diff --git a/ext/Encode/encode.h b/ext/Encode/encode.h index 10f8386ed2..604b97f99d 100644 --- a/ext/Encode/encode.h +++ b/ext/Encode/encode.h @@ -1,3 +1,5 @@ +#ifndef ENCODE_H +#define ENCODE_H #ifndef U8 typedef unsigned char U8; #endif @@ -24,3 +26,14 @@ struct encode_s int replen; }; +#ifdef U8 +extern int do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, + U8 *dst, STRLEN dlen, STRLEN *dout); + +extern void Encode_DefineEncoding(encode_t *enc); +#endif + +#define ENCODE_NOSPACE 1 +#define ENCODE_PARTIAL 2 +#define ENCODE_NOREP 3 +#endif |