diff options
Diffstat (limited to 'ext/Encode/compile')
-rwxr-xr-x | ext/Encode/compile | 530 |
1 files changed, 530 insertions, 0 deletions
diff --git a/ext/Encode/compile b/ext/Encode/compile new file mode 100755 index 0000000000..b890a04d81 --- /dev/null +++ b/ext/Encode/compile @@ -0,0 +1,530 @@ +#!../../perl -w +BEGIN { @INC = '../../lib' }; +use strict; + +sub encode_U +{ + # UTF-8 encode long hand - only covers part of perl's range + my $uv = shift; + if ($uv < 0x80) + { + return chr($uv) + } + if ($uv < 0x800) + { + return chr(($uv >> 6) | 0xC0). + chr(($uv & 0x3F) | 0x80); + } + return chr(($uv >> 12) | 0xE0). + chr((($uv >> 6) & 0x3F) | 0x80). + chr(($uv & 0x3F) | 0x80); +} + +sub encode_S +{ + # encode single byte + my ($ch,$page) = @_; + return chr($ch); +} + +sub encode_D +{ + # encode double byte MS byte first + my ($ch,$page) = @_; + return chr($page).chr($ch); +} + +sub encode_M +{ + # encode Multi-byte - single for 0..255 otherwise double + my ($ch,$page) = @_; + return &encode_D if $page; + return &encode_S; +} + +# Win32 does not expand globs on command line +eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32'); + +my $cname = shift(@ARGV); +chmod(0666,$cname) if -f $cname && !-w $cname; +open(C,">$cname") || die "Cannot open $cname:$!"; +my $dname = $cname; +$dname =~ s/(\.[^\.]*)?$/.def/; + +my ($doC,$doEnc,$doUcm); + +if ($cname =~ /\.(c|xs)$/) + { + $doC = 1; + chmod(0666,$dname) if -f $cname && !-w $dname; + open(D,">$dname") || die "Cannot open $dname:$!"; + my $hname = $cname; + $hname =~ s/(\.[^\.]*)?$/.h/; + chmod(0666,$hname) if -f $cname && !-w $hname; + open(H,">$hname") || die "Cannot open $hname:$!"; + + foreach my $fh (\*C,\*D,\*H) + { + print $fh <<"END"; +/* + !!!!!!! DO NOT EDIT THIS FILE !!!!!!! + This file was autogenerated by: + $^X $0 $cname @ARGV +*/ +END + } + + if ($cname =~ /(\w+)\.xs$/) + { + print C "#include <EXTERN.h>\n"; + print C "#include <perl.h>\n"; + print C "#include <XSUB.h>\n"; + print C "#define U8 U8\n"; + } + print C "#include \"encode.h\"\n"; + } +elsif ($cname =~ /\.enc$/) + { + $doEnc = 1; + } +elsif ($cname =~ /\.ucm$/) + { + $doUcm = 1; + } + +my %encoding; +my %strings; + +sub cmp_name +{ + if ($a =~ /^.*-(\d+)/) + { + my $an = $1; + if ($b =~ /^.*-(\d+)/) + { + my $r = $an <=> $1; + return $r if $r; + } + } + return $a cmp $b; +} + +foreach my $enc (sort cmp_name @ARGV) + { + my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/; + if (open(E,$enc)) + { + if ($sfx eq 'enc') + { + compile_enc(\*E,lc($name),\*C); + } + else + { + compile_ucm(\*E,lc($name),\*C); + } + } + else + { + warn "Cannot open $enc for $name:$!"; + } + } + +if ($doC) + { + foreach my $enc (sort cmp_name keys %encoding) + { + my $sym = "${enc}_encoding"; + $sym =~ s/\W+/_/g; + print C "encode_t $sym = \n"; + print C " {",join(',',"\"$enc\"",@{$encoding{$enc}}),"};\n\n"; + } + + foreach my $enc (sort cmp_name keys %encoding) + { + my $sym = "${enc}_encoding"; + $sym =~ s/\W+/_/g; + print H "extern encode_t $sym;\n"; + print D " Encode_Define(aTHX_ &$sym);\n"; + } + + if ($cname =~ /(\w+)\.xs$/) + { + my $mod = $1; + print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n"; + print C "BOOT:\n{\n"; + print C "#include \"$dname\"\n"; + print C "}\n"; + } + close(D); + close(H); + } +close(C); + + +sub compile_ucm +{ + my ($fh,$name,$ch) = @_; + my $e2u = {}; + my $u2e = {}; + my $cs; + my %attr; + while (<$fh>) + { + s/#.*$//; + last if /^\s*CHARMAP\s*$/i; + if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) + { + $attr{$1} = $2; + } + } + if (!defined($cs = $attr{'code_set_name'})) + { + warn "No <code_set_name> in $name\n"; + } + else + { + # $name = lc($cs); + } + my $erep; + my $urep; + if (exists $attr{'subchar'}) + { + my @byte = $attr{'subchar'} =~ /^\s*(?:\\x([0-9a-f]+))+\s*$/; + $erep = join('',map(hex($_),@byte)); + } + warn "Scanning $name ($cs)\n"; + my $nfb = 0; + my $hfb = 0; + while (<$fh>) + { + s/#.*$//; + last if /^\s*END\s+CHARMAP\s*$/i; + next if /^\s*$/; + my ($u,@byte) = /^<U([0-9a-f]+)>\s+(?:\\x([0-9a-f]+))+\s*(\|[0-3]|)\s*$/i; + my $fb = pop(@byte); + if (defined($u)) + { + my $uch = encode_U(hex($u)); + my $ech = join('',map(chr(hex($_)),@byte)); + if (length($fb)) + { + $fb = substr($fb,1); + $hfb++; + } + else + { + $nfb++; + $fb = '0'; + } + # $fb is fallback flag + # 0 - round trip safe + # 1 - fallback for unicode -> enc + # 2 - skip sub-char mapping + # 3 - fallback enc -> unicode + enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/); + enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/); + } + else + { + warn $_; + } + + } + if ($nfb && $hfb) + { + die "$nfb entries without fallback, $hfb entries with\n"; + } + if ($doC) + { + output($ch,$name.'_utf8',$e2u); + output($ch,'utf8_'.$name,$u2e); + $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, + outstring($ch,$e2u->{Cname}.'_def',$erep),length($erep)]; + } + elsif ($doEnc) + { + output_enc($ch,$name,$e2u); + } + elsif ($doUcm) + { + output_ucm($ch,$name,$u2e); + } +} + +sub compile_enc +{ + my ($fh,$name,$ch) = @_; + my $e2u = {}; + my $u2e = {}; + + my $type; + while ($type = <$fh>) + { + last if $type !~ /^\s*#/; + } + chomp($type); + return if $type eq 'E'; + my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>)); + warn "$type encoded $name\n"; + my $rep = ''; + { + my $v = hex($def); + no strict 'refs'; + $rep = &{"encode_$type"}($v & 0xFF, ($v >> 8) & 0xffe); + } + while ($pages--) + { + my $line = <$fh>; + chomp($line); + my $page = hex($line); + my $ch = 0; + for (my $i = 0; $i < 16; $i++) + { + my $line = <$fh>; + for (my $j = 0; $j < 16; $j++) + { + no strict 'refs'; + my $ech = &{"encode_$type"}($ch,$page); + my $val = hex(substr($line,0,4,'')); + if ($val || (!$ch && !$page)) + { + my $uch = encode_U($val); + enter($e2u,$ech,$uch,$e2u,0); + enter($u2e,$uch,$ech,$u2e,0); + } + else + { + # No character at this position + # enter($e2u,$ech,undef,$e2u); + } + $ch++; + } + } + } + if ($doC) + { + output($ch,$name.'_utf8',$e2u); + output($ch,'utf8_'.$name,$u2e); + $encoding{$name} = [$e2u->{Cname},$u2e->{Cname}, + outstring($ch,$e2u->{Cname}.'_def',$rep),length($rep)]; + } + elsif ($doEnc) + { + output_enc($ch,$name,$e2u); + } + elsif ($doUcm) + { + output_ucm($ch,$name,$u2e); + } +} + +sub enter +{ + my ($a,$s,$d,$t,$fb) = @_; + $t = $a if @_ < 4; + my $b = substr($s,0,1); + my $e = $a->{$b}; + unless ($e) + { # 0 1 2 3 4 5 + $e = [$b,$b,'',{},length($s),0,$fb]; + $a->{$b} = $e; + } + if (length($s) > 1) + { + enter($e->[3],substr($s,1),$d,$t,$fb); + } + else + { + $e->[2] = $d; + $e->[3] = $t; + $e->[5] = length($d); + } +} + +sub outstring +{ + my ($fh,$name,$s) = @_; + my $sym = $strings{$s}; + unless ($sym) + { + foreach my $o (keys %strings) + { + my $i = index($o,$s); + if ($i >= 0) + { + $sym = $strings{$o}; + $sym .= sprintf("+0x%02x",$i) if ($i); + return $sym; + } + } + $strings{$s} = $sym = $name; + printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s); + # Do in chunks of 16 chars to constrain line length + # Assumes ANSI C adjacent string litteral concatenation + while (length($s)) + { + my $c = substr($s,0,16,''); + print $fh '"',join('',map(sprintf('\x%02x',ord($_)),split(//,$c))),'"'; + print $fh "\n" if length($s); + } + printf $fh ";\n"; + } + return $sym; +} + +sub process +{ + my ($name,$a) = @_; + $name =~ s/\W+/_/g; + $a->{Cname} = $name; + my @keys = grep(ref($a->{$_}),sort keys %$a); + my $l; + my @ent; + foreach my $b (@keys) + { + my ($s,$f,$out,$t,$end) = @{$a->{$b}}; + if (defined($l) && + ord($b) == ord($a->{$l}[1])+1 && + $a->{$l}[3] == $a->{$b}[3] && + $a->{$l}[4] == $a->{$b}[4] && + $a->{$l}[5] == $a->{$b}[5] && + $a->{$l}[6] == $a->{$b}[6] + # && length($a->{$l}[2]) < 16 + ) + { + my $i = ord($b)-ord($a->{$l}[0]); + $a->{$l}[1] = $b; + $a->{$l}[2] .= $a->{$b}[2]; + } + else + { + $l = $b; + push(@ent,$b); + } + if (exists $t->{Cname}) + { + $t->{'Forward'} = 1 if $t != $a; + } + else + { + process(sprintf("%s_%02x",$name,ord($s)),$t); + } + } + if (ord($keys[-1]) < 255) + { + my $t = chr(ord($keys[-1])+1); + $a->{$t} = [$t,chr(255),undef,$a,0,0]; + push(@ent,$t); + } + $a->{'Entries'} = \@ent; +} + +sub outtable +{ + my ($fh,$a) = @_; + my $name = $a->{'Cname'}; + # String tables + foreach my $b (@{$a->{'Entries'}}) + { + next unless $a->{$b}[5]; + my $s = ord($a->{$b}[0]); + my $e = ord($a->{$b}[1]); + outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$a->{$b}[2]); + } + if ($a->{'Forward'}) + { + print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n"; + } + $a->{'Done'} = 1; + foreach my $b (@{$a->{'Entries'}}) + { + my ($s,$e,$out,$t,$end,$l) = @{$a->{$b}}; + outtable($fh,$t) unless $t->{'Done'}; + } + print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n"; + foreach my $b (@{$a->{'Entries'}}) + { + my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; + my $sc = ord($s); + my $ec = ord($e); + $end |= 0x80 if $fb; + print $fh "{"; + if ($l) + { + printf $fh outstring($fh,'',$out); + } + else + { + print $fh "0"; + } + print $fh ",",$t->{Cname}; + printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec; + } + print $fh "};\n"; +} + +sub output +{ + my ($fh,$name,$a) = @_; + process($name,$a); + # Sub-tables + outtable($fh,$a); +} + +sub output_enc +{ + my ($fh,$name,$a) = @_; + foreach my $b (sort keys %$a) + { + my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}}; + } +} + +sub decode_U +{ + my $s = shift; + +} + + +sub output_ucm_page +{ + my ($fh,$a,$t,$pre) = @_; + # warn sprintf("Page %x\n",$pre); + foreach my $b (sort keys %$t) + { + my ($s,$e,$out,$n,$end,$l,$fb) = @{$t->{$b}}; + die "oops $s $e" unless $s eq $e; + my $u = ord($s); + if ($n != $a && $n != $t) + { + output_ucm_page($fh,$a,$n,(($pre|($u &0x3F)) << 6)&0xFFFF); + } + elsif (length($out)) + { + if ($pre) + { + $u = $pre|($u &0x3f); + } + printf $fh "<U%04X> ",$u; + foreach my $c (split(//,$out)) + { + printf $fh "\\x%02X",ord($c); + } + printf $fh " |%d\n",($fb ? 1 : 0); + } + else + { + warn join(',',@{$t->{$b}},$a,$t); + } + } +} + +sub output_ucm +{ + my ($fh,$name,$a) = @_; + print $fh "CHARMAP\n"; + output_ucm_page($fh,$a,$a,0); + print $fh "END CHARMAP\n"; +} + |