summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorDan Kogai <dankogai@dan.co.jp>2002-11-19 12:18:44 +0900
committerhv <hv@crypt.org>2002-11-25 02:20:48 +0000
commitedce026feebc176d8f5fa74e6f5a4a54a4731410 (patch)
tree7b0f59ccdc7abdc63569674af522ba272b8d39f7 /ext
parent637a7e2fd3f9e7c5d3f7b6b7f801366803e0b516 (diff)
downloadperl-edce026feebc176d8f5fa74e6f5a4a54a4731410.tar.gz
[Encode] 1.83 + bleedperl patch released
Message-Id: <2C132F6D-FB22-11D6-87FC-0003939A104C@dan.co.jp> p4raw-id: //depot/perl@18175
Diffstat (limited to 'ext')
-rw-r--r--ext/Encode/AUTHORS3
-rw-r--r--ext/Encode/Changes32
-rw-r--r--ext/Encode/Encode.pm32
-rw-r--r--ext/Encode/Encode.xs12
-rw-r--r--ext/Encode/MANIFEST2
-rw-r--r--ext/Encode/bin/enc2xs165
-rw-r--r--ext/Encode/lib/Encode/JP/JIS7.pm20
-rw-r--r--ext/Encode/t/rt.pl5
8 files changed, 189 insertions, 82 deletions
diff --git a/ext/Encode/AUTHORS b/ext/Encode/AUTHORS
index 01d3530ac6..c559c84263 100644
--- a/ext/Encode/AUTHORS
+++ b/ext/Encode/AUTHORS
@@ -9,7 +9,7 @@
#
# This list is in alphabetical order.
--
-Andreas J. Koenig <andreas.koenig@anima.de>
+Andreas J. Koenig <andreas.koenig@anima.de>
Anton Tagunov <tagunov@motor.ru>
Autrijus Tang <autrijus@autrijus.org>
Benjamin Goldberg <goldbb2@earthlink.net>
@@ -21,6 +21,7 @@ Gerrit P. Haase <gp@familiehaase.de>
Graham Barr <gbarr@pobox.com>
Gurusamy Sarathy <gsar@activestate.com>
H.Merijn Brand <h.m.brand@hccnet.nl>
+Hugo van der Sanden <hv@crypt.org>
Jarkko Hietaniemi <jhi@iki.fi>
Jungshik Shin <jshin@mailaps.org>
Laszlo Molnar <ml1050@freemail.hu>
diff --git a/ext/Encode/Changes b/ext/Encode/Changes
index 52cbda3334..60452d82ce 100644
--- a/ext/Encode/Changes
+++ b/ext/Encode/Changes
@@ -1,9 +1,35 @@
# Revision history for Perl extension Encode.
#
-# $Id: Changes,v 1.80 2002/10/21 20:39:09 dankogai Exp dankogai $
+# $Id: Changes,v 1.83 2002/11/18 17:28:49 dankogai Exp dankogai $
#
-$Revision: 1.80 $ $Date: 2002/10/21 20:39:09 $
+$Revision: 1.83 $ $Date: 2002/11/18 17:28:49 $
+! Encode.xs lib/Encode/JIS7.pm
+ Even more patches from Inaba-san has been applied. With this
+ patch t/uni/tr_7jis.t and t/uni/t_utf8.t of bleedperl will work.
+ Message-Id: <20021115105514D.inaba.hiroto@toshiba-it.co.jp>
+
+1.82 2002/11/14 23:06:12
+! Encode.xs
+ Encode::utf8 (XS Version) assertion botch first found in Cygwin,
+ later found in perls w/ -Dusemymalloc was fixed by NC.
+ Message-Id: <20021114210349.GA288@Bagpuss.unfortu.net>
+
+1.81 2002/11/08 18:29:27
+! Encode.pm Encode.xs
+ Non-XS version of Encode::utf8 is back (with XS being default).
+ Encode::predefine_encodings(0) to turn off XS.
+ This is primarily to cope w/ Cygwin smoke but Sadahiro-san has
+ found that it was Test::More causing the problem, not Encode.
+ But I have already made it configurable so it may be useful in
+ some rare cases....
+ Message-Id: <20021107210110.2EE4.BQW10602@nifty.com>, et al.
+! bin/enc2xs
+ The ingenious patch by Nicholas Clark that reduces shlib sizes by
+ 50% with no penalty and backward compatibility preserved, is in.
+ Message-Id: <20021103231324.GE288@Bagpuss.unfortu.net>
+
+1.80 2002/10/21 20:39:09
! Encode.xs t/mime-header.t
Even more patches from NI-XS regarding Encode::utf8->decode().
And one more test to t/mime-header.t to prove it
@@ -774,7 +800,7 @@ $Revision: 1.80 $ $Date: 2002/10/21 20:39:09 $
Typo fixes and improvements by jhi
Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al.
-1.11 $Date: 2002/10/21 20:39:09 $
+1.11 2002/03/31 22:12:13
+ t/encoding.t
+ t/jperl.t
! MANIFEST
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 62e2ae6e49..01dc8ffbc4 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -1,9 +1,9 @@
#
-# $Id: Encode.pm,v 1.80 2002/10/21 20:38:45 dankogai Exp $
+# $Id: Encode.pm,v 1.83 2002/11/18 17:28:29 dankogai Exp $
#
package Encode;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.80 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.83 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
our $DEBUG = 0;
use XSLoader ();
XSLoader::load(__PACKAGE__, $VERSION);
@@ -191,7 +191,7 @@ sub decode_utf8($)
return $str;
}
-predefine_encodings();
+predefine_encodings(1);
#
# This is to restore %Encoding if really needed;
@@ -199,6 +199,8 @@ predefine_encodings();
sub predefine_encodings{
use Encode::Encoding;
+ no warnings 'redefine';
+ my $use_xs = shift;
if ($ON_EBCDIC) {
# was in Encode::UTF_EBCDIC
package Encode::UTF_EBCDIC;
@@ -243,7 +245,29 @@ sub predefine_encodings{
# was in Encode::utf8
package Encode::utf8;
push @Encode::utf8::ISA, 'Encode::Encoding';
- # encode and decode methods now in Encode.xs
+ #
+ if ($use_xs){
+ $DEBUG and warn __PACKAGE__, " XS on";
+ *decode = \&decode_xs;
+ *encode = \&encode_xs;
+ }else{
+ $DEBUG and warn __PACKAGE__, " XS off";
+ *decode = sub{
+ my ($obj,$octets,$chk) = @_;
+ my $str = Encode::decode_utf8($octets);
+ if (defined $str) {
+ $_[1] = '' if $chk;
+ return $str;
+ }
+ return undef;
+ };
+ *encode = sub {
+ my ($obj,$string,$chk) = @_;
+ my $octets = Encode::encode_utf8($string);
+ $_[1] = '' if $chk;
+ return $octets;
+ };
+ }
$Encode::Encoding{utf8} =
bless {Name => "utf8"} => "Encode::utf8";
}
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index df77b7a5b9..4d30914995 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -1,5 +1,5 @@
/*
- $Id: Encode.xs,v 1.49 2002/10/21 19:47:47 dankogai Exp $
+ $Id: Encode.xs,v 1.52 2002/11/18 17:28:49 dankogai Exp dankogai $
*/
#define PERL_NO_GET_CONTEXT
@@ -241,7 +241,7 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_
void
-Method_decode(obj,src,check = 0)
+Method_decode_xs(obj,src,check = 0)
SV * obj
SV * src
int check
@@ -250,7 +250,7 @@ CODE:
STRLEN slen;
U8 *s = (U8 *) SvPV(src, slen);
U8 *e = (U8 *) SvEND(src);
- SV *dst = newSV(slen);
+ SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */
SvPOK_only(dst);
SvCUR_set(dst,0);
if (SvUTF8(src)) {
@@ -321,7 +321,7 @@ CODE:
}
void
-Method_encode(obj,src,check = 0)
+Method_encode_xs(obj,src,check = 0)
SV * obj
SV * src
int check
@@ -330,7 +330,7 @@ CODE:
STRLEN slen;
U8 *s = (U8 *) SvPV(src, slen);
U8 *e = (U8 *) SvEND(src);
- SV *dst = newSV(slen);
+ SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */
if (SvUTF8(src)) {
/* Already encoded - trust it and just copy the octets */
sv_setpvn(dst,(char *)s,(e-s));
@@ -338,7 +338,7 @@ CODE:
}
else {
/* Native bytes - can always encode */
- U8 *d = (U8 *) SvGROW(dst,2*slen);
+ U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */
while (s < e) {
UV uv = NATIVE_TO_UNI((UV) *s++);
if (UNI_IS_INVARIANT(uv))
diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST
index 45fd8695d6..77c189e808 100644
--- a/ext/Encode/MANIFEST
+++ b/ext/Encode/MANIFEST
@@ -33,6 +33,7 @@ bin/enc2xs Encode module generator
bin/piconv iconv by perl
bin/ucm2table Table Generator for testing
bin/ucmlint A UCM Lint utility
+bin/ucmsort Sorts UCM lines
bin/unidump Unicode Dump like hexdump(1)
encengine.c Encode extension
encoding.pm Perl Pragmactic Module
@@ -77,6 +78,7 @@ t/ksc5601.enc test data
t/ksc5601.utf test data
t/mime-header.t test script
t/perlio.t test script
+t/rt.pl even more test script
t/unibench.pl benchmark script
ucm/8859-1.ucm Unicode Character Map
ucm/8859-10.ucm Unicode Character Map
diff --git a/ext/Encode/bin/enc2xs b/ext/Encode/bin/enc2xs
index 7100bab49c..ae44c79dc7 100644
--- a/ext/Encode/bin/enc2xs
+++ b/ext/Encode/bin/enc2xs
@@ -6,9 +6,10 @@ BEGIN {
require Config; import Config;
}
use strict;
+use warnings;
use Getopt::Std;
my @orig_ARGV = @ARGV;
-our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.31 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
# These may get re-ordered.
# RAW is a do_now as inserted by &enter
@@ -186,7 +187,7 @@ END
print C "#include <XSUB.h>\n";
print C "#define U8 U8\n";
}
- print C "#include \"encode.h\"\n";
+ print C "#include \"encode.h\"\n\n";
}
elsif ($cname =~ /\.enc$/)
@@ -204,6 +205,9 @@ elsif ($cname =~ /\.pet$/)
my %encoding;
my %strings;
+my $string_acc;
+my %strings_in_acc;
+
my $saved = 0;
my $subsave = 0;
my $strings = 0;
@@ -250,8 +254,19 @@ if ($doC)
foreach my $name (sort cmp_name keys %encoding)
{
my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
- output(\*C,$name.'_utf8',$e2u);
- output(\*C,'utf8_'.$name,$u2e);
+ process($name.'_utf8',$e2u);
+ addstrings(\*C,$e2u);
+
+ process('utf8_'.$name,$u2e);
+ addstrings(\*C,$u2e);
+ }
+ outbigstring(\*C,"enctable");
+ foreach my $name (sort cmp_name keys %encoding)
+ {
+ my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
+ outtable(\*C,$e2u, "enctable");
+ outtable(\*C,$u2e, "enctable");
+
# push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
}
foreach my $enc (sort cmp_name keys %encoding)
@@ -596,43 +611,6 @@ sub enter_fb0 {
}
}
-
-sub outstring
-{
- my ($fh,$name,$s) = @_;
- my $sym = $strings{$s};
- if ($sym)
- {
- $saved += length($s);
- }
- else
- {
- if ($opt{'O'}) {
- foreach my $o (keys %strings)
- {
- next unless (my $i = index($o,$s)) >= 0;
- $sym = $strings{$o};
- # gcc things that 0x0e+0x10 (anything with e+) starts to look like
- # a hexadecimal floating point constant. Silly gcc. Only p
- # introduces a floating point constant. Put the space in to stop it
- # getting confused.
- $sym .= sprintf(" +0x%02x",$i) if ($i);
- $subsave += length($s);
- return $strings{$s} = $sym;
- }
- }
- $strings{$s} = $sym = $name;
- $strings += length($s);
- my $definition = sprintf "static const U8 %s[%d] = { ",$name,length($s);
- # Maybe we should assert that these are all <256.
- $definition .= join(',',unpack "C*",$s);
- # We have a single long line. Split it at convenient commas.
- $definition =~ s/(.{74,77},)/$1\n/g;
- print $fh "$definition };\n\n";
- }
- return $sym;
-}
-
sub process
{
my ($name,$a) = @_;
@@ -693,7 +671,8 @@ sub process
$a->{'Entries'} = \@ent;
}
-sub outtable
+
+sub addstrings
{
my ($fh,$a) = @_;
my $name = $a->{'Cname'};
@@ -701,20 +680,98 @@ sub outtable
foreach my $b (@{$a->{'Entries'}})
{
next unless $b->[AGG_OUT_LEN];
- my $s = $b->[AGG_MIN_IN];
- my $e = $b->[AGG_MAX_IN];
- outstring($fh,sprintf("%s__%02x_%02x",$name,$s,$e),$b->[AGG_OUT_BYTES]);
+ $strings{$b->[AGG_OUT_BYTES]} = undef;
}
if ($a->{'Forward'})
{
my $var = $^O eq 'MacOS' ? 'extern' : 'static';
- print $fh "\n$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
+ print $fh "$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
+ }
+ $a->{'DoneStrings'} = 1;
+ foreach my $b (@{$a->{'Entries'}})
+ {
+ my ($s,$e,$out,$t,$end,$l) = @$b;
+ addstrings($fh,$t) unless $t->{'DoneStrings'};
}
+}
+
+sub outbigstring
+{
+ my ($fh,$name) = @_;
+
+ $string_acc = '';
+
+ # Make the big string in the string accumulator. Longest first, on the hope
+ # that this makes it more likely that we find the short strings later on.
+ # Not sure if it helps sorting strings of the same length lexcically.
+ foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
+ my $index = index $string_acc, $s;
+ if ($index >= 0) {
+ $saved += length($s);
+ $strings_in_acc{$s} = $index;
+ } else {
+ OPTIMISER: {
+ if ($opt{'O'}) {
+ my $sublength = length $s;
+ while (--$sublength > 0) {
+ # progressively lop characters off the end, to see if the start of
+ # the new string overlaps the end of the accumulator.
+ if (substr ($string_acc, -$sublength)
+ eq substr ($s, 0, $sublength)) {
+ $subsave += $sublength;
+ $strings_in_acc{$s} = length ($string_acc) - $sublength;
+ # append the last bit on the end.
+ $string_acc .= substr ($s, $sublength);
+ last OPTIMISER;
+ }
+ # or if the end of the new string overlaps the start of the
+ # accumulator
+ next unless substr ($string_acc, 0, $sublength)
+ eq substr ($s, -$sublength);
+ # well, the last $sublength characters of the accumulator match.
+ # so as we're prepending to the accumulator, need to shift all our
+ # existing offsets forwards
+ $_ += $sublength foreach values %strings_in_acc;
+ $subsave += $sublength;
+ $strings_in_acc{$s} = 0;
+ # append the first bit on the start.
+ $string_acc = substr ($s, 0, -$sublength) . $string_acc;
+ last OPTIMISER;
+ }
+ }
+ # Optimiser (if it ran) found nothing, so just going have to tack the
+ # whole thing on the end.
+ $strings_in_acc{$s} = length $string_acc;
+ $string_acc .= $s;
+ };
+ }
+ }
+
+ $strings = length $string_acc;
+ my $definition = "\nstatic const U8 $name\[$strings] = { " .
+ join(',',unpack "C*",$string_acc);
+ # We have a single long line. Split it at convenient commas.
+ print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
+ print $fh substr ($definition, pos $definition), " };\n";
+}
+
+sub findstring {
+ my ($name,$s) = @_;
+ my $offset = $strings_in_acc{$s};
+ die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
+ unless defined $offset;
+ "$name + $offset";
+}
+
+sub outtable
+{
+ my ($fh,$a,$bigname) = @_;
+ my $name = $a->{'Cname'};
$a->{'Done'} = 1;
foreach my $b (@{$a->{'Entries'}})
{
my ($s,$e,$out,$t,$end,$l) = @$b;
- outtable($fh,$t) unless $t->{'Done'};
+ outtable($fh,$t,$bigname) unless $t->{'Done'};
}
print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
foreach my $b (@{$a->{'Entries'}})
@@ -724,7 +781,7 @@ sub outtable
print $fh "{";
if ($l)
{
- printf $fh outstring($fh,'',$out);
+ printf $fh findstring($bigname,$out);
}
else
{
@@ -736,14 +793,6 @@ sub outtable
print $fh "};\n";
}
-sub output
-{
- my ($fh,$name,$a) = @_;
- process($name,$a);
- # Sub-tables
- outtable($fh,$a);
-}
-
sub output_enc
{
my ($fh,$name,$a) = @_;
@@ -857,7 +906,7 @@ use vars qw(
);
sub find_e2x{
- eval { require File::Find };
+ eval { require File::Find; };
my (@inc, %e2x_dir);
for my $inc (@INC){
push @inc, $inc unless $inc eq '.'; #skip current dir
@@ -869,6 +918,7 @@ sub find_e2x{
= lstat($_) or return;
-f _ or return;
if (/^.*\.e2x$/o){
+ no warnings 'once';
$e2x_dir{$File::Find::dir} ||= $mtime;
}
return;
@@ -927,6 +977,7 @@ sub make_configlocal_pm
eval { require "Encode/$f"; };
$@ and die "Can't require Encode/$f: $@\n";
for my $enc (Encode->encodings()){
+ no warnings 'once';
$in_core{$enc} and next;
$Encode::Config::ExtModule{$enc} and next;
my $mod = "Encode/$f";
diff --git a/ext/Encode/lib/Encode/JP/JIS7.pm b/ext/Encode/lib/Encode/JP/JIS7.pm
index d1e69e6940..d49ec6cb11 100644
--- a/ext/Encode/lib/Encode/JP/JIS7.pm
+++ b/ext/Encode/lib/Encode/JP/JIS7.pm
@@ -1,7 +1,7 @@
package Encode::JP::JIS7;
use strict;
-our $VERSION = do { my @r = (q$Revision: 1.8 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
+our $VERSION = do { my @r = (q$Revision: 1.9 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode qw(:fallbacks);
@@ -62,21 +62,23 @@ sub encode($$;$)
# JIS<->EUC
+our $re_scan_jis = qr{
+ (?:($RE{JIS_0212})|$RE{JIS_0208}|($RE{ISO_ASC})|($RE{JIS_KANA}))([^\e]*)
+}x;
sub jis_euc {
+ local ${^ENCODING};
my $r_str = shift;
- $$r_str =~ s(
- ($RE{JIS_0212}|$RE{JIS_0208}|$RE{ISO_ASC}|$RE{JIS_KANA})
- ([^\e]*)
- )
+ $$r_str =~ s($re_scan_jis)
{
- my ($esc, $chunk) = ($1, $2);
- if ($esc !~ /$RE{ISO_ASC}/o) {
+ my ($esc_0212, $esc_asc, $esc_kana, $chunk) =
+ ($1, $2, $3, $4);
+ if (!$esc_asc) {
$chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
- if ($esc =~ /$RE{JIS_KANA}/o) {
+ if ($esc_kana) {
$chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
}
- elsif ($esc =~ /$RE{JIS_0212}/o) {
+ elsif ($esc_0212) {
$chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
}
}
diff --git a/ext/Encode/t/rt.pl b/ext/Encode/t/rt.pl
index cff5a3f84b..28924b2e68 100644
--- a/ext/Encode/t/rt.pl
+++ b/ext/Encode/t/rt.pl
@@ -1,12 +1,14 @@
#!/usr/local/bin/perl
#
-# $Id: rt.pl,v 1.1 2002/10/20 15:44:00 dankogai Exp $
+# $Id: rt.pl,v 1.2 2002/11/08 18:29:27 dankogai Exp $
#
BEGIN {
+ my $ucmdir = "ucm";
if ($ENV{'PERL_CORE'}){
chdir 't';
unshift @INC, '../lib';
+ $ucmdir = "../ext/Encode/ucm";
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
@@ -19,7 +21,6 @@ BEGIN {
}
use strict;
require Test::More;
- my $ucmdir = "ucm";
our $DEBUG;
our @ucm;
unless(@ARGV){