summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/Encode/Encode.pm16
-rw-r--r--ext/Encode/Encode.xs20
-rw-r--r--ext/Encode/Unicode/Unicode.pm231
-rw-r--r--ext/Encode/Unicode/Unicode.xs39
-rw-r--r--ext/Encode/lib/Encode/Encoding.pm32
-rw-r--r--ext/PerlIO/encoding/encoding.xs8
6 files changed, 96 insertions, 250 deletions
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index 37b350f1fe..4959b5fc17 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -15,7 +15,7 @@ use base qw/Exporter/;
our @EXPORT = qw(
decode decode_utf8 encode encode_utf8
- encodings find_encoding
+ encodings find_encoding clone_encoding
);
our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
@@ -95,7 +95,7 @@ sub getEncoding
{
my ($class, $name, $skip_external) = @_;
- ref($name) && $name->can('new_sequence') and return $name;
+ ref($name) && $name->can('renew') and return $name;
exists $Encoding{$name} and return $Encoding{$name};
my $lc = lc $name;
exists $Encoding{$lc} and return $Encoding{$lc};
@@ -116,18 +116,26 @@ sub getEncoding
return;
}
-sub find_encoding
+sub find_encoding($;$)
{
my ($name, $skip_external) = @_;
return __PACKAGE__->getEncoding($name,$skip_external);
}
-sub resolve_alias {
+sub resolve_alias($){
my $obj = find_encoding(shift);
defined $obj and return $obj->name;
return;
}
+sub clone_encoding($){
+ my $obj = find_encoding(shift);
+ ref $obj or return;
+ eval { require Storable };
+ $@ and return;
+ return Storable::dclone($obj);
+}
+
sub encode($$;$)
{
my ($name, $string, $check) = @_;
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index 36d5f3dac6..7970058273 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -1,5 +1,5 @@
/*
- $Id: Encode.xs,v 1.55 2003/02/28 01:40:27 dankogai Exp $
+ $Id: Encode.xs,v 1.55 2003/02/28 01:40:27 dankogai Exp dankogai $
*/
#define PERL_NO_GET_CONTEXT
@@ -258,6 +258,16 @@ encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_
+PROTOTYPES: DISABLE
+
+void
+Method_renew(obj)
+SV * obj
+CODE:
+{
+ XSRETURN(1);
+}
+
void
Method_decode_xs(obj,src,check = 0)
SV * obj
@@ -389,6 +399,14 @@ MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
PROTOTYPES: ENABLE
void
+Method_renew(obj)
+SV * obj
+CODE:
+{
+ XSRETURN(1);
+}
+
+void
Method_name(obj)
SV * obj
CODE:
diff --git a/ext/Encode/Unicode/Unicode.pm b/ext/Encode/Unicode/Unicode.pm
index 1829218c66..9648fd358f 100644
--- a/ext/Encode/Unicode/Unicode.pm
+++ b/ext/Encode/Unicode/Unicode.pm
@@ -2,6 +2,7 @@ package Encode::Unicode;
use strict;
use warnings;
+no warnings 'redefine';
our $VERSION = do { my @r = (q$Revision: 1.39 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
@@ -14,6 +15,8 @@ XSLoader::load(__PACKAGE__,$VERSION);
require Encode;
+our %BOM_Unknown = map {$_ => 1} qw(UTF-16 UTF-32);
+
for my $name (qw(UTF-16 UTF-16BE UTF-16LE
UTF-32 UTF-32BE UTF-32LE
UCS-2BE UCS-2LE))
@@ -35,231 +38,23 @@ for my $name (qw(UTF-16 UTF-16BE UTF-16LE
endian => $endian,
ucs2 => $ucs2,
} => __PACKAGE__;
-
}
use base qw(Encode::Encoding);
-#
-# three implementations of (en|de)code exist. The XS version is the
-# fastest. *_modern uses an array and *_classic sticks with substr.
-# *_classic is much slower but more memory conservative.
-# *_xs is the default.
-
-sub set_transcoder{
- no warnings qw(redefine);
- my $type = shift;
- if ($type eq "xs"){
- *decode = \&decode_xs;
- *encode = \&encode_xs;
- }elsif($type eq "modern"){
- *decode = \&decode_modern;
- *encode = \&encode_modern;
- }elsif($type eq "classic"){
- *decode = \&decode_classic;
- *encode = \&encode_classic;
- }else{
- require Carp;
- Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)";
- }
-}
-
-set_transcoder("xs");
-
-#
-# Aux. subs & constants
-#
-
-sub FBCHAR(){ 0xFFFd }
-sub BOM_BE(){ 0xFeFF }
-sub BOM16LE(){ 0xFFFe }
-sub BOM32LE(){ 0xFFFe0000 }
-
-sub valid_ucs2($){
- return
- (0 <= $_[0] && $_[0] < 0xD800)
- || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF);
-}
-
-sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF }
-sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 }
-sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF }
-
-sub ensurrogate($){
- use integer; # we have divisions
- my $uni = shift;
- my $hi = ($uni - 0x10000) / 0x400 + 0xD800;
- my $lo = ($uni - 0x10000) % 0x400 + 0xDC00;
- return ($hi, $lo);
-}
-
-sub desurrogate($$){
- my ($hi, $lo) = @_;
- return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00);
-}
-
-sub Mask { {2 => 0xffff, 4 => 0xffffffff} }
-
-#
-# *_modern are much faster but guzzle more memory
-#
-
-sub decode_modern($$;$)
-{
- my ($obj, $str, $chk ) = @_;
- my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
-
- # warn "$size, $endian, $ucs2";
- $endian ||= BOMB($size, substr($str, 0, $size, ''))
- or poisoned2death($obj, "Where's the BOM?");
- my $mask = Mask->{$size};
- my $utf8 = '';
- my @ord = unpack("$endian*", $str);
- undef $str; # to conserve memory
- while (@ord){
- my $ord = shift @ord;
- unless ($size == 4 or valid_ucs2($ord &= $mask)){
- if ($ucs2){
- $chk and
- poisoned2death($obj, "no surrogates allowed", $ord);
- shift @ord; # skip the next one as well
- $ord = FBCHAR;
- }else{
- unless (isHiSurrogate($ord)){
- poisoned2death($obj, "Malformed HI surrogate", $ord);
- }
- my $lo = shift @ord;
- unless (isLoSurrogate($lo &= $mask)){
- poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
- }
- $ord = desurrogate($ord, $lo);
- }
- }
- $utf8 .= chr($ord);
- }
- utf8::upgrade($utf8);
- return $utf8;
-}
-
-sub encode_modern($$;$)
-{
- my ($obj, $utf8, $chk) = @_;
- my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
- my @str = ();
- unless ($endian){
- $endian = ($size == 4) ? 'N' : 'n';
- push @str, BOM_BE;
- }
- my @ord = unpack("U*", $utf8);
- undef $utf8; # to conserve memory
- for my $ord (@ord){
- unless ($size == 4 or valid_ucs2($ord)) {
- unless(issurrogate($ord)){
- if ($ucs2){
- $chk and
- poisoned2death($obj, "code point too high", $ord);
-
- push @str, FBCHAR;
- }else{
-
- push @str, ensurrogate($ord);
- }
- }else{ # not supposed to happen
- push @str, FBCHAR;
- }
- }else{
- push @str, $ord;
- }
- }
- return pack("$endian*", @str);
-}
-
-#
-# *_classic are slower but more memory conservative
-#
-
-sub decode_classic($$;$)
-{
- my ($obj, $str, $chk ) = @_;
- my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
-
- # warn "$size, $endian, $ucs2";
- $endian ||= BOMB($size, substr($str, 0, $size, ''))
- or poisoned2death($obj, "Where's the BOM?");
- my $mask = Mask->{$size};
- my $utf8 = '';
- my @ord = unpack("$endian*", $str);
- while (length($str)){
- my $ord = unpack($endian, substr($str, 0, $size, ''));
- unless ($size == 4 or valid_ucs2($ord &= $mask)){
- if ($ucs2){
- $chk and
- poisoned2death($obj, "no surrogates allowed", $ord);
- substr($str,0,$size,''); # skip the next one as well
- $ord = FBCHAR;
- }else{
- unless (isHiSurrogate($ord)){
- poisoned2death($obj, "Malformed HI surrogate", $ord);
- }
- my $lo = unpack($endian ,substr($str,0,$size,''));
- unless (isLoSurrogate($lo &= $mask)){
- poisoned2death($obj, "Malformed LO surrogate", $ord, $lo);
- }
- $ord = desurrogate($ord, $lo);
- }
- }
- $utf8 .= chr($ord);
- }
- utf8::upgrade($utf8);
- return $utf8;
+sub renew {
+ my $self = shift;
+ $BOM_Unknown{$self->name} or return $self;
+ my $clone = bless { %$self } => ref($self);
+ $clone->{clone} = 1; # so the caller knows it is renewed.
+ return $clone;
}
-sub encode_classic($$;$)
-{
- my ($obj, $utf8, $chk) = @_;
- my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)};
- # warn join ", ", $size, $ucs2, $endian, $mask;
- my $str = '';
- unless ($endian){
- $endian = ($size == 4) ? 'N' : 'n';
- $str .= pack($endian, BOM_BE);
- }
- while (length($utf8)){
- my $ord = ord(substr($utf8,0,1,''));
- unless ($size == 4 or valid_ucs2($ord)) {
- unless(issurrogate($ord)){
- if ($ucs2){
- $chk and
- poisoned2death($obj, "code point too high", $ord);
- $str .= pack($endian, FBCHAR);
- }else{
- $str .= pack($endian.2, ensurrogate($ord));
- }
- }else{ # not supposed to happen
- $str .= pack($endian, FBCHAR);
- }
- }else{
- $str .= pack($endian, $ord);
- }
- }
- return $str;
-}
+# There used to be a perl implemntation of (en|de)code but with
+# XS version is ripe, perl version is zapped for optimal speed
-sub BOMB {
- my ($size, $bom) = @_;
- my $N = $size == 2 ? 'n' : 'N';
- my $ord = unpack($N, $bom);
- return ($ord eq BOM_BE) ? $N :
- ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef;
-}
-
-sub poisoned2death{
- my $obj = shift;
- my $msg = shift;
- my $pair = join(", ", map {sprintf "\\x%x", $_} @_);
- require Carp;
- Carp::croak $obj->name, ":", $msg, "<$pair>.", caller;
-}
+*decode = \&decode_xs;
+*encode = \&encode_xs;
1;
__END__
diff --git a/ext/Encode/Unicode/Unicode.xs b/ext/Encode/Unicode/Unicode.xs
index 2163fb525d..8b02402d1e 100644
--- a/ext/Encode/Unicode/Unicode.xs
+++ b/ext/Encode/Unicode/Unicode.xs
@@ -1,5 +1,5 @@
/*
- $Id: Unicode.xs,v 1.7 2003/02/20 14:42:34 dankogai Exp $
+ $Id: Unicode.xs,v 1.7 2003/02/20 14:42:34 dankogai Exp dankogai $
*/
#define PERL_NO_GET_CONTEXT
@@ -84,6 +84,9 @@ MODULE = Encode::Unicode PACKAGE = Encode::Unicode
PROTOTYPES: DISABLE
+#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
+ *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
+
void
decode_xs(obj, str, check = 0)
SV * obj
@@ -91,10 +94,11 @@ SV * str
IV check
CODE:
{
- int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
- U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
- int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
- SV *result = newSVpvn("",0);
+ U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
+ int size = SvIV(attr("size", 4));
+ int ucs2 = SvTRUE(attr("ucs2", 4));
+ int clone = SvTRUE(attr("clone", 5));
+ SV *result = newSVpvn("",0);
STRLEN ulen;
U8 *s = (U8 *)SvPVbyte(str,ulen);
U8 *e = (U8 *)SvEND(str);
@@ -118,9 +122,11 @@ CODE:
bom);
}
}
-#if 0
- /* Update endian for this sequence */
- hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#if 1
+ /* Update endian for next sequence */
+ if (clone) {
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+ }
#endif
}
while (s < e && s+size <= e) {
@@ -188,10 +194,11 @@ SV * utf8
IV check
CODE:
{
- int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
- U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
- int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
- SV *result = newSVpvn("",0);
+ U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
+ int size = SvIV(attr("size", 4));
+ int ucs2 = SvTRUE(attr("ucs2", 4));
+ int clone = SvTRUE(attr("clone", 5));
+ SV *result = newSVpvn("",0);
STRLEN ulen;
U8 *s = (U8 *)SvPVutf8(utf8,ulen);
U8 *e = (U8 *)SvEND(utf8);
@@ -199,9 +206,11 @@ CODE:
if (!endian) {
endian = (size == 4) ? 'N' : 'n';
enc_pack(aTHX_ result,size,endian,BOM_BE);
-#if 0
- /* Update endian for this sequence */
- hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+#if 1
+ /* Update endian for next sequence */
+ if (clone){
+ hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
+ }
#endif
}
while (s < e && s+UTF8SKIP(s) <= e) {
diff --git a/ext/Encode/lib/Encode/Encoding.pm b/ext/Encode/lib/Encode/Encoding.pm
index 0bb4350a63..3978e9df89 100644
--- a/ext/Encode/lib/Encode/Encoding.pm
+++ b/ext/Encode/lib/Encode/Encoding.pm
@@ -14,8 +14,10 @@ sub Define
Encode::define_encoding($obj, $canonical, @_);
}
-sub name { return shift->{'Name'} }
-sub new_sequence { return $_[0] }
+sub name { return shift->{'Name'} }
+
+sub renew { return $_[0] }
+*new_sequence = \&renew;
sub needs_lines { 0 };
@@ -24,7 +26,8 @@ sub perlio_ok {
return $@ ? 0 : 1;
}
-# Temporary legacy methods
+# (Temporary|legacy) methods
+
sub toUnicode { shift->decode(@_) }
sub fromUnicode { shift->encode(@_) }
@@ -160,15 +163,28 @@ Predefined As:
MUST return the string representing the canonical name of the encoding.
-=item -E<gt>new_sequence
+=item -E<gt>renew
Predefined As:
- sub new_sequence { return $_[0] }
+ sub renew { return $_[0] }
+
+This method reconstructs the encoding object if necessary. If you need
+to store the state during encoding, this is where you clone your object.
+Here is an example:
+
+ sub renew {
+ my $self = shift;
+ my $clone = bless { %$self } => ref($self);
+ $clone->{clone} = 1; # so the caller can see it
+ return $clone;
+ }
+
+Since most encodings are stateless the default behavior is just return
+itself as shown above.
-This is a placeholder for encodings with state. It should return an
-object which implements this interface. All current implementations
-return the original object.
+PerlIO ALWAYS calls this method to make sure it has its own private
+encoding object.
=item -E<gt>perlio_ok()
diff --git a/ext/PerlIO/encoding/encoding.xs b/ext/PerlIO/encoding/encoding.xs
index 5f7b0dff5e..cd692628a1 100644
--- a/ext/PerlIO/encoding/encoding.xs
+++ b/ext/PerlIO/encoding/encoding.xs
@@ -113,12 +113,13 @@ PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg, PerlIO_funcs *
code = -1;
}
else {
-#ifdef USE_NEW_SEQUENCE
+
+ /* $enc->renew */
PUSHMARK(sp);
XPUSHs(result);
PUTBACK;
- if (call_method("new_sequence",G_SCALAR|G_EVAL) != 1 || SvTRUE(ERRSV)) {
- Perl_warner(aTHX_ packWARN(WARN_IO), "\"%" SVf "\" does not support new_sequence",
+ if (call_method("renew",G_SCALAR|G_EVAL) != 1 || SvTRUE(ERRSV)) {
+ Perl_warner(aTHX_ packWARN(WARN_IO), "\"%" SVf "\" does not support renew method",
arg);
}
else {
@@ -126,7 +127,6 @@ PerlIOEncode_pushed(pTHX_ PerlIO * f, const char *mode, SV * arg, PerlIO_funcs *
result = POPs;
PUTBACK;
}
-#endif
e->enc = newSVsv(result);
PUSHMARK(sp);
XPUSHs(e->enc);