diff options
Diffstat (limited to 'ext/Encode/Unicode/Unicode.pm')
-rw-r--r-- | ext/Encode/Unicode/Unicode.pm | 231 |
1 files changed, 13 insertions, 218 deletions
diff --git a/ext/Encode/Unicode/Unicode.pm b/ext/Encode/Unicode/Unicode.pm index 1829218c66..9648fd358f 100644 --- a/ext/Encode/Unicode/Unicode.pm +++ b/ext/Encode/Unicode/Unicode.pm @@ -2,6 +2,7 @@ package Encode::Unicode; use strict; use warnings; +no warnings 'redefine'; our $VERSION = do { my @r = (q$Revision: 1.39 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; @@ -14,6 +15,8 @@ XSLoader::load(__PACKAGE__,$VERSION); require Encode; +our %BOM_Unknown = map {$_ => 1} qw(UTF-16 UTF-32); + for my $name (qw(UTF-16 UTF-16BE UTF-16LE UTF-32 UTF-32BE UTF-32LE UCS-2BE UCS-2LE)) @@ -35,231 +38,23 @@ for my $name (qw(UTF-16 UTF-16BE UTF-16LE endian => $endian, ucs2 => $ucs2, } => __PACKAGE__; - } use base qw(Encode::Encoding); -# -# three implementations of (en|de)code exist. The XS version is the -# fastest. *_modern uses an array and *_classic sticks with substr. -# *_classic is much slower but more memory conservative. -# *_xs is the default. - -sub set_transcoder{ - no warnings qw(redefine); - my $type = shift; - if ($type eq "xs"){ - *decode = \&decode_xs; - *encode = \&encode_xs; - }elsif($type eq "modern"){ - *decode = \&decode_modern; - *encode = \&encode_modern; - }elsif($type eq "classic"){ - *decode = \&decode_classic; - *encode = \&encode_classic; - }else{ - require Carp; - Carp::croak __PACKAGE__, "::set_transcoder(modern|classic|xs)"; - } -} - -set_transcoder("xs"); - -# -# Aux. subs & constants -# - -sub FBCHAR(){ 0xFFFd } -sub BOM_BE(){ 0xFeFF } -sub BOM16LE(){ 0xFFFe } -sub BOM32LE(){ 0xFFFe0000 } - -sub valid_ucs2($){ - return - (0 <= $_[0] && $_[0] < 0xD800) - || ( 0xDFFF < $_[0] && $_[0] <= 0xFFFF); -} - -sub issurrogate($){ 0xD800 <= $_[0] && $_[0] <= 0xDFFF } -sub isHiSurrogate($){ 0xD800 <= $_[0] && $_[0] < 0xDC00 } -sub isLoSurrogate($){ 0xDC00 <= $_[0] && $_[0] <= 0xDFFF } - -sub ensurrogate($){ - use integer; # we have divisions - my $uni = shift; - my $hi = ($uni - 0x10000) / 0x400 + 0xD800; - my $lo = ($uni - 0x10000) % 0x400 + 0xDC00; - return ($hi, $lo); -} - -sub desurrogate($$){ - my ($hi, $lo) = @_; - return 0x10000 + ($hi - 0xD800)*0x400 + ($lo - 0xDC00); -} - -sub Mask { {2 => 0xffff, 4 => 0xffffffff} } - -# -# *_modern are much faster but guzzle more memory -# - -sub decode_modern($$;$) -{ - my ($obj, $str, $chk ) = @_; - my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; - - # warn "$size, $endian, $ucs2"; - $endian ||= BOMB($size, substr($str, 0, $size, '')) - or poisoned2death($obj, "Where's the BOM?"); - my $mask = Mask->{$size}; - my $utf8 = ''; - my @ord = unpack("$endian*", $str); - undef $str; # to conserve memory - while (@ord){ - my $ord = shift @ord; - unless ($size == 4 or valid_ucs2($ord &= $mask)){ - if ($ucs2){ - $chk and - poisoned2death($obj, "no surrogates allowed", $ord); - shift @ord; # skip the next one as well - $ord = FBCHAR; - }else{ - unless (isHiSurrogate($ord)){ - poisoned2death($obj, "Malformed HI surrogate", $ord); - } - my $lo = shift @ord; - unless (isLoSurrogate($lo &= $mask)){ - poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); - } - $ord = desurrogate($ord, $lo); - } - } - $utf8 .= chr($ord); - } - utf8::upgrade($utf8); - return $utf8; -} - -sub encode_modern($$;$) -{ - my ($obj, $utf8, $chk) = @_; - my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; - my @str = (); - unless ($endian){ - $endian = ($size == 4) ? 'N' : 'n'; - push @str, BOM_BE; - } - my @ord = unpack("U*", $utf8); - undef $utf8; # to conserve memory - for my $ord (@ord){ - unless ($size == 4 or valid_ucs2($ord)) { - unless(issurrogate($ord)){ - if ($ucs2){ - $chk and - poisoned2death($obj, "code point too high", $ord); - - push @str, FBCHAR; - }else{ - - push @str, ensurrogate($ord); - } - }else{ # not supposed to happen - push @str, FBCHAR; - } - }else{ - push @str, $ord; - } - } - return pack("$endian*", @str); -} - -# -# *_classic are slower but more memory conservative -# - -sub decode_classic($$;$) -{ - my ($obj, $str, $chk ) = @_; - my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; - - # warn "$size, $endian, $ucs2"; - $endian ||= BOMB($size, substr($str, 0, $size, '')) - or poisoned2death($obj, "Where's the BOM?"); - my $mask = Mask->{$size}; - my $utf8 = ''; - my @ord = unpack("$endian*", $str); - while (length($str)){ - my $ord = unpack($endian, substr($str, 0, $size, '')); - unless ($size == 4 or valid_ucs2($ord &= $mask)){ - if ($ucs2){ - $chk and - poisoned2death($obj, "no surrogates allowed", $ord); - substr($str,0,$size,''); # skip the next one as well - $ord = FBCHAR; - }else{ - unless (isHiSurrogate($ord)){ - poisoned2death($obj, "Malformed HI surrogate", $ord); - } - my $lo = unpack($endian ,substr($str,0,$size,'')); - unless (isLoSurrogate($lo &= $mask)){ - poisoned2death($obj, "Malformed LO surrogate", $ord, $lo); - } - $ord = desurrogate($ord, $lo); - } - } - $utf8 .= chr($ord); - } - utf8::upgrade($utf8); - return $utf8; +sub renew { + my $self = shift; + $BOM_Unknown{$self->name} or return $self; + my $clone = bless { %$self } => ref($self); + $clone->{clone} = 1; # so the caller knows it is renewed. + return $clone; } -sub encode_classic($$;$) -{ - my ($obj, $utf8, $chk) = @_; - my ($size, $endian, $ucs2) = @$obj{qw(size endian ucs2)}; - # warn join ", ", $size, $ucs2, $endian, $mask; - my $str = ''; - unless ($endian){ - $endian = ($size == 4) ? 'N' : 'n'; - $str .= pack($endian, BOM_BE); - } - while (length($utf8)){ - my $ord = ord(substr($utf8,0,1,'')); - unless ($size == 4 or valid_ucs2($ord)) { - unless(issurrogate($ord)){ - if ($ucs2){ - $chk and - poisoned2death($obj, "code point too high", $ord); - $str .= pack($endian, FBCHAR); - }else{ - $str .= pack($endian.2, ensurrogate($ord)); - } - }else{ # not supposed to happen - $str .= pack($endian, FBCHAR); - } - }else{ - $str .= pack($endian, $ord); - } - } - return $str; -} +# There used to be a perl implemntation of (en|de)code but with +# XS version is ripe, perl version is zapped for optimal speed -sub BOMB { - my ($size, $bom) = @_; - my $N = $size == 2 ? 'n' : 'N'; - my $ord = unpack($N, $bom); - return ($ord eq BOM_BE) ? $N : - ($ord eq BOM16LE) ? 'v' : ($ord eq BOM32LE) ? 'V' : undef; -} - -sub poisoned2death{ - my $obj = shift; - my $msg = shift; - my $pair = join(", ", map {sprintf "\\x%x", $_} @_); - require Carp; - Carp::croak $obj->name, ":", $msg, "<$pair>.", caller; -} +*decode = \&decode_xs; +*encode = \&encode_xs; 1; __END__ |