diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2002-03-31 22:31:29 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2002-03-31 22:31:29 +0000 |
commit | 3ef515df8b968e34c050cfc9602902de0efbefd6 (patch) | |
tree | 856419a8a26ba37abef58ee10f916805a1c23b71 /lib | |
parent | a23c04e41e8e54701da113d9309ef79fb888afe6 (diff) | |
download | perl-3ef515df8b968e34c050cfc9602902de0efbefd6.tar.gz |
Upgrade to Encode 1.11, from Dan Kogai.
p4raw-id: //depot/perl@15638
Diffstat (limited to 'lib')
-rw-r--r-- | lib/encoding.pm | 122 | ||||
-rw-r--r-- | lib/encoding.t | 186 |
2 files changed, 0 insertions, 308 deletions
diff --git a/lib/encoding.pm b/lib/encoding.pm deleted file mode 100644 index 441be3340a..0000000000 --- a/lib/encoding.pm +++ /dev/null @@ -1,122 +0,0 @@ -package encoding; - -our $VERSION = '1.00'; - -use Encode; - -BEGIN { - if (ord("A") == 193) { - require Carp; - Carp::croak "encoding pragma does not support EBCDIC platforms"; - } -} - -sub import { - my ($class, $name) = @_; - $name = $ENV{PERL_ENCODING} if @_ < 2; - $name = "latin1" unless defined $name; - my $enc = find_encoding($name); - unless (defined $enc) { - require Carp; - Carp::croak "Unknown encoding '$name'"; - } - ${^ENCODING} = $enc; -} - -=pod - -=head1 NAME - -encoding - pragma to control the conversion of legacy data into Unicode - -=head1 SYNOPSIS - - use encoding "iso 8859-7"; - - # The \xDF of ISO 8859-7 (Greek) is \x{3af} in Unicode. - - $a = "\xDF"; - $b = "\x{100}"; - - printf "%#x\n", ord($a); # will print 0x3af, not 0xdf - - $c = $a . $b; - - # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}". - - # chr() is affected, and ... - - print "mega\n" if ord(chr(0xdf)) == 0x3af; - - # ... ord() is affected by the encoding pragma ... - - print "tera\n" if ord(pack("C", 0xdf)) == 0x3af; - - # ... as are eq and cmp ... - - print "peta\n" if "\x{3af}" eq pack("C", 0xdf); - print "exa\n" if "\x{3af}" cmp pack("C", 0xdf) == 0; - - # ... but pack/unpack C are not affected, in case you still - # want back to your native encoding - - print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf; - -=head1 DESCRIPTION - -Normally when legacy 8-bit data is converted to Unicode the data is -expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the -encoding pragma you can change this default. - -The pragma is a per script, not a per block lexical. Only the last -C<use encoding> matters, and it affects B<the whole script>. - -Notice that only literals (string or regular expression) having only -legacy code points are affected: if you mix data like this - - \xDF\x{100} - -the data is assumed to be in (Latin 1 and) Unicode, not in your native -encoding. In other words, this will match in "greek": - - "\xDF" =~ /\x{3af}/ - -but this will not - - "\xDF\x{100}" =~ /\x{3af}\x{100}/ - -since the C<\xDF> on the left will B<not> be upgraded to C<\x{3af}> -because of the C<\x{100}> on the left. You should not be mixing your -legacy data and Unicode in the same string. - -This pragma also affects encoding of the 0x80..0xFF code point range: -normally characters in that range are left as eight-bit bytes (unless -they are combined with characters with code points 0x100 or larger, -in which case all characters need to become UTF-8 encoded), but if -the C<encoding> pragma is present, even the 0x80..0xFF range always -gets UTF-8 encoded. - -If no encoding is specified, the environment variable L<PERL_ENCODING> -is consulted. If that fails, "latin1" (ISO 8859-1) is assumed. If no -encoding can be found, C<Unknown encoding '...'> error will be thrown. - -Note if you want to get back to the original byte encoding, you need -to use things like I/O with encoding discplines (see L<open>) or the -Encode module, since C<no encoding> (or re-C<encoding>) do not work. - -=head1 KNOWN PROBLEMS - -For native multibyte encodings (either fixed or variable length) -the current implementation of the regular expressions may introduce -recoding errors for longer regular expression literals than 127 bytes. - -The encoding pragma is not supported on EBCDIC platforms. -(Porters wanted.) - -=head1 SEE ALSO - -L<perlunicode>, L<Encode> - -=cut - -1; diff --git a/lib/encoding.t b/lib/encoding.t deleted file mode 100644 index ec21c1b3a6..0000000000 --- a/lib/encoding.t +++ /dev/null @@ -1,186 +0,0 @@ -BEGIN { - require Config; import Config; - if ($Config{'extensions'} !~ /\bEncode\b/) { - print "1..0 # Skip: Encode was not built\n"; - exit 0; - } - if (ord("A") == 193) { - print "1..0 # encoding pragma does not support EBCDIC platforms\n"; - exit(0); - } -} - -print "1..29\n"; - -use encoding "latin1"; # ignored (overwritten by the next line) -use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...) - -# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is -# \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS), -# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S) - -$a = "\xDF"; -$b = "\x{100}"; - -print "not " unless ord($a) == 0x3af; -print "ok 1\n"; - -print "not " unless ord($b) == 0x100; -print "ok 2\n"; - -my $c; - -$c = $a . $b; - -print "not " unless ord($c) == 0x3af; -print "ok 3\n"; - -print "not " unless length($c) == 2; -print "ok 4\n"; - -print "not " unless ord(substr($c, 1, 1)) == 0x100; -print "ok 5\n"; - -print "not " unless ord(chr(0xdf)) == 0x3af; # spooky -print "ok 6\n"; - -print "not " unless ord(pack("C", 0xdf)) == 0x3af; -print "ok 7\n"; - -# we didn't break pack/unpack, I hope - -print "not " unless unpack("C", pack("C", 0xdf)) == 0xdf; -print "ok 8\n"; - -# the first octet of UTF-8 encoded 0x3af -print "not " unless unpack("C", chr(0xdf)) == 0xce; -print "ok 9\n"; - -print "not " unless unpack("U", pack("U", 0xdf)) == 0xdf; -print "ok 10\n"; - -print "not " unless unpack("U", chr(0xdf)) == 0x3af; -print "ok 11\n"; - -# charnames must still work -use charnames ':full'; -print "not " unless ord("\N{LATIN SMALL LETTER SHARP S}") == 0xdf; -print "ok 12\n"; - -# combine - -$c = "\xDF\N{LATIN SMALL LETTER SHARP S}" . chr(0xdf); - -print "not " unless ord($c) == 0x3af; -print "ok 13\n"; - -print "not " unless ord(substr($c, 1, 1)) == 0xdf; -print "ok 14\n"; - -print "not " unless ord(substr($c, 2, 1)) == 0x3af; -print "ok 15\n"; - -# regex literals - -print "not " unless "\xDF" =~ /\x{3AF}/; -print "ok 16\n"; - -print "not " unless "\x{3AF}" =~ /\xDF/; -print "ok 17\n"; - -print "not " unless "\xDF" =~ /\xDF/; -print "ok 18\n"; - -print "not " unless "\x{3AF}" =~ /\x{3AF}/; -print "ok 19\n"; - -# eq, cmp - -my ($byte,$bytes,$U,$Ub,$g1,$g2,$l) = ( - pack("C*", 0xDF ), # byte - pack("C*", 0xDF, 0x20), # ($bytes2 cmp $U) > 0 - pack("U*", 0x3AF), # $U eq $byte - pack("U*", 0xDF ), # $Ub would eq $bytev w/o use encoding - pack("U*", 0x3B1), # ($g1 cmp $byte) > 0; === chr(0xe1) - pack("U*", 0x3AF, 0x20), # ($g2 cmp $byte) > 0; - pack("U*", 0x3AB), # ($l cmp $byte) < 0; === chr(0xdb) -); - -# all the tests in this section that compare a byte encoded string -# ato UTF-8 encoded are run in all possible vairants -# all of the eq, ne, cmp operations tested, -# $v z $u tested as well as $u z $v - -sub alleq($$){ - my ($a,$b) = (shift, shift); - $a eq $b && $b eq $a && - !( $a ne $b ) && !( $b ne $a ) && - ( $a cmp $b ) == 0 && ( $b cmp $a ) == 0; -} - -sub anyeq($$){ - my ($a,$b) = (shift, shift); - $a eq $b || $b eq $a || - !( $a ne $b ) || !( $b ne $a ) || - ( $a cmp $b ) == 0 || ( $b cmp $a ) == 0; -} - -sub allgt($$){ - my ($a,$b) = (shift, shift); - ( $a cmp $b ) == 1 && ( $b cmp $a ) == -1; -} -#match the correct UTF-8 string -print "not " unless alleq($byte, $U); -print "ok 20\n"; - -#do not match a wrong UTF-8 string -print "not " if anyeq($byte, $Ub); -print "ok 21\n"; - -#string ordering -print "not " unless allgt ( $g1, $byte ) && - allgt ( $g2, $byte ) && - allgt ( $byte, $l ) && - allgt ( $bytes, $U ); -print "ok 22\n"; - -# upgrade, downgrade - -my ($u,$v,$v2); -$u = $v = $v2 = pack("C*", 0xDF); -utf8::upgrade($v); #explicit upgrade -$v2 = substr( $v2."\x{410}", 0, -1); #implicit upgrade - -# implicit upgrade === explicit upgrade -print "not " if do{{use bytes; $v ne $v2}} || $v ne $v2; -print "ok 23\n"; - -# utf8::upgrade is transparent and does not break equality -print "not " unless alleq( $u, $v ); -print "ok 24\n"; - -$u = $v = pack("C*", 0xDF); -utf8::upgrade($v); -#test for a roundtrip, we should get back from where we left -eval {utf8::downgrade( $v )}; -print "not " if $@ !~ /^Wide / || do{{use bytes; $u eq $v}} || $u ne $v; -print "ok 25\n"; - -# some more eq, cmp - -my $byte=pack("C*", 0xDF); - -print "not " unless pack("U*", 0x3AF) eq $byte; -print "ok 26\n"; - -print "not " if chr(0xDF) cmp $byte; -print "ok 27\n"; - -print "not " unless ((pack("U*", 0x3B0) cmp $byte) == 1) && - ((pack("U*", 0x3AE) cmp $byte) == -1) && - ((pack("U*", 0x3AF, 0x20) cmp $byte) == 1) && - ((pack("U*", 0x3AF) cmp pack("C*",0xDF,0x20))==-1); -print "ok 28\n"; - -# Used to core dump in 5.7.3 -print ord undef == 0 ? "ok 29\n" : "not ok 29\n"; |