diff options
author | Nick Ing-Simmons <nik@tiuk.ti.com> | 2001-02-24 17:05:23 +0000 |
---|---|---|
committer | Nick Ing-Simmons <nik@tiuk.ti.com> | 2001-02-24 17:05:23 +0000 |
commit | 4411f3b60960504e67715e05eb0ed3192bfff8fc (patch) | |
tree | e97680603c76eb78227139f832ff5e9e179c35bf | |
parent | ebf1a2feae7df8a971c8e3f6df7764df635eadae (diff) | |
download | perl-4411f3b60960504e67715e05eb0ed3192bfff8fc.tar.gz |
Encode API documentation nd tidy up.
Minor additions to test.
Tweaks sv_utf8_upgrade() to force SvPV, and always SvUTF8_on and return length.
Adds STRLEN to standard typemap.
p4raw-id: //depot/perlio@8923
-rwxr-xr-x | embed.pl | 8 | ||||
-rw-r--r-- | ext/Encode/Encode.pm | 836 | ||||
-rw-r--r-- | ext/Encode/Encode.xs | 101 | ||||
-rw-r--r-- | lib/ExtUtils/typemap | 1 | ||||
-rw-r--r-- | perlapi.c | 4 | ||||
-rw-r--r-- | proto.h | 2 | ||||
-rw-r--r-- | sv.c | 39 | ||||
-rw-r--r-- | t/lib/encode.t | 17 |
8 files changed, 573 insertions, 435 deletions
@@ -2180,10 +2180,10 @@ Ap |char* |sv_2pvbyte_nolen|SV* sv Ap |char* |sv_pv |SV *sv Ap |char* |sv_pvutf8 |SV *sv Ap |char* |sv_pvbyte |SV *sv -Apd |void |sv_utf8_upgrade|SV *sv -ApdM |bool |sv_utf8_downgrade|SV *sv|bool fail_ok -ApdM |void |sv_utf8_encode |SV *sv -Ap |bool |sv_utf8_decode |SV *sv +Apd |STRLEN |sv_utf8_upgrade|SV *sv +ApdM |bool |sv_utf8_downgrade|SV *sv|bool fail_ok +Apd |void |sv_utf8_encode |SV *sv +ApdM |bool |sv_utf8_decode |SV *sv Ap |void |sv_force_normal|SV *sv Ap |void |sv_force_normal_flags|SV *sv|U32 flags Ap |void |tmps_grow |I32 n diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index 1f4ffb1485..769b8daf46 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -7,345 +7,37 @@ require Exporter; @ISA = qw(Exporter DynaLoader); +# Public, encouraged API is exported by default +@EXPORT = qw ( + encode + decode + encode_utf8 + decode_utf8 + find_encoding +); + @EXPORT_OK = qw( - bytes_to_utf8 - utf8_to_bytes - chars_to_utf8 - utf8_to_chars - utf8_to_chars_check - bytes_to_chars - chars_to_bytes + encodings from_to is_utf8 - on_utf8 - off_utf8 - utf_to_utf - encodings - utf8_decode - utf8_encode + is_8bit + is_16bit utf8_upgrade utf8_downgrade + _utf8_on + _utf8_off ); bootstrap Encode (); -=pod - -=head1 NAME - -Encode - character encodings - -=head2 TERMINOLOGY - -=over - -=item * - -I<char>: a character in the range 0..maxint (at least 2**32-1) - -=item * - -I<byte>: a character in the range 0..255 - -=back - -The marker [INTERNAL] marks Internal Implementation Details, in -general meant only for those who think they know what they are doing, -and such details may change in future releases. - -=head2 bytes - -=over 4 - -=item * - - bytes_to_utf8(STRING [, FROM]) - -The bytes in STRING are recoded in-place into UTF-8. If no FROM is -specified the bytes are expected to be encoded in US-ASCII or ISO -8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if -there's a failure. - -[INTERNAL] Also the UTF-8 flag of STRING is turned on. - -=item * - - utf8_to_bytes(STRING [, TO [, CHECK]]) - -The UTF-8 in STRING is decoded in-place into bytes. If no TO encoding -is specified the bytes are expected to be encoded in US-ASCII or ISO -8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if -there's a failure. - -What if there are characters > 255? What if the UTF-8 in STRING is -malformed? See L</"Handling Malformed Data">. - -[INTERNAL] The UTF-8 flag of STRING is not checked. - -=back - -=head2 chars - -=over 4 - -=item * - - chars_to_utf8(STRING) - -The chars in STRING are encoded in-place into UTF-8. Returns the new -size of STRING, or C<undef> if there's a failure. - -No assumptions are made on the encoding of the chars. If you want to -assume that the chars are Unicode and to trap illegal Unicode -characters, you must use C<from_to('Unicode', ...)>. - -[INTERNAL] Also the UTF-8 flag of STRING is turned on. - -=over 4 - -=item * - - utf8_to_chars(STRING) - -The UTF-8 in STRING is decoded in-place into chars. Returns the new -size of STRING, or C<undef> if there's a failure. - -If the UTF-8 in STRING is malformed C<undef> is returned, and also an -optional lexical warning (category utf8) is given. - -[INTERNAL] The UTF-8 flag of STRING is not checked. - -=item * - - utf8_to_chars_check(STRING [, CHECK]) - -(Note that special naming of this interface since a two-argument -utf8_to_chars() has different semantics.) - -The UTF-8 in STRING is decoded in-place into chars. Returns the new -size of STRING, or C<undef> if there is a failure. - -If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">. - -[INTERNAL] The UTF-8 flag of STRING is not checked. - -=back - -=head2 chars With Encoding - -=over 4 - -=item * - - chars_to_utf8(STRING, FROM [, CHECK]) - -The chars in STRING encoded in FROM are recoded in-place into UTF-8. -Returns the new size of STRING, or C<undef> if there's a failure. - -No assumptions are made on the encoding of the chars. If you want to -assume that the chars are Unicode and to trap illegal Unicode -characters, you must use C<from_to('Unicode', ...)>. - -[INTERNAL] Also the UTF-8 flag of STRING is turned on. - -=item * - - utf8_to_chars(STRING, TO [, CHECK]) - -The UTF-8 in STRING is decoded in-place into chars encoded in TO. -Returns the new size of STRING, or C<undef> if there's a failure. - -If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">. - -[INTERNAL] The UTF-8 flag of STRING is not checked. - -=item * - - bytes_to_chars(STRING, FROM [, CHECK]) - -The bytes in STRING encoded in FROM are recoded in-place into chars. -Returns the new size of STRING in bytes, or C<undef> if there's a -failure. - -If the mapping is impossible? See L</"Handling Malformed Data">. - -=item * - - chars_to_bytes(STRING, TO [, CHECK]) - -The chars in STRING are recoded in-place to bytes encoded in TO. -Returns the new size of STRING in bytes, or C<undef> if there's a -failure. - -If the mapping is impossible? See L</"Handling Malformed Data">. - -=item * - - from_to(STRING, FROM, TO [, CHECK]) - -The chars in STRING encoded in FROM are recoded in-place into TO. -Returns the new size of STRING, or C<undef> if there's a failure. - -If mapping between the encodings is impossible? -See L</"Handling Malformed Data">. - -[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on. - -=back - -=head2 Testing For UTF-8 - -=over 4 - -=item * - - is_utf8(STRING [, CHECK]) - -[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. -If CHECK is true, also checks the data in STRING for being -well-formed UTF-8. Returns true if successful, false otherwise. - -=back - -=head2 Toggling UTF-8-ness - -=over 4 - -=item * - - on_utf8(STRING) - -[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is -B<not> checked for being well-formed UTF-8. Do not use unless you -B<know> that the STRING is well-formed UTF-8. Returns the previous -state of the UTF-8 flag (so please don't test the return value as -I<not> success or failure), or C<undef> if STRING is not a string. - -=item * - - off_utf8(STRING) - -[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously. -Returns the previous state of the UTF-8 flag (so please don't test the -return value as I<not> success or failure), or C<undef> if STRING is -not a string. - -=back - -=head2 UTF-16 and UTF-32 Encodings - -=over 4 - -=item * - - utf_to_utf(STRING, FROM, TO [, CHECK]) - -The data in STRING is converted from Unicode Transfer Encoding FROM to -Unicode Transfer Encoding TO. Both FROM and TO may be any of the -following tags (case-insensitive, with or without 'utf' or 'utf-' prefix): - - tag meaning - - '7' UTF-7 - '8' UTF-8 - '16be' UTF-16 big-endian - '16le' UTF-16 little-endian - '16' UTF-16 native-endian - '32be' UTF-32 big-endian - '32le' UTF-32 little-endian - '32' UTF-32 native-endian - -UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as -UCS-4, 32-bit or 4-byte chunks. Returns the new size of STRING, or -C<undef> is there's a failure. - -If FROM is UTF-8 and the UTF-8 in STRING is malformed? See -L</"Handling Malformed Data">. - -[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of -STRING is not checked. If TO is UTF-8, also the UTF-8 flag of STRING is -turned on. Identical FROM and TO are fine. - -=back - -=head2 Handling Malformed Data - -If CHECK is not set, C<undef> is returned. If the data is supposed to -be UTF-8, an optional lexical warning (category utf8) is given. If -CHECK is true but not a code reference, dies. If CHECK is a code -reference, it is called with the arguments - - (MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR) - -Two return values are expected from the call: the string to be used in -the result string in place of the malformed section, and the length of -the malformed section in bytes. - -=cut - -sub bytes_to_utf8 { - &_bytes_to_utf8; -} - -sub utf8_to_bytes { - &_utf8_to_bytes; -} - -sub chars_to_utf8 { - &C_to_utf8; -} - -sub utf8_to_chars { - &_utf8_to_chars; -} - -sub utf8_to_chars_check { - &_utf8_to_chars_check; -} - -sub bytes_to_chars { - &_bytes_to_chars; -} - -sub chars_to_bytes { - &_chars_to_bytes; -} - -sub is_utf8 { - &_is_utf8; -} - -sub on_utf8 { - &_on_utf8; -} - -sub off_utf8 { - &_off_utf8; -} - -sub utf_to_utf { - &_utf_to_utf; -} +# Documentation moved after __END__ for speed - NI-S use Carp; -sub from_to -{ - my ($string,$from,$to,$check) = @_; - my $f = __PACKAGE__->getEncoding($from); - croak("Unknown encoding '$from'") unless $f; - my $t = __PACKAGE__->getEncoding($to); - croak("Unknown encoding '$to'") unless $t; - my $uni = $f->toUnicode($string,$check); - return undef if ($check && length($string)); - $string = $t->fromUnicode($uni,$check); - return undef if ($check && length($uni)); - return length($_[0] = $string); -} - # The global hash is declared in XS code -$encoding{Unicode} = bless({},'Encode::Unicode'); +$encoding{Unicode} = bless({},'Encode::Unicode'); +$encoding{utf8} = bless({},'Encode::utf8'); $encoding{'iso10646-1'} = bless({},'Encode::iso10646_1'); sub encodings @@ -410,6 +102,60 @@ sub getEncoding return $enc; } +sub find_encoding +{ + my ($name) = @_; + return __PACKAGE__->getEncoding($name); +} + +sub encode +{ + my ($name,$string,$check) = @_; + my $enc = find_encoding($name); + croak("Unknown encoding '$name'") unless defined $enc; + my $octets = $enc->fromUnicode($string,$check); + return undef if ($check && length($string)); + return $octets; +} + +sub decode +{ + my ($name,$octets,$check) = @_; + my $enc = find_encoding($name); + croak("Unknown encoding '$name'") unless defined $enc; + my $string = $enc->toUnicode($octets,$check); + return undef if ($check && length($octets)); + return $string; +} + +sub from_to +{ + my ($string,$from,$to,$check) = @_; + my $f = find_encoding($from); + croak("Unknown encoding '$from'") unless defined $f; + my $t = find_encoding($to); + croak("Unknown encoding '$to'") unless defined $t; + my $uni = $f->toUnicode($string,$check); + return undef if ($check && length($string)); + $string = $t->fromUnicode($uni,$check); + return undef if ($check && length($uni)); + return length($_[0] = $string); +} + +sub encode_utf8 +{ + my ($str) = @_; + utf8_encode($str); + return $str; +} + +sub decode_utf8 +{ + my ($str) = @_; + return undef unless utf8_decode($str); + return $str; +} + package Encode::Unicode; # Dummy package that provides the encode interface but leaves data @@ -427,6 +173,37 @@ sub toUnicode *fromUnicode = \&toUnicode; +package Encode::utf8; + +# package to allow long-hand +# $octets = encode( utf8 => $string ); +# + +sub name { 'utf8' } + +sub toUnicode +{ + my ($obj,$octets,$chk) = @_; + my $str = decode_utf8($octets); + if (defined $str) + { + $_[1] = '' if $chk; + return $str; + } + return undef; +} + +sub fromUnicode +{ + my ($obj,$string,$chk) = @_; + my $octets = encode_utf8($string); + $_[1] = '' if $chk; + return $octets; + +} + +*fromUnicode = \&toUnicode; + package Encode::Table; sub read @@ -613,6 +390,415 @@ sub fromUnicode croak("Not implemented yet"); } +# switch back to Encode package in case we ever add AutoLoader +package Encode; + 1; +=head1 NAME + +Encode - character encodings + +=head1 SYNOPSIS + + use Encode; + +=head1 DESCRIPTION + +The C<Encode> module provides the interfaces between perl's strings +and the rest of the system. Perl strings are sequences of B<characters>. + +The repertoire of characters that Perl can represent is at least that +defined by the Unicode Consortium. On most platforms the ordinal values +of the characters (as returned by C<ord(ch)>) is the "Unicode codepoint" for +the character (the exceptions are those platforms where the legacy +encoding is some variant of EBCDIC rather than a super-set of ASCII +- see L<perlebcdic>). + +Traditionaly computer data has been moved around in 8-bit chunks +often called "bytes". These chunks are also known as "octets" in +networking standards. Perl is widely used to manipulate data of +many types - not only strings of characters representing human or +computer languages but also "binary" data being the machines representation +of numbers, pixels in an image - or just about anything. + +When perl is processing "binary data" the programmer wants perl to process +"sequences of bytes". This is not a problem for perl - as a byte has 256 +possible values it easily fits in perl's much larger "logical character". + +=head2 TERMINOLOGY + +=over + +=item * + +I<character>: a character in the range 0..(2**32-1) (or more). +(What perl's strings are made of.) + +=item * + +I<byte>: a character in the range 0..255 +(A special case of a perl character.) + +=item * + +I<octet>: 8 bits of data, with ordinal values 0..255 +(Term for bytes passed to or from a non-perl context, e.g. disk file.) + +=back + +The marker [INTERNAL] marks Internal Implementation Details, in +general meant only for those who think they know what they are doing, +and such details may change in future releases. + +=head1 ENCODINGS + +=head2 Characteristics of an Encoding + +An encoding has a "repertoire" of characters that it can represent, +and for each representable character there is at least one sequence of +octets that represents it. + +=head2 Types of Encodings + +Encodings can be divided into the following types: + +=over 4 + +=item * Fixed length 8-bit (or less) encodings. + +Each character is a single octet so may have a repertoire of up to +256 characters. ASCII and iso-8859-* are typical examples. + +=item * Fixed length 16-bit encodings + +Each character is two octets so may have a repertoire of up to +65,536 characters. Unicode's UCS-2 is an example. Also used for +encodings for East Asian languages. + +=item * Fixed length 32-bit encodings. + +Not really very "encoded" encodings. The Unicode code points +are just represented as 4-octet integers. None the less because +different architectures use different representations of integers +(so called "endian") there at least two disctinct encodings. + +=item * Multi-byte encodings + +The number of octets needed to represent a character varies. +UTF-8 is a particularly complex but regular case of a multi-byte +encoding. Several East Asian countries use a multi-byte encoding +where 1-octet is used to cover western roman characters and Asian +characters get 2-octets. +(UTF-16 is strictly a multi-byte encoding taking either 2 or 4 octets +to represent a Unicode code point.) + +=item * "Escape" encodings. + +These encodings embed "escape sequences" into the octet sequence +which describe how the following octets are to be interpreted. +The iso-2022-* family is typical. Following the escape sequence +octets are encoded by an "embedded" encoding (which will be one +of the above types) until another escape sequence switches to +a different "embedded" encoding. + +These schemes are very flexible and can handle mixed languages but are +very complex to process (and have state). +No escape encodings are implemented for perl yet. + +=back + +=head2 Specifying Encodings + +Encodings can be specified to the API described below in two ways: + +=over 4 + +=item 1. By name + +Encoding names are strings with characters taken from a restricted repertoire. +See L</"Encoding Names">. + +=item 2. As an object + +Encoding objects are returned by C<find_encoding($name)>. + +=back + +=head2 Encoding Names + +Encoding names are case insensitive. White space in names is ignored. +In addition an encoding may have aliases. Each encoding has one "canonical" name. +The "canonical" name is chosen from the names of the encoding by picking +the first in the following sequence: + +=over 4 + +=item * The MIME name as defined in IETF RFC-XXXX. + +=item * The name in the IANA registry. + +=item * The name used by the the organization that defined it. + +=back + +Because of all the alias issues, and because in the general case +encodings have state C<Encode> uses the encoding object internally +once an operation is in progress. + +I<Aliasing is not yet implemented.> + +=head1 PERL ENCODING API + +=head2 Generic Encoding Interface + +=over 4 + +=item * + + $bytes = encode(ENCODING, $string[, CHECK]) + +Encodes string from perl's internal form into I<ENCODING> and returns a +sequence of octets. +See L</"Handling Malformed Data">. + +=item * + + $string = decode(ENCODING, $bytes[, CHECK]) + +Decode sequence of octets assumed to be in I<ENCODING> into perls internal +form and returns the resuting string. +See L</"Handling Malformed Data">. + +=back + +=head2 Handling Malformed Data + +If CHECK is not set, C<undef> is returned. If the data is supposed to +be UTF-8, an optional lexical warning (category utf8) is given. +If CHECK is true but not a code reference, dies. + +It would desirable to have a way to indicate that transform should use the +encodings "replacement character" - no such mechanism is defined yet. + +It is also planned to allow I<CHECK> to be a code reference. + +This is not yet implemented as there are design issues with what its arguments +should be and how it returns its results. + +=over 4 + +=item Scheme 1 + +Passed remaining fragment of string being processed. +Modifies it in place to remove bytes/characters it can understand +and returns a string used to represent them. +e.g. + + sub fixup { + my $ch = substr($_[0],0,1,''); + return sprintf("\x{%02X}",ord($ch); + } + +This scheme is close to how underlying C code for Encode works, but gives +the fixup routine very little context. + +=item Scheme 2 + +Passed original string, and an index into it of the problem area, +and output string so far. +Appends what it will to output string and returns new index into +original string. +e.g. + + sub fixup { + # my ($s,$i,$d) = @_; + my $ch = substr($_[0],$_[1],1); + $_[2] .= sprintf("\x{%02X}",ord($ch); + return $_[1]+1; + } + +This scheme gives maximal control to the fixup routine but is more complicated +to code, and may need internals of Encode to be tweaked to keep original +string intact. + +=item Other Schemes + +Hybrids of above. + +Multiple return values rather than in-place modifications. + +Index into the string could be pos($str) allowing s/\G...//. + +=back + +=head2 UTF-8 / utf8 + +The Unicode consortium defines the UTF-8 standard as a way of encoding +the entire Unicode repertiore as sequences of octets. This encoding +is expected to become very widespread. Perl can use this form internaly +to represent strings, so conversions to and from this form are particularly +efficient (as octets in memory do not have to change, just the meta-data +that tells perl how to treat them). + +=over 4 + +=item * + + $bytes = encode_utf8($string); + +The characters that comprise string are encoded in perl's superset of UTF-8 +and the resulting octets returned as a sequence of bytes. All possible +characters have a UTF-8 representation so this function cannot fail. + +=item * + + $string = decode_utf8($bytes [,CHECK]); + +The sequence of octets represented by $bytes is decoded from UTF-8 into +a sequence of logical characters. Not all sequences of octets form valid +UTF-8 encodings, so it is possible for this call to fail. +See L</"Handling Malformed Data">. + +=back + +=head2 Other Encodings of Unicode + +UTF-16 is similar to UCS-2, 16 bit or 2-byte chunks. +UCS-2 can only represent 0..0xFFFF, while UTF-16 has a "surogate pair" +scheme which allows it to cover the whole Unicode range. + +Encode implements big-endian UCS-2 as the encoding "iso10646-1" as that +happens to be the name used by that representation when used with X11 fonts. + +UTF-32 or UCS-4 is 32-bit or 4-byte chunks. Perl's logical characters +can be considered as being in this form without encoding. An encoding +to transfer strings in this form (e.g. to write them to a file) would need to + + pack('L',map(chr($_),split(//,$string))); # native + or + pack('V',map(chr($_),split(//,$string))); # little-endian + or + pack('N',map(chr($_),split(//,$string))); # big-endian + +depending on the endian required. + +No UTF-32 encodings are not yet implemented. + +Both UCS-2 and UCS-4 style encodings can have "byte order marks" by representing +the code point 0xFFFE as the very first thing in a file. + +=head1 Encoding and IO + +It is very common to want to do encoding transformations when +reading or writing files, network connections, pipes etc. +If perl is configured to use the new 'perlio' IO system then +C<Encode> provides a "layer" (See L<perliol>) which can transform +data as it is read or written. + + open(my $ilyad,'>:encoding(iso8859-7)','ilyad.greek'); + print $ilyad @epic; + +In addition the new IO system can also be configured to read/write +UTF-8 encoded characters (as noted above this is efficient): + + open(my $fh,'>:utf8','anything'); + print $fh "Any \x{0021} string \N{SMILEY FACE}\n"; + +Either of the above forms of "layer" specifications can be made the default +for a lexical scope with the C<use open ...> pragma. See L<open>. + +Once a handle is open is layers can be altered using C<binmode>. + +Without any such configuration, or if perl itself is built using +system's own IO, then write operations assume that file handle accepts +only I<bytes> and will C<die> if a character larger than 255 is +written to the handle. When reading, each octet from the handle +becomes a byte-in-a-character. Note that this default is the same +behaviour as bytes-only languages (including perl before v5.6) would have, +and is sufficient to handle native 8-bit encodings e.g. iso-8859-1, +EBCDIC etc. and any legacy mechanisms for handling other encodings +and binary data. + +In other cases it is the programs responsibility +to transform characters into bytes using the API above before +doing writes, and to transform the bytes read from a handle into characters +before doing "character operations" (e.g. C<lc>, C</\W+/>, ...). + +=head1 Encoding How to ... + +To do: + +=over 4 + +=item * IO with mixed content (faking iso-2020-*) + +=item * MIME's Content-Length: + +=item * UTF-8 strings in binary data. + +=item * perl/Encode wrappers on non-Unicode XS modules. + +=back + +=head1 Messing with Perl's Internals + +The following API uses parts of perl's internals in the current implementation. +As such they are efficient, but may change. + +=over 4 + +=item * + + $num_octets = utf8_upgrade($string); + +Converts internal representation of string to the UTF-8 form. +Returns the number of octets necessary to represent the string as UTF-8. + +=item * utf8_downgrade($string[, CHECK]) + +Converts internal representation of string to be un-encoded bytes. + +=item * is_utf8(STRING [, CHECK]) + +[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. +If CHECK is true, also checks the data in STRING for being +well-formed UTF-8. Returns true if successful, false otherwise. + +=item * valid_utf8(STRING) + +[INTERNAL] Test whether STRING is in a consistent state. +Will return true if string is held as bytes, or is well-formed UTF-8 +and has the UTF-8 flag on. +Main reason for this routine is to allow perl's testsuite to check +that operations have left strings in a consistent state. + +=item * + + _utf8_on(STRING) + +[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is +B<not> checked for being well-formed UTF-8. Do not use unless you +B<know> that the STRING is well-formed UTF-8. Returns the previous +state of the UTF-8 flag (so please don't test the return value as +I<not> success or failure), or C<undef> if STRING is not a string. + +=item * + + _utf8_off(STRING) + +[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously. +Returns the previous state of the UTF-8 flag (so please don't test the +return value as I<not> success or failure), or C<undef> if STRING is +not a string. + +=back + +=head1 SEE ALSO + +L<perlunicode>, L<perlebcdic>, L<perlfunc/open> + +=cut + + __END__ diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs index 8aa51ff33d..3913fafb48 100644 --- a/ext/Encode/Encode.xs +++ b/ext/Encode/Encode.xs @@ -453,7 +453,7 @@ bool sv_utf8_decode(sv) SV * sv -void +STRLEN sv_utf8_upgrade(sv) SV * sv @@ -584,88 +584,16 @@ _utf8_to_bytes(sv, ...) OUTPUT: RETVAL -SV * -_chars_to_utf8(sv, from, ...) - SV * sv - SV * from - CODE: - { - SV * check = items == 3 ? ST(2) : Nullsv; - RETVAL = &PL_sv_undef; - } - OUTPUT: - RETVAL - -SV * -_utf8_to_chars(sv, to, ...) - SV * sv - SV * to - CODE: - { - SV * check = items == 3 ? ST(2) : Nullsv; - RETVAL = &PL_sv_undef; - } - OUTPUT: - RETVAL - -SV * -_utf8_to_chars_check(sv, ...) - SV * sv - CODE: - { - SV * check = items == 2 ? ST(1) : Nullsv; - RETVAL = &PL_sv_undef; - } - OUTPUT: - RETVAL - -SV * -_bytes_to_chars(sv, from, ...) - SV * sv - SV * from - CODE: - { - SV * check = items == 3 ? ST(2) : Nullsv; - RETVAL = &PL_sv_undef; - } - OUTPUT: - RETVAL - -SV * -_chars_to_bytes(sv, to, ...) - SV * sv - SV * to - CODE: - { - SV * check = items == 3 ? ST(2) : Nullsv; - RETVAL = &PL_sv_undef; - } - OUTPUT: - RETVAL - -SV * -_from_to(sv, from, to, ...) - SV * sv - SV * from - SV * to - CODE: - { - SV * check = items == 4 ? ST(3) : Nullsv; - RETVAL = &PL_sv_undef; - } - OUTPUT: - RETVAL - bool -_is_utf8(sv, ...) - SV * sv +is_utf8(sv, check = FALSE) +SV * sv +bool check CODE: { - SV * check = items == 2 ? ST(1) : Nullsv; if (SvPOK(sv)) { - RETVAL = SvUTF8(sv) ? 1 : 0; + RETVAL = SvUTF8(sv) ? TRUE : FALSE; if (RETVAL && - SvTRUE(check) && + check && !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) RETVAL = FALSE; } else { @@ -676,7 +604,7 @@ _is_utf8(sv, ...) RETVAL SV * -_on_utf8(sv) +_utf8_on(sv) SV * sv CODE: { @@ -692,7 +620,7 @@ _on_utf8(sv) RETVAL SV * -_off_utf8(sv) +_utf8_off(sv) SV * sv CODE: { @@ -707,19 +635,6 @@ _off_utf8(sv) OUTPUT: RETVAL -SV * -_utf_to_utf(sv, from, to, ...) - SV * sv - SV * from - SV * to - CODE: - { - SV * check = items == 4 ? ST(3) : Nullsv; - RETVAL = &PL_sv_undef; - } - OUTPUT: - RETVAL - BOOT: { #ifdef USE_PERLIO diff --git a/lib/ExtUtils/typemap b/lib/ExtUtils/typemap index c59c3dc88a..bf94afcb64 100644 --- a/lib/ExtUtils/typemap +++ b/lib/ExtUtils/typemap @@ -32,6 +32,7 @@ UV T_UV I32 T_IV I16 T_IV I8 T_IV +STRLEN T_IV U32 T_U_LONG U16 T_U_SHORT U8 T_UV @@ -3875,10 +3875,10 @@ Perl_sv_pvbyte(pTHXo_ SV *sv) } #undef Perl_sv_utf8_upgrade -void +STRLEN Perl_sv_utf8_upgrade(pTHXo_ SV *sv) { - ((CPerlObj*)pPerl)->Perl_sv_utf8_upgrade(sv); + return ((CPerlObj*)pPerl)->Perl_sv_utf8_upgrade(sv); } #undef Perl_sv_utf8_downgrade @@ -922,7 +922,7 @@ PERL_CALLCONV char* Perl_sv_2pvbyte_nolen(pTHX_ SV* sv); PERL_CALLCONV char* Perl_sv_pv(pTHX_ SV *sv); PERL_CALLCONV char* Perl_sv_pvutf8(pTHX_ SV *sv); PERL_CALLCONV char* Perl_sv_pvbyte(pTHX_ SV *sv); -PERL_CALLCONV void Perl_sv_utf8_upgrade(pTHX_ SV *sv); +PERL_CALLCONV STRLEN Perl_sv_utf8_upgrade(pTHX_ SV *sv); PERL_CALLCONV bool Perl_sv_utf8_downgrade(pTHX_ SV *sv, bool fail_ok); PERL_CALLCONV void Perl_sv_utf8_encode(pTHX_ SV *sv); PERL_CALLCONV bool Perl_sv_utf8_decode(pTHX_ SV *sv); @@ -2943,18 +2943,27 @@ Perl_sv_2bool(pTHX_ register SV *sv) =for apidoc sv_utf8_upgrade Convert the PV of an SV to its UTF8-encoded form. +Forces the SV to string form it it is not already. +Always sets the SvUTF8 flag to avoid future validity checks even +if all the bytes have hibit clear. =cut */ -void +STRLEN Perl_sv_utf8_upgrade(pTHX_ register SV *sv) { char *s, *t, *e; int hibit = 0; - if (!sv || !SvPOK(sv) || SvUTF8(sv)) - return; + if (!sv) + return 0; + + if (!SvPOK(sv)) + (void) SvPV_nolen(sv); + + if (SvUTF8(sv)) + return SvCUR(sv); /* This function could be much more efficient if we had a FLAG in SVs * to signal if there are any hibit chars in the PV. @@ -2981,8 +2990,10 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) if (SvLEN(sv) != 0) Safefree(s); /* No longer using what was there before. */ SvLEN(sv) = len; /* No longer know the real size. */ - SvUTF8_on(sv); } + /* Mark as UTF-8 even if no hibit - saves scanning loop */ + SvUTF8_on(sv); + return SvCUR(sv); } /* @@ -3030,7 +3041,8 @@ Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok) =for apidoc sv_utf8_encode Convert the PV of an SV to UTF8-encoded, but then turn off the C<SvUTF8> -flag so that it looks like bytes again. Nothing calls this. +flag so that it looks like octets again. Used as a building block +for encode_utf8 in Encode.xs =cut */ @@ -3038,10 +3050,22 @@ flag so that it looks like bytes again. Nothing calls this. void Perl_sv_utf8_encode(pTHX_ register SV *sv) { - sv_utf8_upgrade(sv); + (void) sv_utf8_upgrade(sv); SvUTF8_off(sv); } +/* +=for apidoc sv_utf8_decode + +Convert the octets in the PV from UTF-8 to chars. Scan for validity and then +turn of SvUTF8 if needed so that we see characters. Used as a building block +for decode_utf8 in Encode.xs + +=cut +*/ + + + bool Perl_sv_utf8_decode(pTHX_ register SV *sv) { @@ -3049,6 +3073,7 @@ Perl_sv_utf8_decode(pTHX_ register SV *sv) char *c; char *e; + /* The octets may have got themselves encoded - get them back as bytes */ if (!sv_utf8_downgrade(sv, TRUE)) return FALSE; @@ -7821,7 +7846,7 @@ S_gv_share(pTHX_ SV *sstr) return Nullsv; } - /* + /* * write attempts will die with * "Modification of a read-only value attempted" */ diff --git a/t/lib/encode.t b/t/lib/encode.t index 280c2d0ed5..5c911f0f3a 100644 --- a/t/lib/encode.t +++ b/t/lib/encode.t @@ -8,7 +8,7 @@ BEGIN { } } use Test; -use Encode qw(from_to); +use Encode qw(from_to encode decode encode_utf8 decode_utf8); use charnames qw(greek); my @encodings = grep(/iso8859/,Encode::encodings()); my $n = 2; @@ -16,7 +16,7 @@ my @character_set = ('0'..'9', 'A'..'Z', 'a'..'z'); my @source = qw(ascii iso8859-1 cp1250); my @destiny = qw(cp1047 cp37 posix-bc); my @ebcdic_sets = qw(cp1047 cp37 posix-bc); -plan test => 21+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256; +plan test => 33+$n*@encodings + 2*@source*@destiny*@character_set + 2*@ebcdic_sets*256; my $str = join('',map(chr($_),0x20..0x7E)); my $cpy = $str; ok(length($str),from_to($cpy,'iso8859-1','Unicode'),"Length Wrong"); @@ -91,7 +91,7 @@ foreach my $enc_eb (@ebcdic_sets) } } -for $i (256,128,129,256) +for my $i (256,128,129,256) { my $c = chr($i); my $s = "$c\n".sprintf("%02X",$i); @@ -100,3 +100,14 @@ for $i (256,128,129,256) ok(Encode::valid_utf8($s),1,"concat of $i botched"); } +# Spot check a few points in/out of utf8 +for my $i (0x41,128,256,0x20AC) + { + my $c = chr($i); + my $o = encode_utf8($c); + ok(decode_utf8($o),$c,"decode_utf8 not inverse of encode_utf8 for $i"); + ok(encode('utf8',$c),$o,"utf8 encode by name broken for $i"); + ok(decode('utf8',$o),$c,"utf8 decode by name broken for $i"); + } + + |