diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2000-09-13 14:18:39 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2000-09-13 14:18:39 +0000 |
commit | 2c674647e4cd32dc05dd268c2de5090815265695 (patch) | |
tree | 85789e2b80dd80aa9f617e9758594d1f4c5deef7 | |
parent | d0d1d9b9406c0402a9e154640146d087c367cd8a (diff) | |
download | perl-2c674647e4cd32dc05dd268c2de5090815265695.tar.gz |
Add the Encode extension. The code is still largely just skeleton.
p4raw-id: //depot/perl@7068
-rw-r--r-- | MANIFEST | 4 | ||||
-rw-r--r-- | Todo-5.6 | 19 | ||||
-rw-r--r-- | ext/Encode/Encode.pm | 329 | ||||
-rw-r--r-- | ext/Encode/Encode.xs | 169 | ||||
-rw-r--r-- | ext/Encode/Makefile.PL | 11 | ||||
-rw-r--r-- | ext/Encode/Todo | 49 |
6 files changed, 573 insertions, 8 deletions
@@ -188,6 +188,10 @@ ext/DynaLoader/hints/aix.pl Hint for DynaLoader for named architecture ext/DynaLoader/hints/linux.pl Hint for DynaLoader for named architecture ext/DynaLoader/hints/netbsd.pl Hint for DynaLoader for named architecture ext/DynaLoader/hints/openbsd.pl Hint for DynaLoader for named architecture +ext/Encode/Encode.pm Encode extension +ext/Encode/Encode.xs Encode extension +ext/Encode/Makefile.PL Encode extension +ext/Encode/Todo Encode extension ext/Errno/ChangeLog Errno perl module change log ext/Errno/Errno_pm.PL Errno perl module create script ext/Errno/Makefile.PL Errno extension makefile writer @@ -16,6 +16,17 @@ Unicode support to work similarly to Unicode tech reports and Java notation \uXXXX (and already existing \x{XXXX))? more than four hexdigits? make also \U+XXXX work? + + see ext/Encode/Todo for notes and references about proper detection + of malformed UTF-8 + + SCSU? http://www.unicode.org/unicode/reports/tr6/ + Collation? http://www.unicode.org/unicode/reports/tr10/ + Normalization? http://www.unicode.org/unicode/reports/tr15/ + EBCDIC? http://www.unicode.org/unicode/reports/tr16/ + Regexes? http://www.unicode.org/unicode/reports/tr18/ + Case Mappings? http://www.unicode.org/unicode/reports/tr21/ + See also "Locales", "Regexen", and "Miscellaneous". Multi-threading @@ -156,14 +167,6 @@ Miscellaneous sendmsg, recvmsg? (Configure doesn't probe for these but the units exist) setitimer, getitimer? (the metaconfig units exist) -Unicode - SCSU? http://www.unicode.org/unicode/reports/tr6/ - Collation? http://www.unicode.org/unicode/reports/tr10/ - Normalization? http://www.unicode.org/unicode/reports/tr15/ - EBCDIC? http://www.unicode.org/unicode/reports/tr16/ - Regexes? http://www.unicode.org/unicode/reports/tr18/ - Case Mappings? http://www.unicode.org/unicode/reports/tr21/ - Ongoing keep filenames 8.3 friendly, where feasible upgrade to newer versions of all independently maintained modules diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm new file mode 100644 index 0000000000..57677e690b --- /dev/null +++ b/ext/Encode/Encode.pm @@ -0,0 +1,329 @@ +package Encode; + +$VERSION = 0.01; + +require DynaLoader; +require Exporter; + +@ISA = qw(Exporter DynaLoader); + +@EXPORT_OK = + qw( + bytes_to_utf8 + utf8_to_bytes + chars_to_utf8 + utf8_to_chars + utf8_to_chars_check + bytes_to_chars + chars_to_bytes + from_to + is_utf8 + on_utf8 + off_utf8 + utf_to_utf + ); + +bootstrap Encode (); + +=pod + +=head1 NAME + +Encode - character encodings + +=head2 TERMINOLOGY + +=over + +=item * + +I<char>: a character in the range 0..maxint (at least 2**32-1) + +=item * + +I<byte>: a character in the range 0..255 + +=back + +The marker [INTERNAL] marks Internal Implementation Details, in +general meant only for those who think they know what they are doing, +and such details may change in future releases. + +=head2 bytes + +=over 4 + +=item * + + bytes_to_utf8(STRING [, FROM]) + +The bytes in STRING are recoded in-place into UTF-8. If no FROM is +specified the bytes are expected to be encoded in US-ASCII or ISO +8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if +there's a failure. + +[INTERNAL] Also the UTF-8 flag of STRING is turned on. + +=item * + + utf8_to_bytes(STRING [, TO [, CHECK]]) + +The UTF-8 in STRING is decoded in-place into bytes. If no TO encoding +is specified the bytes are expected to be encoded in US-ASCII or ISO +8859-1 (Latin 1). Returns the new size of STRING, or C<undef> if +there's a failure. + +What if there are characters > 255? What if the UTF-8 in STRING is +malformed? See L</"Handling Malformed Data">. + +[INTERNAL] The UTF-8 flag of STRING is not checked. + +=back + +=head2 chars + +=over 4 + +=item * + + chars_to_utf8(STRING) + +The chars in STRING are encoded in-place into UTF-8. Returns the new +size of STRING, or C<undef> if there's a failure. + +No assumptions are made on the encoding of the chars. If you want to +assume that the chars are Unicode and to trap illegal Unicode +characters, you must use C<from_to('Unicode', ...)>. + +[INTERNAL] Also the UTF-8 flag of STRING is turned on. + +=over 4 + +=item * + + utf8_to_chars(STRING) + +The UTF-8 in STRING is decoded in-place into chars. Returns the new +size of STRING, or C<undef> if there's a failure. + +If the UTF-8 in STRING is malformed C<undef> is returned, and also an +optional lexical warning (category utf8) is given. + +[INTERNAL] The UTF-8 flag of STRING is not checked. + +=item * + + utf8_to_chars_check(STRING [, CHECK]) + +(Note that special naming of this interface since a two-argument +utf8_to_chars() has different semantics.) + +The UTF-8 in STRING is decoded in-place into chars. Returns the new +size of STRING, or C<undef> if there is a failure. + +If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">. + +[INTERNAL] The UTF-8 flag of STRING is not checked. + +=back + +=head2 chars With Encoding + +=over 4 + +=item * + + chars_to_utf8(STRING, FROM [, CHECK]) + +The chars in STRING encoded in FROM are recoded in-place into UTF-8. +Returns the new size of STRING, or C<undef> if there's a failure. + +No assumptions are made on the encoding of the chars. If you want to +assume that the chars are Unicode and to trap illegal Unicode +characters, you must use C<from_to('Unicode', ...)>. + +[INTERNAL] Also the UTF-8 flag of STRING is turned on. + +=item * + + utf8_to_chars(STRING, TO [, CHECK]) + +The UTF-8 in STRING is decoded in-place into chars encoded in TO. +Returns the new size of STRING, or C<undef> if there's a failure. + +If the UTF-8 in STRING is malformed? See L</"Handling Malformed Data">. + +[INTERNAL] The UTF-8 flag of STRING is not checked. + +=item * + + bytes_to_chars(STRING, FROM [, CHECK]) + +The bytes in STRING encoded in FROM are recoded in-place into chars. +Returns the new size of STRING in bytes, or C<undef> if there's a +failure. + +If the mapping is impossible? See L</"Handling Malformed Data">. + +=item * + + chars_to_bytes(STRING, TO [, CHECK]) + +The chars in STRING are recoded in-place to bytes encoded in TO. +Returns the new size of STRING in bytes, or C<undef> if there's a +failure. + +If the mapping is impossible? See L</"Handling Malformed Data">. + +=item * + + from_to(STRING, FROM, TO [, CHECK]) + +The chars in STRING encoded in FROM are recoded in-place into TO. +Returns the new size of STRING, or C<undef> if there's a failure. + +If mapping between the encodings is impossible? +See L</"Handling Malformed Data">. + +[INTERNAL] If TO is UTF-8, also the UTF-8 flag of STRING is turned on. + +=back + +=head2 Testing For UTF-8 + +=over 4 + +=item * + + is_utf8(STRING [, CHECK]) + +[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING. +If CHECK is true, also checks the data in STRING for being +well-formed UTF-8. Returns true if successful, false otherwise. + +=back + +=head2 Toggling UTF-8-ness + +=over 4 + +=item * + + on_utf8(STRING) + +[INTERNAL] Turn on the UTF-8 flag in STRING. The data in STRING is +B<not> checked for being well-formed UTF-8. Do not use unless you +B<know> that the STRING is well-formed UTF-8. Returns the previous +state of the UTF-8 flag (so please don't test the return value as +I<not> success or failure), or C<undef> if STRING is not a string. + +=item * + + off_utf8(STRING) + +[INTERNAL] Turn off the UTF-8 flag in STRING. Do not use frivolously. +Returns the previous state of the UTF-8 flag (so please don't test the +return value as I<not> success or failure), or C<undef> if STRING is +not a string. + +=back + +=head2 UTF-16 and UTF-32 Encodings + +=over 4 + +=item * + + utf_to_utf(STRING, FROM, TO [, CHECK]) + +The data in STRING is converted from Unicode Transfer Encoding FROM to +Unicode Transfer Encoding TO. Both FROM and TO may be any of the +following tags (case-insensitive, with or without 'utf' or 'utf-' prefix): + + tag meaning + + '7' UTF-7 + '8' UTF-8 + '16be' UTF-16 big-endian + '16le' UTF-16 little-endian + '16' UTF-16 native-endian + '32be' UTF-32 big-endian + '32le' UTF-32 little-endian + '32' UTF-32 native-endian + +UTF-16 is also known as UCS-2, 16 bit or 2-byte chunks, and UTF-32 as +UCS-4, 32-bit or 4-byte chunks. Returns the new size of STRING, or +C<undef> is there's a failure. + +If FROM is UTF-8 and the UTF-8 in STRING is malformed? See +L</"Handling Malformed Data">. + +[INTERNAL] Even if CHECK is true and FROM is UTF-8, the UTF-8 flag of +STRING is not checked. If TO is UTF-8, also the UTF-8 flag of STRING is +turned on. Identical FROM and TO are fine. + +=back + +=head2 Handling Malformed Data + +If CHECK is not set, C<undef> is returned. If the data is supposed to +be UTF-8, an optional lexical warning (category utf8) is given. If +CHECK is true but not a code reference, dies. If CHECK is a code +reference, it is called with the arguments + + (MALFORMED_STRING, STRING_FROM_SO_FAR, STRING_TO_SO_FAR) + +Two return values are expected from the call: the string to be used in +the result string in place of the malformed section, and the length of +the malformed section in bytes. + +=cut + +sub bytes_to_utf8 { + &_bytes_to_utf8; +} + +sub utf8_to_bytes { + &_utf8_to_bytes; +} + +sub chars_to_utf8 { + &C_to_utf8; +} + +sub utf8_to_chars { + &_utf8_to_chars; +} + +sub utf8_to_chars_check { + &_utf8_to_chars_check; +} + +sub bytes_to_chars { + &_bytes_to_chars; +} + +sub chars_to_bytes { + &_chars_to_bytes; +} + +sub from_to { + &_from_to; +} + +sub is_utf8 { + &_is_utf8; +} + +sub on_utf8 { + &_on_utf8; +} + +sub off_utf8 { + &_off_utf8; +} + +sub utf_to_utf { + &_utf_to_utf; +} + diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs new file mode 100644 index 0000000000..cc0a86a117 --- /dev/null +++ b/ext/Encode/Encode.xs @@ -0,0 +1,169 @@ +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +MODULE = Encode PACKAGE = Encode + +PROTOTYPES: ENABLE + +SV * +_bytes_to_utf8(sv, ...) + SV * sv + CODE: + { + SV * encoding = 2 ? ST(1) : Nullsv; + RETVAL = &PL_sv_undef; + } + OUTPUT: + RETVAL + +SV * +_utf8_to_bytes(sv, ...) + SV * sv + CODE: + { + SV * to = items > 1 ? ST(1) : Nullsv; + SV * check = items > 2 ? ST(2) : Nullsv; + RETVAL = &PL_sv_undef; + } + OUTPUT: + RETVAL + +SV * +_chars_to_utf8(sv, from, ...) + SV * sv + SV * from + CODE: + { + SV * check = items == 3 ? ST(2) : Nullsv; + RETVAL = &PL_sv_undef; + } + OUTPUT: + RETVAL + +SV * +_utf8_to_chars(sv, to, ...) + SV * sv + SV * to + CODE: + { + SV * check = items == 3 ? ST(2) : Nullsv; + RETVAL = &PL_sv_undef; + } + OUTPUT: + RETVAL + +SV * +_utf8_to_chars_check(sv, ...) + SV * sv + CODE: + { + SV * check = items == 2 ? ST(1) : Nullsv; + RETVAL = &PL_sv_undef; + } + OUTPUT: + RETVAL + +SV * +_bytes_to_chars(sv, from, ...) + SV * sv + SV * from + CODE: + { + SV * check = items == 3 ? ST(2) : Nullsv; + RETVAL = &PL_sv_undef; + } + OUTPUT: + RETVAL + +SV * +_chars_to_bytes(sv, to, ...) + SV * sv + SV * to + CODE: + { + SV * check = items == 3 ? ST(2) : Nullsv; + RETVAL = &PL_sv_undef; + } + OUTPUT: + RETVAL + +SV * +_from_to(sv, from, to, ...) + SV * sv + SV * from + SV * to + CODE: + { + SV * check = items == 4 ? ST(3) : Nullsv; + RETVAL = &PL_sv_undef; + } + OUTPUT: + RETVAL + +bool +_is_utf8(sv, ...) + SV * sv + CODE: + { + SV * check = items == 2 ? ST(1) : Nullsv; + if (SvPOK(sv)) { + RETVAL = SvUTF8(sv); + if (RETVAL && + SvTRUE(check) && + !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) + RETVAL = FALSE; + } else { + RETVAL = FALSE; + } + } + OUTPUT: + RETVAL + +SV * +_on_utf8(sv) + SV * sv + CODE: + { + if (SvPOK(sv)) { + SV *rsv = newSViv(SvUTF8(sv)); + sv_2mortal(rsv); + RETVAL = rsv; + SvUTF8_on(sv); + } else { + RETVAL = &PL_sv_undef; + } + } + OUTPUT: + RETVAL + +SV * +_off_utf8(sv) + SV * sv + CODE: + { + if (SvPOK(sv)) { + SV *rsv = newSViv(SvUTF8(sv)); + sv_2mortal(rsv); + RETVAL = rsv; + SvUTF8_off(sv); + } else { + RETVAL = &PL_sv_undef; + } + } + OUTPUT: + RETVAL + +SV * +_utf_to_utf(sv, from, to, ...) + SV * sv + SV * from + SV * to + CODE: + { + SV * check = items == 4 ? ST(3) : Nullsv; + RETVAL = &PL_sv_undef; + } + OUTPUT: + RETVAL + diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL new file mode 100644 index 0000000000..329937e0e2 --- /dev/null +++ b/ext/Encode/Makefile.PL @@ -0,0 +1,11 @@ +use ExtUtils::MakeMaker; +WriteMakefile( + NAME => "Encode", + VERSION_FROM => 'Encode.pm', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, +); diff --git a/ext/Encode/Todo b/ext/Encode/Todo new file mode 100644 index 0000000000..c59622b462 --- /dev/null +++ b/ext/Encode/Todo @@ -0,0 +1,49 @@ +Use Markus Kuhn's UTF-8 Decode Stress Tester at + + http://www.cl.cam.ac.uk/~mgk25/ucs/examples/ + +Markus: +> +> What exactly is malformed UTF-8 data here? +> +> Obviously at least everything listed in section R.7 of ISO 10646-1/Amd.2. +> +> Does it also cover overlong UTF-8 sequences, i.e. any string +> containing any of the five bit sequences +> +> 1100000x, +> 11100000 100xxxxx, +> 11110000 1000xxxx, +> 11111000 10000xxx, +> 11111100 100000xx +> +> Does it also cover UTF-8 encoded code positions U+D800 to U+DFFF (UTF-16 +> surrogates) as well as U+FFFE (anti-BOM) and U+FFFF, all of which must +> not occur in proper UTF-8 and UTF-32 data according to the standard +> (see note 3 in section R.4 of UCS)? +> +> It might be useful, if the spec were clearer here. +> +> References: +> +> - ISO/IEC 10646-1:1993(E), Amd. 2, +> http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html +> +> - http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 +> + +Markus: +> +> It is commonly considered to be good practice to reject at least +> overlong UTF-8 sequences, otherwise one permits multiple encodings for +> characters, which makes pattern matching far more difficult in +> applications where strings are processed in both coded and decoded form. +> It has been argued that this could easily lead to security +> vulnerabilities. See +> +> http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 +> http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt (section 4) +> ftp://sunsite.doc.ic.ac.uk/packages/rfc/rfc2279.txt (section 6) +> +> for a brief discussion. +> |