summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Ing-Simmons <nik@tiuk.ti.com>2001-03-18 14:18:12 +0000
committerNick Ing-Simmons <nik@tiuk.ti.com>2001-03-18 14:18:12 +0000
commit1b026014ba0f5424fabe070eda050db5e7df518a (patch)
tree60a3c121ee35a593bb71b0cfe1bf6288be335630
parentbe4731d2ab91c4f6213bf88a0084f6128a0db383 (diff)
downloadperl-1b026014ba0f5424fabe070eda050db5e7df518a.tar.gz
UTF-X encoding invariance for Encode:
- move Encode::utf8_encode to utf8::encode (likewise decode,upgrade,downgrade,valid) - move the XS code for those to universal.c (so in miniperl) - add utf8::unicode_to_native and its inverse to allow EBCDIC to work in true unicode. - change ext/Encode/compile to use above. - Fix t/lib/encode.t for above - Teach t/lib/b.t to expect -uutf8 - In utf8.c look for SWASHNEW rather than just utf8:: package to see if utf8.pm is needed. p4raw-id: //depot/perlio@9198
-rw-r--r--ext/Encode/Encode.pm19
-rw-r--r--ext/Encode/Encode.xs32
-rwxr-xr-xext/Encode/compile19
-rw-r--r--lib/utf8.pm28
-rwxr-xr-xt/lib/b.t16
-rw-r--r--t/lib/encode.t6
-rw-r--r--universal.c121
-rw-r--r--utf8.c3
8 files changed, 170 insertions, 74 deletions
diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm
index b5ba929a54..fd85520311 100644
--- a/ext/Encode/Encode.pm
+++ b/ext/Encode/Encode.pm
@@ -188,14 +188,14 @@ sub from_to
sub encode_utf8
{
my ($str) = @_;
- utf8_encode($str);
+ utf8::encode($str);
return $str;
}
sub decode_utf8
{
my ($str) = @_;
- return undef unless utf8_decode($str);
+ return undef unless utf8::decode($str);
return $str;
}
@@ -226,14 +226,14 @@ package Encode::Unicode;
use base 'Encode::Encoding';
# Dummy package that provides the encode interface but leaves data
-# as UTF-8 encoded. It is here so that from_to() works.
+# as UTF-X encoded. It is here so that from_to() works.
__PACKAGE__->Define('Unicode');
sub decode
{
my ($obj,$str,$chk) = @_;
- Encode::utf8_upgrade($str);
+ utf8::upgrade($str);
$_[1] = '' if $chk;
return $str;
}
@@ -717,17 +717,6 @@ As such they are efficient, but may change.
=over 4
-=item *
-
- $num_octets = utf8_upgrade($string);
-
-Converts internal representation of string to the UTF-8 form.
-Returns the number of octets necessary to represent the string as UTF-8.
-
-=item * utf8_downgrade($string[, CHECK])
-
-Converts internal representation of string to be un-encoded bytes.
-
=item * is_utf8(STRING [, CHECK])
[INTERNAL] Test whether the UTF-8 flag is turned on in the STRING.
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index cdb1965bda..4d62501775 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -433,38 +433,6 @@ encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
return dst;
}
-MODULE = Encode PACKAGE = Encode PREFIX = sv_
-
-void
-valid_utf8(sv)
-SV * sv
-CODE:
- {
- STRLEN len;
- char *s = SvPV(sv,len);
- if (!SvUTF8(sv) || is_utf8_string((U8*)s,len))
- XSRETURN_YES;
- else
- XSRETURN_NO;
- }
-
-void
-sv_utf8_encode(sv)
-SV * sv
-
-bool
-sv_utf8_decode(sv)
-SV * sv
-
-STRLEN
-sv_utf8_upgrade(sv)
-SV * sv
-
-bool
-sv_utf8_downgrade(sv,failok=0)
-SV * sv
-bool failok
-
MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_
PROTOTYPES: ENABLE
diff --git a/ext/Encode/compile b/ext/Encode/compile
index 8201043fde..d0611f719f 100755
--- a/ext/Encode/compile
+++ b/ext/Encode/compile
@@ -8,23 +8,16 @@ use Getopt::Std;
my @orig_ARGV = @ARGV;
my $perforce = '$Id$';
-
sub encode_U
{
# UTF-8 encode long hand - only covers part of perl's range
my $uv = shift;
- if ($uv < 0x80)
- {
- return chr($uv)
- }
- if ($uv < 0x800)
- {
- return chr(($uv >> 6) | 0xC0).
- chr(($uv & 0x3F) | 0x80);
- }
- return chr(($uv >> 12) | 0xE0).
- chr((($uv >> 6) & 0x3F) | 0x80).
- chr(($uv & 0x3F) | 0x80);
+ # chr() works in native space so convert value from table
+ # into that space before using chr().
+ my $ch = chr(utf8::unicode_to_native($uv));
+ # Now get core perl to encode that the way it likes.
+ utf8::encode($ch);
+ return $ch;
}
sub encode_S
diff --git a/lib/utf8.pm b/lib/utf8.pm
index f9055b5dd9..7c9a7dfb4e 100644
--- a/lib/utf8.pm
+++ b/lib/utf8.pm
@@ -52,7 +52,7 @@ source text. Until UTF-8 becomes the default format for source
text, this pragma should be used to recognize UTF-8 in the source.
When UTF-8 becomes the standard source format, this pragma will
effectively become a no-op. This pragma already is a no-op on
-EBCDIC platforms (where it is alright to code perl in EBCDIC
+EBCDIC platforms (where it is alright to code perl in EBCDIC
rather than UTF-8).
Enabling the C<utf8> pragma has the following effects:
@@ -81,6 +81,32 @@ of byte semantics.
=back
+=head2 Utility functions
+
+The following functions are defined in the C<utf8::> package by the perl core.
+
+=over 4
+
+=item * $num_octets = utf8::upgrade($string);
+
+Converts internal representation of string to the perls internal UTF-X form.
+Returns the number of octets necessary to represent the string as UTF-X.
+
+=item * utf8::downgrade($string[, CHECK])
+
+Converts internal representation of string to be un-encoded bytes.
+
+=item * utf8::encode($string)
+
+Converts (in-place) I<$string> from logical characters to octet sequence
+representing it in perl's UTF-X encoding.
+
+=item * $flag = utf8::decode($string)
+
+Attempts to converts I<$string> in-place from perl's UTF-X encoding into logical characters.
+
+=back
+
=head1 SEE ALSO
L<perlunicode>, L<bytes>
diff --git a/t/lib/b.t b/t/lib/b.t
index 397fdba869..019a1e8437 100755
--- a/t/lib/b.t
+++ b/t/lib/b.t
@@ -2,11 +2,11 @@
BEGIN {
chdir 't' if -d 't';
- if ($^O eq 'MacOS') {
- @INC = qw(: ::lib ::macos:lib);
- } else {
- @INC = '.';
- push @INC, '../lib';
+ if ($^O eq 'MacOS') {
+ @INC = qw(: ::lib ::macos:lib);
+ } else {
+ @INC = '.';
+ push @INC, '../lib';
}
}
@@ -141,7 +141,7 @@ $a =~ s/-u(Cwd|File|File::Copy|OS2),//g if $^O eq 'os2';
$a =~ s/-uCwd,// if $^O eq 'cygwin';
if ($Config{static_ext} eq ' ') {
$b = '-uCarp,-uCarp::Heavy,-uDB,-uExporter,-uExporter::Heavy,-uattributes,'
- . '-umain,-ustrict,-uwarnings';
+ . '-umain,-ustrict,-uutf8,-uwarnings';
if (ord('A') == 193) { # EBCDIC sort order is qw(a A) not qw(A a)
$b = join ',', sort split /,/, $b;
}
@@ -157,7 +157,7 @@ if ($is_thread) {
$a = `$^X $path "-MO=Showlex" -e "my %one" $redir`;
if (ord('A') != 193) { # ASCIIish
print "# [$a]\nnot " unless $a =~ /sv_undef.*PVNV.*%one.*sv_undef.*HV/s;
- }
+ }
else { # EBCDICish C<1: PVNV (0x1a7ede34) "%\226\225\205">
print "# [$a]\nnot " unless $a =~ /sv_undef.*PVNV.*%\\[0-9].*sv_undef.*HV/s;
}
@@ -171,6 +171,6 @@ my $foo = $deparse->coderef2text(sub { { 234; }});
print "not " unless $foo =~ /{.*{.*234;.*}.*}/sm;
ok;
$foo = $deparse->coderef2text(sub { { 234; } continue { 123; } });
-print "not " unless $foo =~ /{.*{.*234;.*}.*continue.*{.*123.*}/sm;
+print "not " unless $foo =~ /{.*{.*234;.*}.*continue.*{.*123.*}/sm;
ok;
}
diff --git a/t/lib/encode.t b/t/lib/encode.t
index af1f34bb4f..d4a13eeeaa 100644
--- a/t/lib/encode.t
+++ b/t/lib/encode.t
@@ -104,9 +104,9 @@ for my $i (256,128,129,256)
{
my $c = chr($i);
my $s = "$c\n".sprintf("%02X",$i);
- ok(Encode::valid_utf8($s),1,"concat of $i botched");
- Encode::utf8_upgrade($s);
- ok(Encode::valid_utf8($s),1,"concat of $i botched");
+ ok(utf8::valid($s),1,"concat of $i botched");
+ utf8::upgrade($s);
+ ok(utf8::valid($s),1,"concat of $i botched");
}
# Spot check a few points in/out of utf8
diff --git a/universal.c b/universal.c
index 12d31e58b1..3e14a68bd7 100644
--- a/universal.c
+++ b/universal.c
@@ -130,9 +130,18 @@ Perl_sv_derived_from(pTHX_ SV *sv, const char *name)
: FALSE ;
}
+#include "XSUB.h"
+
void XS_UNIVERSAL_isa(pTHXo_ CV *cv);
void XS_UNIVERSAL_can(pTHXo_ CV *cv);
void XS_UNIVERSAL_VERSION(pTHXo_ CV *cv);
+XS(XS_utf8_valid);
+XS(XS_utf8_encode);
+XS(XS_utf8_decode);
+XS(XS_utf8_upgrade);
+XS(XS_utf8_downgrade);
+XS(XS_utf8_unicode_to_native);
+XS(XS_utf8_native_to_unicode);
void
Perl_boot_core_UNIVERSAL(pTHX)
@@ -142,9 +151,15 @@ Perl_boot_core_UNIVERSAL(pTHX)
newXS("UNIVERSAL::isa", XS_UNIVERSAL_isa, file);
newXS("UNIVERSAL::can", XS_UNIVERSAL_can, file);
newXS("UNIVERSAL::VERSION", XS_UNIVERSAL_VERSION, file);
+ newXS("utf8::valid", XS_utf8_valid, file);
+ newXS("utf8::encode", XS_utf8_encode, file);
+ newXS("utf8::decode", XS_utf8_decode, file);
+ newXS("utf8::upgrade", XS_utf8_upgrade, file);
+ newXS("utf8::downgrade", XS_utf8_downgrade, file);
+ newXS("utf8::native_to_unicode", XS_utf8_native_to_unicode, file);
+ newXS("utf8::unicode_to_native", XS_utf8_unicode_to_native, file);
}
-#include "XSUB.h"
XS(XS_UNIVERSAL_isa)
{
@@ -299,3 +314,107 @@ finish:
XSRETURN(1);
}
+XS(XS_utf8_valid)
+{
+ dXSARGS;
+ if (items != 1)
+ Perl_croak(aTHX_ "Usage: utf8::valid(sv)");
+ {
+ SV * sv = ST(0);
+ {
+ STRLEN len;
+ char *s = SvPV(sv,len);
+ if (!SvUTF8(sv) || is_utf8_string((U8*)s,len))
+ XSRETURN_YES;
+ else
+ XSRETURN_NO;
+ }
+ }
+ XSRETURN_EMPTY;
+}
+
+XS(XS_utf8_encode)
+{
+ dXSARGS;
+ if (items != 1)
+ Perl_croak(aTHX_ "Usage: utf8::encode(sv)");
+ {
+ SV * sv = ST(0);
+
+ sv_utf8_encode(sv);
+ }
+ XSRETURN_EMPTY;
+}
+
+XS(XS_utf8_decode)
+{
+ dXSARGS;
+ if (items != 1)
+ Perl_croak(aTHX_ "Usage: utf8::decode(sv)");
+ {
+ SV * sv = ST(0);
+ bool RETVAL;
+
+ RETVAL = sv_utf8_decode(sv);
+ ST(0) = boolSV(RETVAL);
+ sv_2mortal(ST(0));
+ }
+ XSRETURN(1);
+}
+
+XS(XS_utf8_upgrade)
+{
+ dXSARGS;
+ if (items != 1)
+ Perl_croak(aTHX_ "Usage: utf8::upgrade(sv)");
+ {
+ SV * sv = ST(0);
+ STRLEN RETVAL;
+ dXSTARG;
+
+ RETVAL = sv_utf8_upgrade(sv);
+ XSprePUSH; PUSHi((IV)RETVAL);
+ }
+ XSRETURN(1);
+}
+
+XS(XS_utf8_downgrade)
+{
+ dXSARGS;
+ if (items < 1 || items > 2)
+ Perl_croak(aTHX_ "Usage: utf8::downgrade(sv, failok=0)");
+ {
+ SV * sv = ST(0);
+ bool failok;
+ bool RETVAL;
+
+ if (items < 2)
+ failok = 0;
+ else {
+ failok = (int)SvIV(ST(1));
+ }
+
+ RETVAL = sv_utf8_downgrade(sv, failok);
+ ST(0) = boolSV(RETVAL);
+ sv_2mortal(ST(0));
+ }
+ XSRETURN(1);
+}
+
+XS(XS_utf8_native_to_unicode)
+{
+ dXSARGS;
+ UV uv = SvUV(ST(0));
+ ST(0) = sv_2mortal(newSViv(NATIVE_TO_UNI(uv)));
+ XSRETURN(1);
+}
+
+XS(XS_utf8_unicode_to_native)
+{
+ dXSARGS;
+ UV uv = SvUV(ST(0));
+ ST(0) = sv_2mortal(newSViv(UNI_TO_NATIVE(uv)));
+ XSRETURN(1);
+}
+
+
diff --git a/utf8.c b/utf8.c
index 7ca3cc7bd0..81fb44db35 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1282,8 +1282,9 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
SV* retval;
SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
dSP;
+ HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
- if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */
+ if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ENTER;
Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
LEAVE;