summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2002-03-22 04:07:13 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-03-22 04:07:13 +0000
commit574c8022b1fdc7312bf9a5af037c8f777b60b6db (patch)
tree06b4317b44c20a0a8683822193a3359385f3c9bf
parent3fbcfac442ddabdaab668242ba16ca26c5edd56c (diff)
downloadperl-574c8022b1fdc7312bf9a5af037c8f777b60b6db.tar.gz
If Unicode keys are entered to a hash, a bit is turned on.
If the bit is on, when the keys are fetched from the hash (%h, each %h, keys %h), the Unicodified versions of the keys are returned if needed. This solution errs on the size of over-Unicodifying, the old solution erred on the side of under-Unicodifying. As long as the hash keys can be a mix of byte and Unicode strings, a perfect fit is hard to come by. p4raw-id: //depot/perl@15407
-rw-r--r--doop.c15
-rw-r--r--dump.c1
-rw-r--r--ext/Devel/Peek/Peek.t8
-rw-r--r--hv.c16
-rw-r--r--hv.h5
-rw-r--r--pod/perlunicode.pod32
-rw-r--r--pp.c12
-rw-r--r--sv.h1
-rwxr-xr-xt/op/pat.t35
9 files changed, 95 insertions, 30 deletions
diff --git a/doop.c b/doop.c
index e2faa87426..20379a98e5 100644
--- a/doop.c
+++ b/doop.c
@@ -1336,8 +1336,19 @@ Perl_do_kv(pTHX)
PUTBACK; /* hv_iternext and hv_iterval might clobber stack_sp */
while ((entry = hv_iternext(keys))) {
SPAGAIN;
- if (dokeys)
- XPUSHs(hv_iterkeysv(entry)); /* won't clobber stack_sp */
+ if (dokeys) {
+ SV* sv = hv_iterkeysv(entry);
+ if (HvUTF8KEYS((SV*)hv) && !DO_UTF8(sv)) {
+ STRLEN len, i;
+ char* s = SvPV(sv, len);
+ for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++);
+ if (i < len) {
+ sv = newSVsv(sv);
+ sv_utf8_upgrade(sv);
+ }
+ }
+ XPUSHs(sv); /* won't clobber stack_sp */
+ }
if (dovalues) {
PUTBACK;
tmpstr = realhv ?
diff --git a/dump.c b/dump.c
index b4b37bbd63..48a3b38468 100644
--- a/dump.c
+++ b/dump.c
@@ -980,6 +980,7 @@ Perl_do_sv_dump(pTHX_ I32 level, PerlIO *file, SV *sv, I32 nest, I32 maxnest, bo
case SVt_PVHV:
if (HvSHAREKEYS(sv)) sv_catpv(d, "SHAREKEYS,");
if (HvLAZYDEL(sv)) sv_catpv(d, "LAZYDEL,");
+ if (HvUTF8KEYS(sv)) sv_catpv(d, "UTF8,");
break;
case SVt_PVGV:
if (GvINTRO(sv)) sv_catpv(d, "INTRO,");
diff --git a/ext/Devel/Peek/Peek.t b/ext/Devel/Peek/Peek.t
index bd42d93c58..f577369880 100644
--- a/ext/Devel/Peek/Peek.t
+++ b/ext/Devel/Peek/Peek.t
@@ -347,8 +347,8 @@ do_test(19,
RV = $ADDR
SV = PVHV\\($ADDR\\) at $ADDR
REFCNT = 2
- FLAGS = \\(SHAREKEYS\\)
- IV = 1
+ FLAGS = \\(SHAREKEYS,UTF8\\)
+ UV = 1
NV = $FLOAT
ARRAY = $ADDR \\(0:7, 1:1\\)
hash quality = 100.0%
@@ -373,8 +373,8 @@ do_test(19,
RV = $ADDR
SV = PVHV\\($ADDR\\) at $ADDR
REFCNT = 2
- FLAGS = \\(SHAREKEYS\\)
- IV = 1
+ FLAGS = \\(SHAREKEYS,UTF8\\)
+ UV = 1
NV = 0
ARRAY = $ADDR \\(0:7, 1:1\\)
hash quality = 100.0%
diff --git a/hv.c b/hv.c
index 41aa8bbe54..f92e31e427 100644
--- a/hv.c
+++ b/hv.c
@@ -488,11 +488,13 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has
#endif
}
}
+
if (is_utf8) {
STRLEN tmplen = klen;
/* See the note in hv_fetch(). --jhi */
key = (char*)bytes_from_utf8((U8*)key, &tmplen, &is_utf8);
klen = tmplen;
+ HvUTF8KEYS_on((SV*)hv);
}
if (!hash)
@@ -615,8 +617,10 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash)
keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv) != 0);
- if (is_utf8)
+ if (is_utf8) {
key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+ HvUTF8KEYS_on((SV*)hv);
+ }
if (!hash)
PERL_HASH(hash, key, klen);
@@ -773,6 +777,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
else
hv_free_ent(hv, entry);
xhv->xhv_keys--; /* HvKEYS(hv)-- */
+ if (xhv->xhv_keys == 0)
+ HvUTF8KEYS_off(hv);
xhv->xhv_placeholders--;
return Nullsv;
}
@@ -810,6 +816,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
else
hv_free_ent(hv, entry);
xhv->xhv_keys--; /* HvKEYS(hv)-- */
+ if (xhv->xhv_keys == 0)
+ HvUTF8KEYS_off(hv);
}
return sv;
}
@@ -920,6 +928,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
else
hv_free_ent(hv, entry);
xhv->xhv_keys--; /* HvKEYS(hv)-- */
+ if (xhv->xhv_keys == 0)
+ HvUTF8KEYS_off(hv);
xhv->xhv_placeholders--;
return Nullsv;
}
@@ -956,6 +966,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
else
hv_free_ent(hv, entry);
xhv->xhv_keys--; /* HvKEYS(hv)-- */
+ if (xhv->xhv_keys == 0)
+ HvUTF8KEYS_off(hv);
}
return sv;
}
@@ -1478,6 +1490,8 @@ Perl_hv_clear(pTHX_ HV *hv)
if (SvRMAGICAL(hv))
mg_clear((SV*)hv);
+
+ HvUTF8KEYS_off(hv);
}
STATIC void
diff --git a/hv.h b/hv.h
index 369bf3c549..3d5107500f 100644
--- a/hv.h
+++ b/hv.h
@@ -159,11 +159,14 @@ C<SV*>.
#define HvTOTALKEYS(hv) XHvTOTALKEYS((XPVHV*) SvANY(hv))
#define HvPLACEHOLDERS(hv) XHvPLACEHOLDERS((XPVHV*) SvANY(hv))
-
#define HvSHAREKEYS(hv) (SvFLAGS(hv) & SVphv_SHAREKEYS)
#define HvSHAREKEYS_on(hv) (SvFLAGS(hv) |= SVphv_SHAREKEYS)
#define HvSHAREKEYS_off(hv) (SvFLAGS(hv) &= ~SVphv_SHAREKEYS)
+#define HvUTF8KEYS(hv) (SvFLAGS(hv) & SVphv_UTF8KEYS)
+#define HvUTF8KEYS_on(hv) (SvFLAGS(hv) |= SVphv_UTF8KEYS)
+#define HvUTF8KEYS_off(hv) (SvFLAGS(hv) &= ~SVphv_UTF8KEYS)
+
#define HvLAZYDEL(hv) (SvFLAGS(hv) & SVphv_LAZYDEL)
#define HvLAZYDEL_on(hv) (SvFLAGS(hv) |= SVphv_LAZYDEL)
#define HvLAZYDEL_off(hv) (SvFLAGS(hv) &= ~SVphv_LAZYDEL)
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 4cb83252f0..9ba32ee3e0 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -113,8 +113,8 @@ Character semantics have the following effects:
=item *
-Strings and patterns may contain characters that have an ordinal value
-larger than 255.
+Strings (including hash keys) and regular expression patterns may
+contain characters that have an ordinal value larger than 255.
If you use a Unicode editor to edit your program, Unicode characters
may occur directly within the literal strings in one of the various
@@ -128,18 +128,20 @@ hexadecimal, into the curlies. For instance, a smiley face is C<\x{263A}>.
This works only for characters with a code 0x100 and above.
Additionally, if you
+
use charnames ':full';
+
you can use the C<\N{...}> notation, putting the official Unicode character
name within the curlies. For example, C<\N{WHITE SMILING FACE}>.
This works for all characters that have names.
=item *
-If an appropriate L<encoding> is specified,
-identifiers within the Perl script may contain Unicode alphanumeric
-characters, including ideographs. (You are currently on your own when
-it comes to using the canonical forms of characters--Perl doesn't
-(yet) attempt to canonicalize variable names for you.)
+If an appropriate L<encoding> is specified, identifiers within the
+Perl script may contain Unicode alphanumeric characters, including
+ideographs. (You are currently on your own when it comes to using the
+canonical forms of characters--Perl doesn't (yet) attempt to
+canonicalize variable names for you.)
=item *
@@ -846,8 +848,7 @@ B<any subsequent file open>, is UTF-8.
Perl tries really hard to work both with Unicode and the old byte
oriented world: most often this is nice, but sometimes this causes
-problems. See L</BUGS> for example how sometimes using locales
-with Unicode can help with these problems.
+problems.
=back
@@ -959,19 +960,10 @@ Use of locales with Unicode data may lead to odd results. Currently
there is some attempt to apply 8-bit locale info to characters in the
range 0..255, but this is demonstrably incorrect for locales that use
characters above that range when mapped into Unicode. It will also
-tend to run slower. Avoidance of locales is strongly encouraged,
-with one known expection, see the next paragraph.
-
-If the keys of a hash are "mixed", that is, some keys are Unicode,
-while some keys are "byte", the keys may behave differently in regular
-expressions since the definition of character classes like C</\w/>
-is different for byte strings and character strings. This problem can
-sometimes be helped by using an appropriate locale (see L<perllocale>).
-Another way is to force all the strings to be character encoded by
-using utf8::upgrade() (see L<utf8>).
+tend to run slower. Use of locales with Unicode is discouraged.
Some functions are slower when working on UTF-8 encoded strings than
-on byte encoded strings. All functions that need to hop over
+on byte encoded strings. All functions that need to hop over
characters such as length(), substr() or index() can work B<much>
faster when the underlying data are byte-encoded. Witness the
following benchmark:
diff --git a/pp.c b/pp.c
index 15bf3515b0..757b4f0984 100644
--- a/pp.c
+++ b/pp.c
@@ -3686,7 +3686,17 @@ PP(pp_each)
EXTEND(SP, 2);
if (entry) {
- PUSHs(hv_iterkeysv(entry)); /* won't clobber stack_sp */
+ SV* sv = hv_iterkeysv(entry);
+ if (HvUTF8KEYS((SV*)hash) && !DO_UTF8(sv)) {
+ STRLEN len, i;
+ char* s = SvPV(sv, len);
+ for (i = 0; i < len && NATIVE_IS_INVARIANT(s[i]); i++);
+ if (i < len) {
+ sv = newSVsv(sv);
+ sv_utf8_upgrade(sv);
+ }
+ }
+ PUSHs(sv); /* won't clobber stack_sp */
if (gimme == G_ARRAY) {
SV *val;
PUTBACK;
diff --git a/sv.h b/sv.h
index b9567685b0..9671bd7210 100644
--- a/sv.h
+++ b/sv.h
@@ -235,6 +235,7 @@ perform the upgrade if necessary. See C<svtype>.
#define SVphv_SHAREKEYS 0x20000000 /* keys live on shared string table */
#define SVphv_LAZYDEL 0x40000000 /* entry in xhv_eiter must be deleted */
+#define SVphv_UTF8KEYS 0x80000000 /* keys when fetched are UTF8 */
#define SVprv_WEAKREF 0x80000000 /* Weak reference */
diff --git a/t/op/pat.t b/t/op/pat.t
index b5dff4b7e3..001a5b0ac5 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -6,7 +6,7 @@
$| = 1;
-print "1..892\n";
+print "1..903\n";
BEGIN {
chdir 't' if -d 't';
@@ -2771,3 +2771,36 @@ print "# some Unicode properties\n";
++$test;
}
}
+
+
+{
+ my $test = 893;
+
+ print "# Unicode hash keys and \\w\n";
+ # This is not really a regex test but regexes bring
+ # out the issue nicely.
+ use strict;
+ my $u3 = "f\x{df}\x{100}";
+ my $u2 = substr($u3,0,2);
+ my $u1 = substr($u2,0,1);
+ my %u = ( $u1 => $u1, $u2 => $u2, $u3 => $u3 );
+
+ for (keys %u) {
+ print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+ "ok $test\n" : "not ok $test\n";
+ $test++;
+ }
+
+ for (each %u) {
+ print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+ "ok $test\n" : "not ok $test\n";
+ $test++;
+ }
+
+ for (%u) {
+ print /^\w+$/ && $u{$_} =~ /^\w+$/ ?
+ "ok $test\n" : "not ok $test\n";
+ $test++;
+ }
+}
+