summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChip Salzenberg <chip@pobox.com>2009-08-26 14:33:15 -0700
committerChip Salzenberg <chip@pobox.com>2009-08-26 14:33:15 -0700
commiteaf7a4d2ee7805b14e26e634fba0893913924a6c (patch)
tree4c932a2ae4599ff971a6e2b488ea8208d2510182
parent8ff3e83ba54cff6118a4f9fd629260c21194f290 (diff)
downloadperl-eaf7a4d2ee7805b14e26e634fba0893913924a6c.tar.gz
In C<use utf8; a=>'b'>, do not set utf8 flag on 'a' [perl #68812]
-rw-r--r--embed.fnc1
-rw-r--r--embed.h2
-rw-r--r--global.sym1
-rw-r--r--proto.h5
-rw-r--r--t/op/utfhash.t14
-rw-r--r--toke.c4
-rw-r--r--utf8.c35
7 files changed, 59 insertions, 3 deletions
diff --git a/embed.fnc b/embed.fnc
index 3f9ddcd06f..3e829b68bf 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -481,6 +481,7 @@ ApPR |bool |is_uni_lower_lc|UV c
ApPR |bool |is_uni_print_lc|UV c
ApPR |bool |is_uni_punct_lc|UV c
ApPR |bool |is_uni_xdigit_lc|UV c
+Apd |bool |is_ascii_string|NN const U8 *s|STRLEN len
Apd |STRLEN |is_utf8_char |NN const U8 *s
Apd |bool |is_utf8_string |NN const U8 *s|STRLEN len
Apdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p
diff --git a/embed.h b/embed.h
index 5968fb6228..ba78d6062b 100644
--- a/embed.h
+++ b/embed.h
@@ -363,6 +363,7 @@
#define is_uni_print_lc Perl_is_uni_print_lc
#define is_uni_punct_lc Perl_is_uni_punct_lc
#define is_uni_xdigit_lc Perl_is_uni_xdigit_lc
+#define is_ascii_string Perl_is_ascii_string
#define is_utf8_char Perl_is_utf8_char
#define is_utf8_string Perl_is_utf8_string
#define is_utf8_string_loclen Perl_is_utf8_string_loclen
@@ -2701,6 +2702,7 @@
#define is_uni_print_lc(a) Perl_is_uni_print_lc(aTHX_ a)
#define is_uni_punct_lc(a) Perl_is_uni_punct_lc(aTHX_ a)
#define is_uni_xdigit_lc(a) Perl_is_uni_xdigit_lc(aTHX_ a)
+#define is_ascii_string(a,b) Perl_is_ascii_string(aTHX_ a,b)
#define is_utf8_char(a) Perl_is_utf8_char(aTHX_ a)
#define is_utf8_string(a,b) Perl_is_utf8_string(aTHX_ a,b)
#define is_utf8_string_loclen(a,b,c,d) Perl_is_utf8_string_loclen(aTHX_ a,b,c,d)
diff --git a/global.sym b/global.sym
index 115490ac75..a5c9f93ebb 100644
--- a/global.sym
+++ b/global.sym
@@ -220,6 +220,7 @@ Perl_is_uni_lower_lc
Perl_is_uni_print_lc
Perl_is_uni_punct_lc
Perl_is_uni_xdigit_lc
+Perl_is_ascii_string
Perl_is_utf8_char
Perl_is_utf8_string
Perl_is_utf8_string_loc
diff --git a/proto.h b/proto.h
index 0dc4aab456..8c52f5a24d 100644
--- a/proto.h
+++ b/proto.h
@@ -1281,6 +1281,11 @@ PERL_CALLCONV bool Perl_is_uni_xdigit_lc(pTHX_ UV c)
__attribute__warn_unused_result__
__attribute__pure__;
+PERL_CALLCONV bool Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len)
+ __attribute__nonnull__(pTHX_1);
+#define PERL_ARGS_ASSERT_IS_ASCII_STRING \
+ assert(s)
+
PERL_CALLCONV STRLEN Perl_is_utf8_char(pTHX_ const U8 *s)
__attribute__nonnull__(pTHX_1);
#define PERL_ARGS_ASSERT_IS_UTF8_CHAR \
diff --git a/t/op/utfhash.t b/t/op/utfhash.t
index 32a182635a..a9af502fbf 100644
--- a/t/op/utfhash.t
+++ b/t/op/utfhash.t
@@ -5,7 +5,7 @@ BEGIN {
@INC = '../lib';
require './test.pl';
- plan(tests => 97);
+ plan(tests => 99);
}
use strict;
@@ -196,6 +196,12 @@ __END__
is($hash{тест}, $hash{'тест'});
is($hash{тест}, 123);
is($hash{'тест'}, 123);
+
+ # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812]
+ my %foo = (a => 'b', 'c' => 'd');
+ for my $key (keys %foo) {
+ ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag";
+ }
}
__END__
{
@@ -209,4 +215,10 @@ __END__
is($hash{}, $hash{''});
is($hash{}, 123);
is($hash{''}, 123);
+
+ # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812]
+ my %foo = (a => 'b', 'c' => 'd');
+ for my $key (keys %foo) {
+ ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag";
+ }
}
diff --git a/toke.c b/toke.c
index 24b3c40f80..35ea21816a 100644
--- a/toke.c
+++ b/toke.c
@@ -1384,7 +1384,9 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
{
dVAR;
SV * const sv = newSVpvn_utf8(start, len,
- UTF && !IN_BYTES
+ !IN_BYTES
+ && UTF
+ && !is_ascii_string((const U8*)start, len)
&& is_utf8_string((const U8*)start, len));
return sv;
}
diff --git a/utf8.c b/utf8.c
index b5a380962a..4bf4705aad 100644
--- a/utf8.c
+++ b/utf8.c
@@ -51,6 +51,38 @@ Unicode characters as a variable number of bytes, in such a way that
characters in the ASCII range are unmodified, and a zero byte never appears
within non-zero characters.
+=cut
+*/
+
+/*
+=for apidoc is_ascii_string
+
+Returns true if first C<len> bytes of the given string are ASCII (i.e. none
+of them even raise the question of UTF-8-ness).
+
+See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
+
+=cut
+*/
+
+bool
+Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len)
+{
+ const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* x = s;
+
+ PERL_ARGS_ASSERT_IS_ASCII_STRING;
+ PERL_UNUSED_CONTEXT;
+
+ for (; x < send; ++x) {
+ if (!UTF8_IS_INVARIANT(*x))
+ break;
+ }
+
+ return x == send;
+}
+
+/*
=for apidoc uvuni_to_utf8_flags
Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
@@ -266,6 +298,7 @@ Perl_is_utf8_char(pTHX_ const U8 *s)
return is_utf8_char_slow(s, len);
}
+
/*
=for apidoc is_utf8_string
@@ -274,7 +307,7 @@ UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
not mean 'a string that contains code points above 0x7F encoded in UTF-8'
because a valid ASCII string is a valid UTF-8 string.
-See also is_utf8_string_loclen() and is_utf8_string_loc().
+See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
=cut
*/