diff options
author | Chip Salzenberg <chip@pobox.com> | 2009-08-26 14:33:15 -0700 |
---|---|---|
committer | Chip Salzenberg <chip@pobox.com> | 2009-08-26 14:33:15 -0700 |
commit | eaf7a4d2ee7805b14e26e634fba0893913924a6c (patch) | |
tree | 4c932a2ae4599ff971a6e2b488ea8208d2510182 | |
parent | 8ff3e83ba54cff6118a4f9fd629260c21194f290 (diff) | |
download | perl-eaf7a4d2ee7805b14e26e634fba0893913924a6c.tar.gz |
In C<use utf8; a=>'b'>, do not set utf8 flag on 'a' [perl #68812]
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 2 | ||||
-rw-r--r-- | global.sym | 1 | ||||
-rw-r--r-- | proto.h | 5 | ||||
-rw-r--r-- | t/op/utfhash.t | 14 | ||||
-rw-r--r-- | toke.c | 4 | ||||
-rw-r--r-- | utf8.c | 35 |
7 files changed, 59 insertions, 3 deletions
@@ -481,6 +481,7 @@ ApPR |bool |is_uni_lower_lc|UV c ApPR |bool |is_uni_print_lc|UV c ApPR |bool |is_uni_punct_lc|UV c ApPR |bool |is_uni_xdigit_lc|UV c +Apd |bool |is_ascii_string|NN const U8 *s|STRLEN len Apd |STRLEN |is_utf8_char |NN const U8 *s Apd |bool |is_utf8_string |NN const U8 *s|STRLEN len Apdmb |bool |is_utf8_string_loc|NN const U8 *s|STRLEN len|NULLOK const U8 **p @@ -363,6 +363,7 @@ #define is_uni_print_lc Perl_is_uni_print_lc #define is_uni_punct_lc Perl_is_uni_punct_lc #define is_uni_xdigit_lc Perl_is_uni_xdigit_lc +#define is_ascii_string Perl_is_ascii_string #define is_utf8_char Perl_is_utf8_char #define is_utf8_string Perl_is_utf8_string #define is_utf8_string_loclen Perl_is_utf8_string_loclen @@ -2701,6 +2702,7 @@ #define is_uni_print_lc(a) Perl_is_uni_print_lc(aTHX_ a) #define is_uni_punct_lc(a) Perl_is_uni_punct_lc(aTHX_ a) #define is_uni_xdigit_lc(a) Perl_is_uni_xdigit_lc(aTHX_ a) +#define is_ascii_string(a,b) Perl_is_ascii_string(aTHX_ a,b) #define is_utf8_char(a) Perl_is_utf8_char(aTHX_ a) #define is_utf8_string(a,b) Perl_is_utf8_string(aTHX_ a,b) #define is_utf8_string_loclen(a,b,c,d) Perl_is_utf8_string_loclen(aTHX_ a,b,c,d) diff --git a/global.sym b/global.sym index 115490ac75..a5c9f93ebb 100644 --- a/global.sym +++ b/global.sym @@ -220,6 +220,7 @@ Perl_is_uni_lower_lc Perl_is_uni_print_lc Perl_is_uni_punct_lc Perl_is_uni_xdigit_lc +Perl_is_ascii_string Perl_is_utf8_char Perl_is_utf8_string Perl_is_utf8_string_loc @@ -1281,6 +1281,11 @@ PERL_CALLCONV bool Perl_is_uni_xdigit_lc(pTHX_ UV c) __attribute__warn_unused_result__ __attribute__pure__; +PERL_CALLCONV bool Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len) + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_IS_ASCII_STRING \ + assert(s) + PERL_CALLCONV STRLEN Perl_is_utf8_char(pTHX_ const U8 *s) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_IS_UTF8_CHAR \ diff --git a/t/op/utfhash.t b/t/op/utfhash.t index 32a182635a..a9af502fbf 100644 --- a/t/op/utfhash.t +++ b/t/op/utfhash.t @@ -5,7 +5,7 @@ BEGIN { @INC = '../lib'; require './test.pl'; - plan(tests => 97); + plan(tests => 99); } use strict; @@ -196,6 +196,12 @@ __END__ is($hash{тест}, $hash{'тест'}); is($hash{тест}, 123); is($hash{'тест'}, 123); + + # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812] + my %foo = (a => 'b', 'c' => 'd'); + for my $key (keys %foo) { + ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag"; + } } __END__ { @@ -209,4 +215,10 @@ __END__ is($hash{}, $hash{''}); is($hash{}, 123); is($hash{''}, 123); + + # See if plain ASCII strings quoted with '=>' erroneously get utf8 flag [perl #68812] + my %foo = (a => 'b', 'c' => 'd'); + for my $key (keys %foo) { + ok !utf8::is_utf8($key), "'$key' shouldn't have utf8 flag"; + } } @@ -1384,7 +1384,9 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len) { dVAR; SV * const sv = newSVpvn_utf8(start, len, - UTF && !IN_BYTES + !IN_BYTES + && UTF + && !is_ascii_string((const U8*)start, len) && is_utf8_string((const U8*)start, len)); return sv; } @@ -51,6 +51,38 @@ Unicode characters as a variable number of bytes, in such a way that characters in the ASCII range are unmodified, and a zero byte never appears within non-zero characters. +=cut +*/ + +/* +=for apidoc is_ascii_string + +Returns true if first C<len> bytes of the given string are ASCII (i.e. none +of them even raise the question of UTF-8-ness). + +See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). + +=cut +*/ + +bool +Perl_is_ascii_string(pTHX_ const U8 *s, STRLEN len) +{ + const U8* const send = s + (len ? len : strlen((const char *)s)); + const U8* x = s; + + PERL_ARGS_ASSERT_IS_ASCII_STRING; + PERL_UNUSED_CONTEXT; + + for (; x < send; ++x) { + if (!UTF8_IS_INVARIANT(*x)) + break; + } + + return x == send; +} + +/* =for apidoc uvuni_to_utf8_flags Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end @@ -266,6 +298,7 @@ Perl_is_utf8_char(pTHX_ const U8 *s) return is_utf8_char_slow(s, len); } + /* =for apidoc is_utf8_string @@ -274,7 +307,7 @@ UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does not mean 'a string that contains code points above 0x7F encoded in UTF-8' because a valid ASCII string is a valid UTF-8 string. -See also is_utf8_string_loclen() and is_utf8_string_loc(). +See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). =cut */ |