diff options
-rw-r--r-- | lib/Unicode/Collate.pm | 26 | ||||
-rw-r--r-- | lib/charnames.t | 8 | ||||
-rw-r--r-- | pp.c | 4 | ||||
-rw-r--r-- | pp_pack.c | 4 | ||||
-rw-r--r-- | regexec.c | 68 | ||||
-rw-r--r-- | t/lib/warnings/utf8 | 21 | ||||
-rw-r--r-- | utf8.c | 16 | ||||
-rw-r--r-- | utf8.h | 10 |
8 files changed, 98 insertions, 59 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index 8522a79f2c..43446162a9 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -137,24 +137,26 @@ sub parseEntry # get element my($e, $k) = split /;/, $line; my @e = _getHexArray($e); - $ele = pack('U*', @e); + { no warnings 'utf8'; $ele = pack('U*', @e); } return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; # get sort key - if( + { no warnings 'utf8'; + if( defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ || defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/ - ) - { - $self->{entries}{$ele} = $self->{ignored}{$ele} = 1; - } - else - { - foreach my $arr ($k =~ /\[(\S+)\]/g) { - my $var = $arr =~ /\*/; - push @key, $self->altCE( $var, _getHexArray($arr) ); + ) + { + $self->{entries}{$ele} = $self->{ignored}{$ele} = 1; + } + else + { + foreach my $arr ($k =~ /\[(\S+)\]/g) { + my $var = $arr =~ /\*/; + push @key, $self->altCE( $var, _getHexArray($arr) ); + } + $self->{entries}{$ele} = \@key; } - $self->{entries}{$ele} = \@key; } $self->{maxlength}{ord $ele} = scalar @e if @e > 1; } diff --git a/lib/charnames.t b/lib/charnames.t index 1beecf3f71..31231270a6 100644 --- a/lib/charnames.t +++ b/lib/charnames.t @@ -12,7 +12,7 @@ BEGIN { $| = 1; -print "1..38\n"; +print "1..39\n"; use charnames ':full'; @@ -220,3 +220,9 @@ print "ok 33\n"; print "not " unless charnames::viacode(0xFEFF) eq "ZERO WIDTH NO-BREAK SPACE"; print "ok 38\n"; +{ + use warnings; + print "not " unless ord("\N{BOM}") == 0xFEFF; + print "ok 39\n"; +} + @@ -3241,7 +3241,9 @@ PP(pp_ord) argsv = tmpsv; } - XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff)); + XPUSHu(DO_UTF8(argsv) ? + utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) : + (*s & 0xff)); RETURN; } @@ -770,7 +770,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char * if (checksum) { while (len-- > 0 && s < strend) { STRLEN alen; - auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0)); + auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV)); along = alen; s += along; if (checksum > bits_in_uv) @@ -784,7 +784,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char * EXTEND_MORTAL(len); while (len-- > 0 && s < strend) { STRLEN alen; - auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0)); + auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV)); along = alen; s += along; sv = NEWSV(37, 0); @@ -999,8 +999,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta to_utf8_lower((U8*)m, tmpbuf1, &ulen1); to_utf8_upper((U8*)m, tmpbuf2, &ulen2); - c1 = utf8_to_uvchr(tmpbuf1, 0); - c2 = utf8_to_uvchr(tmpbuf2, 0); + c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, + 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); + c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC, + 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } else { c1 = *(U8*)m; @@ -1037,7 +1039,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (c1 == c2) { while (s <= e) { - c = utf8_to_uvchr((U8*)s, &len); + c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); if ( c == c1 && (ln == len || ibcmp_utf8(s, (char **)0, 0, do_utf8, @@ -1062,7 +1066,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta } else { while (s <= e) { - c = utf8_to_uvchr((U8*)s, &len); + c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); /* Handle some of the three Greek sigmas cases. * Note that not all the possible combinations @@ -2390,7 +2396,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*(U8*)s) != - utf8_to_uvuni((U8*)l, &ulen)) + utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY)) sayNO; l += ulen; s ++; @@ -2402,7 +2410,9 @@ S_regmatch(pTHX_ regnode *prog) if (l >= PL_regeol) sayNO; if (NATIVE_TO_UNI(*((U8*)l)) != - utf8_to_uvuni((U8*)s, &ulen)) + utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY)) sayNO; s += ulen; l ++; @@ -3545,11 +3555,17 @@ S_regmatch(pTHX_ regnode *prog) to_utf8_lower((U8*)s, tmpbuf1, &ulen1); to_utf8_upper((U8*)s, tmpbuf2, &ulen2); - c1 = utf8_to_uvuni(tmpbuf1, 0); - c2 = utf8_to_uvuni(tmpbuf2, 0); + c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); + c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); } else { - c2 = c1 = utf8_to_uvchr(s, NULL); + c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); } } } @@ -3605,16 +3621,24 @@ S_regmatch(pTHX_ regnode *prog) else { STRLEN len; if (c1 == c2) { - /* count initialised to utf8_distance(old, locinput) */ + /* count initialised to + * utf8_distance(old, locinput) */ while (locinput <= e && - utf8_to_uvchr((U8*)locinput, &len) != c1) { + utf8n_to_uvchr((U8*)locinput, + UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY) != c1) { locinput += len; count++; } } else { - /* count initialised to utf8_distance(old, locinput) */ + /* count initialised to + * utf8_distance(old, locinput) */ while (locinput <= e) { - UV c = utf8_to_uvchr((U8*)locinput, &len); + UV c = utf8n_to_uvchr((U8*)locinput, + UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); if (c == c1 || c == c2) break; locinput += len; @@ -3648,7 +3672,10 @@ S_regmatch(pTHX_ regnode *prog) UV c; if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); /* If it could work, try it. */ @@ -3695,7 +3722,10 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); } @@ -3715,7 +3745,10 @@ S_regmatch(pTHX_ regnode *prog) while (n >= ln) { if (c1 != -1000) { if (do_utf8) - c = utf8_to_uvchr((U8*)PL_reginput, NULL); + c = utf8n_to_uvchr((U8*)PL_reginput, + UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); else c = UCHARAT(PL_reginput); } @@ -4297,7 +4330,8 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b STRLEN len = 0; STRLEN plen; - c = do_utf8 ? utf8_to_uvchr(p, &len) : *p; + c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p; plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); if (do_utf8 || (flags & ANYOF_UNICODE)) { diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8 index 747436ab27..5cd0e051b3 100644 --- a/t/lib/warnings/utf8 +++ b/t/lib/warnings/utf8 @@ -39,7 +39,6 @@ my $d800 = chr(0xD800); my $dfff = chr(0xDFFF); my $e000 = chr(0xE000); my $fffd = chr(0xFFFD); -my $fffe = chr(0xFFFE); my $ffff = chr(0xFFFF); my $hex4 = chr(0x10000); my $hex5 = chr(0x100000); @@ -50,7 +49,6 @@ my $d800 = chr(0xD800); my $dfff = chr(0xDFFF); my $e000 = chr(0xE000); my $fffd = chr(0xFFFD); -my $fffe = chr(0xFFFE); my $ffff = chr(0xFFFF); my $hex4 = chr(0x10000); my $hex5 = chr(0x100000); @@ -58,9 +56,8 @@ my $max = chr(0x10FFFF); EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. -Unicode character 0xfffe is illegal at - line 7. -Unicode character 0xffff is illegal at - line 8. -Unicode character 0x10ffff is illegal at - line 11. +Unicode character 0xffff is illegal at - line 7. +Unicode character 0x10ffff is illegal at - line 10. ######## use warnings 'utf8'; my $d7ff = pack("U", 0xD7FF); @@ -68,7 +65,6 @@ my $d800 = pack("U", 0xD800); my $dfff = pack("U", 0xDFFF); my $e000 = pack("U", 0xE000); my $fffd = pack("U", 0xFFFD); -my $fffe = pack("U", 0xFFFE); my $ffff = pack("U", 0xFFFF); my $hex4 = pack("U", 0x10000); my $hex5 = pack("U", 0x100000); @@ -79,7 +75,6 @@ my $d800 = pack("U", 0xD800); my $dfff = pack("U", 0xDFFF); my $e000 = pack("U", 0xE000); my $fffd = pack("U", 0xFFFD); -my $fffe = pack("U", 0xFFFE); my $ffff = pack("U", 0xFFFF); my $hex4 = pack("U", 0x10000); my $hex5 = pack("U", 0x100000); @@ -87,9 +82,8 @@ my $max = pack("U", 0x10FFFF); EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. -Unicode character 0xfffe is illegal at - line 7. -Unicode character 0xffff is illegal at - line 8. -Unicode character 0x10ffff is illegal at - line 11. +Unicode character 0xffff is illegal at - line 7. +Unicode character 0x10ffff is illegal at - line 10. ######## use warnings 'utf8'; my $d7ff = "\x{D7FF}"; @@ -97,7 +91,6 @@ my $d800 = "\x{D800}"; my $dfff = "\x{DFFF}"; my $e000 = "\x{E000}"; my $fffd = "\x{FFFD}"; -my $fffe = "\x{FFFE}"; my $ffff = "\x{FFFF}"; my $hex4 = "\x{10000}"; my $hex5 = "\x{100000}"; @@ -108,7 +101,6 @@ my $d800 = "\x{D800}"; my $dfff = "\x{DFFF}"; my $e000 = "\x{E000}"; my $fffd = "\x{FFFD}"; -my $fffe = "\x{FFFE}"; my $ffff = "\x{FFFF}"; my $hex4 = "\x{10000}"; my $hex5 = "\x{100000}"; @@ -116,6 +108,5 @@ my $max = "\x{10FFFF}"; EXPECT UTF-16 surrogate 0xd800 at - line 3. UTF-16 surrogate 0xdfff at - line 4. -Unicode character 0xfffe is illegal at - line 7. -Unicode character 0xffff is illegal at - line 8. -Unicode character 0x10ffff is illegal at - line 11. +Unicode character 0xffff is illegal at - line 7. +Unicode character 0x10ffff is illegal at - line 10. @@ -64,13 +64,13 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) ((uv >= 0xFDD0 && uv <= 0xFDEF && !(flags & UNICODE_ALLOW_FDD0)) || - ((uv & 0xFFFF) == 0xFFFE && - !(flags & UNICODE_ALLOW_FFFE)) + (UNICODE_IS_BYTE_ORDER_MARK(uv) && + !(flags & UNICODE_ALLOW_BOM)) || ((uv & 0xFFFF) == 0xFFFF && !(flags & UNICODE_ALLOW_FFFF))) && /* UNICODE_ALLOW_SUPER includes - * FFFEs and FFFFs beyond 0x10FFFF. */ + * FFFFs beyond 0x10FFFF. */ ((uv <= PERL_UNICODE_MAX) || !(flags & UNICODE_ALLOW_SUPER)) ) @@ -500,7 +500,8 @@ returned and retlen is set, if possible, to -1. UV Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen) { - return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0); + return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* @@ -523,7 +524,8 @@ UV Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen) { /* Call the low level routine asking for checks */ - return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0); + return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, + ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* @@ -1626,7 +1628,9 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8) /* We use utf8n_to_uvuni() as we want an index into Unicode tables, not a native character number. */ - UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0); + UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0, + ckWARN(WARN_UTF8) ? + 0 : UTF8_ALLOW_ANY); SV *errsv_save; ENTER; SAVETMPS; @@ -188,24 +188,24 @@ encoded character. #define UNICODE_SURROGATE_FIRST 0xd800 #define UNICODE_SURROGATE_LAST 0xdfff #define UNICODE_REPLACEMENT 0xfffd -#define UNICODE_BYTER_ORDER_MARK 0xfffe +#define UNICODE_BYTE_ORDER_MARK 0xfeff #define UNICODE_ILLEGAL 0xffff /* Though our UTF-8 encoding can go beyond this, - * let's be conservative. */ + * let's be conservative and do as Unicode 3.2 says. */ #define PERL_UNICODE_MAX 0x10FFFF #define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */ #define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */ -#define UNICODE_ALLOW_FFFE 0x0004 /* Allow 0xFFFE, 0x1FFFE, ... */ -#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFE, 0x1FFFE, ... */ +#define UNICODE_ALLOW_BOM 0x0004 /* Allow 0xFEFF */ +#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFF, 0x1FFFF, ... */ #define UNICODE_ALLOW_SUPER 0x0010 /* Allow past 10xFFFF */ #define UNICODE_ALLOW_ANY 0xFFFF #define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \ (c) <= UNICODE_SURROGATE_LAST) #define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT) -#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTER_ORDER_MARK) +#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTE_ORDER_MARK) #define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL) #ifdef HAS_QUAD |