summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Unicode/Collate.pm26
-rw-r--r--lib/charnames.t8
-rw-r--r--pp.c4
-rw-r--r--pp_pack.c4
-rw-r--r--regexec.c68
-rw-r--r--t/lib/warnings/utf821
-rw-r--r--utf8.c16
-rw-r--r--utf8.h10
8 files changed, 98 insertions, 59 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index 8522a79f2c..43446162a9 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -137,24 +137,26 @@ sub parseEntry
# get element
my($e, $k) = split /;/, $line;
my @e = _getHexArray($e);
- $ele = pack('U*', @e);
+ { no warnings 'utf8'; $ele = pack('U*', @e); }
return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
# get sort key
- if(
+ { no warnings 'utf8';
+ if(
defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
- )
- {
- $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
- }
- else
- {
- foreach my $arr ($k =~ /\[(\S+)\]/g) {
- my $var = $arr =~ /\*/;
- push @key, $self->altCE( $var, _getHexArray($arr) );
+ )
+ {
+ $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
+ }
+ else
+ {
+ foreach my $arr ($k =~ /\[(\S+)\]/g) {
+ my $var = $arr =~ /\*/;
+ push @key, $self->altCE( $var, _getHexArray($arr) );
+ }
+ $self->{entries}{$ele} = \@key;
}
- $self->{entries}{$ele} = \@key;
}
$self->{maxlength}{ord $ele} = scalar @e if @e > 1;
}
diff --git a/lib/charnames.t b/lib/charnames.t
index 1beecf3f71..31231270a6 100644
--- a/lib/charnames.t
+++ b/lib/charnames.t
@@ -12,7 +12,7 @@ BEGIN {
$| = 1;
-print "1..38\n";
+print "1..39\n";
use charnames ':full';
@@ -220,3 +220,9 @@ print "ok 33\n";
print "not " unless charnames::viacode(0xFEFF) eq "ZERO WIDTH NO-BREAK SPACE";
print "ok 38\n";
+{
+ use warnings;
+ print "not " unless ord("\N{BOM}") == 0xFEFF;
+ print "ok 39\n";
+}
+
diff --git a/pp.c b/pp.c
index 757b4f0984..fbe4737497 100644
--- a/pp.c
+++ b/pp.c
@@ -3241,7 +3241,9 @@ PP(pp_ord)
argsv = tmpsv;
}
- XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff));
+ XPUSHu(DO_UTF8(argsv) ?
+ utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) :
+ (*s & 0xff));
RETURN;
}
diff --git a/pp_pack.c b/pp_pack.c
index 1c5ee315ea..452a2b0a5b 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -770,7 +770,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *
if (checksum) {
while (len-- > 0 && s < strend) {
STRLEN alen;
- auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+ auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
along = alen;
s += along;
if (checksum > bits_in_uv)
@@ -784,7 +784,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *
EXTEND_MORTAL(len);
while (len-- > 0 && s < strend) {
STRLEN alen;
- auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+ auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
along = alen;
s += along;
sv = NEWSV(37, 0);
diff --git a/regexec.c b/regexec.c
index 8db2dc27b5..29b870436a 100644
--- a/regexec.c
+++ b/regexec.c
@@ -999,8 +999,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
- c1 = utf8_to_uvchr(tmpbuf1, 0);
- c2 = utf8_to_uvchr(tmpbuf2, 0);
+ c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
+ 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+ 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
else {
c1 = *(U8*)m;
@@ -1037,7 +1039,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (c1 == c2) {
while (s <= e) {
- c = utf8_to_uvchr((U8*)s, &len);
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
if ( c == c1
&& (ln == len ||
ibcmp_utf8(s, (char **)0, 0, do_utf8,
@@ -1062,7 +1066,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
}
else {
while (s <= e) {
- c = utf8_to_uvchr((U8*)s, &len);
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
/* Handle some of the three Greek sigmas cases.
* Note that not all the possible combinations
@@ -2390,7 +2396,9 @@ S_regmatch(pTHX_ regnode *prog)
if (l >= PL_regeol)
sayNO;
if (NATIVE_TO_UNI(*(U8*)s) !=
- utf8_to_uvuni((U8*)l, &ulen))
+ utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY))
sayNO;
l += ulen;
s ++;
@@ -2402,7 +2410,9 @@ S_regmatch(pTHX_ regnode *prog)
if (l >= PL_regeol)
sayNO;
if (NATIVE_TO_UNI(*((U8*)l)) !=
- utf8_to_uvuni((U8*)s, &ulen))
+ utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY))
sayNO;
s += ulen;
l ++;
@@ -3545,11 +3555,17 @@ S_regmatch(pTHX_ regnode *prog)
to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
- c1 = utf8_to_uvuni(tmpbuf1, 0);
- c2 = utf8_to_uvuni(tmpbuf2, 0);
+ c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
+ c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
}
else {
- c2 = c1 = utf8_to_uvchr(s, NULL);
+ c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
}
}
}
@@ -3605,16 +3621,24 @@ S_regmatch(pTHX_ regnode *prog)
else {
STRLEN len;
if (c1 == c2) {
- /* count initialised to utf8_distance(old, locinput) */
+ /* count initialised to
+ * utf8_distance(old, locinput) */
while (locinput <= e &&
- utf8_to_uvchr((U8*)locinput, &len) != c1) {
+ utf8n_to_uvchr((U8*)locinput,
+ UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY) != c1) {
locinput += len;
count++;
}
} else {
- /* count initialised to utf8_distance(old, locinput) */
+ /* count initialised to
+ * utf8_distance(old, locinput) */
while (locinput <= e) {
- UV c = utf8_to_uvchr((U8*)locinput, &len);
+ UV c = utf8n_to_uvchr((U8*)locinput,
+ UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
if (c == c1 || c == c2)
break;
locinput += len;
@@ -3648,7 +3672,10 @@ S_regmatch(pTHX_ regnode *prog)
UV c;
if (c1 != -1000) {
if (do_utf8)
- c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+ c = utf8n_to_uvchr((U8*)PL_reginput,
+ UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
else
c = UCHARAT(PL_reginput);
/* If it could work, try it. */
@@ -3695,7 +3722,10 @@ S_regmatch(pTHX_ regnode *prog)
while (n >= ln) {
if (c1 != -1000) {
if (do_utf8)
- c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+ c = utf8n_to_uvchr((U8*)PL_reginput,
+ UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
else
c = UCHARAT(PL_reginput);
}
@@ -3715,7 +3745,10 @@ S_regmatch(pTHX_ regnode *prog)
while (n >= ln) {
if (c1 != -1000) {
if (do_utf8)
- c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+ c = utf8n_to_uvchr((U8*)PL_reginput,
+ UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
else
c = UCHARAT(PL_reginput);
}
@@ -4297,7 +4330,8 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
STRLEN len = 0;
STRLEN plen;
- c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
+ c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p;
plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
if (do_utf8 || (flags & ANYOF_UNICODE)) {
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8
index 747436ab27..5cd0e051b3 100644
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -39,7 +39,6 @@ my $d800 = chr(0xD800);
my $dfff = chr(0xDFFF);
my $e000 = chr(0xE000);
my $fffd = chr(0xFFFD);
-my $fffe = chr(0xFFFE);
my $ffff = chr(0xFFFF);
my $hex4 = chr(0x10000);
my $hex5 = chr(0x100000);
@@ -50,7 +49,6 @@ my $d800 = chr(0xD800);
my $dfff = chr(0xDFFF);
my $e000 = chr(0xE000);
my $fffd = chr(0xFFFD);
-my $fffe = chr(0xFFFE);
my $ffff = chr(0xFFFF);
my $hex4 = chr(0x10000);
my $hex5 = chr(0x100000);
@@ -58,9 +56,8 @@ my $max = chr(0x10FFFF);
EXPECT
UTF-16 surrogate 0xd800 at - line 3.
UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
########
use warnings 'utf8';
my $d7ff = pack("U", 0xD7FF);
@@ -68,7 +65,6 @@ my $d800 = pack("U", 0xD800);
my $dfff = pack("U", 0xDFFF);
my $e000 = pack("U", 0xE000);
my $fffd = pack("U", 0xFFFD);
-my $fffe = pack("U", 0xFFFE);
my $ffff = pack("U", 0xFFFF);
my $hex4 = pack("U", 0x10000);
my $hex5 = pack("U", 0x100000);
@@ -79,7 +75,6 @@ my $d800 = pack("U", 0xD800);
my $dfff = pack("U", 0xDFFF);
my $e000 = pack("U", 0xE000);
my $fffd = pack("U", 0xFFFD);
-my $fffe = pack("U", 0xFFFE);
my $ffff = pack("U", 0xFFFF);
my $hex4 = pack("U", 0x10000);
my $hex5 = pack("U", 0x100000);
@@ -87,9 +82,8 @@ my $max = pack("U", 0x10FFFF);
EXPECT
UTF-16 surrogate 0xd800 at - line 3.
UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
########
use warnings 'utf8';
my $d7ff = "\x{D7FF}";
@@ -97,7 +91,6 @@ my $d800 = "\x{D800}";
my $dfff = "\x{DFFF}";
my $e000 = "\x{E000}";
my $fffd = "\x{FFFD}";
-my $fffe = "\x{FFFE}";
my $ffff = "\x{FFFF}";
my $hex4 = "\x{10000}";
my $hex5 = "\x{100000}";
@@ -108,7 +101,6 @@ my $d800 = "\x{D800}";
my $dfff = "\x{DFFF}";
my $e000 = "\x{E000}";
my $fffd = "\x{FFFD}";
-my $fffe = "\x{FFFE}";
my $ffff = "\x{FFFF}";
my $hex4 = "\x{10000}";
my $hex5 = "\x{100000}";
@@ -116,6 +108,5 @@ my $max = "\x{10FFFF}";
EXPECT
UTF-16 surrogate 0xd800 at - line 3.
UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
diff --git a/utf8.c b/utf8.c
index 85a22a1ffd..0100eb17a7 100644
--- a/utf8.c
+++ b/utf8.c
@@ -64,13 +64,13 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
((uv >= 0xFDD0 && uv <= 0xFDEF &&
!(flags & UNICODE_ALLOW_FDD0))
||
- ((uv & 0xFFFF) == 0xFFFE &&
- !(flags & UNICODE_ALLOW_FFFE))
+ (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
+ !(flags & UNICODE_ALLOW_BOM))
||
((uv & 0xFFFF) == 0xFFFF &&
!(flags & UNICODE_ALLOW_FFFF))) &&
/* UNICODE_ALLOW_SUPER includes
- * FFFEs and FFFFs beyond 0x10FFFF. */
+ * FFFFs beyond 0x10FFFF. */
((uv <= PERL_UNICODE_MAX) ||
!(flags & UNICODE_ALLOW_SUPER))
)
@@ -500,7 +500,8 @@ returned and retlen is set, if possible, to -1.
UV
Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
{
- return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+ return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
@@ -523,7 +524,8 @@ UV
Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
{
/* Call the low level routine asking for checks */
- return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+ return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
+ ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
@@ -1626,7 +1628,9 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
/* We use utf8n_to_uvuni() as we want an index into
Unicode tables, not a native character number.
*/
- UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
+ UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
+ ckWARN(WARN_UTF8) ?
+ 0 : UTF8_ALLOW_ANY);
SV *errsv_save;
ENTER;
SAVETMPS;
diff --git a/utf8.h b/utf8.h
index a5312ca19e..3787832f81 100644
--- a/utf8.h
+++ b/utf8.h
@@ -188,24 +188,24 @@ encoded character.
#define UNICODE_SURROGATE_FIRST 0xd800
#define UNICODE_SURROGATE_LAST 0xdfff
#define UNICODE_REPLACEMENT 0xfffd
-#define UNICODE_BYTER_ORDER_MARK 0xfffe
+#define UNICODE_BYTE_ORDER_MARK 0xfeff
#define UNICODE_ILLEGAL 0xffff
/* Though our UTF-8 encoding can go beyond this,
- * let's be conservative. */
+ * let's be conservative and do as Unicode 3.2 says. */
#define PERL_UNICODE_MAX 0x10FFFF
#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
#define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */
-#define UNICODE_ALLOW_FFFE 0x0004 /* Allow 0xFFFE, 0x1FFFE, ... */
-#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_BOM 0x0004 /* Allow 0xFEFF */
+#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFF, 0x1FFFF, ... */
#define UNICODE_ALLOW_SUPER 0x0010 /* Allow past 10xFFFF */
#define UNICODE_ALLOW_ANY 0xFFFF
#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
(c) <= UNICODE_SURROGATE_LAST)
#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT)
-#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTER_ORDER_MARK)
+#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTE_ORDER_MARK)
#define UNICODE_IS_ILLEGAL(c) ((c) == UNICODE_ILLEGAL)
#ifdef HAS_QUAD