8 files changed, 98 insertions, 59 deletions
diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm
index 8522a79f2c..43446162a9 100644
--- a/lib/Unicode/Collate.pm
+++ b/lib/Unicode/Collate.pm
@@ -137,24 +137,26 @@ sub parseEntry
   # get element
   my($e, $k) = split /;/, $line;
   my @e = _getHexArray($e);
-  $ele = pack('U*', @e);
+  { no warnings 'utf8'; $ele = pack('U*', @e); }
   return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 
   # get sort key
-  if(
+  { no warnings 'utf8';
+    if(
      defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
      defined $self->{ignoreChar} && $ele  =~ /$self->{ignoreChar}/
-  )
-  {
-     $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
-  }
-  else
-  {
-    foreach my $arr ($k =~ /\[(\S+)\]/g) {
-      my $var = $arr =~ /\*/;
-      push @key, $self->altCE( $var, _getHexArray($arr) );
+       )
+    {
+	$self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
+    }
+    else
+    {
+	foreach my $arr ($k =~ /\[(\S+)\]/g) {
+	    my $var = $arr =~ /\*/;
+	    push @key, $self->altCE( $var, _getHexArray($arr) );
+	}
+	$self->{entries}{$ele} = \@key;
     }
-    $self->{entries}{$ele} = \@key;
   }
   $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
 }
diff --git a/lib/charnames.t b/lib/charnames.t
index 1beecf3f71..31231270a6 100644
--- a/lib/charnames.t
+++ b/lib/charnames.t
@@ -12,7 +12,7 @@ BEGIN {
 
 $| = 1;
 
-print "1..38\n";
+print "1..39\n";
 
 use charnames ':full';
 
@@ -220,3 +220,9 @@ print "ok 33\n";
 print "not " unless charnames::viacode(0xFEFF) eq "ZERO WIDTH NO-BREAK SPACE";
 print "ok 38\n";
 
+{
+    use warnings;
+    print "not " unless ord("\N{BOM}") == 0xFEFF;
+    print "ok 39\n";
+}
+
diff --git a/pp.c b/pp.c
index 757b4f0984..fbe4737497 100644
--- a/pp.c
+++ b/pp.c
@@ -3241,7 +3241,9 @@ PP(pp_ord)
         argsv = tmpsv;
     }
 
-    XPUSHu(DO_UTF8(argsv) ? utf8_to_uvchr(s, 0) : (*s & 0xff));
+    XPUSHu(DO_UTF8(argsv) ?
+	   utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) :
+	   (*s & 0xff));
 
     RETURN;
 }
diff --git a/pp_pack.c b/pp_pack.c
index 1c5ee315ea..452a2b0a5b 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -770,7 +770,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *
 	    if (checksum) {
 		while (len-- > 0 && s < strend) {
 		    STRLEN alen;
-		    auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+		    auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
 		    along = alen;
 		    s += along;
 		    if (checksum > bits_in_uv)
@@ -784,7 +784,7 @@ Perl_unpack_str(pTHX_ char *pat, register char *patend, register char *s, char *
 		EXTEND_MORTAL(len);
 		while (len-- > 0 && s < strend) {
 		    STRLEN alen;
-		    auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, 0));
+		    auint = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, strend - s, &alen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANYUV));
 		    along = alen;
 		    s += along;
 		    sv = NEWSV(37, 0);
diff --git a/regexec.c b/regexec.c
index 8db2dc27b5..29b870436a 100644
--- a/regexec.c
+++ b/regexec.c
@@ -999,8 +999,10 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
 		to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
 
-		c1 = utf8_to_uvchr(tmpbuf1, 0);
-		c2 = utf8_to_uvchr(tmpbuf2, 0);
+		c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC, 
+				    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+		c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+				    0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 	    }
 	    else {
 		c1 = *(U8*)m;
@@ -1037,7 +1039,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		
 		if (c1 == c2) {
 		    while (s <= e) {
-		        c = utf8_to_uvchr((U8*)s, &len);
+		        c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY);
 			if ( c == c1
 			     && (ln == len ||
 				 ibcmp_utf8(s, (char **)0, 0,  do_utf8,
@@ -1062,7 +1066,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
 		}
 		else {
 		    while (s <= e) {
-		        c = utf8_to_uvchr((U8*)s, &len);
+		      c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY);
 
 			/* Handle some of the three Greek sigmas cases.
 			 * Note that not all the possible combinations
@@ -2390,7 +2396,9 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			     sayNO;
 			if (NATIVE_TO_UNI(*(U8*)s) !=
-			    utf8_to_uvuni((U8*)l, &ulen))
+			    utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY))
 			     sayNO;
 			l += ulen;
 			s ++;
@@ -2402,7 +2410,9 @@ S_regmatch(pTHX_ regnode *prog)
 			if (l >= PL_regeol)
 			    sayNO;
 			if (NATIVE_TO_UNI(*((U8*)l)) !=
-			    utf8_to_uvuni((U8*)s, &ulen))
+			    utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY))
 			    sayNO;
 			s += ulen;
 			l ++;
@@ -3545,11 +3555,17 @@ S_regmatch(pTHX_ regnode *prog)
 			     to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
 			     to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
 
-			     c1 = utf8_to_uvuni(tmpbuf1, 0);
-			     c2 = utf8_to_uvuni(tmpbuf2, 0);
+			     c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+						 ckWARN(WARN_UTF8) ?
+						 0 : UTF8_ALLOW_ANY);
+			     c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+						 ckWARN(WARN_UTF8) ?
+						 0 : UTF8_ALLOW_ANY);
 			}
 			else {
-			    c2 = c1 = utf8_to_uvchr(s, NULL);
+			    c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+						     ckWARN(WARN_UTF8) ?
+						     0 : UTF8_ALLOW_ANY);
 			}
 		    }
 		}
@@ -3605,16 +3621,24 @@ S_regmatch(pTHX_ regnode *prog)
 			else {
 			    STRLEN len;
 			    if (c1 == c2) {
-				/* count initialised to utf8_distance(old, locinput) */
+				/* count initialised to
+				 * utf8_distance(old, locinput) */
 				while (locinput <= e &&
-				       utf8_to_uvchr((U8*)locinput, &len) != c1) {
+				       utf8n_to_uvchr((U8*)locinput,
+						      UTF8_MAXLEN, &len,
+						      ckWARN(WARN_UTF8) ?
+						      0 : UTF8_ALLOW_ANY) != c1) {
 				    locinput += len;
 				    count++;
 				}
 			    } else {
-				/* count initialised to utf8_distance(old, locinput) */
+				/* count initialised to
+				 * utf8_distance(old, locinput) */
 				while (locinput <= e) {
-				    UV c = utf8_to_uvchr((U8*)locinput, &len);
+				    UV c = utf8n_to_uvchr((U8*)locinput,
+							  UTF8_MAXLEN, &len,
+							  ckWARN(WARN_UTF8) ?
+							  0 : UTF8_ALLOW_ANY);
 				    if (c == c1 || c == c2)
 					break;
 				    locinput += len;
@@ -3648,7 +3672,10 @@ S_regmatch(pTHX_ regnode *prog)
 		    UV c;
 		    if (c1 != -1000) {
 			if (do_utf8)
-			    c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+			    c = utf8n_to_uvchr((U8*)PL_reginput,
+					       UTF8_MAXLEN, 0,
+					       ckWARN(WARN_UTF8) ?
+					       0 : UTF8_ALLOW_ANY);
 			else
 			    c = UCHARAT(PL_reginput);
 			/* If it could work, try it. */
@@ -3695,7 +3722,10 @@ S_regmatch(pTHX_ regnode *prog)
 		    while (n >= ln) {
 			if (c1 != -1000) {
 			    if (do_utf8)
-				c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+				c = utf8n_to_uvchr((U8*)PL_reginput,
+						   UTF8_MAXLEN, 0,
+						   ckWARN(WARN_UTF8) ?
+						   0 : UTF8_ALLOW_ANY);
 			    else
 				c = UCHARAT(PL_reginput);
 			}
@@ -3715,7 +3745,10 @@ S_regmatch(pTHX_ regnode *prog)
 		    while (n >= ln) {
 			if (c1 != -1000) {
 			    if (do_utf8)
-				c = utf8_to_uvchr((U8*)PL_reginput, NULL);
+				c = utf8n_to_uvchr((U8*)PL_reginput,
+						   UTF8_MAXLEN, 0,
+						   ckWARN(WARN_UTF8) ?
+						   0 : UTF8_ALLOW_ANY);
 			    else
 				c = UCHARAT(PL_reginput);
 			}
@@ -4297,7 +4330,8 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
     STRLEN len = 0;
     STRLEN plen;
 
-    c = do_utf8 ? utf8_to_uvchr(p, &len) : *p;
+    c = do_utf8 ? utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+				 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY) : *p;
 
     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
     if (do_utf8 || (flags & ANYOF_UNICODE)) {
diff --git a/t/lib/warnings/utf8 b/t/lib/warnings/utf8
index 747436ab27..5cd0e051b3 100644
--- a/t/lib/warnings/utf8
+++ b/t/lib/warnings/utf8
@@ -39,7 +39,6 @@ my $d800  = chr(0xD800);
 my $dfff  = chr(0xDFFF);
 my $e000  = chr(0xE000);
 my $fffd  = chr(0xFFFD);
-my $fffe  = chr(0xFFFE);
 my $ffff  = chr(0xFFFF);
 my $hex4  = chr(0x10000);
 my $hex5  = chr(0x100000);
@@ -50,7 +49,6 @@ my $d800  = chr(0xD800);
 my $dfff  = chr(0xDFFF);
 my $e000  = chr(0xE000);
 my $fffd  = chr(0xFFFD);
-my $fffe  = chr(0xFFFE);
 my $ffff  = chr(0xFFFF);
 my $hex4  = chr(0x10000);
 my $hex5  = chr(0x100000);
@@ -58,9 +56,8 @@ my $max   = chr(0x10FFFF);
 EXPECT
 UTF-16 surrogate 0xd800 at - line 3.
 UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
 ########
 use warnings 'utf8';
 my $d7ff  = pack("U", 0xD7FF);
@@ -68,7 +65,6 @@ my $d800  = pack("U", 0xD800);
 my $dfff  = pack("U", 0xDFFF);
 my $e000  = pack("U", 0xE000);
 my $fffd  = pack("U", 0xFFFD);
-my $fffe  = pack("U", 0xFFFE);
 my $ffff  = pack("U", 0xFFFF);
 my $hex4  = pack("U", 0x10000);
 my $hex5  = pack("U", 0x100000);
@@ -79,7 +75,6 @@ my $d800  = pack("U", 0xD800);
 my $dfff  = pack("U", 0xDFFF);
 my $e000  = pack("U", 0xE000);
 my $fffd  = pack("U", 0xFFFD);
-my $fffe  = pack("U", 0xFFFE);
 my $ffff  = pack("U", 0xFFFF);
 my $hex4  = pack("U", 0x10000);
 my $hex5  = pack("U", 0x100000);
@@ -87,9 +82,8 @@ my $max   = pack("U", 0x10FFFF);
 EXPECT
 UTF-16 surrogate 0xd800 at - line 3.
 UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
 ########
 use warnings 'utf8';
 my $d7ff  = "\x{D7FF}";
@@ -97,7 +91,6 @@ my $d800  = "\x{D800}";
 my $dfff  = "\x{DFFF}";
 my $e000  = "\x{E000}";
 my $fffd  = "\x{FFFD}";
-my $fffe  = "\x{FFFE}";
 my $ffff  = "\x{FFFF}";
 my $hex4  = "\x{10000}";
 my $hex5  = "\x{100000}";
@@ -108,7 +101,6 @@ my $d800  = "\x{D800}";
 my $dfff  = "\x{DFFF}";
 my $e000  = "\x{E000}";
 my $fffd  = "\x{FFFD}";
-my $fffe  = "\x{FFFE}";
 my $ffff  = "\x{FFFF}";
 my $hex4  = "\x{10000}";
 my $hex5  = "\x{100000}";
@@ -116,6 +108,5 @@ my $max   = "\x{10FFFF}";
 EXPECT
 UTF-16 surrogate 0xd800 at - line 3.
 UTF-16 surrogate 0xdfff at - line 4.
-Unicode character 0xfffe is illegal at - line 7.
-Unicode character 0xffff is illegal at - line 8.
-Unicode character 0x10ffff is illegal at - line 11.
+Unicode character 0xffff is illegal at - line 7.
+Unicode character 0x10ffff is illegal at - line 10.
diff --git a/utf8.c b/utf8.c
index 85a22a1ffd..0100eb17a7 100644
--- a/utf8.c
+++ b/utf8.c
@@ -64,13 +64,13 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
 		  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
 		    !(flags & UNICODE_ALLOW_FDD0))
 		   ||
-		   ((uv & 0xFFFF) == 0xFFFE &&
-		    !(flags & UNICODE_ALLOW_FFFE))
+		   (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
+		    !(flags & UNICODE_ALLOW_BOM))
 		   ||
 		   ((uv & 0xFFFF) == 0xFFFF &&
 		    !(flags & UNICODE_ALLOW_FFFF))) &&
 		  /* UNICODE_ALLOW_SUPER includes
-		   * FFFEs and FFFFs beyond 0x10FFFF. */
+		   * FFFFs beyond 0x10FFFF. */
 		  ((uv <= PERL_UNICODE_MAX) ||
 		   !(flags & UNICODE_ALLOW_SUPER))
 		  )
@@ -500,7 +500,8 @@ returned and retlen is set, if possible, to -1.
 UV
 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
 {
-    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+    return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
+			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
 /*
@@ -523,7 +524,8 @@ UV
 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
 {
     /* Call the low level routine asking for checks */
-    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
+    return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
+			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
 }
 
 /*
@@ -1626,7 +1628,9 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
 	    /* We use utf8n_to_uvuni() as we want an index into
 	       Unicode tables, not a native character number.
 	     */
-	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
+	    UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
+					   ckWARN(WARN_UTF8) ?
+					   0 : UTF8_ALLOW_ANY);
 	    SV *errsv_save;
 	    ENTER;
 	    SAVETMPS;
diff --git a/utf8.h b/utf8.h
index a5312ca19e..3787832f81 100644
--- a/utf8.h
+++ b/utf8.h
@@ -188,24 +188,24 @@ encoded character.
 #define UNICODE_SURROGATE_FIRST		0xd800
 #define UNICODE_SURROGATE_LAST		0xdfff
 #define UNICODE_REPLACEMENT		0xfffd
-#define UNICODE_BYTER_ORDER_MARK	0xfffe
+#define UNICODE_BYTE_ORDER_MARK		0xfeff
 #define UNICODE_ILLEGAL			0xffff
 
 /* Though our UTF-8 encoding can go beyond this,
- * let's be conservative. */
+ * let's be conservative and do as Unicode 3.2 says. */
 #define PERL_UNICODE_MAX	0x10FFFF
 
 #define UNICODE_ALLOW_SURROGATE 0x0001	/* Allow UTF-16 surrogates (EVIL) */
 #define UNICODE_ALLOW_FDD0	0x0002	/* Allow the U+FDD0...U+FDEF */
-#define UNICODE_ALLOW_FFFE	0x0004	/* Allow 0xFFFE, 0x1FFFE, ... */
-#define UNICODE_ALLOW_FFFF	0x0008	/* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_BOM	0x0004	/* Allow 0xFEFF */
+#define UNICODE_ALLOW_FFFF	0x0008	/* Allow 0xFFFF, 0x1FFFF, ... */
 #define UNICODE_ALLOW_SUPER	0x0010	/* Allow past 10xFFFF */
 #define UNICODE_ALLOW_ANY	0xFFFF
 
 #define UNICODE_IS_SURROGATE(c)		((c) >= UNICODE_SURROGATE_FIRST && \
 					 (c) <= UNICODE_SURROGATE_LAST)
 #define UNICODE_IS_REPLACEMENT(c)	((c) == UNICODE_REPLACEMENT)
-#define UNICODE_IS_BYTE_ORDER_MARK(c)	((c) == UNICODE_BYTER_ORDER_MARK)
+#define UNICODE_IS_BYTE_ORDER_MARK(c)	((c) == UNICODE_BYTE_ORDER_MARK)
 #define UNICODE_IS_ILLEGAL(c)		((c) == UNICODE_ILLEGAL)
 
 #ifdef HAS_QUAD