Allow noncharacter code points in unicode encoding and decoding

The two noncharacter code points 16#FFFE and 16#FFFF were not allowed to be encoded or decoded using the unicode module or bit syntax. That causes an inconsistency, since the noncharacters 16#FDD0 to 16#FDEF could be encoded/decoded. There is two ways to fix that inconsistency. We have chosen to allow 16#FFFE and 16#FFFF to be encoded and decoded, because the noncharacters could be useful internally within an application and it will make encoding and decoding slightly faster. Reported-by: Alisdair Sullivan
author: Björn Gustavsson <bjorn@erlang.org> 2011-08-30 11:51:11 +0200
committer: Björn Gustavsson <bjorn@erlang.org> 2011-10-13 14:16:00 +0200
commit: 34db76765561487e526fe66d3d19ecf3b3fb9dc8 (patch)
tree: 9141e3c5729e46d03c8b27b14da3b29b1e54abca /erts/emulator/beam/erl_unicode.c
parent: 6ca6dd3c670fb8185ebb9a20c2a731a7375c1cac (diff)
download: erlang-34db76765561487e526fe66d3d19ecf3b3fb9dc8.tar.gz
1 files changed, 5 insertions, 19 deletions
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 158eb361a4..bd5f3cc4c1 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -348,12 +348,6 @@ static int copy_utf8_bin(byte *target, byte *source, Uint size,
 		return copied;
 	    }
 
-	    if (((*source) == 0xEF) && (source[1] == 0xBF) &&
-		((source[2] == 0xBE) || (source[2] == 0xBF))) {
-		*err_pos = source;
-		return copied;
-	    }
-		
 	    *(target++) = *(source++);
 	    *(target++) = *(source++);
 	    *(target++) = *(source++);
@@ -714,9 +708,8 @@ L_Again:   /* Restart with sublist, old listend was pushed on stack */
 			    target[(*pos)++] = (((byte) (x & 0x3F)) | 
 						((byte) 0x80));
 			} else if (x < 0x10000) {
-			    if ((x >= 0xD800 && x <= 0xDFFF) ||
-				(x == 0xFFFE) ||
-				(x == 0xFFFF)) { /* Invalid unicode range */
+			    if (x >= 0xD800 && x <= 0xDFFF) {
+				/* Invalid unicode range */
 				*err = 1;
 				goto done;
 			    }
@@ -1230,10 +1223,6 @@ int erts_analyze_utf8(byte *source, Uint size,
 		((source[1] & 0x20) != 0)) {
 		return ERTS_UTF8_ERROR;
 	    }
-	    if (((*source) == 0xEF) && (source[1] == 0xBF) &&
-		((source[2] == 0xBE) || (source[2] == 0xBF))) {
-		return ERTS_UTF8_ERROR;
-	    }
 	    source += 3;
 	    size -= 3;
 	} else if (((*source) & ((byte) 0xF8)) == 0xF0) {
@@ -2166,9 +2155,8 @@ L_Again:   /* Restart with sublist, old listend was pushed on stack */
 			    } else if (x < 0x800) {
 				need += 2;
 			    } else if (x < 0x10000) {
-				if ((x >= 0xD800 && x <= 0xDFFF) ||
-				    (x == 0xFFFE) ||
-				    (x == 0xFFFF)) { /* Invalid unicode range */
+				if (x >= 0xD800 && x <= 0xDFFF) {
+				    /* Invalid unicode range */
 				    DESTROY_ESTACK(stack);
 				    return ((Sint) -1);
 				}
@@ -2314,9 +2302,7 @@ L_Again:   /* Restart with sublist, old listend was pushed on stack */
 				*p++ = (((byte) (x & 0x3F)) | 
 					((byte) 0x80));
 			    } else if (x < 0x10000) {
-				ASSERT(!((x >= 0xD800 && x <= 0xDFFF) ||
-					 (x == 0xFFFE) ||
-					 (x == 0xFFFF)));
+				ASSERT(!(x >= 0xD800 && x <= 0xDFFF));
 				*p++ = (((byte) (x >> 12)) | 
 					((byte) 0xE0));
 				*p++ = ((((byte) (x >> 6)) & 0x3F)  |
author	Björn Gustavsson <bjorn@erlang.org>	2011-08-30 11:51:11 +0200
committer	Björn Gustavsson <bjorn@erlang.org>	2011-10-13 14:16:00 +0200
commit	34db76765561487e526fe66d3d19ecf3b3fb9dc8 (patch)
tree	9141e3c5729e46d03c8b27b14da3b29b1e54abca /erts/emulator/beam/erl_unicode.c
parent	6ca6dd3c670fb8185ebb9a20c2a731a7375c1cac (diff)
download	erlang-34db76765561487e526fe66d3d19ecf3b3fb9dc8.tar.gz