1 files changed, 120 insertions, 110 deletions
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
index 620e8a76f7..c2c30973db 100644
--- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
+++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c
@@ -30,15 +30,17 @@
 #include "mbfilter.h"
 #include "mbfilter_utf16.h"
 
+static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter);
+
 static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL};
 
 const mbfl_encoding mbfl_encoding_utf16 = {
 	mbfl_no_encoding_utf16,
 	"UTF-16",
 	"UTF-16",
-	(const char *(*)[])&mbfl_encoding_utf16_aliases,
+	mbfl_encoding_utf16_aliases,
 	NULL,
-	MBFL_ENCTYPE_MWC2BE,
+	MBFL_ENCTYPE_MWC2,
 	&vtbl_utf16_wchar,
 	&vtbl_wchar_utf16
 };
@@ -49,7 +51,7 @@ const mbfl_encoding mbfl_encoding_utf16be = {
 	"UTF-16BE",
 	NULL,
 	NULL,
-	MBFL_ENCTYPE_MWC2BE,
+	MBFL_ENCTYPE_MWC2,
 	&vtbl_utf16be_wchar,
 	&vtbl_wchar_utf16be
 };
@@ -60,7 +62,7 @@ const mbfl_encoding mbfl_encoding_utf16le = {
 	"UTF-16LE",
 	NULL,
 	NULL,
-	MBFL_ENCTYPE_MWC2LE,
+	MBFL_ENCTYPE_MWC2,
 	&vtbl_utf16le_wchar,
 	&vtbl_wchar_utf16le
 };
@@ -71,7 +73,7 @@ const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
 	mbfl_filt_conv_common_ctor,
 	NULL,
 	mbfl_filt_conv_utf16_wchar,
-	mbfl_filt_conv_common_flush,
+	mbfl_filt_conv_utf16_wchar_flush,
 	NULL,
 };
 
@@ -91,7 +93,7 @@ const struct mbfl_convert_vtbl vtbl_utf16be_wchar = {
 	mbfl_filt_conv_common_ctor,
 	NULL,
 	mbfl_filt_conv_utf16be_wchar,
-	mbfl_filt_conv_common_flush,
+	mbfl_filt_conv_utf16_wchar_flush,
 	NULL,
 };
 
@@ -111,7 +113,7 @@ const struct mbfl_convert_vtbl vtbl_utf16le_wchar = {
 	mbfl_filt_conv_common_ctor,
 	NULL,
 	mbfl_filt_conv_utf16le_wchar,
-	mbfl_filt_conv_common_flush,
+	mbfl_filt_conv_utf16_wchar_flush,
 	NULL,
 };
 
@@ -127,111 +129,89 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf16le = {
 
 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
 
-/*
- * UTF-16 => wchar
- */
 int mbfl_filt_conv_utf16_wchar(int c, mbfl_convert_filter *filter)
 {
-	int n, endian;
-
-	endian = filter->status & 0xff00;
-	switch (filter->status & 0x0f) {
-	case 0:
-		if (endian) {
-			n = c & 0xff;
-		} else {
-			n = (c & 0xff) << 8;
-		}
-		filter->cache |= n;
-		filter->status++;
-		break;
-	default:
-		if (endian) {
-			n = (c & 0xff) << 8;
+	/* Start with the assumption that the string is big-endian;
+	 * If we find a little-endian BOM, then we will change that assumption */
+	if (filter->status == 0) {
+		filter->cache = c & 0xFF;
+		filter->status = 1;
+	} else {
+		int n = (filter->cache << 8) | (c & 0xFF);
+		if (n == 0xFFFE) {
+			/* Switch to little-endian mode */
+			filter->filter_function = mbfl_filt_conv_utf16le_wchar;
+			filter->cache = filter->status = 0;
 		} else {
-			n = c & 0xff;
-		}
-		n |= filter->cache & 0xffff;
-		filter->status &= ~0x0f;
-		if (n >= 0xd800 && n < 0xdc00) {
-			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
-		} else if (n >= 0xdc00 && n < 0xe000) {
-			n &= 0x3ff;
-			n |= (filter->cache & 0xfff0000) >> 6;
-			filter->cache = 0;
-			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
-				CK((*filter->output_function)(n, filter->data));
-			} else {		/* illegal character */
-				n &= MBFL_WCSGROUP_MASK;
-				n |= MBFL_WCSGROUP_THROUGH;
+			filter->filter_function = mbfl_filt_conv_utf16be_wchar;
+			if (n >= 0xD800 && n <= 0xDBFF) {
+				filter->cache = n & 0x3FF; /* Pick out 10 data bits */
+				filter->status = 2;
+				return c;
+			} else if (n >= 0xDC00 && n <= 0xDFFF) {
+				/* This is wrong; second part of surrogate pair has come first */
+				CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
+			} else if (n != 0xFEFF) {
 				CK((*filter->output_function)(n, filter->data));
 			}
-		} else {
-			int is_first = filter->status & 0x10;
-			filter->cache = 0;
-			filter->status |= 0x10;
-			if (!is_first) {
-				if (n == 0xfffe) {
-					if (endian) {
-						filter->status &= ~0x100;		/* big-endian */
-					} else {
-						filter->status |= 0x100;		/* little-endian */
-					}
-					break;
-				} else if (n == 0xfeff) {
-					break;
-				}
-			}
-			CK((*filter->output_function)(n, filter->data));
+			filter->cache = filter->status = 0;
 		}
-		break;
 	}
 
 	return c;
 }
 
-/*
- * UTF-16BE => wchar
- */
 int mbfl_filt_conv_utf16be_wchar(int c, mbfl_convert_filter *filter)
 {
 	int n;
 
 	switch (filter->status) {
-	case 0:
+	case 0: /* First byte */
+		filter->cache = c & 0xFF;
 		filter->status = 1;
-		n = (c & 0xff) << 8;
-		filter->cache |= n;
 		break;
-	default:
-		filter->status = 0;
-		n = (filter->cache & 0xff00) | (c & 0xff);
-		if (n >= 0xd800 && n < 0xdc00) {
-			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
-		} else if (n >= 0xdc00 && n < 0xe000) {
-			n &= 0x3ff;
-			n |= (filter->cache & 0xfff0000) >> 6;
-			filter->cache = 0;
-			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
-				CK((*filter->output_function)(n, filter->data));
-			} else {		/* illegal character */
-				n &= MBFL_WCSGROUP_MASK;
-				n |= MBFL_WCSGROUP_THROUGH;
-				CK((*filter->output_function)(n, filter->data));
-			}
+
+	case 1: /* Second byte */
+		n = (filter->cache << 8) | (c & 0xFF);
+		if (n >= 0xD800 && n <= 0xDBFF) {
+			filter->cache = n & 0x3FF; /* Pick out 10 data bits */
+			filter->status = 2;
+		} else if (n >= 0xDC00 && n <= 0xDFFF) {
+			/* This is wrong; second part of surrogate pair has come first */
+			CK((*filter->output_function)(n | MBFL_WCSGROUP_THROUGH, filter->data));
+			filter->status = 0;
 		} else {
-			filter->cache = 0;
 			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
 		}
 		break;
+
+	case 2: /* Second part of surrogate, first byte */
+		filter->cache = (filter->cache << 8) | (c & 0xFF);
+		filter->status = 3;
+		break;
+
+	case 3: /* Second part of surrogate, second byte */
+		n = ((filter->cache & 0xFF) << 8) | (c & 0xFF);
+		if (n >= 0xD800 && n <= 0xDBFF) {
+			/* Wrong; that's the first half of a surrogate pair, not the second */
+			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+			filter->cache = n & 0x3FF;
+			filter->status = 2;
+		} else if (n >= 0xDC00 && n <= 0xDFFF) {
+			n = ((filter->cache & 0x3FF00) << 2) + (n & 0x3FF) + 0x10000;
+			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
+		} else {
+			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 8) | MBFL_WCSGROUP_THROUGH, filter->data));
+			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
+		}
 	}
 
 	return c;
 }
 
-/*
- * wchar => UTF-16BE
- */
 int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
 {
 	int n;
@@ -253,38 +233,53 @@ int mbfl_filt_conv_wchar_utf16be(int c, mbfl_convert_filter *filter)
 	return c;
 }
 
-/*
- * UTF-16LE => wchar
- */
 int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
 {
 	int n;
 
 	switch (filter->status) {
 	case 0:
+		filter->cache = c & 0xff;
 		filter->status = 1;
-		n = c & 0xff;
-		filter->cache |= n;
 		break;
-	default:
-		filter->status = 0;
-		n = (filter->cache & 0xff) | ((c & 0xff) << 8);
-		if (n >= 0xd800 && n < 0xdc00) {
-			filter->cache = ((n & 0x3ff) << 16) + 0x400000;
-		} else if (n >= 0xdc00 && n < 0xe000) {
-			n &= 0x3ff;
-			n |= (filter->cache & 0xfff0000) >> 6;
-			filter->cache = 0;
-			if (n >= MBFL_WCSPLANE_SUPMIN && n < MBFL_WCSPLANE_SUPMAX) {
-				CK((*filter->output_function)(n, filter->data));
-			} else {		/* illegal character */
-				n &= MBFL_WCSGROUP_MASK;
-				n |= MBFL_WCSGROUP_THROUGH;
-				CK((*filter->output_function)(n, filter->data));
-			}
+
+	case 1:
+		if ((c & 0xfc) == 0xd8) {
+			/* Looks like we have a surrogate pair here */
+			filter->cache += ((c & 0x3) << 8);
+			filter->status = 2;
+		} else if ((c & 0xfc) == 0xdc) {
+			/* This is wrong; the second part of the surrogate pair has come first
+			 * Flag it with `MBFL_WCSGROUP_THROUGH`; the following filter will handle
+			 * the error */
+			n = (filter->cache + ((c & 0xff) << 8)) | MBFL_WCSGROUP_THROUGH;
+			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
 		} else {
-			filter->cache = 0;
+			CK((*filter->output_function)(filter->cache + ((c & 0xff) << 8), filter->data));
+			filter->status = 0;
+		}
+		break;
+
+	case 2:
+		filter->cache = (filter->cache << 10) + (c & 0xff);
+		filter->status = 3;
+		break;
+
+	case 3:
+		n = (filter->cache & 0xFF) | ((c & 0xFF) << 8);
+		if (n >= 0xD800 && n <= 0xDBFF) {
+			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
+			filter->cache = n & 0x3FF;
+			filter->status = 2;
+		} else if (n >= 0xDC00 && n <= 0xDFFF) {
+			n = filter->cache + ((c & 0x3) << 8) + 0x10000;
+			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
+		} else {
+			CK((*filter->output_function)((0xD8 << 10) | (filter->cache >> 10) | MBFL_WCSGROUP_THROUGH, filter->data));
 			CK((*filter->output_function)(n, filter->data));
+			filter->status = 0;
 		}
 		break;
 	}
@@ -292,9 +287,6 @@ int mbfl_filt_conv_utf16le_wchar(int c, mbfl_convert_filter *filter)
 	return c;
 }
 
-/*
- * wchar => UTF-16LE
- */
 int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
 {
 	int n;
@@ -315,3 +307,21 @@ int mbfl_filt_conv_wchar_utf16le(int c, mbfl_convert_filter *filter)
 
 	return c;
 }
+
+static int mbfl_filt_conv_utf16_wchar_flush(mbfl_convert_filter *filter)
+{
+	int status = filter->status;
+	int cache = filter->cache;
+	filter->status = filter->cache = 0;
+
+	if (status) {
+		/* Input string was truncated */
+		CK((*filter->output_function)(cache | MBFL_WCSGROUP_THROUGH, filter->data));
+	}
+
+	if (filter->flush_function) {
+		(*filter->flush_function)(filter->data);
+	}
+
+	return 0;
+}