S_utf16_textfilter() needs to avoid splitting UTF-16 surrogate pairs.

Easier said than done.
author: Nicholas Clark <nick@ccl4.org> 2009-10-22 19:39:30 +0100
committer: Nicholas Clark <nick@ccl4.org> 2009-10-22 19:39:30 +0100
commit: ba77e4cc9d1ceebf472c9c5c18b2377ee47062e6 (patch)
tree: 0a70cdd56cb6415fc3e02ae6e1fefb69934e2eb0 /toke.c
parent: b3766b12c64c46e0bcc2c1dc58cc7b96d8bef10c (diff)
download: perl-ba77e4cc9d1ceebf472c9c5c18b2377ee47062e6.tar.gz
1 files changed, 20 insertions, 1 deletions
diff --git a/toke.c b/toke.c
index f795707e0d..f105505ea4 100644
--- a/toke.c
+++ b/toke.c
@@ -12822,13 +12822,32 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen)
 	    sv_chop(utf8_buffer, nl);
 	    break;
 	}
+
 	/* OK, not a complete line there, so need to read some more UTF-16.
 	   Read an extra octect if the buffer currently has an odd number. */
+	while (1) {
+	    if (status <= 0)
+		break;
+	    if (SvCUR(utf16_buffer) >= 2) {
+		/* Location of the high octet of the last complete code point.
+		   Gosh, UTF-16 is a pain. All the benefits of variable length,
+		   *coupled* with all the benefits of partial reads and
+		   endianness.  */
+		const U8 *const last_hi = (U8*)SvPVX(utf16_buffer)
+		    + ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2));
+
+		if (*last_hi < 0xd8 || *last_hi > 0xdb) {
+		    break;
+		}
+
+		/* We have the first half of a surrogate. Read more.  */
+		DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi));
+	    }
 
-	while(SvCUR(utf16_buffer) < 2 && status > 0) {
 	    status = FILTER_READ(idx + 1, utf16_buffer,
 				 160 + (SvCUR(utf16_buffer) & 1));
 	    DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%"IVdf" SvCUR(sv)=%"UVuf"\n", status, (UV)SvCUR(utf16_buffer)));
+	    DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);});
 	    if (status < 0) {
 		/* Error */
 		IoPAGE(filter) = status;
author	Nicholas Clark <nick@ccl4.org>	2009-10-22 19:39:30 +0100
committer	Nicholas Clark <nick@ccl4.org>	2009-10-22 19:39:30 +0100
commit	ba77e4cc9d1ceebf472c9c5c18b2377ee47062e6 (patch)
tree	0a70cdd56cb6415fc3e02ae6e1fefb69934e2eb0 /toke.c
parent	b3766b12c64c46e0bcc2c1dc58cc7b96d8bef10c (diff)
download	perl-ba77e4cc9d1ceebf472c9c5c18b2377ee47062e6.tar.gz