summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
authorNicholas Clark <nick@ccl4.org>2009-10-22 19:39:30 +0100
committerNicholas Clark <nick@ccl4.org>2009-10-22 19:39:30 +0100
commitba77e4cc9d1ceebf472c9c5c18b2377ee47062e6 (patch)
tree0a70cdd56cb6415fc3e02ae6e1fefb69934e2eb0 /toke.c
parentb3766b12c64c46e0bcc2c1dc58cc7b96d8bef10c (diff)
downloadperl-ba77e4cc9d1ceebf472c9c5c18b2377ee47062e6.tar.gz
S_utf16_textfilter() needs to avoid splitting UTF-16 surrogate pairs.
Easier said than done.
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c21
1 files changed, 20 insertions, 1 deletions
diff --git a/toke.c b/toke.c
index f795707e0d..f105505ea4 100644
--- a/toke.c
+++ b/toke.c
@@ -12822,13 +12822,32 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen)
sv_chop(utf8_buffer, nl);
break;
}
+
/* OK, not a complete line there, so need to read some more UTF-16.
Read an extra octect if the buffer currently has an odd number. */
+ while (1) {
+ if (status <= 0)
+ break;
+ if (SvCUR(utf16_buffer) >= 2) {
+ /* Location of the high octet of the last complete code point.
+ Gosh, UTF-16 is a pain. All the benefits of variable length,
+ *coupled* with all the benefits of partial reads and
+ endianness. */
+ const U8 *const last_hi = (U8*)SvPVX(utf16_buffer)
+ + ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2));
+
+ if (*last_hi < 0xd8 || *last_hi > 0xdb) {
+ break;
+ }
+
+ /* We have the first half of a surrogate. Read more. */
+ DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi));
+ }
- while(SvCUR(utf16_buffer) < 2 && status > 0) {
status = FILTER_READ(idx + 1, utf16_buffer,
160 + (SvCUR(utf16_buffer) & 1));
DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%"IVdf" SvCUR(sv)=%"UVuf"\n", status, (UV)SvCUR(utf16_buffer)));
+ DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);});
if (status < 0) {
/* Error */
IoPAGE(filter) = status;