diff options
author | Nicholas Clark <nick@ccl4.org> | 2009-10-22 19:39:30 +0100 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2009-10-22 19:39:30 +0100 |
commit | ba77e4cc9d1ceebf472c9c5c18b2377ee47062e6 (patch) | |
tree | 0a70cdd56cb6415fc3e02ae6e1fefb69934e2eb0 /toke.c | |
parent | b3766b12c64c46e0bcc2c1dc58cc7b96d8bef10c (diff) | |
download | perl-ba77e4cc9d1ceebf472c9c5c18b2377ee47062e6.tar.gz |
S_utf16_textfilter() needs to avoid splitting UTF-16 surrogate pairs.
Easier said than done.
Diffstat (limited to 'toke.c')
-rw-r--r-- | toke.c | 21 |
1 files changed, 20 insertions, 1 deletions
@@ -12822,13 +12822,32 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen) sv_chop(utf8_buffer, nl); break; } + /* OK, not a complete line there, so need to read some more UTF-16. Read an extra octect if the buffer currently has an odd number. */ + while (1) { + if (status <= 0) + break; + if (SvCUR(utf16_buffer) >= 2) { + /* Location of the high octet of the last complete code point. + Gosh, UTF-16 is a pain. All the benefits of variable length, + *coupled* with all the benefits of partial reads and + endianness. */ + const U8 *const last_hi = (U8*)SvPVX(utf16_buffer) + + ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2)); + + if (*last_hi < 0xd8 || *last_hi > 0xdb) { + break; + } + + /* We have the first half of a surrogate. Read more. */ + DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi)); + } - while(SvCUR(utf16_buffer) < 2 && status > 0) { status = FILTER_READ(idx + 1, utf16_buffer, 160 + (SvCUR(utf16_buffer) & 1)); DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%"IVdf" SvCUR(sv)=%"UVuf"\n", status, (UV)SvCUR(utf16_buffer))); + DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);}); if (status < 0) { /* Error */ IoPAGE(filter) = status; |