summaryrefslogtreecommitdiff
path: root/encoding.c
diff options
context:
space:
mode:
authorDavid Kilzer <ddkilzer@apple.com>2021-07-07 18:23:18 -0700
committerNick Wellnhofer <wellnhofer@aevum.de>2022-01-16 14:07:17 +0100
commit03bb929390a5cac451f720ab581817684ecddb8e (patch)
tree6eb19deabe1a8c3c20ad1f0307cb6a37bcf087a0 /encoding.c
parente6adc19fff785120140a45e9e43e43d1599895a2 (diff)
downloadlibxml2-03bb929390a5cac451f720ab581817684ecddb8e.tar.gz
Fix parse failure when 4-byte character in UTF-16 BE is split across a chunk
This makes the logic in UTF16BEToUTF8() match UTF16LEToUTF8(). * encoding.c: (UTF16LEToUTF8): - Fix comment to describe what the code does. (UTF16BEToUTF8): - Fix undefined behavior which was applied to UTF16LEToUTF8() in 2f9382033e. - Add bounds check to while() loop which was applied to UTF16LEToUTF8() in be803967db. - Do not return -2 when (in >= inend) to fix the bug. This was applied to UTF16LEToUTF8() in 496a1cf592. - Inline (<< 8) statements to match UTF16LEToUTF8(). Add the following tests and results: test/text-4-byte-UTF-16-BE-offset.xml test/text-4-byte-UTF-16-BE.xml test/text-4-byte-UTF-16-LE-offset.xml test/text-4-byte-UTF-16-LE.xml
Diffstat (limited to 'encoding.c')
-rw-r--r--encoding.c23
1 files changed, 12 insertions, 11 deletions
diff --git a/encoding.c b/encoding.c
index 5e50c153..5d28e4f1 100644
--- a/encoding.c
+++ b/encoding.c
@@ -527,7 +527,7 @@ UTF16LEToUTF8(unsigned char* out, int *outlen,
in++;
}
if ((c & 0xFC00) == 0xD800) { /* surrogates */
- if (in >= inend) { /* (in > inend) shouldn't happens */
+ if (in >= inend) { /* handle split mutli-byte characters */
break;
}
if (xmlLittleEndian) {
@@ -744,38 +744,39 @@ UTF16BEToUTF8(unsigned char* out, int *outlen,
{
unsigned char* outstart = out;
const unsigned char* processed = inb;
- unsigned char* outend = out + *outlen;
+ unsigned char* outend;
unsigned short* in = (unsigned short*) inb;
unsigned short* inend;
unsigned int c, d, inlen;
unsigned char *tmp;
int bits;
+ if (*outlen == 0) {
+ *inlenb = 0;
+ return(0);
+ }
+ outend = out + *outlen;
if ((*inlenb % 2) == 1)
(*inlenb)--;
inlen = *inlenb / 2;
inend= in + inlen;
- while (in < inend) {
+ while ((in < inend) && (out - outstart + 5 < *outlen)) {
if (xmlLittleEndian) {
tmp = (unsigned char *) in;
c = *tmp++;
- c = c << 8;
- c = c | (unsigned int) *tmp;
+ c = (c << 8) | (unsigned int) *tmp;
in++;
} else {
c= *in++;
}
if ((c & 0xFC00) == 0xD800) { /* surrogates */
- if (in >= inend) { /* (in > inend) shouldn't happens */
- *outlen = out - outstart;
- *inlenb = processed - inb;
- return(-2);
+ if (in >= inend) { /* handle split mutli-byte characters */
+ break;
}
if (xmlLittleEndian) {
tmp = (unsigned char *) in;
d = *tmp++;
- d = d << 8;
- d = d | (unsigned int) *tmp;
+ d = (d << 8) | (unsigned int) *tmp;
in++;
} else {
d= *in++;