diff options
author | David Kilzer <ddkilzer@apple.com> | 2021-07-07 18:23:18 -0700 |
---|---|---|
committer | Nick Wellnhofer <wellnhofer@aevum.de> | 2022-01-16 14:07:17 +0100 |
commit | 03bb929390a5cac451f720ab581817684ecddb8e (patch) | |
tree | 6eb19deabe1a8c3c20ad1f0307cb6a37bcf087a0 /encoding.c | |
parent | e6adc19fff785120140a45e9e43e43d1599895a2 (diff) | |
download | libxml2-03bb929390a5cac451f720ab581817684ecddb8e.tar.gz |
Fix parse failure when 4-byte character in UTF-16 BE is split across a chunk
This makes the logic in UTF16BEToUTF8() match UTF16LEToUTF8().
* encoding.c:
(UTF16LEToUTF8):
- Fix comment to describe what the code does.
(UTF16BEToUTF8):
- Fix undefined behavior which was applied to UTF16LEToUTF8() in
2f9382033e.
- Add bounds check to while() loop which was applied to
UTF16LEToUTF8() in be803967db.
- Do not return -2 when (in >= inend) to fix the bug. This was
applied to UTF16LEToUTF8() in 496a1cf592.
- Inline (<< 8) statements to match UTF16LEToUTF8().
Add the following tests and results:
test/text-4-byte-UTF-16-BE-offset.xml
test/text-4-byte-UTF-16-BE.xml
test/text-4-byte-UTF-16-LE-offset.xml
test/text-4-byte-UTF-16-LE.xml
Diffstat (limited to 'encoding.c')
-rw-r--r-- | encoding.c | 23 |
1 files changed, 12 insertions, 11 deletions
@@ -527,7 +527,7 @@ UTF16LEToUTF8(unsigned char* out, int *outlen, in++; } if ((c & 0xFC00) == 0xD800) { /* surrogates */ - if (in >= inend) { /* (in > inend) shouldn't happens */ + if (in >= inend) { /* handle split mutli-byte characters */ break; } if (xmlLittleEndian) { @@ -744,38 +744,39 @@ UTF16BEToUTF8(unsigned char* out, int *outlen, { unsigned char* outstart = out; const unsigned char* processed = inb; - unsigned char* outend = out + *outlen; + unsigned char* outend; unsigned short* in = (unsigned short*) inb; unsigned short* inend; unsigned int c, d, inlen; unsigned char *tmp; int bits; + if (*outlen == 0) { + *inlenb = 0; + return(0); + } + outend = out + *outlen; if ((*inlenb % 2) == 1) (*inlenb)--; inlen = *inlenb / 2; inend= in + inlen; - while (in < inend) { + while ((in < inend) && (out - outstart + 5 < *outlen)) { if (xmlLittleEndian) { tmp = (unsigned char *) in; c = *tmp++; - c = c << 8; - c = c | (unsigned int) *tmp; + c = (c << 8) | (unsigned int) *tmp; in++; } else { c= *in++; } if ((c & 0xFC00) == 0xD800) { /* surrogates */ - if (in >= inend) { /* (in > inend) shouldn't happens */ - *outlen = out - outstart; - *inlenb = processed - inb; - return(-2); + if (in >= inend) { /* handle split mutli-byte characters */ + break; } if (xmlLittleEndian) { tmp = (unsigned char *) in; d = *tmp++; - d = d << 8; - d = d | (unsigned int) *tmp; + d = (d << 8) | (unsigned int) *tmp; in++; } else { d= *in++; |