summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKen Sharp <ken.sharp@artifex.com>2023-04-10 15:50:54 +0100
committerKen Sharp <ken.sharp@artifex.com>2023-04-10 15:50:54 +0100
commit91943811904f562b101b0ac410da60974b4186f2 (patch)
treef2335bc513f33247767bd54f5eefbfb36568b7f3
parent7fc1b376bba20c5a207f07c0222827705a2c8fa5 (diff)
downloadghostpdl-91943811904f562b101b0ac410da60974b4186f2.tar.gz
pdfwrite - support surrogate pairs in XMP UTF-8 strings
Bug #706551 "ps2pdf corrupts Unicode title in PDF 1.4 XML metadata" This is not in fact part of the original report and should have been reported separately. Still.... The bug arose from mer removing a partial implementation of surrogate pairs when we started using Coverity, because Coverity complained about the code and it was simpler to remove it. I clearly forgto to go back and finish it. This just adds code to deal with the (documented as unusual) case of UTF-16 surrogate pairs. Seems to work with all the tests I can concoct.
-rw-r--r--devices/vector/gdevpdfe.c41
1 files changed, 31 insertions, 10 deletions
diff --git a/devices/vector/gdevpdfe.c b/devices/vector/gdevpdfe.c
index db0c0b883..35ebdee01 100644
--- a/devices/vector/gdevpdfe.c
+++ b/devices/vector/gdevpdfe.c
@@ -359,6 +359,7 @@ static const char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
static int gs_ConvertUTF16(unsigned char *UTF16, size_t UTF16Len, unsigned char **UTF8Start, int UTF8Len)
{
size_t i, bytes = 0;
+ uint32_t U32 = 0;
unsigned short U16;
unsigned char *UTF8 = *UTF8Start;
unsigned char *UTF8End = UTF8 + UTF8Len;
@@ -372,21 +373,38 @@ static int gs_ConvertUTF16(unsigned char *UTF16, size_t UTF16Len, unsigned char
U16 += *UTF16++;
if (U16 >= 0xD800 && U16 <= 0xDBFF) {
- return gs_note_error(gs_error_rangecheck);
- }
- if (U16 >= 0xDC00 && U16 <= 0xDFFF) {
- return gs_note_error(gs_error_rangecheck);
- }
+ /* Ensure at least two bytes of input left */
+ if (i == (UTF16Len / sizeof(short)) - 1)
+ return gs_note_error(gs_error_rangecheck);
+
+ U32 += (U16 & 0x3FF) << 10;
+ U16 = (*(UTF16++) << 8);
+ U16 += *(UTF16++);
+ i++;
- if(U16 < 0x80) {
- bytes = 1;
+ /* Ensure a high order surrogate is followed by a low order surrogate */
+ if (U16 < 0xDC00 || U16 > 0xDFFF)
+ return gs_note_error(gs_error_rangecheck);
+
+ U32 += (U16 & 0x3FF) | 0x10000;
+ bytes = 4;
} else {
- if (U16 < 0x800) {
- bytes = 2;
+ if (U16 >= 0xDC00 && U16 <= 0xDFFF) {
+ /* We got a low order surrogate without a preceding high-order */
+ return gs_note_error(gs_error_rangecheck);
+ }
+
+ if(U16 < 0x80) {
+ bytes = 1;
} else {
- bytes = 3;
+ if (U16 < 0x800) {
+ bytes = 2;
+ } else {
+ bytes = 3;
+ }
}
}
+
if (UTF8 + bytes > UTF8End)
return gs_note_error(gs_error_VMerror);
@@ -394,6 +412,9 @@ static int gs_ConvertUTF16(unsigned char *UTF16, size_t UTF16Len, unsigned char
UTF8 += bytes;
switch(bytes) {
+ case 4:
+ *--UTF8 = (unsigned char)((U32 | 0x80) & 0xBF);
+ U16 = U32 >> 6;
case 3:
*--UTF8 = (unsigned char)((U16 | 0x80) & 0xBF);
U16 >>= 6;