From 0af357c7e59baa371a56467395ae1bf64c8c2b9c Mon Sep 17 00:00:00 2001 From: Robin Watts Date: Tue, 4 Oct 2022 17:36:56 +0100 Subject: Bug 705911: Fix Ghostscript's encoding/decoding of UTF-8 from UTF-16. We were not coping with high/low surrogate pairs in UTF-16, meaning that we could encode/decode strings fine for our own purposes, but when we passed them off to other users (such as SmartOffice), it would fail to understand our utf-8 encoded surrogate pairs. Thanks to Pete, Joseph and Fred for their help here, and to Silver for having spotted it! --- base/gp_wutf8.c | 162 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 127 insertions(+), 35 deletions(-) diff --git a/base/gp_wutf8.c b/base/gp_wutf8.c index b7b1d0758..920114cd1 100644 --- a/base/gp_wutf8.c +++ b/base/gp_wutf8.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2001-2021 Artifex Software, Inc. +/* Copyright (C) 2001-2022 Artifex Software, Inc. All Rights Reserved. This software is provided AS-IS with no warranty, either express or @@ -16,6 +16,56 @@ #include "windows_.h" +static int +decode_utf8(const char **inp, unsigned int i) +{ + const char *in = *inp; + unsigned char c; + + if (i < 0x80) { + } else if ((i & 0xE0) == 0xC0) { + i &= 0x1F; + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + } else if ((i & 0xF0) == 0xE0) { + i &= 0xF; + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + } else if ((i & 0xF8) == 0xF0) { + i &= 0x7; + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + } + if (0) + { + /* If we fail, unread the last one, and return the unicode replacement char. */ +fail: + in--; + i = 0xfffd; + } + *inp = in; + + return i; +} + int utf8_to_wchar(wchar_t *out, const char *in) { unsigned int i; @@ -24,47 +74,37 @@ int utf8_to_wchar(wchar_t *out, const char *in) if (out) { while (i = *(unsigned char *)in++) { - if (i < 0x80) { - *out++ = (wchar_t)i; - len++; - } else if ((i & 0xE0) == 0xC0) { - i &= 0x1F; - c = (unsigned char)*in++; - if ((c & 0xC0) != 0x80) - return -1; - i = (i<<6) | (c & 0x3f); - *out++ = (wchar_t)i; - len++; - } else if ((i & 0xF0) == 0xE0) { - i &= 0xF; - c = (unsigned char)*in++; - if ((c & 0xC0) != 0x80) - return -1; - i = (i<<6) | (c & 0x3f); - c = (unsigned char)*in++; - if ((c & 0xC0) != 0x80) - return -1; - i = (i<<6) | (c & 0x3f); - *out++ = (wchar_t)i; + /* Decode UTF-8 */ + i = decode_utf8(&in, i); + + /* Encode, allowing for surrogates. */ + if (i >= 0x10000 && i <= 0x10ffff) + { + i -= 0x10000; + *out++ = 0xd800 + (i>>10); + *out++ = 0xdc00 + (i & 0x3ff); len++; - } else { + } + else if (i > 0x10000) + { return -1; } + else + *out++ = (wchar_t)i; + len++; } *out = 0; } else { while (i = *(unsigned char *)in++) { - if (i < 0x80) { - len++; - } else if ((i & 0xE0) == 0xC0) { - in++; - len++; - } else if ((i & 0xF0) == 0xE0) { - in+=2; + /* Decode UTF-8 */ + i = decode_utf8(&in, i); + + /* Encode, allowing for surrogates. */ + if (i >= 0x10000 && i <= 0x10ffff) len++; - } else { + else if (i > 0x10000) return -1; - } + len++; } } return len; @@ -74,9 +114,32 @@ int wchar_to_utf8(char *out, const wchar_t *in) { unsigned int i; unsigned int len = 1; + int hi = -1; if (out) { while (i = (unsigned int)*in++) { + /* Decode surrogates */ + if (i >= 0xD800 && i <= 0xDBFF) + { + /* High surrogate. Must be followed by a low surrogate, or this is a failure. */ + int hi = i & 0x3ff; + int j = (unsigned int)*in++; + if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF)) + { + /* Failure! Unicode replacement char! */ + in--; + i = 0xfffd; + } else { + /* Decode surrogates */ + i = 0x10000 + (hi<<10) + (j & 0x3ff); + } + } else if (i >= 0xDC00 && i <= 0xDFFF) + { + /* Lone low surrogate. Failure. Unicode replacement char. */ + i = 0xfffd; + } + + /* Encode output */ if (i < 0x80) { *out++ = (char)i; len++; @@ -84,22 +147,51 @@ int wchar_to_utf8(char *out, const wchar_t *in) *out++ = 0xC0 | ( i>> 6 ); *out++ = 0x80 | ( i & 0x3F); len+=2; - } else /* if (i < 0x10000) */ { + } else if (i < 0x10000) { *out++ = 0xE0 | ( i>>12 ); *out++ = 0x80 | ((i>> 6) & 0x3F); *out++ = 0x80 | ( i & 0x3F); len+=3; + } else { + *out++ = 0xF0 | ( i>>18 ); + *out++ = 0x80 | ((i>>12) & 0x3F); + *out++ = 0x80 | ((i>> 6) & 0x3F); + *out++ = 0x80 | ( i & 0x3F); + len+=4; } } *out = 0; } else { while (i = (unsigned int)*in++) { + /* Decode surrogates */ + if (i >= 0xD800 && i <= 0xDBFF) + { + /* High surrogate. Must be followed by a low surrogate, or this is a failure. */ + int hi = i & 0x3ff; + int j = (unsigned int)*in++; + if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF)) + { + /* Failure! Unicode replacement char! */ + in--; + i = 0xfffd; + } else { + /* Decode surrogates */ + i = 0x10000 + (hi<<10) + (j & 0x3ff); + } + } else if (i >= 0xDC00 && i <= 0xDFFF) + { + /* Lone low surrogate. Failure. Unicode replacement char. */ + i = 0xfffd; + } + if (i < 0x80) { len++; } else if (i < 0x800) { len += 2; - } else /* if (i < 0x10000) */ { + } else if (i < 0x10000) { len += 3; + } else { + len += 4; } } } -- cgit v1.2.1