summaryrefslogtreecommitdiff
path: root/base/gp_utf8.c
diff options
context:
space:
mode:
authorRobin Watts <Robin.Watts@artifex.com>2022-11-21 16:35:17 +0000
committerRobin Watts <Robin.Watts@artifex.com>2022-11-21 18:58:51 +0000
commite15f8ff9db30c4e780973cef79e162baed6047c6 (patch)
treed73af5935cceb56721479331fdd2a901b7b083c7 /base/gp_utf8.c
parent31f96d10d419ca83c6d9bcafb2c635df35bb537c (diff)
downloadghostpdl-e15f8ff9db30c4e780973cef79e162baed6047c6.tar.gz
Move gp_wutf8 functions to be gp_utf8.
Promote previously windows specific functions to be generically available (and give them names that reflect that).
Diffstat (limited to 'base/gp_utf8.c')
-rw-r--r--base/gp_utf8.c199
1 files changed, 199 insertions, 0 deletions
diff --git a/base/gp_utf8.c b/base/gp_utf8.c
new file mode 100644
index 000000000..31a6fba6b
--- /dev/null
+++ b/base/gp_utf8.c
@@ -0,0 +1,199 @@
+/* Copyright (C) 2001-2022 Artifex Software, Inc.
+ All Rights Reserved.
+
+ This software is provided AS-IS with no warranty, either express or
+ implied.
+
+ This software is distributed under license and may not be copied,
+ modified or distributed except as expressly authorized under the terms
+ of the license contained in the file LICENSE in this distribution.
+
+ Refer to licensing information at http://www.artifex.com or contact
+ Artifex Software, Inc., 1305 Grant Avenue - Suite 200, Novato,
+ CA 94945, U.S.A., +1(415)492-9861, for further information.
+*/
+
+
+#include "gp_utf8.h"
+
+static int
+decode_utf8(const char **inp, unsigned int i)
+{
+ const char *in = *inp;
+ unsigned char c;
+
+ if (i < 0x80) {
+ } else if ((i & 0xE0) == 0xC0) {
+ i &= 0x1F;
+ c = (unsigned char)*in++;
+ if ((c & 0xC0) != 0x80)
+ goto fail;
+ i = (i<<6) | (c & 0x3f);
+ } else if ((i & 0xF0) == 0xE0) {
+ i &= 0xF;
+ c = (unsigned char)*in++;
+ if ((c & 0xC0) != 0x80)
+ goto fail;
+ i = (i<<6) | (c & 0x3f);
+ c = (unsigned char)*in++;
+ if ((c & 0xC0) != 0x80)
+ goto fail;
+ i = (i<<6) | (c & 0x3f);
+ } else if ((i & 0xF8) == 0xF0) {
+ i &= 0x7;
+ c = (unsigned char)*in++;
+ if ((c & 0xC0) != 0x80)
+ goto fail;
+ i = (i<<6) | (c & 0x3f);
+ c = (unsigned char)*in++;
+ if ((c & 0xC0) != 0x80)
+ goto fail;
+ i = (i<<6) | (c & 0x3f);
+ c = (unsigned char)*in++;
+ if ((c & 0xC0) != 0x80)
+ goto fail;
+ i = (i<<6) | (c & 0x3f);
+ }
+ if (0)
+ {
+ /* If we fail, unread the last one, and return the unicode replacement char. */
+fail:
+ in--;
+ i = 0xfffd;
+ }
+ *inp = in;
+
+ return i;
+}
+
+int gp_utf8_to_uint16(unsigned short *out, const char *in)
+{
+ unsigned int i;
+ unsigned int len = 1;
+ unsigned char c;
+
+ if (out) {
+ while (i = *(unsigned char *)in++) {
+ /* Decode UTF-8 */
+ i = decode_utf8(&in, i);
+
+ /* Encode, allowing for surrogates. */
+ if (i >= 0x10000 && i <= 0x10ffff)
+ {
+ i -= 0x10000;
+ *out++ = 0xd800 + (i>>10);
+ *out++ = 0xdc00 + (i & 0x3ff);
+ len++;
+ }
+ else if (i > 0x10000)
+ {
+ return -1;
+ }
+ else
+ *out++ = (unsigned short)i;
+ len++;
+ }
+ *out = 0;
+ } else {
+ while (i = *(unsigned char *)in++) {
+ /* Decode UTF-8 */
+ i = decode_utf8(&in, i);
+
+ /* Encode, allowing for surrogates. */
+ if (i >= 0x10000 && i <= 0x10ffff)
+ len++;
+ else if (i > 0x10000)
+ return -1;
+ len++;
+ }
+ }
+ return len;
+}
+
+int gp_uint16_to_utf8(char *out, const unsigned short *in)
+{
+ unsigned int i;
+ unsigned int len = 1;
+ int hi = -1;
+
+ if (out) {
+ while (i = (unsigned int)*in++) {
+ /* Decode surrogates */
+ if (i >= 0xD800 && i <= 0xDBFF)
+ {
+ /* High surrogate. Must be followed by a low surrogate, or this is a failure. */
+ int hi = i & 0x3ff;
+ int j = (unsigned int)*in++;
+ if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF))
+ {
+ /* Failure! Unicode replacement char! */
+ in--;
+ i = 0xfffd;
+ } else {
+ /* Decode surrogates */
+ i = 0x10000 + (hi<<10) + (j & 0x3ff);
+ }
+ } else if (i >= 0xDC00 && i <= 0xDFFF)
+ {
+ /* Lone low surrogate. Failure. Unicode replacement char. */
+ i = 0xfffd;
+ }
+
+ /* Encode output */
+ if (i < 0x80) {
+ *out++ = (char)i;
+ len++;
+ } else if (i < 0x800) {
+ *out++ = 0xC0 | ( i>> 6 );
+ *out++ = 0x80 | ( i & 0x3F);
+ len+=2;
+ } else if (i < 0x10000) {
+ *out++ = 0xE0 | ( i>>12 );
+ *out++ = 0x80 | ((i>> 6) & 0x3F);
+ *out++ = 0x80 | ( i & 0x3F);
+ len+=3;
+ } else {
+ *out++ = 0xF0 | ( i>>18 );
+ *out++ = 0x80 | ((i>>12) & 0x3F);
+ *out++ = 0x80 | ((i>> 6) & 0x3F);
+ *out++ = 0x80 | ( i & 0x3F);
+ len+=4;
+ }
+ }
+ *out = 0;
+ } else {
+ while (i = (unsigned int)*in++) {
+ /* Decode surrogates */
+ if (i >= 0xD800 && i <= 0xDBFF)
+ {
+ /* High surrogate. Must be followed by a low surrogate, or this is a failure. */
+ int hi = i & 0x3ff;
+ int j = (unsigned int)*in++;
+ if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF))
+ {
+ /* Failure! Unicode replacement char! */
+ in--;
+ i = 0xfffd;
+ } else {
+ /* Decode surrogates */
+ i = 0x10000 + (hi<<10) + (j & 0x3ff);
+ }
+ } else if (i >= 0xDC00 && i <= 0xDFFF)
+ {
+ /* Lone low surrogate. Failure. Unicode replacement char. */
+ i = 0xfffd;
+ }
+
+ if (i < 0x80) {
+ len++;
+ } else if (i < 0x800) {
+ len += 2;
+ } else if (i < 0x10000) {
+ len += 3;
+ } else {
+ len += 4;
+ }
+ }
+ }
+ return len;
+}