Move gp_wutf8 functions to be gp_utf8.

Promote previously windows specific functions to be generically available (and give them names that reflect that).
author: Robin Watts <Robin.Watts@artifex.com> 2022-11-21 16:35:17 +0000
committer: Robin Watts <Robin.Watts@artifex.com> 2022-11-21 18:58:51 +0000
commit: e15f8ff9db30c4e780973cef79e162baed6047c6 (patch)
tree: d73af5935cceb56721479331fdd2a901b7b083c7 /base/gp_utf8.c
parent: 31f96d10d419ca83c6d9bcafb2c635df35bb537c (diff)
download: ghostpdl-e15f8ff9db30c4e780973cef79e162baed6047c6.tar.gz
1 files changed, 199 insertions, 0 deletions
diff --git a/base/gp_utf8.c b/base/gp_utf8.c
new file mode 100644
index 000000000..31a6fba6b
--- /dev/null
+++ b/base/gp_utf8.c
@@ -0,0 +1,199 @@
+/* Copyright (C) 2001-2022 Artifex Software, Inc.
+   All Rights Reserved.
+
+   This software is provided AS-IS with no warranty, either express or
+   implied.
+
+   This software is distributed under license and may not be copied,
+   modified or distributed except as expressly authorized under the terms
+   of the license contained in the file LICENSE in this distribution.
+
+   Refer to licensing information at http://www.artifex.com or contact
+   Artifex Software, Inc.,  1305 Grant Avenue - Suite 200, Novato,
+   CA 94945, U.S.A., +1(415)492-9861, for further information.
+*/
+
+
+#include "gp_utf8.h"
+
+static int
+decode_utf8(const char **inp, unsigned int i)
+{
+    const char *in = *inp;
+    unsigned char c;
+
+    if (i < 0x80) {
+    } else if ((i & 0xE0) == 0xC0) {
+        i &= 0x1F;
+        c = (unsigned char)*in++;
+        if ((c & 0xC0) != 0x80)
+            goto fail;
+        i = (i<<6) | (c & 0x3f);
+    } else if ((i & 0xF0) == 0xE0) {
+        i &= 0xF;
+        c = (unsigned char)*in++;
+        if ((c & 0xC0) != 0x80)
+            goto fail;
+        i = (i<<6) | (c & 0x3f);
+        c = (unsigned char)*in++;
+        if ((c & 0xC0) != 0x80)
+            goto fail;
+        i = (i<<6) | (c & 0x3f);
+    } else if ((i & 0xF8) == 0xF0) {
+        i &= 0x7;
+        c = (unsigned char)*in++;
+        if ((c & 0xC0) != 0x80)
+            goto fail;
+        i = (i<<6) | (c & 0x3f);
+        c = (unsigned char)*in++;
+        if ((c & 0xC0) != 0x80)
+            goto fail;
+        i = (i<<6) | (c & 0x3f);
+        c = (unsigned char)*in++;
+        if ((c & 0xC0) != 0x80)
+            goto fail;
+        i = (i<<6) | (c & 0x3f);
+    }
+    if (0)
+    {
+        /* If we fail, unread the last one, and return the unicode replacement char. */
+fail:
+       in--;
+       i = 0xfffd;
+    }
+    *inp = in;
+
+    return i;
+}
+
+int gp_utf8_to_uint16(unsigned short *out, const char *in)
+{
+    unsigned int i;
+    unsigned int len = 1;
+    unsigned char c;
+
+    if (out) {
+        while (i = *(unsigned char *)in++) {
+            /* Decode UTF-8 */
+            i = decode_utf8(&in, i);
+
+            /* Encode, allowing for surrogates. */
+            if (i >= 0x10000 && i <= 0x10ffff)
+            {
+                i -= 0x10000;
+                *out++ = 0xd800 + (i>>10);
+                *out++ = 0xdc00 + (i & 0x3ff);
+                len++;
+            }
+            else if (i > 0x10000)
+            {
+                return -1;
+            }
+            else
+                *out++ = (unsigned short)i;
+            len++;
+        }
+        *out = 0;
+    } else {
+        while (i = *(unsigned char *)in++) {
+            /* Decode UTF-8 */
+            i = decode_utf8(&in, i);
+
+            /* Encode, allowing for surrogates. */
+            if (i >= 0x10000 && i <= 0x10ffff)
+                len++;
+            else if (i > 0x10000)
+                return -1;
+            len++;
+        }
+    }
+    return len;
+}
+
+int gp_uint16_to_utf8(char *out, const unsigned short *in)
+{
+    unsigned int i;
+    unsigned int len = 1;
+    int hi = -1;
+
+    if (out) {
+        while (i = (unsigned int)*in++) {
+            /* Decode surrogates */
+            if (i >= 0xD800 && i <= 0xDBFF)
+            {
+                /* High surrogate. Must be followed by a low surrogate, or this is a failure. */
+                int hi = i & 0x3ff;
+                int j = (unsigned int)*in++;
+                if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF))
+                {
+                    /* Failure! Unicode replacement char! */
+                    in--;
+                    i = 0xfffd;
+                } else {
+                    /* Decode surrogates */
+                    i = 0x10000 + (hi<<10) + (j & 0x3ff);
+                }
+            } else if (i >= 0xDC00 && i <= 0xDFFF)
+            {
+                /* Lone low surrogate. Failure. Unicode replacement char. */
+                i = 0xfffd;
+            }
+
+            /* Encode output */
+            if (i < 0x80) {
+                *out++ = (char)i;
+                len++;
+            } else if (i < 0x800) {
+                *out++ = 0xC0 | ( i>> 6        );
+                *out++ = 0x80 | ( i      & 0x3F);
+                len+=2;
+            } else if (i < 0x10000) {
+                *out++ = 0xE0 | ( i>>12        );
+                *out++ = 0x80 | ((i>> 6) & 0x3F);
+                *out++ = 0x80 | ( i      & 0x3F);
+                len+=3;
+            } else {
+                *out++ = 0xF0 | ( i>>18        );
+                *out++ = 0x80 | ((i>>12) & 0x3F);
+                *out++ = 0x80 | ((i>> 6) & 0x3F);
+                *out++ = 0x80 | ( i      & 0x3F);
+                len+=4;
+            }
+        }
+        *out = 0;
+    } else {
+        while (i = (unsigned int)*in++) {
+            /* Decode surrogates */
+            if (i >= 0xD800 && i <= 0xDBFF)
+            {
+                /* High surrogate. Must be followed by a low surrogate, or this is a failure. */
+                int hi = i & 0x3ff;
+                int j = (unsigned int)*in++;
+                if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF))
+                {
+                    /* Failure! Unicode replacement char! */
+                    in--;
+                    i = 0xfffd;
+                } else {
+                    /* Decode surrogates */
+                    i = 0x10000 + (hi<<10) + (j & 0x3ff);
+                }
+            } else if (i >= 0xDC00 && i <= 0xDFFF)
+            {
+                /* Lone low surrogate. Failure. Unicode replacement char. */
+                i = 0xfffd;
+            }
+
+            if (i < 0x80) {
+                len++;
+            } else if (i < 0x800) {
+                len += 2;
+            } else if (i < 0x10000) {
+                len += 3;
+            } else {
+                len += 4;
+            }
+        }
+    }
+    return len;
+}
author	Robin Watts <Robin.Watts@artifex.com>	2022-11-21 16:35:17 +0000
committer	Robin Watts <Robin.Watts@artifex.com>	2022-11-21 18:58:51 +0000
commit	e15f8ff9db30c4e780973cef79e162baed6047c6 (patch)
tree	d73af5935cceb56721479331fdd2a901b7b083c7 /base/gp_utf8.c
parent	31f96d10d419ca83c6d9bcafb2c635df35bb537c (diff)
download	ghostpdl-e15f8ff9db30c4e780973cef79e162baed6047c6.tar.gz