diff options
author | Robin Watts <Robin.Watts@artifex.com> | 2022-11-21 16:35:17 +0000 |
---|---|---|
committer | Robin Watts <Robin.Watts@artifex.com> | 2022-11-21 18:58:51 +0000 |
commit | e15f8ff9db30c4e780973cef79e162baed6047c6 (patch) | |
tree | d73af5935cceb56721479331fdd2a901b7b083c7 /base/gp_utf8.c | |
parent | 31f96d10d419ca83c6d9bcafb2c635df35bb537c (diff) | |
download | ghostpdl-e15f8ff9db30c4e780973cef79e162baed6047c6.tar.gz |
Move gp_wutf8 functions to be gp_utf8.
Promote previously windows specific functions to be
generically available (and give them names that
reflect that).
Diffstat (limited to 'base/gp_utf8.c')
-rw-r--r-- | base/gp_utf8.c | 199 |
1 files changed, 199 insertions, 0 deletions
diff --git a/base/gp_utf8.c b/base/gp_utf8.c new file mode 100644 index 000000000..31a6fba6b --- /dev/null +++ b/base/gp_utf8.c @@ -0,0 +1,199 @@ +/* Copyright (C) 2001-2022 Artifex Software, Inc. + All Rights Reserved. + + This software is provided AS-IS with no warranty, either express or + implied. + + This software is distributed under license and may not be copied, + modified or distributed except as expressly authorized under the terms + of the license contained in the file LICENSE in this distribution. + + Refer to licensing information at http://www.artifex.com or contact + Artifex Software, Inc., 1305 Grant Avenue - Suite 200, Novato, + CA 94945, U.S.A., +1(415)492-9861, for further information. +*/ + + +#include "gp_utf8.h" + +static int +decode_utf8(const char **inp, unsigned int i) +{ + const char *in = *inp; + unsigned char c; + + if (i < 0x80) { + } else if ((i & 0xE0) == 0xC0) { + i &= 0x1F; + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + } else if ((i & 0xF0) == 0xE0) { + i &= 0xF; + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + } else if ((i & 0xF8) == 0xF0) { + i &= 0x7; + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + c = (unsigned char)*in++; + if ((c & 0xC0) != 0x80) + goto fail; + i = (i<<6) | (c & 0x3f); + } + if (0) + { + /* If we fail, unread the last one, and return the unicode replacement char. */ +fail: + in--; + i = 0xfffd; + } + *inp = in; + + return i; +} + +int gp_utf8_to_uint16(unsigned short *out, const char *in) +{ + unsigned int i; + unsigned int len = 1; + unsigned char c; + + if (out) { + while (i = *(unsigned char *)in++) { + /* Decode UTF-8 */ + i = decode_utf8(&in, i); + + /* Encode, allowing for surrogates. */ + if (i >= 0x10000 && i <= 0x10ffff) + { + i -= 0x10000; + *out++ = 0xd800 + (i>>10); + *out++ = 0xdc00 + (i & 0x3ff); + len++; + } + else if (i > 0x10000) + { + return -1; + } + else + *out++ = (unsigned short)i; + len++; + } + *out = 0; + } else { + while (i = *(unsigned char *)in++) { + /* Decode UTF-8 */ + i = decode_utf8(&in, i); + + /* Encode, allowing for surrogates. */ + if (i >= 0x10000 && i <= 0x10ffff) + len++; + else if (i > 0x10000) + return -1; + len++; + } + } + return len; +} + +int gp_uint16_to_utf8(char *out, const unsigned short *in) +{ + unsigned int i; + unsigned int len = 1; + int hi = -1; + + if (out) { + while (i = (unsigned int)*in++) { + /* Decode surrogates */ + if (i >= 0xD800 && i <= 0xDBFF) + { + /* High surrogate. Must be followed by a low surrogate, or this is a failure. */ + int hi = i & 0x3ff; + int j = (unsigned int)*in++; + if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF)) + { + /* Failure! Unicode replacement char! */ + in--; + i = 0xfffd; + } else { + /* Decode surrogates */ + i = 0x10000 + (hi<<10) + (j & 0x3ff); + } + } else if (i >= 0xDC00 && i <= 0xDFFF) + { + /* Lone low surrogate. Failure. Unicode replacement char. */ + i = 0xfffd; + } + + /* Encode output */ + if (i < 0x80) { + *out++ = (char)i; + len++; + } else if (i < 0x800) { + *out++ = 0xC0 | ( i>> 6 ); + *out++ = 0x80 | ( i & 0x3F); + len+=2; + } else if (i < 0x10000) { + *out++ = 0xE0 | ( i>>12 ); + *out++ = 0x80 | ((i>> 6) & 0x3F); + *out++ = 0x80 | ( i & 0x3F); + len+=3; + } else { + *out++ = 0xF0 | ( i>>18 ); + *out++ = 0x80 | ((i>>12) & 0x3F); + *out++ = 0x80 | ((i>> 6) & 0x3F); + *out++ = 0x80 | ( i & 0x3F); + len+=4; + } + } + *out = 0; + } else { + while (i = (unsigned int)*in++) { + /* Decode surrogates */ + if (i >= 0xD800 && i <= 0xDBFF) + { + /* High surrogate. Must be followed by a low surrogate, or this is a failure. */ + int hi = i & 0x3ff; + int j = (unsigned int)*in++; + if (j == 0 || (j <= 0xDC00 || j >= 0xDFFF)) + { + /* Failure! Unicode replacement char! */ + in--; + i = 0xfffd; + } else { + /* Decode surrogates */ + i = 0x10000 + (hi<<10) + (j & 0x3ff); + } + } else if (i >= 0xDC00 && i <= 0xDFFF) + { + /* Lone low surrogate. Failure. Unicode replacement char. */ + i = 0xfffd; + } + + if (i < 0x80) { + len++; + } else if (i < 0x800) { + len += 2; + } else if (i < 0x10000) { + len += 3; + } else { + len += 4; + } + } + } + return len; +} |