summaryrefslogtreecommitdiff
path: root/lib/striconveh.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/striconveh.c')
-rw-r--r--lib/striconveh.c106
1 files changed, 88 insertions, 18 deletions
diff --git a/lib/striconveh.c b/lib/striconveh.c
index 78a5e0296..5b60a7e0a 100644
--- a/lib/striconveh.c
+++ b/lib/striconveh.c
@@ -1,5 +1,5 @@
/* Character set conversion with error handling.
- Copyright (C) 2001-2021 Free Software Foundation, Inc.
+ Copyright (C) 2001-2022 Free Software Foundation, Inc.
Written by Bruno Haible and Simon Josefsson.
This file is free software: you can redistribute it and/or modify
@@ -457,13 +457,18 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
if (cd2 == (iconv_t)(-1))
{
/* TO_CODESET is UTF-8. */
- /* Error handling can produce up to 1 byte of output. */
- if (length + 1 + extra_alloc > allocated)
+ /* Error handling can produce up to 1 or 3 bytes of
+ output. */
+ size_t extra_need =
+ (handler == iconveh_replacement_character ? 3 : 1);
+ if (length + extra_need + extra_alloc > allocated)
{
char *memory;
allocated = 2 * allocated;
- if (length + 1 + extra_alloc > allocated)
+ if (length + extra_need + extra_alloc > allocated)
+ allocated = 2 * allocated;
+ if (length + extra_need + extra_alloc > allocated)
abort ();
if (result == initial_result)
memory = (char *) malloc (allocated);
@@ -482,7 +487,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
grow = false;
}
/* The input is invalid in FROM_CODESET. Eat up one byte
- and emit a question mark. */
+ and emit a replacement character or a question mark. */
if (!incremented)
{
if (insize == 0)
@@ -490,8 +495,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
inptr++;
insize--;
}
- result[length] = '?';
- length++;
+ if (handler == iconveh_replacement_character)
+ {
+ /* U+FFFD in UTF-8 encoding. */
+ result[length+0] = '\357';
+ result[length+1] = '\277';
+ result[length+2] = '\275';
+ length += 3;
+ }
+ else
+ {
+ result[length] = '?';
+ length++;
+ }
}
else
goto indirectly;
@@ -594,7 +610,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
{
const bool slowly = (offsets != NULL || handler == iconveh_error);
# define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
- char utf8buf[utf8bufsize + 1];
+ char utf8buf[utf8bufsize + 3];
size_t utf8len = 0;
const char *in1ptr = src;
size_t in1size = srclen;
@@ -682,8 +698,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
&& errno == EILSEQ && handler != iconveh_error)
{
/* The input is invalid in FROM_CODESET. Eat up one byte and
- emit a question mark. Room for the question mark was allocated
- at the end of utf8buf. */
+ emit a U+FFFD character or a question mark. Room for this
+ character was allocated at the end of utf8buf. */
if (!incremented1)
{
if (in1size == 0)
@@ -691,7 +707,16 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
in1ptr++;
in1size--;
}
- *out1ptr++ = '?';
+ if (handler == iconveh_replacement_character)
+ {
+ /* U+FFFD in UTF-8 encoding. */
+ out1ptr[0] = '\357';
+ out1ptr[1] = '\277';
+ out1ptr[2] = '\275';
+ out1ptr += 3;
+ }
+ else
+ *out1ptr++ = '?';
res1 = 0;
}
errno1 = errno;
@@ -756,7 +781,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
break;
else if (errno == EILSEQ && handler != iconveh_error)
{
- /* Error handling can produce up to 10 bytes of ASCII
+ /* Error handling can produce up to 10 bytes of UTF-8
output. But TO_CODESET may be UCS-2, UTF-16 or
UCS-4, so use CD2 here as well. */
char scratchbuf[10];
@@ -804,6 +829,14 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
scratchbuf[scratchlen++] = hex[uc & 15];
}
+ else if (handler == iconveh_replacement_character)
+ {
+ /* U+FFFD in UTF-8 encoding. */
+ scratchbuf[0] = '\357';
+ scratchbuf[1] = '\277';
+ scratchbuf[2] = '\275';
+ scratchlen = 3;
+ }
else
{
scratchbuf[0] = '?';
@@ -813,9 +846,45 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
inptr = scratchbuf;
insize = scratchlen;
if (cd2 != (iconv_t)(-1))
- res = iconv (cd2,
- (ICONV_CONST char **) &inptr, &insize,
- &out2ptr, &out2size);
+ {
+ char *out2ptr_try = out2ptr;
+ size_t out2size_try = out2size;
+ res = iconv (cd2,
+ (ICONV_CONST char **) &inptr, &insize,
+ &out2ptr_try, &out2size_try);
+ if (handler == iconveh_replacement_character
+ && (res == (size_t)(-1)
+ ? errno == EILSEQ
+ /* FreeBSD iconv(), NetBSD iconv(), and
+ Solaris 11 iconv() insert a '?' if they
+ cannot convert. This is what we want.
+ But IRIX iconv() inserts a NUL byte if it
+ cannot convert.
+ And musl libc iconv() inserts a '*' if it
+ cannot convert. */
+ : (res > 0
+ && !(out2ptr_try - out2ptr == 1
+ && *out2ptr == '?'))))
+ {
+ /* The iconv() call failed.
+ U+FFFD can't be converted to TO_CODESET.
+ Use '?' instead. */
+ scratchbuf[0] = '?';
+ scratchlen = 1;
+ inptr = scratchbuf;
+ insize = scratchlen;
+ res = iconv (cd2,
+ (ICONV_CONST char **) &inptr, &insize,
+ &out2ptr, &out2size);
+ }
+ else
+ {
+ /* Accept the results of the iconv() call. */
+ out2ptr = out2ptr_try;
+ out2size = out2size_try;
+ res = 0;
+ }
+ }
else
{
/* TO_CODESET is UTF-8. */
@@ -880,9 +949,10 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
length = out2ptr - result;
}
# if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
- /* Irix iconv() inserts a NUL byte if it cannot convert.
- NetBSD iconv() inserts a question mark if it cannot
- convert.
+ /* IRIX iconv() inserts a NUL byte if it cannot convert.
+ FreeBSD iconv(), NetBSD iconv(), and Solaris 11
+ iconv() insert a '?' if they cannot convert.
+ musl libc iconv() inserts a '*' if it cannot convert.
Only GNU libiconv and GNU libc are known to prefer
to fail rather than doing a lossy conversion. */
if (res != (size_t)(-1) && res > 0)