1 files changed, 88 insertions, 18 deletions
diff --git a/lib/striconveh.c b/lib/striconveh.c
index 78a5e0296..5b60a7e0a 100644
--- a/lib/striconveh.c
+++ b/lib/striconveh.c
@@ -1,5 +1,5 @@
 /* Character set conversion with error handling.
-   Copyright (C) 2001-2021 Free Software Foundation, Inc.
+   Copyright (C) 2001-2022 Free Software Foundation, Inc.
    Written by Bruno Haible and Simon Josefsson.
 
    This file is free software: you can redistribute it and/or modify
@@ -457,13 +457,18 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                 if (cd2 == (iconv_t)(-1))
                   {
                     /* TO_CODESET is UTF-8.  */
-                    /* Error handling can produce up to 1 byte of output.  */
-                    if (length + 1 + extra_alloc > allocated)
+                    /* Error handling can produce up to 1 or 3 bytes of
+                       output.  */
+                    size_t extra_need =
+                      (handler == iconveh_replacement_character ? 3 : 1);
+                    if (length + extra_need + extra_alloc > allocated)
                       {
                         char *memory;
 
                         allocated = 2 * allocated;
-                        if (length + 1 + extra_alloc > allocated)
+                        if (length + extra_need + extra_alloc > allocated)
+                          allocated = 2 * allocated;
+                        if (length + extra_need + extra_alloc > allocated)
                           abort ();
                         if (result == initial_result)
                           memory = (char *) malloc (allocated);
@@ -482,7 +487,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                         grow = false;
                       }
                     /* The input is invalid in FROM_CODESET.  Eat up one byte
-                       and emit a question mark.  */
+                       and emit a replacement character or a question mark.  */
                     if (!incremented)
                       {
                         if (insize == 0)
@@ -490,8 +495,19 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                         inptr++;
                         insize--;
                       }
-                    result[length] = '?';
-                    length++;
+                    if (handler == iconveh_replacement_character)
+                      {
+                        /* U+FFFD in UTF-8 encoding.  */
+                        result[length+0] = '\357';
+                        result[length+1] = '\277';
+                        result[length+2] = '\275';
+                        length += 3;
+                      }
+                    else
+                      {
+                        result[length] = '?';
+                        length++;
+                      }
                   }
                 else
                   goto indirectly;
@@ -594,7 +610,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
   {
     const bool slowly = (offsets != NULL || handler == iconveh_error);
 # define utf8bufsize 4096 /* may also be smaller or larger than tmpbufsize */
-    char utf8buf[utf8bufsize + 1];
+    char utf8buf[utf8bufsize + 3];
     size_t utf8len = 0;
     const char *in1ptr = src;
     size_t in1size = srclen;
@@ -682,8 +698,8 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
             && errno == EILSEQ && handler != iconveh_error)
           {
             /* The input is invalid in FROM_CODESET.  Eat up one byte and
-               emit a question mark.  Room for the question mark was allocated
-               at the end of utf8buf.  */
+               emit a U+FFFD character or a question mark.  Room for this
+               character was allocated at the end of utf8buf.  */
             if (!incremented1)
               {
                 if (in1size == 0)
@@ -691,7 +707,16 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                 in1ptr++;
                 in1size--;
               }
-            *out1ptr++ = '?';
+            if (handler == iconveh_replacement_character)
+              {
+                /* U+FFFD in UTF-8 encoding.  */
+                out1ptr[0] = '\357';
+                out1ptr[1] = '\277';
+                out1ptr[2] = '\275';
+                out1ptr += 3;
+              }
+            else
+              *out1ptr++ = '?';
             res1 = 0;
           }
         errno1 = errno;
@@ -756,7 +781,7 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                       break;
                     else if (errno == EILSEQ && handler != iconveh_error)
                       {
-                        /* Error handling can produce up to 10 bytes of ASCII
+                        /* Error handling can produce up to 10 bytes of UTF-8
                            output.  But TO_CODESET may be UCS-2, UTF-16 or
                            UCS-4, so use CD2 here as well.  */
                         char scratchbuf[10];
@@ -804,6 +829,14 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                             scratchbuf[scratchlen++] = hex[(uc>>4) & 15];
                             scratchbuf[scratchlen++] = hex[uc & 15];
                           }
+                        else if (handler == iconveh_replacement_character)
+                          {
+                            /* U+FFFD in UTF-8 encoding.  */
+                            scratchbuf[0] = '\357';
+                            scratchbuf[1] = '\277';
+                            scratchbuf[2] = '\275';
+                            scratchlen = 3;
+                          }
                         else
                           {
                             scratchbuf[0] = '?';
@@ -813,9 +846,45 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                         inptr = scratchbuf;
                         insize = scratchlen;
                         if (cd2 != (iconv_t)(-1))
-                          res = iconv (cd2,
-                                       (ICONV_CONST char **) &inptr, &insize,
-                                       &out2ptr, &out2size);
+                          {
+                            char *out2ptr_try = out2ptr;
+                            size_t out2size_try = out2size;
+                            res = iconv (cd2,
+                                         (ICONV_CONST char **) &inptr, &insize,
+                                         &out2ptr_try, &out2size_try);
+                            if (handler == iconveh_replacement_character
+                                && (res == (size_t)(-1)
+                                    ? errno == EILSEQ
+                                    /* FreeBSD iconv(), NetBSD iconv(), and
+                                       Solaris 11 iconv() insert a '?' if they
+                                       cannot convert.  This is what we want.
+                                       But IRIX iconv() inserts a NUL byte if it
+                                       cannot convert.
+                                       And musl libc iconv() inserts a '*' if it
+                                       cannot convert.  */
+                                    : (res > 0
+                                       && !(out2ptr_try - out2ptr == 1
+                                            && *out2ptr == '?'))))
+                              {
+                                /* The iconv() call failed.
+                                   U+FFFD can't be converted to TO_CODESET.
+                                   Use '?' instead.  */
+                                scratchbuf[0] = '?';
+                                scratchlen = 1;
+                                inptr = scratchbuf;
+                                insize = scratchlen;
+                                res = iconv (cd2,
+                                             (ICONV_CONST char **) &inptr, &insize,
+                                             &out2ptr, &out2size);
+                              }
+                            else
+                              {
+                                /* Accept the results of the iconv() call.  */
+                                out2ptr = out2ptr_try;
+                                out2size = out2size_try;
+                                res = 0;
+                              }
+                          }
                         else
                           {
                             /* TO_CODESET is UTF-8.  */
@@ -880,9 +949,10 @@ mem_cd_iconveh_internal (const char *src, size_t srclen,
                             length = out2ptr - result;
                           }
 # if !defined _LIBICONV_VERSION && !(defined __GLIBC__ && !defined __UCLIBC__)
-                        /* Irix iconv() inserts a NUL byte if it cannot convert.
-                           NetBSD iconv() inserts a question mark if it cannot
-                           convert.
+                        /* IRIX iconv() inserts a NUL byte if it cannot convert.
+                           FreeBSD iconv(), NetBSD iconv(), and Solaris 11
+                           iconv() insert a '?' if they cannot convert.
+                           musl libc iconv() inserts a '*' if it cannot convert.
                            Only GNU libiconv and GNU libc are known to prefer
                            to fail rather than doing a lossy conversion.  */
                         if (res != (size_t)(-1) && res > 0)