diff options
-rw-r--r-- | libarchive/archive_string.c | 68 |
1 files changed, 17 insertions, 51 deletions
diff --git a/libarchive/archive_string.c b/libarchive/archive_string.c index f6d1b893..dcffe6a0 100644 --- a/libarchive/archive_string.c +++ b/libarchive/archive_string.c @@ -131,12 +131,7 @@ struct archive_string_conv { #define UNICODE_MAX 0x10FFFF #define UNICODE_R_CHAR 0xFFFD /* Replacement character. */ /* Set U+FFFD(Replacement character) in UTF-8. */ -#define UTF8_SET_R_CHAR(outp) do { \ - (outp)[0] = 0xef; \ - (outp)[1] = 0xbf; \ - (outp)[2] = 0xbd; \ -} while (0) -#define UTF8_R_CHAR_SIZE 3 +const static char utf8_replacement_char[] = {0xef, 0xbf, 0xbd}; static struct archive_string_conv *find_sconv_object(struct archive *, const char *, const char *); @@ -2041,7 +2036,7 @@ iconv_strncat_in_locale(struct archive_string *as, const void *_p, if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) { size_t rbytes; if (sc->flag & SCONV_TO_UTF8) - rbytes = UTF8_R_CHAR_SIZE; + rbytes = sizeof(utf8_replacement_char); else rbytes = 2; @@ -2057,7 +2052,7 @@ iconv_strncat_in_locale(struct archive_string *as, const void *_p, - as->length - to_size; } if (sc->flag & SCONV_TO_UTF8) - UTF8_SET_R_CHAR(outp); + memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char)); else if (sc->flag & SCONV_TO_UTF16BE) archive_be16enc(outp, UNICODE_R_CHAR); else @@ -2206,9 +2201,7 @@ best_effort_strncat_in_locale(struct archive_string *as, const void *_p, size_t length, struct archive_string_conv *sc) { size_t remaining; - char *otp; const uint8_t *itp; - size_t avail; int return_value = 0; /* success */ /* @@ -2227,46 +2220,25 @@ best_effort_strncat_in_locale(struct archive_string *as, const void *_p, * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD, * a Replacement Character in Unicode. */ - if (archive_string_ensure(as, as->length + length + 1) == NULL) - return (-1); remaining = length; itp = (const uint8_t *)_p; - otp = as->s + as->length; - avail = as->buffer_length - as->length -1; while (*itp && remaining > 0) { - if (*itp > 127 && (sc->flag & SCONV_TO_UTF8)) { - if (avail < UTF8_R_CHAR_SIZE) { - as->length = otp - as->s; - if (NULL == archive_string_ensure(as, - as->buffer_length + remaining + - UTF8_R_CHAR_SIZE)) - return (-1); - otp = as->s + as->length; - avail = as->buffer_length - as->length -1; + if (*itp > 127) { + // Non-ASCII: Substitute with suitable replacement + if (sc->flag & SCONV_TO_UTF8) { + if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) { + __archive_errx(1, "Out of memory"); + } + } else { + archive_strappend_char(as, '?'); } - /* - * When coping a string in UTF-8, unknown character - * should be U+FFFD (replacement character). - */ - UTF8_SET_R_CHAR(otp); - otp += UTF8_R_CHAR_SIZE; - avail -= UTF8_R_CHAR_SIZE; - itp++; - remaining--; - return_value = -1; - } else if (*itp > 127) { - *otp++ = '?'; - itp++; - remaining--; return_value = -1; } else { - *otp++ = (char)*itp++; - remaining--; + archive_strappend_char(as, *itp); } + ++itp; } - as->length = otp - as->s; - as->s[as->length] = '\0'; return (return_value); } @@ -2492,6 +2464,9 @@ unicode_to_utf8(char *p, size_t remaining, uint32_t uc) { char *_p = p; + /* Invalid Unicode char maps to Replacement character */ + if (uc > UNICODE_MAX) + uc = UNICODE_R_CHAR; /* Translate code point to UTF8 */ if (uc <= 0x7f) { if (remaining == 0) @@ -2508,22 +2483,13 @@ unicode_to_utf8(char *p, size_t remaining, uint32_t uc) *p++ = 0xe0 | ((uc >> 12) & 0x0f); *p++ = 0x80 | ((uc >> 6) & 0x3f); *p++ = 0x80 | (uc & 0x3f); - } else if (uc <= UNICODE_MAX) { + } else { if (remaining < 4) return (0); *p++ = 0xf0 | ((uc >> 18) & 0x07); *p++ = 0x80 | ((uc >> 12) & 0x3f); *p++ = 0x80 | ((uc >> 6) & 0x3f); *p++ = 0x80 | (uc & 0x3f); - } else { - /* - * Undescribed code point should be U+FFFD - * (replacement character). - */ - if (remaining < UTF8_R_CHAR_SIZE) - return (0); - UTF8_SET_R_CHAR(p); - p += UTF8_R_CHAR_SIZE; } return (p - _p); } |