summaryrefslogtreecommitdiff
path: root/libarchive/archive_string.c
diff options
context:
space:
mode:
authorTim Kientzle <kientzle@acm.org>2015-08-08 20:52:19 -0700
committerTim Kientzle <kientzle@acm.org>2015-08-08 20:52:19 -0700
commit6322b68e8e0cdcb52c2aa441a8ddfad3f9f4b01b (patch)
tree76f475c1a85cfc4ac2bbbc5e14eec057aef44ded /libarchive/archive_string.c
parent3c7a6dc6694d9b26400d2bd672e04d09ed8a4276 (diff)
downloadlibarchive-6322b68e8e0cdcb52c2aa441a8ddfad3f9f4b01b.tar.gz
Issue 551: Fix the best-effort UTF8 conversion
If a valid character set conversion is impossible, the code falls back to a best-effort conversion that preserves ASCII bytes and converts the rest to Unicode Replacement Characters (if the output is UTF8) or '?' (otherwise). This code did not correctly track the remaining bytes in the output buffer; I've replaced this with slower and simpler code that utilizes the safe string append functions.
Diffstat (limited to 'libarchive/archive_string.c')
-rw-r--r--libarchive/archive_string.c68
1 files changed, 17 insertions, 51 deletions
diff --git a/libarchive/archive_string.c b/libarchive/archive_string.c
index f6d1b893..dcffe6a0 100644
--- a/libarchive/archive_string.c
+++ b/libarchive/archive_string.c
@@ -131,12 +131,7 @@ struct archive_string_conv {
#define UNICODE_MAX 0x10FFFF
#define UNICODE_R_CHAR 0xFFFD /* Replacement character. */
/* Set U+FFFD(Replacement character) in UTF-8. */
-#define UTF8_SET_R_CHAR(outp) do { \
- (outp)[0] = 0xef; \
- (outp)[1] = 0xbf; \
- (outp)[2] = 0xbd; \
-} while (0)
-#define UTF8_R_CHAR_SIZE 3
+const static char utf8_replacement_char[] = {0xef, 0xbf, 0xbd};
static struct archive_string_conv *find_sconv_object(struct archive *,
const char *, const char *);
@@ -2041,7 +2036,7 @@ iconv_strncat_in_locale(struct archive_string *as, const void *_p,
if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) {
size_t rbytes;
if (sc->flag & SCONV_TO_UTF8)
- rbytes = UTF8_R_CHAR_SIZE;
+ rbytes = sizeof(utf8_replacement_char);
else
rbytes = 2;
@@ -2057,7 +2052,7 @@ iconv_strncat_in_locale(struct archive_string *as, const void *_p,
- as->length - to_size;
}
if (sc->flag & SCONV_TO_UTF8)
- UTF8_SET_R_CHAR(outp);
+ memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char));
else if (sc->flag & SCONV_TO_UTF16BE)
archive_be16enc(outp, UNICODE_R_CHAR);
else
@@ -2206,9 +2201,7 @@ best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
size_t length, struct archive_string_conv *sc)
{
size_t remaining;
- char *otp;
const uint8_t *itp;
- size_t avail;
int return_value = 0; /* success */
/*
@@ -2227,46 +2220,25 @@ best_effort_strncat_in_locale(struct archive_string *as, const void *_p,
* byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD,
* a Replacement Character in Unicode.
*/
- if (archive_string_ensure(as, as->length + length + 1) == NULL)
- return (-1);
remaining = length;
itp = (const uint8_t *)_p;
- otp = as->s + as->length;
- avail = as->buffer_length - as->length -1;
while (*itp && remaining > 0) {
- if (*itp > 127 && (sc->flag & SCONV_TO_UTF8)) {
- if (avail < UTF8_R_CHAR_SIZE) {
- as->length = otp - as->s;
- if (NULL == archive_string_ensure(as,
- as->buffer_length + remaining +
- UTF8_R_CHAR_SIZE))
- return (-1);
- otp = as->s + as->length;
- avail = as->buffer_length - as->length -1;
+ if (*itp > 127) {
+ // Non-ASCII: Substitute with suitable replacement
+ if (sc->flag & SCONV_TO_UTF8) {
+ if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) {
+ __archive_errx(1, "Out of memory");
+ }
+ } else {
+ archive_strappend_char(as, '?');
}
- /*
- * When coping a string in UTF-8, unknown character
- * should be U+FFFD (replacement character).
- */
- UTF8_SET_R_CHAR(otp);
- otp += UTF8_R_CHAR_SIZE;
- avail -= UTF8_R_CHAR_SIZE;
- itp++;
- remaining--;
- return_value = -1;
- } else if (*itp > 127) {
- *otp++ = '?';
- itp++;
- remaining--;
return_value = -1;
} else {
- *otp++ = (char)*itp++;
- remaining--;
+ archive_strappend_char(as, *itp);
}
+ ++itp;
}
- as->length = otp - as->s;
- as->s[as->length] = '\0';
return (return_value);
}
@@ -2492,6 +2464,9 @@ unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
{
char *_p = p;
+ /* Invalid Unicode char maps to Replacement character */
+ if (uc > UNICODE_MAX)
+ uc = UNICODE_R_CHAR;
/* Translate code point to UTF8 */
if (uc <= 0x7f) {
if (remaining == 0)
@@ -2508,22 +2483,13 @@ unicode_to_utf8(char *p, size_t remaining, uint32_t uc)
*p++ = 0xe0 | ((uc >> 12) & 0x0f);
*p++ = 0x80 | ((uc >> 6) & 0x3f);
*p++ = 0x80 | (uc & 0x3f);
- } else if (uc <= UNICODE_MAX) {
+ } else {
if (remaining < 4)
return (0);
*p++ = 0xf0 | ((uc >> 18) & 0x07);
*p++ = 0x80 | ((uc >> 12) & 0x3f);
*p++ = 0x80 | ((uc >> 6) & 0x3f);
*p++ = 0x80 | (uc & 0x3f);
- } else {
- /*
- * Undescribed code point should be U+FFFD
- * (replacement character).
- */
- if (remaining < UTF8_R_CHAR_SIZE)
- return (0);
- UTF8_SET_R_CHAR(p);
- p += UTF8_R_CHAR_SIZE;
}
return (p - _p);
}