diff options
author | Bruno Haible <bruno@clisp.org> | 2022-01-01 19:48:54 +0100 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2022-01-01 19:48:54 +0100 |
commit | a7b896b830ae70699473440199889ffc0088cf29 (patch) | |
tree | 96cadc34b803f59ef7eb1bb135fd84e1ba233068 /tests | |
parent | ff468794101e86e0e198fdfdba23df7f575cc570 (diff) | |
download | gnulib-a7b896b830ae70699473440199889ffc0088cf29.tar.gz |
striconveh: Support an error handler that produces a Unicode U+FFFD.
Suggested by Marc Nieper-Wißkirchen in
<https://lists.gnu.org/archive/html/bug-gnulib/2021-12/msg00175.html>.
* lib/iconveh.h (iconveh_replacement_character): New enum value.
* lib/striconveh.c (mem_cd_iconveh_internal): When the handler is
iconveh_replacement_character, try to produce U+FFFD when possible,
instead of '?'.
* tests/test-striconveh.c (main): Add GB18030 tests. Test also
iconveh_replacement_character.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/test-striconveh.c | 387 |
1 files changed, 385 insertions, 2 deletions
diff --git a/tests/test-striconveh.c b/tests/test-striconveh.c index 438b7b0877..781aa52541 100644 --- a/tests/test-striconveh.c +++ b/tests/test-striconveh.c @@ -46,14 +46,19 @@ main () { #if HAVE_ICONV static enum iconv_ilseq_handler handlers[] = - { iconveh_error, iconveh_question_mark, iconveh_escape_sequence }; + { + iconveh_error, + iconveh_question_mark, + iconveh_replacement_character, + iconveh_escape_sequence + }; size_t indirect; size_t h; size_t o; size_t i; /* Assume that iconv() supports at least the encodings ASCII, ISO-8859-1, - ISO-8859-2, and UTF-8. */ + ISO-8859-2, UTF-8, and with libiconv or glibc also GB18030. */ iconv_t cd_ascii_to_88591 = iconv_open ("ISO-8859-1", "ASCII"); iconv_t cd_88591_to_88592 = iconv_open ("ISO-8859-2", "ISO-8859-1"); iconv_t cd_88592_to_88591 = iconv_open ("ISO-8859-1", "ISO-8859-2"); @@ -63,6 +68,12 @@ main () iconv_t cd_88592_to_utf8 = iconv_open ("UTF-8", "ISO-8859-2"); iconv_t cd_utf8_to_88592 = iconv_open ("ISO-8859-2", "UTF-8"); iconv_t cd_utf7_to_utf8 = iconv_open ("UTF-8", "UTF-7"); +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + iconv_t cd_ascii_to_gb18030 = iconv_open ("GB18030", "ASCII"); + iconv_t cd_utf8_to_gb18030 = iconv_open ("GB18030", "UTF-8"); + iconv_t cd_88591_to_gb18030 = iconv_open ("GB18030", "ISO-8859-1"); + iconv_t cd_utf7_to_gb18030 = iconv_open ("GB18030", "UTF-7"); +# endif iconveh_t cdeh_ascii_to_88591; iconveh_t cdeh_ascii_to_88591_indirectly; iconveh_t cdeh_88592_to_88591; @@ -71,12 +82,21 @@ main () iconveh_t cdeh_88591_to_utf8; iconveh_t cdeh_utf8_to_88591; iconveh_t cdeh_utf7_to_utf8; +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + iconveh_t cdeh_ascii_to_gb18030; + iconveh_t cdeh_88591_to_gb18030; + iconveh_t cdeh_utf7_to_gb18030; +# endif ASSERT (cd_ascii_to_utf8 != (iconv_t)(-1)); ASSERT (cd_88591_to_utf8 != (iconv_t)(-1)); ASSERT (cd_utf8_to_88591 != (iconv_t)(-1)); ASSERT (cd_88592_to_utf8 != (iconv_t)(-1)); ASSERT (cd_utf8_to_88592 != (iconv_t)(-1)); +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + ASSERT (cd_ascii_to_gb18030 != (iconv_t)(-1)); + ASSERT (cd_utf8_to_gb18030 != (iconv_t)(-1)); +# endif cdeh_ascii_to_88591.cd = cd_ascii_to_88591; cdeh_ascii_to_88591.cd1 = cd_ascii_to_utf8; @@ -110,6 +130,20 @@ main () cdeh_utf7_to_utf8.cd1 = cd_utf7_to_utf8; cdeh_utf7_to_utf8.cd2 = (iconv_t)(-1); +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + cdeh_ascii_to_gb18030.cd = cd_ascii_to_gb18030; + cdeh_ascii_to_gb18030.cd1 = cd_ascii_to_utf8; + cdeh_ascii_to_gb18030.cd2 = cd_utf8_to_gb18030; + + cdeh_88591_to_gb18030.cd = cd_88591_to_gb18030; + cdeh_88591_to_gb18030.cd1 = cd_88591_to_utf8; + cdeh_88591_to_gb18030.cd2 = cd_utf8_to_gb18030; + + cdeh_utf7_to_gb18030.cd = cd_utf7_to_gb18030; + cdeh_utf7_to_gb18030.cd1 = cd_utf7_to_utf8; + cdeh_utf7_to_gb18030.cd2 = cd_utf8_to_gb18030; +# endif + /* ------------------------ Test mem_cd_iconveh() ------------------------ */ /* Test conversion from ISO-8859-2 to ISO-8859-1 with no errors. */ @@ -175,6 +209,7 @@ main () free (offsets); break; case iconveh_question_mark: + case iconveh_replacement_character: case iconveh_escape_sequence: { static const char expected[] = "Rafa? Maszkowski"; @@ -224,6 +259,7 @@ main () free (offsets); break; case iconveh_question_mark: + case iconveh_replacement_character: { static const char expected[] = "Rafa? Maszkowski"; ASSERT (retval == 0); @@ -294,6 +330,41 @@ main () } } +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */ + for (h = 0; h < SIZEOF (handlers); h++) + { + enum iconv_ilseq_handler handler = handlers[h]; + static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; + static const char expected[] = "\2010\2072rger mit b\2010\2132sen B\250\271bchen ohne Augenma\2010\2118"; + for (o = 0; o < 2; o++) + { + size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); + char *result = NULL; + size_t length = 0; + int retval = mem_cd_iconveh (input, strlen (input), + &cdeh_88591_to_gb18030, + handler, + offsets, + &result, &length); + ASSERT (retval == 0); + ASSERT (length == strlen (expected)); + ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); + if (o) + { + for (i = 0; i < 37; i++) + ASSERT (offsets[i] == (i < 1 ? i : + i < 12 ? i + 3 : + i < 18 ? i + 6 : + i + 7)); + ASSERT (offsets[37] == MAGIC); + free (offsets); + } + free (result); + } + } +# endif + /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ for (h = 0; h < SIZEOF (handlers); h++) { @@ -371,10 +442,88 @@ main () free (result); } break; + case iconveh_replacement_character: + { + static const char expected[] = "Rafa\357\277\275 Maszkowski"; + ASSERT (retval == 0); + ASSERT (length == strlen (expected)); + ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); + if (o) + { + for (i = 0; i < 16; i++) + ASSERT (offsets[i] == (i < 5 ? i : i + 2)); + ASSERT (offsets[16] == MAGIC); + free (offsets); + } + free (result); + } + break; } } } +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ). */ + for (h = 0; h < SIZEOF (handlers); h++) + { + enum iconv_ilseq_handler handler = handlers[h]; + static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski */ + for (o = 0; o < 2; o++) + { + size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); + char *result = NULL; + size_t length = 0; + int retval = mem_cd_iconveh (input, strlen (input), + &cdeh_ascii_to_gb18030, + handler, + offsets, + &result, &length); + switch (handler) + { + case iconveh_error: + ASSERT (retval == -1 && errno == EILSEQ); + ASSERT (result == NULL); + if (o) + free (offsets); + break; + case iconveh_question_mark: + case iconveh_escape_sequence: + { + static const char expected[] = "Rafa? Maszkowski"; + ASSERT (retval == 0); + ASSERT (length == strlen (expected)); + ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); + if (o) + { + for (i = 0; i < 16; i++) + ASSERT (offsets[i] == i); + ASSERT (offsets[16] == MAGIC); + free (offsets); + } + free (result); + } + break; + case iconveh_replacement_character: + { + static const char expected[] = "Rafa\2041\2447 Maszkowski"; + ASSERT (retval == 0); + ASSERT (length == strlen (expected)); + ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); + if (o) + { + for (i = 0; i < 16; i++) + ASSERT (offsets[i] == (i < 5 ? i : i + 3)); + ASSERT (offsets[16] == MAGIC); + free (offsets); + } + free (result); + } + break; + } + } + } +# endif + /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */ for (h = 0; h < SIZEOF (handlers); h++) { @@ -399,6 +548,7 @@ main () free (offsets); break; case iconveh_question_mark: + case iconveh_replacement_character: { static const char expected[] = "Rafa? Maszkowski"; ASSERT (retval == 0); @@ -496,6 +646,34 @@ main () free (result); } +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + /* Test conversion from UTF-7 to GB18030 with EINVAL. */ + for (h = 0; h < SIZEOF (handlers); h++) + { + enum iconv_ilseq_handler handler = handlers[h]; + /* This is base64 encoded 0x54 0x32 0xD8 0x3F 0xD8 0x40. It would + convert to U+5432 U+D83F U+D840 but these are Unicode surrogates. */ + static const char input[] = "+VDLYP9hA"; + static const char expected1[] = "\337\305"; /* 吲 glibc */ + static const char expected2[] = ""; /* libiconv */ + char *result = NULL; + size_t length = 0; + int retval = mem_cd_iconveh (input, 7, + &cdeh_utf7_to_gb18030, + handler, + NULL, + &result, &length); + ASSERT (retval == 0); + ASSERT (length == strlen (expected1) || length == strlen (expected2)); + ASSERT (result != NULL); + if (length == strlen (expected1)) + ASSERT (memcmp (result, expected1, strlen (expected1)) == 0); + else + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0); + free (result); + } +# endif + /* Disabled on NetBSD, because NetBSD 5.0 iconv() is buggy: it converts the input "+2D/YQNhB" to U+1FED8 U+3FD8 U+40D8. */ # if !(defined __NetBSD__ && !defined _LIBICONV_VERSION) @@ -544,8 +722,98 @@ main () free (result); } break; + case iconveh_replacement_character: + { + /* glibc result */ + static const char expected1[] = "\357\277\275\357\277\275\357\277\275\357\277\275\357\277\275"; + /* libiconv <= 1.12 result */ + static const char expected2[] = "\357\277\2752D/YQNhB"; + /* libiconv >= 1.13 result */ + static const char expected3[] = "\357\277\275\340\277\266\341\200\266"; + ASSERT (retval == 0); + ASSERT (length == strlen (expected1) + || length == strlen (expected2) + || length == strlen (expected3)); + ASSERT (result != NULL); + if (length == strlen (expected1)) + ASSERT (memcmp (result, expected1, strlen (expected1)) == 0); + else if (length == strlen (expected2)) + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0); + else + ASSERT (memcmp (result, expected3, strlen (expected3)) == 0); + free (result); + } + } + } + +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + /* Test conversion from UTF-7 to GB18030 with EILSEQ. */ + for (h = 0; h < SIZEOF (handlers); h++) + { + enum iconv_ilseq_handler handler = handlers[h]; + /* This is base64 encoded 0xD8 0x3F 0xD8 0x40 0xD8 0x41. It would + convert to U+D83F U+D840 U+D841 but these are Unicode surrogates. */ + static const char input[] = "+2D/YQNhB"; + char *result = NULL; + size_t length = 0; + int retval = mem_cd_iconveh (input, strlen (input), + &cdeh_utf7_to_gb18030, + handler, + NULL, + &result, &length); + switch (handler) + { + case iconveh_error: + ASSERT (retval == -1 && errno == EILSEQ); + ASSERT (result == NULL); + break; + case iconveh_question_mark: + case iconveh_escape_sequence: + { + /* glibc result */ + static const char expected1[] = "?????"; + /* libiconv <= 1.12 result */ + static const char expected2[] = "?2D/YQNhB"; + /* libiconv behaviour changed in version 1.13: the result is + '?' U+0FF6 U+1036; this is U+D83F U+D840 U+D841 shifted left + by 6 bits. */ + static const char expected3[] = "?\2013\2030\2013\2114"; + ASSERT (retval == 0); + ASSERT (length == strlen (expected1) + || length == strlen (expected2) + || length == strlen (expected3)); + ASSERT (result != NULL); + if (length == strlen (expected1)) + ASSERT (memcmp (result, expected1, strlen (expected1)) == 0); + else if (length == strlen (expected2)) + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0 + || memcmp (result, expected3, strlen (expected3)) == 0); + free (result); + } + break; + case iconveh_replacement_character: + { + /* glibc result */ + static const char expected1[] = "\2041\2447\2041\2447\2041\2447\2041\2447\2041\2447"; + /* libiconv <= 1.12 result */ + static const char expected2[] = "\2041\24472D/YQNhB"; + /* libiconv >= 1.13 result */ + static const char expected3[] = "\2041\2447\2013\2030\2013\2114"; + ASSERT (retval == 0); + ASSERT (length == strlen (expected1) + || length == strlen (expected2) + || length == strlen (expected3)); + ASSERT (result != NULL); + if (length == strlen (expected1)) + ASSERT (memcmp (result, expected1, strlen (expected1)) == 0); + else if (length == strlen (expected2)) + ASSERT (memcmp (result, expected2, strlen (expected2)) == 0 + || memcmp (result, expected3, strlen (expected3)) == 0); + free (result); + } } } +# endif # endif # endif } @@ -589,6 +857,7 @@ main () ASSERT (result == NULL && errno == EILSEQ); break; case iconveh_question_mark: + case iconveh_replacement_character: case iconveh_escape_sequence: { static const char expected[] = "Rafa? Maszkowski"; @@ -619,6 +888,7 @@ main () ASSERT (result == NULL && errno == EILSEQ); break; case iconveh_question_mark: + case iconveh_replacement_character: { static const char expected[] = "Rafa? Maszkowski"; ASSERT (result != NULL); @@ -652,6 +922,22 @@ main () free (result); } +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */ + for (h = 0; h < SIZEOF (handlers); h++) + { + enum iconv_ilseq_handler handler = handlers[h]; + static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; + static const char expected[] = "\2010\2072rger mit b\2010\2132sen B\250\271bchen ohne Augenma\2010\2118"; + char *result = str_cd_iconveh (input, + &cdeh_88591_to_gb18030, + handler); + ASSERT (result != NULL); + ASSERT (strcmp (result, expected) == 0); + free (result); + } +# endif + /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ for (h = 0; h < SIZEOF (handlers); h++) { @@ -688,8 +974,51 @@ main () free (result); } break; + case iconveh_replacement_character: + { + static const char expected[] = "Rafa\357\277\275 Maszkowski"; + ASSERT (result != NULL); + ASSERT (strcmp (result, expected) == 0); + free (result); + } + break; + } + } + +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + /* Test conversion from ASCII to GB18030 with invalid input (EILSEQ). */ + for (h = 0; h < SIZEOF (handlers); h++) + { + enum iconv_ilseq_handler handler = handlers[h]; + static const char input[] = "Rafa\263 Maszkowski"; /* Rafa? Maszkowski */ + char *result = str_cd_iconveh (input, + &cdeh_ascii_to_gb18030, + handler); + switch (handler) + { + case iconveh_error: + ASSERT (result == NULL && errno == EILSEQ); + break; + case iconveh_question_mark: + case iconveh_escape_sequence: + { + static const char expected[] = "Rafa? Maszkowski"; + ASSERT (result != NULL); + ASSERT (strcmp (result, expected) == 0); + free (result); + } + break; + case iconveh_replacement_character: + { + static const char expected[] = "Rafa\2041\2447 Maszkowski"; + ASSERT (result != NULL); + ASSERT (strcmp (result, expected) == 0); + free (result); + } + break; } } +# endif /* Test conversion from UTF-8 to ISO-8859-1 with EILSEQ. */ for (h = 0; h < SIZEOF (handlers); h++) @@ -705,6 +1034,7 @@ main () ASSERT (result == NULL && errno == EILSEQ); break; case iconveh_question_mark: + case iconveh_replacement_character: { static const char expected[] = "Costs: 27 ?"; ASSERT (result != NULL); @@ -801,6 +1131,7 @@ main () free (offsets); break; case iconveh_question_mark: + case iconveh_replacement_character: { static const char expected[] = "Rafa? Maszkowski"; ASSERT (retval == 0); @@ -870,6 +1201,41 @@ main () } } +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */ + for (h = 0; h < SIZEOF (handlers); h++) + { + enum iconv_ilseq_handler handler = handlers[h]; + static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; + static const char expected[] = "\2010\2072rger mit b\2010\2132sen B\250\271bchen ohne Augenma\2010\2118"; + for (o = 0; o < 2; o++) + { + size_t *offsets = (o ? new_offsets (strlen (input)) : NULL); + char *result = NULL; + size_t length = 0; + int retval = mem_iconveh (input, strlen (input), + "ISO-8859-1", "GB18030", + handler, + offsets, + &result, &length); + ASSERT (retval == 0); + ASSERT (length == strlen (expected)); + ASSERT (result != NULL && memcmp (result, expected, strlen (expected)) == 0); + if (o) + { + for (i = 0; i < 37; i++) + ASSERT (offsets[i] == (i < 1 ? i : + i < 12 ? i + 3 : + i < 18 ? i + 6 : + i + 7)); + ASSERT (offsets[37] == MAGIC); + free (offsets); + } + free (result); + } + } +# endif + /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ for (h = 0; h < SIZEOF (handlers); h++) { @@ -931,6 +1297,7 @@ main () free (offsets); break; case iconveh_question_mark: + case iconveh_replacement_character: { static const char expected[] = "Rafa? Maszkowski"; ASSERT (retval == 0); @@ -1023,6 +1390,7 @@ main () ASSERT (result == NULL && errno == EILSEQ); break; case iconveh_question_mark: + case iconveh_replacement_character: { static const char expected[] = "Rafa? Maszkowski"; ASSERT (result != NULL); @@ -1053,6 +1421,20 @@ main () free (result); } +# if defined _LIBICONV_VERSION || (defined __GLIBC__ && !defined __UCLIBC__) + /* Test conversion from ISO-8859-1 to GB18030 with no errors. */ + for (h = 0; h < SIZEOF (handlers); h++) + { + enum iconv_ilseq_handler handler = handlers[h]; + static const char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; + static const char expected[] = "\2010\2072rger mit b\2010\2132sen B\250\271bchen ohne Augenma\2010\2118"; + char *result = str_iconveh (input, "ISO-8859-1", "GB18030", handler); + ASSERT (result != NULL); + ASSERT (strcmp (result, expected) == 0); + free (result); + } +# endif + /* Test conversion from UTF-8 to ISO-8859-1 with no errors. */ for (h = 0; h < SIZEOF (handlers); h++) { @@ -1077,6 +1459,7 @@ main () ASSERT (result == NULL && errno == EILSEQ); break; case iconveh_question_mark: + case iconveh_replacement_character: { static const char expected[] = "Costs: 27 ?"; ASSERT (result != NULL); |