diff options
Diffstat (limited to 'libarchive/test/test_read_format_zip_utf8_paths.c')
-rw-r--r-- | libarchive/test/test_read_format_zip_utf8_paths.c | 263 |
1 files changed, 219 insertions, 44 deletions
diff --git a/libarchive/test/test_read_format_zip_utf8_paths.c b/libarchive/test/test_read_format_zip_utf8_paths.c index a7034162..ea4738b4 100644 --- a/libarchive/test/test_read_format_zip_utf8_paths.c +++ b/libarchive/test/test_read_format_zip_utf8_paths.c @@ -26,68 +26,243 @@ #include "test.h" __FBSDID("$FreeBSD$"); -static void -verify(struct archive *a) { +/* + * This collection of tests tries to verify that libarchive correctly + * handles Zip UTF-8 filenames stored in various fashions, including + * boundary cases where the different copies of the filename don't + * agree with each other. + * + * A UTF8 filename can appear in a Zip file in three different fashions. + * + * Unmarked: If bit 11 of the GP bit flag is not set, then the + * filename is stored in an unspecified encoding which may or may not + * be UTF-8. Practically speaking, decoders can make no assumptions + * about the filename encoding. + * + * GP bit flag #11: If this bit is set, then the Filename and File + * comment should be stored in UTF-8. + * + * Extra field 0x7075: This field was added by Info-ZIP. It stores a + * second copy of the filename in UTF-8. Note this second filename + * may not be the same encoding -- or even the same name -- as the primary + * filename. It makes no assertion about the character set used by + * the file comment. + * + * Also note that the above can appear in the local file header or the + * central directory or both and may or may not agree in any of those + * cases. In the worst case, we may have four different filenames for + * a single entry: The local file header can have both a regular filename + * (in UTF-8 or not) and the 0x7075 extension, the central directory + * would also have both, and all four names could be different. + */ + +/* + * Case 1: Use GP#11 to flag UTF-8 filename in local file header, + * but central directory has a different name. + */ +static const unsigned char case1[] = { + /* Local file header */ + 0x50, 0x4b, 0x03, 0x04, /* PK\003\004 */ + 0x20, 0x00, /* Version needed to extract: 2.0 */ + 0x00, 0x08, /* General purpose bit flag: 0x0800 == UTF8 filename */ + 0x00, 0x00, /* Compression method: None */ + 0x00, 0x00, /* Last mod time */ + 0x00, 0x00, /* Last mod date */ + 0x00, 0x00, 0x00, 0x00, /* CRC32 */ + 0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */ + 0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */ + 0x0a, 0x00, /* Filename length: 5 */ + 0x00, 0x00, /* Extra field lenght: 0 */ + 0x41, 0x42, 0x43, 0xE2, 0x86, 0x92, 0x2e, 0x74, 0x78, 0x74, /* Filename: ABC<right arrow>.txt */ + /* Extra field: Not present */ + + /* File data */ + 0x41, 0x42, 0x43, 0x0a, /* "ABC\n" */ + + /* Central directory header */ + 0x50, 0x4b, 0x01, 0x02, /* PK\001\002 */ + 0x20, 0x00, /* Version made by: 2.0 for MSDOS */ + 0x20, 0x00, /* Version needed to extract: 2.0 */ + 0x00, 0x08, /* General purpose bit flag: bit 11 = UTF8 filename */ + 0x00, 0x00, /* Compression method: None */ + 0x00, 0x00, /* Last mod time */ + 0x00, 0x00, /* Last mod date */ + 0x00, 0x00, 0x00, 0x00, /* CRC32 */ + 0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */ + 0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */ + 0x05, 0x00, /* Filename length */ + 0x00, 0x00, /* Extra field length: 0 */ + 0x00, 0x00, /* Comment length: 0 */ + 0x00, 0x00, /* Disk number start: 0 */ + 0x00, 0x00, /* Internal file attributes */ + 0x00, 0x00, 0x00, 0x00, /* External file attributes */ + 0x00, 0x00, 0x00, 0x00, /* Offset of local header */ + 0x41, 0x2e, 0x74, 0x78, 0x74, /* File name */ + /* Extra field: not present */ + /* File comment: not present */ + + /* End of central directory record */ + 0x50, 0x4b, 0x05, 0x06, /* PK\005\006 */ + 0x00, 0x00, /* Number of this disk: 0 */ + 0x00, 0x00, /* Central directory starts on this disk: 0 */ + 0x01, 0x00, /* Total CD entries on this disk: 1 */ + 0x01, 0x00, /* Total CD entries: 1 */ + 0x33, 0x00, 0x00, 0x00, /* Size of CD in bytes */ + 0x2c, 0x00, 0x00, 0x00, /* Offset of start of CD */ + 0x00, 0x00, /* Length of archive comment: 0 */ + /* Archive comment: not present */ +}; + +DEFINE_TEST(test_read_format_zip_utf8_paths_case1_seeking) +{ + struct archive *a; struct archive_entry *ae; - const wchar_t *wp; - int file, i; - - /* - * Test file has a pattern to all names: They all have a - * number followed by " - " and an accented character. This - * archive was created by Windows and has regular filenames in - * some MBCS and uses the Zip 0x7075 extension to hold UTF-8 - * pathnames. The code below checks that the correct - * (Unicode) characters are decoded by comparing the number to - * the expected accented character. - */ - - for (file = 0; file < 20; ++file) { - assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); - assert((wp = archive_entry_pathname_w(ae)) != NULL); - if (wp) { - for (i = 0; wp[i] != 0; ++i) { - if (wp[i] == '2') { - failure("Unicode 'o with umlaut' expected"); - assertEqualInt(wp[i + 4], 0xF6); - } else if (wp[i] == '3') { - failure("Unicode 'a with umlaut' expected"); - assertEqualInt(wp[i + 4], 0xE4); - } else if (wp[i] == '4') { - failure("Unicode 'a with ring' expected"); - assertEqualInt(wp[i + 4], 0xE5); - } - } - } - } - assertEqualIntA(a, ARCHIVE_EOF, archive_read_next_header(a, &ae)); + + /* Verify with seeking reader. */ + assert((a = archive_read_new()) != NULL); + assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a)); + assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a)); + assertEqualIntA(a, ARCHIVE_OK, read_open_memory_seek(a, case1, sizeof(case1), 7)); + assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); + assertEqualString(archive_entry_pathname(ae), NULL); + assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt"); + + assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a)); + assertEqualIntA(a, ARCHIVE_OK, archive_read_free(a)); } -DEFINE_TEST(test_read_format_zip_utf8_paths) +DEFINE_TEST(test_read_format_zip_utf8_paths_case1_streaming) { - const char *refname = "test_read_format_zip_utf8_paths.zip"; struct archive *a; - char *p; - size_t s; + struct archive_entry *ae; - extract_reference_file(refname); + /* Verify with streaming reader. */ + assert((a = archive_read_new()) != NULL); + assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a)); + assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a)); + assertEqualIntA(a, ARCHIVE_OK, read_open_memory(a, case1, sizeof(case1), 31)); + assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); + assertEqualString(archive_entry_pathname(ae), NULL); + assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt"); + + assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a)); + assertEqualIntA(a, ARCHIVE_OK, archive_free(a)); +} + +/* + * TODO: Case 2: GP#11 is used, but filename is not valid UTF-8. + * This should always cause an error; malformed UTF-8 should never happen. + */ + +/* + * TODO: Case 3: Store UTF-8 filename using extra field 0x7075 + * 0x7075 filename and regular filename have identical bytes but + * regular filename is not marked with GP#11 bit. + * + * Note: Central dir entry has only "A.txt" and no 0x7075 extension. + */ +static const unsigned char case3[] = { + /* Local file header */ + 0x50, 0x4b, 0x03, 0x04, /* PK\003\004 */ + 0x20, 0x00, /* Version needed to extract: 2.0 */ + 0x00, 0x00, /* General purpose bit flag: 0x0000 */ + 0x00, 0x00, /* Compression method: None */ + 0x00, 0x00, /* Last mod time */ + 0x00, 0x00, /* Last mod date */ + 0x00, 0x00, 0x00, 0x00, /* CRC32 */ + 0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */ + 0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */ + 0x0a, 0x00, /* Filename length: 10 */ + 0x0e, 0x00, /* Extra field length: 14 */ + 0x41, 0x42, 0x43, 0xE2, 0x86, 0x92, 0x2e, 0x74, 0x78, 0x74, /* Filename: ABC<right arrow>.txt */ + 0x75, 0x70, 0x0a, 0x00, 0x41, 0x42, 0x43, 0xE2, 0x86, 0x92, 0x2e, 0x74, 0x78, 0x74, /* Extra field: 0x7075 */ + + /* File data */ + 0x41, 0x42, 0x43, 0x0a, /* "ABC\n" */ + + /* Central directory header */ + 0x50, 0x4b, 0x01, 0x02, /* PK\001\002 */ + 0x20, 0x00, /* Version made by: 2.0 for MSDOS */ + 0x20, 0x00, /* Version needed to extract: 2.0 */ + 0x00, 0x08, /* General purpose bit flag: bit 11 = UTF8 filename */ + 0x00, 0x00, /* Compression method: None */ + 0x00, 0x00, /* Last mod time */ + 0x00, 0x00, /* Last mod date */ + 0x00, 0x00, 0x00, 0x00, /* CRC32 */ + 0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */ + 0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */ + 0x05, 0x00, /* Filename length */ + 0x00, 0x00, /* Extra field length: 0 */ + 0x00, 0x00, /* Comment length: 0 */ + 0x00, 0x00, /* Disk number start: 0 */ + 0x00, 0x00, /* Internal file attributes */ + 0x00, 0x00, 0x00, 0x00, /* External file attributes */ + 0x00, 0x00, 0x00, 0x00, /* Offset of local header */ + 0x41, 0x2e, 0x74, 0x78, 0x74, /* File name */ + /* No extra fields */ + /* File comment: not present */ + + /* End of central directory record */ + 0x50, 0x4b, 0x05, 0x06, /* PK\005\006 */ + 0x00, 0x00, /* Number of this disk: 0 */ + 0x00, 0x00, /* Central directory starts on this disk: 0 */ + 0x01, 0x00, /* Total CD entries on this disk: 1 */ + 0x01, 0x00, /* Total CD entries: 1 */ + 0x33, 0x00, 0x00, 0x00, /* Size of CD in bytes */ + 0x3a, 0x00, 0x00, 0x00, /* Offset of start of CD */ + 0x00, 0x00, /* Length of archive comment: 0 */ + /* Archive comment: not present */ +}; + +DEFINE_TEST(test_read_format_zip_utf8_paths_case3_seeking) +{ + struct archive *a; + struct archive_entry *ae; /* Verify with seeking reader. */ assert((a = archive_read_new()) != NULL); assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a)); assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a)); - assertEqualIntA(a, ARCHIVE_OK, archive_read_open_filename(a, refname, 10240)); - verify(a); + assertEqualIntA(a, ARCHIVE_OK, read_open_memory_seek(a, case3, sizeof(case3), 7)); + assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); + assertEqualString(archive_entry_pathname(ae), NULL); + assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt"); + assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a)); assertEqualIntA(a, ARCHIVE_OK, archive_read_free(a)); +} + +DEFINE_TEST(test_read_format_zip_utf8_paths_case3_streaming) +{ + struct archive *a; + struct archive_entry *ae; /* Verify with streaming reader. */ - p = slurpfile(&s, refname); assert((a = archive_read_new()) != NULL); assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a)); assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a)); - assertEqualIntA(a, ARCHIVE_OK, read_open_memory(a, p, s, 31)); - verify(a); + assertEqualIntA(a, ARCHIVE_OK, read_open_memory(a, case3, sizeof(case3), 31)); + assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); + assertEqualString(archive_entry_pathname(ae), NULL); + assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt"); + assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a)); assertEqualIntA(a, ARCHIVE_OK, archive_free(a)); } + + +/* + * TODO: Case 4: As with Case 3, but the two filenames are not + * the same. + */ + +/* + * TODO: Case 5: GP#11 and extra field 0x7075 both used, but + * store different names. + */ + +/* + * TODO: Similar cases where the local file header and central directory + * disagree. Seeking reader should always use the CD version, streaming + * reader must necessarily always use the local file header version. + */ |