summaryrefslogtreecommitdiff
path: root/libarchive/test/test_read_format_zip_utf8_paths.c
diff options
context:
space:
mode:
Diffstat (limited to 'libarchive/test/test_read_format_zip_utf8_paths.c')
-rw-r--r--libarchive/test/test_read_format_zip_utf8_paths.c263
1 files changed, 219 insertions, 44 deletions
diff --git a/libarchive/test/test_read_format_zip_utf8_paths.c b/libarchive/test/test_read_format_zip_utf8_paths.c
index a7034162..ea4738b4 100644
--- a/libarchive/test/test_read_format_zip_utf8_paths.c
+++ b/libarchive/test/test_read_format_zip_utf8_paths.c
@@ -26,68 +26,243 @@
#include "test.h"
__FBSDID("$FreeBSD$");
-static void
-verify(struct archive *a) {
+/*
+ * This collection of tests tries to verify that libarchive correctly
+ * handles Zip UTF-8 filenames stored in various fashions, including
+ * boundary cases where the different copies of the filename don't
+ * agree with each other.
+ *
+ * A UTF8 filename can appear in a Zip file in three different fashions.
+ *
+ * Unmarked: If bit 11 of the GP bit flag is not set, then the
+ * filename is stored in an unspecified encoding which may or may not
+ * be UTF-8. Practically speaking, decoders can make no assumptions
+ * about the filename encoding.
+ *
+ * GP bit flag #11: If this bit is set, then the Filename and File
+ * comment should be stored in UTF-8.
+ *
+ * Extra field 0x7075: This field was added by Info-ZIP. It stores a
+ * second copy of the filename in UTF-8. Note this second filename
+ * may not be the same encoding -- or even the same name -- as the primary
+ * filename. It makes no assertion about the character set used by
+ * the file comment.
+ *
+ * Also note that the above can appear in the local file header or the
+ * central directory or both and may or may not agree in any of those
+ * cases. In the worst case, we may have four different filenames for
+ * a single entry: The local file header can have both a regular filename
+ * (in UTF-8 or not) and the 0x7075 extension, the central directory
+ * would also have both, and all four names could be different.
+ */
+
+/*
+ * Case 1: Use GP#11 to flag UTF-8 filename in local file header,
+ * but central directory has a different name.
+ */
+static const unsigned char case1[] = {
+ /* Local file header */
+ 0x50, 0x4b, 0x03, 0x04, /* PK\003\004 */
+ 0x20, 0x00, /* Version needed to extract: 2.0 */
+ 0x00, 0x08, /* General purpose bit flag: 0x0800 == UTF8 filename */
+ 0x00, 0x00, /* Compression method: None */
+ 0x00, 0x00, /* Last mod time */
+ 0x00, 0x00, /* Last mod date */
+ 0x00, 0x00, 0x00, 0x00, /* CRC32 */
+ 0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */
+ 0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */
+ 0x0a, 0x00, /* Filename length: 5 */
+ 0x00, 0x00, /* Extra field lenght: 0 */
+ 0x41, 0x42, 0x43, 0xE2, 0x86, 0x92, 0x2e, 0x74, 0x78, 0x74, /* Filename: ABC<right arrow>.txt */
+ /* Extra field: Not present */
+
+ /* File data */
+ 0x41, 0x42, 0x43, 0x0a, /* "ABC\n" */
+
+ /* Central directory header */
+ 0x50, 0x4b, 0x01, 0x02, /* PK\001\002 */
+ 0x20, 0x00, /* Version made by: 2.0 for MSDOS */
+ 0x20, 0x00, /* Version needed to extract: 2.0 */
+ 0x00, 0x08, /* General purpose bit flag: bit 11 = UTF8 filename */
+ 0x00, 0x00, /* Compression method: None */
+ 0x00, 0x00, /* Last mod time */
+ 0x00, 0x00, /* Last mod date */
+ 0x00, 0x00, 0x00, 0x00, /* CRC32 */
+ 0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */
+ 0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */
+ 0x05, 0x00, /* Filename length */
+ 0x00, 0x00, /* Extra field length: 0 */
+ 0x00, 0x00, /* Comment length: 0 */
+ 0x00, 0x00, /* Disk number start: 0 */
+ 0x00, 0x00, /* Internal file attributes */
+ 0x00, 0x00, 0x00, 0x00, /* External file attributes */
+ 0x00, 0x00, 0x00, 0x00, /* Offset of local header */
+ 0x41, 0x2e, 0x74, 0x78, 0x74, /* File name */
+ /* Extra field: not present */
+ /* File comment: not present */
+
+ /* End of central directory record */
+ 0x50, 0x4b, 0x05, 0x06, /* PK\005\006 */
+ 0x00, 0x00, /* Number of this disk: 0 */
+ 0x00, 0x00, /* Central directory starts on this disk: 0 */
+ 0x01, 0x00, /* Total CD entries on this disk: 1 */
+ 0x01, 0x00, /* Total CD entries: 1 */
+ 0x33, 0x00, 0x00, 0x00, /* Size of CD in bytes */
+ 0x2c, 0x00, 0x00, 0x00, /* Offset of start of CD */
+ 0x00, 0x00, /* Length of archive comment: 0 */
+ /* Archive comment: not present */
+};
+
+DEFINE_TEST(test_read_format_zip_utf8_paths_case1_seeking)
+{
+ struct archive *a;
struct archive_entry *ae;
- const wchar_t *wp;
- int file, i;
-
- /*
- * Test file has a pattern to all names: They all have a
- * number followed by " - " and an accented character. This
- * archive was created by Windows and has regular filenames in
- * some MBCS and uses the Zip 0x7075 extension to hold UTF-8
- * pathnames. The code below checks that the correct
- * (Unicode) characters are decoded by comparing the number to
- * the expected accented character.
- */
-
- for (file = 0; file < 20; ++file) {
- assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
- assert((wp = archive_entry_pathname_w(ae)) != NULL);
- if (wp) {
- for (i = 0; wp[i] != 0; ++i) {
- if (wp[i] == '2') {
- failure("Unicode 'o with umlaut' expected");
- assertEqualInt(wp[i + 4], 0xF6);
- } else if (wp[i] == '3') {
- failure("Unicode 'a with umlaut' expected");
- assertEqualInt(wp[i + 4], 0xE4);
- } else if (wp[i] == '4') {
- failure("Unicode 'a with ring' expected");
- assertEqualInt(wp[i + 4], 0xE5);
- }
- }
- }
- }
- assertEqualIntA(a, ARCHIVE_EOF, archive_read_next_header(a, &ae));
+
+ /* Verify with seeking reader. */
+ assert((a = archive_read_new()) != NULL);
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
+ assertEqualIntA(a, ARCHIVE_OK, read_open_memory_seek(a, case1, sizeof(case1), 7));
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+ assertEqualString(archive_entry_pathname(ae), NULL);
+ assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt");
+
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_free(a));
}
-DEFINE_TEST(test_read_format_zip_utf8_paths)
+DEFINE_TEST(test_read_format_zip_utf8_paths_case1_streaming)
{
- const char *refname = "test_read_format_zip_utf8_paths.zip";
struct archive *a;
- char *p;
- size_t s;
+ struct archive_entry *ae;
- extract_reference_file(refname);
+ /* Verify with streaming reader. */
+ assert((a = archive_read_new()) != NULL);
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
+ assertEqualIntA(a, ARCHIVE_OK, read_open_memory(a, case1, sizeof(case1), 31));
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+ assertEqualString(archive_entry_pathname(ae), NULL);
+ assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt");
+
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
+ assertEqualIntA(a, ARCHIVE_OK, archive_free(a));
+}
+
+/*
+ * TODO: Case 2: GP#11 is used, but filename is not valid UTF-8.
+ * This should always cause an error; malformed UTF-8 should never happen.
+ */
+
+/*
+ * TODO: Case 3: Store UTF-8 filename using extra field 0x7075
+ * 0x7075 filename and regular filename have identical bytes but
+ * regular filename is not marked with GP#11 bit.
+ *
+ * Note: Central dir entry has only "A.txt" and no 0x7075 extension.
+ */
+static const unsigned char case3[] = {
+ /* Local file header */
+ 0x50, 0x4b, 0x03, 0x04, /* PK\003\004 */
+ 0x20, 0x00, /* Version needed to extract: 2.0 */
+ 0x00, 0x00, /* General purpose bit flag: 0x0000 */
+ 0x00, 0x00, /* Compression method: None */
+ 0x00, 0x00, /* Last mod time */
+ 0x00, 0x00, /* Last mod date */
+ 0x00, 0x00, 0x00, 0x00, /* CRC32 */
+ 0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */
+ 0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */
+ 0x0a, 0x00, /* Filename length: 10 */
+ 0x0e, 0x00, /* Extra field length: 14 */
+ 0x41, 0x42, 0x43, 0xE2, 0x86, 0x92, 0x2e, 0x74, 0x78, 0x74, /* Filename: ABC<right arrow>.txt */
+ 0x75, 0x70, 0x0a, 0x00, 0x41, 0x42, 0x43, 0xE2, 0x86, 0x92, 0x2e, 0x74, 0x78, 0x74, /* Extra field: 0x7075 */
+
+ /* File data */
+ 0x41, 0x42, 0x43, 0x0a, /* "ABC\n" */
+
+ /* Central directory header */
+ 0x50, 0x4b, 0x01, 0x02, /* PK\001\002 */
+ 0x20, 0x00, /* Version made by: 2.0 for MSDOS */
+ 0x20, 0x00, /* Version needed to extract: 2.0 */
+ 0x00, 0x08, /* General purpose bit flag: bit 11 = UTF8 filename */
+ 0x00, 0x00, /* Compression method: None */
+ 0x00, 0x00, /* Last mod time */
+ 0x00, 0x00, /* Last mod date */
+ 0x00, 0x00, 0x00, 0x00, /* CRC32 */
+ 0x04, 0x00, 0x00, 0x00, /* Compressed size: 4 */
+ 0x04, 0x00, 0x00, 0x00, /* Uncompressed size: 4 */
+ 0x05, 0x00, /* Filename length */
+ 0x00, 0x00, /* Extra field length: 0 */
+ 0x00, 0x00, /* Comment length: 0 */
+ 0x00, 0x00, /* Disk number start: 0 */
+ 0x00, 0x00, /* Internal file attributes */
+ 0x00, 0x00, 0x00, 0x00, /* External file attributes */
+ 0x00, 0x00, 0x00, 0x00, /* Offset of local header */
+ 0x41, 0x2e, 0x74, 0x78, 0x74, /* File name */
+ /* No extra fields */
+ /* File comment: not present */
+
+ /* End of central directory record */
+ 0x50, 0x4b, 0x05, 0x06, /* PK\005\006 */
+ 0x00, 0x00, /* Number of this disk: 0 */
+ 0x00, 0x00, /* Central directory starts on this disk: 0 */
+ 0x01, 0x00, /* Total CD entries on this disk: 1 */
+ 0x01, 0x00, /* Total CD entries: 1 */
+ 0x33, 0x00, 0x00, 0x00, /* Size of CD in bytes */
+ 0x3a, 0x00, 0x00, 0x00, /* Offset of start of CD */
+ 0x00, 0x00, /* Length of archive comment: 0 */
+ /* Archive comment: not present */
+};
+
+DEFINE_TEST(test_read_format_zip_utf8_paths_case3_seeking)
+{
+ struct archive *a;
+ struct archive_entry *ae;
/* Verify with seeking reader. */
assert((a = archive_read_new()) != NULL);
assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
- assertEqualIntA(a, ARCHIVE_OK, archive_read_open_filename(a, refname, 10240));
- verify(a);
+ assertEqualIntA(a, ARCHIVE_OK, read_open_memory_seek(a, case3, sizeof(case3), 7));
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+ assertEqualString(archive_entry_pathname(ae), NULL);
+ assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt");
+
assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
assertEqualIntA(a, ARCHIVE_OK, archive_read_free(a));
+}
+
+DEFINE_TEST(test_read_format_zip_utf8_paths_case3_streaming)
+{
+ struct archive *a;
+ struct archive_entry *ae;
/* Verify with streaming reader. */
- p = slurpfile(&s, refname);
assert((a = archive_read_new()) != NULL);
assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
- assertEqualIntA(a, ARCHIVE_OK, read_open_memory(a, p, s, 31));
- verify(a);
+ assertEqualIntA(a, ARCHIVE_OK, read_open_memory(a, case3, sizeof(case3), 31));
+ assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+ assertEqualString(archive_entry_pathname(ae), NULL);
+ assertEqualString(archive_entry_pathname_utf8(ae), "ABC\xe2\x86\x92.txt");
+
assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
assertEqualIntA(a, ARCHIVE_OK, archive_free(a));
}
+
+
+/*
+ * TODO: Case 4: As with Case 3, but the two filenames are not
+ * the same.
+ */
+
+/*
+ * TODO: Case 5: GP#11 and extra field 0x7075 both used, but
+ * store different names.
+ */
+
+/*
+ * TODO: Similar cases where the local file header and central directory
+ * disagree. Seeking reader should always use the CD version, streaming
+ * reader must necessarily always use the local file header version.
+ */