summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Pipping <sebastian@pipping.org>2022-02-18 20:12:32 +0100
committerGitHub <noreply@github.com>2022-02-18 20:12:32 +0100
commit306b72134f157bbfd1637b20a22cabf4acfa136a (patch)
tree6e699421a26247924bf6c250486bc83917e78e08
parent2cc97e875ef84da4bcf55156c83599116f7523b4 (diff)
parentc16300f0bc4318f31f9e27eb2702ddbffe086fea (diff)
downloadlibexpat-git-306b72134f157bbfd1637b20a22cabf4acfa136a.tar.gz
Merge pull request #562 from libexpat/utf8-security
[CVE-2022-25235] lib: Protect against malformed encoding (e.g. malformed UTF-8)
-rw-r--r--expat/Changes7
-rw-r--r--expat/lib/xmltok.c5
-rw-r--r--expat/lib/xmltok_impl.c18
-rw-r--r--expat/tests/runtests.c109
4 files changed, 127 insertions, 12 deletions
diff --git a/expat/Changes b/expat/Changes
index 6198e4ff..2a898778 100644
--- a/expat/Changes
+++ b/expat/Changes
@@ -4,6 +4,13 @@ NOTE: We are looking for help with a few things:
Release X.X.X XXX XXXXXXX XX XXXX
Security fixes:
+ #562 CVE-2022-25235 -- Passing malformed 2- and 3-byte UTF-8
+ sequences (e.g. from start tag names) to the XML
+ processing application on top of Expat can cause
+ arbitrary damage (e.g. code execution) depending
+ on how invalid UTF-8 is handled inside the XML
+ processor; validation was not their job but Expat's.
+ Exploits with code execution are known to exist.
#561 CVE-2022-25236 -- Passing (one or more) namespace separator
characters in "xmlns[:prefix]" attribute values
made Expat send malformed tag names to the XML
diff --git a/expat/lib/xmltok.c b/expat/lib/xmltok.c
index a72200e8..3bddf125 100644
--- a/expat/lib/xmltok.c
+++ b/expat/lib/xmltok.c
@@ -98,11 +98,6 @@
+ ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \
& (1u << (((byte)[2]) & 0x1F)))
-#define UTF8_GET_NAMING(pages, p, n) \
- ((n) == 2 \
- ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
- : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0))
-
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
with the additional restriction of not allowing the Unicode
diff --git a/expat/lib/xmltok_impl.c b/expat/lib/xmltok_impl.c
index 0430591b..84ff35f9 100644
--- a/expat/lib/xmltok_impl.c
+++ b/expat/lib/xmltok_impl.c
@@ -69,7 +69,7 @@
case BT_LEAD##n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
- if (! IS_NAME_CHAR(enc, ptr, n)) { \
+ if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@@ -98,7 +98,7 @@
case BT_LEAD##n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
- if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \
+ if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@@ -1142,6 +1142,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
case BT_LEAD##n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
+ if (IS_INVALID_CHAR(enc, ptr, n)) { \
+ *nextTokPtr = ptr; \
+ return XML_TOK_INVALID; \
+ } \
if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
ptr += n; \
tok = XML_TOK_NAME; \
@@ -1270,7 +1274,7 @@ PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
switch (BYTE_TYPE(enc, ptr)) {
# define LEAD_CASE(n) \
case BT_LEAD##n: \
- ptr += n; \
+ ptr += n; /* NOTE: The encoding has already been validated. */ \
break;
LEAD_CASE(2)
LEAD_CASE(3)
@@ -1339,7 +1343,7 @@ PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
switch (BYTE_TYPE(enc, ptr)) {
# define LEAD_CASE(n) \
case BT_LEAD##n: \
- ptr += n; \
+ ptr += n; /* NOTE: The encoding has already been validated. */ \
break;
LEAD_CASE(2)
LEAD_CASE(3)
@@ -1518,7 +1522,7 @@ PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax,
state = inName; \
}
# define LEAD_CASE(n) \
- case BT_LEAD##n: \
+ case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \
START_NAME ptr += (n - MINBPC(enc)); \
break;
LEAD_CASE(2)
@@ -1730,7 +1734,7 @@ PREFIX(nameLength)(const ENCODING *enc, const char *ptr) {
switch (BYTE_TYPE(enc, ptr)) {
# define LEAD_CASE(n) \
case BT_LEAD##n: \
- ptr += n; \
+ ptr += n; /* NOTE: The encoding has already been validated. */ \
break;
LEAD_CASE(2)
LEAD_CASE(3)
@@ -1775,7 +1779,7 @@ PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end,
switch (BYTE_TYPE(enc, ptr)) {
# define LEAD_CASE(n) \
case BT_LEAD##n: \
- ptr += n; \
+ ptr += n; /* NOTE: The encoding has already been validated. */ \
pos->columnNumber++; \
break;
LEAD_CASE(2)
diff --git a/expat/tests/runtests.c b/expat/tests/runtests.c
index bc5344b1..9b155b82 100644
--- a/expat/tests/runtests.c
+++ b/expat/tests/runtests.c
@@ -5998,6 +5998,105 @@ START_TEST(test_utf8_in_cdata_section_2) {
}
END_TEST
+START_TEST(test_utf8_in_start_tags) {
+ struct test_case {
+ bool goodName;
+ bool goodNameStart;
+ const char *tagName;
+ };
+
+ // The idea with the tests below is this:
+ // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
+ // go to isNever and are hence not a concern.
+ //
+ // We start with a character that is a valid name character
+ // (or even name-start character, see XML 1.0r4 spec) and then we flip
+ // single bits at places where (1) the result leaves the UTF-8 encoding space
+ // and (2) we stay in the same n-byte sequence family.
+ //
+ // The flipped bits are highlighted in angle brackets in comments,
+ // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
+ // the most significant bit to 1 to leave UTF-8 encoding space.
+ struct test_case cases[] = {
+ // 1-byte UTF-8: [0xxx xxxx]
+ {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':'
+ {false, false, "\xBA"}, // [<1>011 1010]
+ {true, false, "\x39"}, // [0011 1001] = ASCII nine '9'
+ {false, false, "\xB9"}, // [<1>011 1001]
+
+ // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
+ {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] =
+ // Arabic small waw U+06E5
+ {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
+ {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
+ {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
+ {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] =
+ // combining char U+0301
+ {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
+ {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
+ {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
+
+ // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
+ {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] =
+ // Devanagari Letter A U+0905
+ {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
+ {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
+ {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
+ {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
+ {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
+ {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] =
+ // combining char U+0901
+ {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
+ {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
+ {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
+ {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
+ {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
+ };
+ const bool atNameStart[] = {true, false};
+
+ size_t i = 0;
+ char doc[1024];
+ size_t failCount = 0;
+
+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
+ size_t j = 0;
+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
+ const bool expectedSuccess
+ = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
+ sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);
+ XML_Parser parser = XML_ParserCreate(NULL);
+
+ const enum XML_Status status
+ = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
+
+ bool success = true;
+ if ((status == XML_STATUS_OK) != expectedSuccess) {
+ success = false;
+ }
+ if ((status == XML_STATUS_ERROR)
+ && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
+ success = false;
+ }
+
+ if (! success) {
+ fprintf(
+ stderr,
+ "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
+ (unsigned)i + 1u, atNameStart[j] ? " " : "not ",
+ (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
+ failCount++;
+ }
+
+ XML_ParserFree(parser);
+ }
+ }
+
+ if (failCount > 0) {
+ fail("UTF-8 regression detected");
+ }
+}
+END_TEST
+
/* Test trailing spaces in elements are accepted */
static void XMLCALL
record_element_end_handler(void *userData, const XML_Char *name) {
@@ -6175,6 +6274,14 @@ START_TEST(test_bad_doctype) {
}
END_TEST
+START_TEST(test_bad_doctype_utf8) {
+ const char *text = "<!DOCTYPE \xDB\x25"
+ "doc><doc/>"; // [1101 1011] [<0>010 0101]
+ expect_failure(text, XML_ERROR_INVALID_TOKEN,
+ "Invalid UTF-8 in DOCTYPE not faulted");
+}
+END_TEST
+
START_TEST(test_bad_doctype_utf16) {
const char text[] =
/* <!DOCTYPE doc [ \x06f2 ]><doc/>
@@ -11870,6 +11977,7 @@ make_suite(void) {
tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
tcase_add_test(tc_basic, test_utf8_in_cdata_section);
tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
+ tcase_add_test(tc_basic, test_utf8_in_start_tags);
tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
tcase_add_test(tc_basic, test_utf16_attribute);
tcase_add_test(tc_basic, test_utf16_second_attr);
@@ -11878,6 +11986,7 @@ make_suite(void) {
tcase_add_test(tc_basic, test_bad_attr_desc_keyword);
tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16);
tcase_add_test(tc_basic, test_bad_doctype);
+ tcase_add_test(tc_basic, test_bad_doctype_utf8);
tcase_add_test(tc_basic, test_bad_doctype_utf16);
tcase_add_test(tc_basic, test_bad_doctype_plus);
tcase_add_test(tc_basic, test_bad_doctype_star);