File expat-CVE-2022-25235.patch of Package expat.35320
From ee2a5b50e7d1940ba8745715b62ceb9efd3a96da Mon Sep 17 00:00:00 2001
From: Sebastian Pipping <sebastian@pipping.org>
Date: Tue, 8 Feb 2022 17:37:14 +0100
Subject: [PATCH 1/5] lib: Drop unused macro UTF8_GET_NAMING
---
expat/lib/xmltok.c | 5 -----
1 file changed, 5 deletions(-)
Index: expat-2.1.0/lib/xmltok.c
===================================================================
--- expat-2.1.0.orig/lib/xmltok.c
+++ expat-2.1.0/lib/xmltok.c
@@ -71,13 +71,6 @@
+ ((((byte)[2]) >> 5) & 1)] \
& (1 << (((byte)[2]) & 0x1F)))
-#define UTF8_GET_NAMING(pages, p, n) \
- ((n) == 2 \
- ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
- : ((n) == 3 \
- ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
- : 0))
-
/* Detection of invalid UTF-8 sequences is based on Table 3.1B
of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
with the additional restriction of not allowing the Unicode
Index: expat-2.1.0/lib/xmltok_impl.c
===================================================================
--- expat-2.1.0.orig/lib/xmltok_impl.c
+++ expat-2.1.0/lib/xmltok_impl.c
@@ -34,7 +34,7 @@
case BT_LEAD ## n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
- if (!IS_NAME_CHAR(enc, ptr, n)) { \
+ if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@@ -62,7 +62,7 @@
case BT_LEAD ## n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
- if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
+ if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@@ -1097,6 +1097,10 @@ PREFIX(prologTok)(const ENCODING *enc, c
case BT_LEAD ## n: \
if (end - ptr < n) \
return XML_TOK_PARTIAL_CHAR; \
+ if (IS_INVALID_CHAR(enc, ptr, n)) { \
+ *nextTokPtr = ptr; \
+ return XML_TOK_INVALID; \
+ } \
if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
ptr += n; \
tok = XML_TOK_NAME; \
@@ -1210,7 +1214,9 @@ PREFIX(attributeValueTok)(const ENCODING
while (ptr < end) {
switch (BYTE_TYPE(enc, ptr)) {
#define LEAD_CASE(n) \
- case BT_LEAD ## n: ptr += n; break;
+ case BT_LEAD ## n: \
+ ptr += n; /* NOTE: The encoding has already been validated. */ \
+ break;
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
#undef LEAD_CASE
case BT_AMP:
@@ -1268,7 +1274,9 @@ PREFIX(entityValueTok)(const ENCODING *e
while (ptr < end) {
switch (BYTE_TYPE(enc, ptr)) {
#define LEAD_CASE(n) \
- case BT_LEAD ## n: ptr += n; break;
+ case BT_LEAD ## n: \
+ ptr += n; /* NOTE: The encoding has already been validated. */ \
+ break;
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
#undef LEAD_CASE
case BT_AMP:
@@ -1447,7 +1455,8 @@ PREFIX(getAtts)(const ENCODING *enc, con
state = inName; \
}
#define LEAD_CASE(n) \
- case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
+ case BT_LEAD ## n: /* NOTE: The encoding has already been validated. */ \
+ START_NAME ptr += (n - MINBPC(enc)); break;
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
#undef LEAD_CASE
case BT_NONASCII:
@@ -1702,8 +1711,10 @@ PREFIX(nameLength)(const ENCODING *enc,
for (;;) {
switch (BYTE_TYPE(enc, ptr)) {
#define LEAD_CASE(n) \
- case BT_LEAD ## n: ptr += n; break;
- LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
+ case BT_LEAD ## n: \
+ ptr += n; /* NOTE: The encoding has already been validated. */ \
+ break;
+ LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
#undef LEAD_CASE
case BT_NONASCII:
case BT_NMSTRT:
@@ -1748,7 +1759,7 @@ PREFIX(updatePosition)(const ENCODING *e
switch (BYTE_TYPE(enc, ptr)) {
#define LEAD_CASE(n) \
case BT_LEAD ## n: \
- ptr += n; \
+ ptr += n; /* NOTE: The encoding has already been validated. */ \
break;
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
#undef LEAD_CASE
Index: expat-2.1.0/tests/runtests.c
===================================================================
--- expat-2.1.0.orig/tests/runtests.c
+++ expat-2.1.0/tests/runtests.c
@@ -13,6 +13,7 @@
#include <stdio.h>
#include <string.h>
#include <stdint.h>
+#include <stdbool.h>
#include <limits.h> // INT_MAX
#include "expat.h"
@@ -29,6 +30,29 @@
#define XML_FMT_INT_MOD "l"
#endif
+#ifdef XML_UNICODE_WCHAR_T
+# define XML_FMT_CHAR "lc"
+# define XML_FMT_STR "ls"
+# include <wchar.h>
+# define xcstrlen(s) wcslen(s)
+# define xcstrcmp(s, t) wcscmp((s), (t))
+# define xcstrncmp(s, t, n) wcsncmp((s), (t), (n))
+# define XCS(s) _XCS(s)
+# define _XCS(s) L ## s
+#else
+# ifdef XML_UNICODE
+# error "No support for UTF-16 character without wchar_t in tests"
+# else
+# define XML_FMT_CHAR "c"
+# define XML_FMT_STR "s"
+# define xcstrlen(s) strlen(s)
+# define xcstrcmp(s, t) strcmp((s), (t))
+# define xcstrncmp(s, t, n) strncmp((s), (t), (n))
+# define XCS(s) s
+# endif /* XML_UNICODE */
+#endif /* XML_UNICODE_WCHAR_T */
+
+
static XML_Parser parser;
@@ -149,6 +173,9 @@ dummy_start_element(void *userData,
const XML_Char *name, const XML_Char **atts)
{}
+static void XMLCALL
+dummy_end_element(void *userData, const XML_Char *name)
+{}
/*
* Character & encoding tests.
@@ -320,6 +347,8 @@ START_TEST(test_utf8_false_rejection)
}
END_TEST
+
+
/* Regression test for SF bug #477667.
This test assures that any 8-bit character followed by a 7-bit
character will not be mistakenly interpreted as a valid UTF-8
@@ -346,6 +375,105 @@ START_TEST(test_illegal_utf8)
}
END_TEST
+START_TEST(test_utf8_in_start_tags) {
+ struct test_case {
+ bool goodName;
+ bool goodNameStart;
+ const char *tagName;
+ };
+
+ // The idea with the tests below is this:
+ // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
+ // go to isNever and are hence not a concern.
+ //
+ // We start with a character that is a valid name character
+ // (or even name-start character, see XML 1.0r4 spec) and then we flip
+ // single bits at places where (1) the result leaves the UTF-8 encoding space
+ // and (2) we stay in the same n-byte sequence family.
+ //
+ // The flipped bits are highlighted in angle brackets in comments,
+ // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
+ // the most significant bit to 1 to leave UTF-8 encoding space.
+ struct test_case cases[] = {
+ // 1-byte UTF-8: [0xxx xxxx]
+ {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':'
+ {false, false, "\xBA"}, // [<1>011 1010]
+ {true, false, "\x39"}, // [0011 1001] = ASCII nine '9'
+ {false, false, "\xB9"}, // [<1>011 1001]
+
+ // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
+ {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] =
+ // Arabic small waw U+06E5
+ {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
+ {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
+ {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
+ {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] =
+ // combining char U+0301
+ {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
+ {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
+ {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
+
+ // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
+ {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] =
+ // Devanagari Letter A U+0905
+ {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
+ {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
+ {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
+ {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
+ {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
+ {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] =
+ // combining char U+0901
+ {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
+ {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
+ {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
+ {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
+ {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
+ };
+ const bool atNameStart[] = {true, false};
+
+ size_t i = 0;
+ char doc[1024];
+ size_t failCount = 0;
+
+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
+ size_t j = 0;
+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
+ const bool expectedSuccess
+ = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
+ sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);
+ XML_Parser parser = XML_ParserCreate(NULL);
+
+ const enum XML_Status status
+ = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
+
+ bool success = true;
+ if ((status == XML_STATUS_OK) != expectedSuccess) {
+ success = false;
+ }
+ if ((status == XML_STATUS_ERROR)
+ && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
+ success = false;
+ }
+
+ if (! success) {
+ fprintf(
+ stderr,
+ "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
+ (unsigned)i + 1u, atNameStart[j] ? " " : "not ",
+ (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
+ failCount++;
+ }
+
+ XML_ParserFree(parser);
+ }
+ }
+
+ if (failCount > 0) {
+ fail("UTF-8 regression detected");
+ }
+}
+END_TEST
+
START_TEST(test_utf16)
{
/* <?xml version="1.0" encoding="UTF-16"?>
@@ -1138,6 +1266,15 @@ START_TEST(test_suspend_parser_between_c
END_TEST
+START_TEST(test_bad_doctype_utf8) {
+ char *text = "<!DOCTYPE \xDB\x25"
+ "doc><doc/>"; // [1101 1011] [<0>010 0101]
+ expect_failure(text, XML_ERROR_INVALID_TOKEN,
+ "Invalid UTF-8 in DOCTYPE not faulted");
+}
+END_TEST
+
+
/*
* Namespaces tests.
*/
@@ -1618,6 +1755,7 @@ make_suite(void)
tcase_add_test(tc_basic, test_bom_utf16_be);
tcase_add_test(tc_basic, test_bom_utf16_le);
tcase_add_test(tc_basic, test_illegal_utf8);
+ tcase_add_test(tc_basic, test_utf8_in_start_tags);
tcase_add_test(tc_basic, test_utf16);
tcase_add_test(tc_basic, test_utf16_le_epilog_newline);
tcase_add_test(tc_basic, test_latin1_umlauts);
@@ -1657,6 +1795,7 @@ make_suite(void)
#ifdef XML_DTD
tcase_add_test(tc_basic, test_misc_deny_internal_entity_closing_doctype_issue_317);
#endif
+ tcase_add_test(tc_basic, test_bad_doctype_utf8);
suite_add_tcase(s, tc_namespace);
tcase_add_checked_fixture(tc_namespace,