File expat-CVE-2022-25235.patch of Package expat.35320

From ee2a5b50e7d1940ba8745715b62ceb9efd3a96da Mon Sep 17 00:00:00 2001
From: Sebastian Pipping <sebastian@pipping.org>
Date: Tue, 8 Feb 2022 17:37:14 +0100
Subject: [PATCH 1/5] lib: Drop unused macro UTF8_GET_NAMING

---
 expat/lib/xmltok.c | 5 -----
 1 file changed, 5 deletions(-)

Index: expat-2.1.0/lib/xmltok.c
===================================================================
--- expat-2.1.0.orig/lib/xmltok.c
+++ expat-2.1.0/lib/xmltok.c
@@ -71,13 +71,6 @@
                       + ((((byte)[2]) >> 5) & 1)] \
          & (1 << (((byte)[2]) & 0x1F)))
 
-#define UTF8_GET_NAMING(pages, p, n) \
-  ((n) == 2 \
-  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
-  : ((n) == 3 \
-     ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
-     : 0))
-
 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
    with the additional restriction of not allowing the Unicode
Index: expat-2.1.0/lib/xmltok_impl.c
===================================================================
--- expat-2.1.0.orig/lib/xmltok_impl.c
+++ expat-2.1.0/lib/xmltok_impl.c
@@ -34,7 +34,7 @@
    case BT_LEAD ## n: \
      if (end - ptr < n) \
        return XML_TOK_PARTIAL_CHAR; \
-     if (!IS_NAME_CHAR(enc, ptr, n)) { \
+     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
        *nextTokPtr = ptr; \
        return XML_TOK_INVALID; \
      } \
@@ -62,7 +62,7 @@
    case BT_LEAD ## n: \
      if (end - ptr < n) \
        return XML_TOK_PARTIAL_CHAR; \
-     if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
+     if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
        *nextTokPtr = ptr; \
        return XML_TOK_INVALID; \
      } \
@@ -1097,6 +1097,10 @@ PREFIX(prologTok)(const ENCODING *enc, c
   case BT_LEAD ## n: \
     if (end - ptr < n) \
       return XML_TOK_PARTIAL_CHAR; \
+    if (IS_INVALID_CHAR(enc, ptr, n)) {                                        \
+      *nextTokPtr = ptr;                                                       \
+      return XML_TOK_INVALID;                                                  \
+    }                                                                          \
     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
       ptr += n; \
       tok = XML_TOK_NAME; \
@@ -1210,7 +1214,9 @@ PREFIX(attributeValueTok)(const ENCODING
   while (ptr < end) {
     switch (BYTE_TYPE(enc, ptr)) {
 #define LEAD_CASE(n) \
-    case BT_LEAD ## n: ptr += n; break;
+    case BT_LEAD ## n: \
+      ptr += n; /* NOTE: The encoding has already been validated. */ \
+      break;
     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 #undef LEAD_CASE
     case BT_AMP:
@@ -1268,7 +1274,9 @@ PREFIX(entityValueTok)(const ENCODING *e
   while (ptr < end) {
     switch (BYTE_TYPE(enc, ptr)) {
 #define LEAD_CASE(n) \
-    case BT_LEAD ## n: ptr += n; break;
+    case BT_LEAD ## n: \
+      ptr += n; /* NOTE: The encoding has already been validated. */ \
+      break;
     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 #undef LEAD_CASE
     case BT_AMP:
@@ -1447,7 +1455,8 @@ PREFIX(getAtts)(const ENCODING *enc, con
         state = inName; \
       }
 #define LEAD_CASE(n) \
-    case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
+    case BT_LEAD ## n: /* NOTE: The encoding has already been validated. */ \
+      START_NAME ptr += (n - MINBPC(enc)); break;
     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 #undef LEAD_CASE
     case BT_NONASCII:
@@ -1702,8 +1711,10 @@ PREFIX(nameLength)(const ENCODING *enc,
   for (;;) {
     switch (BYTE_TYPE(enc, ptr)) {
 #define LEAD_CASE(n) \
-    case BT_LEAD ## n: ptr += n; break;
-    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
+    case BT_LEAD ## n: \
+      ptr += n; /* NOTE: The encoding has already been validated. */ \
+      break; 
+        LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 #undef LEAD_CASE
     case BT_NONASCII:
     case BT_NMSTRT:
@@ -1748,7 +1759,7 @@ PREFIX(updatePosition)(const ENCODING *e
     switch (BYTE_TYPE(enc, ptr)) {
 #define LEAD_CASE(n) \
     case BT_LEAD ## n: \
-      ptr += n; \
+      ptr += n; /* NOTE: The encoding has already been validated. */ \
       break;
     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
 #undef LEAD_CASE
Index: expat-2.1.0/tests/runtests.c
===================================================================
--- expat-2.1.0.orig/tests/runtests.c
+++ expat-2.1.0/tests/runtests.c
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdint.h>
+#include <stdbool.h>
 #include <limits.h> // INT_MAX
 
 #include "expat.h"
@@ -29,6 +30,29 @@
 #define XML_FMT_INT_MOD "l"
 #endif
 
+#ifdef XML_UNICODE_WCHAR_T
+# define XML_FMT_CHAR "lc"
+# define XML_FMT_STR "ls"
+# include <wchar.h>
+# define xcstrlen(s) wcslen(s)
+# define xcstrcmp(s, t) wcscmp((s), (t))
+# define xcstrncmp(s, t, n) wcsncmp((s), (t), (n))
+# define XCS(s) _XCS(s)
+# define _XCS(s) L ## s
+#else
+# ifdef XML_UNICODE
+#  error "No support for UTF-16 character without wchar_t in tests"
+# else
+#  define XML_FMT_CHAR "c"
+#  define XML_FMT_STR "s"
+#  define xcstrlen(s) strlen(s)
+#  define xcstrcmp(s, t) strcmp((s), (t))
+#  define xcstrncmp(s, t, n) strncmp((s), (t), (n))
+#  define XCS(s) s
+# endif /* XML_UNICODE */
+#endif /* XML_UNICODE_WCHAR_T */
+
+
 static XML_Parser parser;
 
 
@@ -149,6 +173,9 @@ dummy_start_element(void *userData,
                     const XML_Char *name, const XML_Char **atts)
 {}
 
+static void XMLCALL
+dummy_end_element(void *userData, const XML_Char *name)
+{}
 
 /*
  * Character & encoding tests.
@@ -320,6 +347,8 @@ START_TEST(test_utf8_false_rejection)
 }
 END_TEST
 
+
+
 /* Regression test for SF bug #477667.
    This test assures that any 8-bit character followed by a 7-bit
    character will not be mistakenly interpreted as a valid UTF-8
@@ -346,6 +375,105 @@ START_TEST(test_illegal_utf8)
 }
 END_TEST
 
+START_TEST(test_utf8_in_start_tags) {
+  struct test_case {
+    bool goodName;
+    bool goodNameStart;
+    const char *tagName;
+  };
+
+  // The idea with the tests below is this:
+  // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
+  // go to isNever and are hence not a concern.
+  //
+  // We start with a character that is a valid name character
+  // (or even name-start character, see XML 1.0r4 spec) and then we flip
+  // single bits at places where (1) the result leaves the UTF-8 encoding space
+  // and (2) we stay in the same n-byte sequence family.
+  //
+  // The flipped bits are highlighted in angle brackets in comments,
+  // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
+  // the most significant bit to 1 to leave UTF-8 encoding space.
+  struct test_case cases[] = {
+      // 1-byte UTF-8: [0xxx xxxx]
+      {true, true, "\x3A"},   // [0011 1010] = ASCII colon ':'
+      {false, false, "\xBA"}, // [<1>011 1010]
+      {true, false, "\x39"},  // [0011 1001] = ASCII nine '9'
+      {false, false, "\xB9"}, // [<1>011 1001]
+
+      // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
+      {true, true, "\xDB\xA5"},   // [1101 1011] [1010 0101] =
+                                  // Arabic small waw U+06E5
+      {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
+      {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
+      {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
+      {true, false, "\xCC\x81"},  // [1100 1100] [1000 0001] =
+                                  // combining char U+0301
+      {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
+      {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
+      {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
+
+      // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
+      {true, true, "\xE0\xA4\x85"},   // [1110 0000] [1010 0100] [1000 0101] =
+                                      // Devanagari Letter A U+0905
+      {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
+      {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
+      {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
+      {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
+      {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
+      {true, false, "\xE0\xA4\x81"},  // [1110 0000] [1010 0100] [1000 0001] =
+                                      // combining char U+0901
+      {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
+      {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
+      {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
+      {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
+      {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
+  };
+  const bool atNameStart[] = {true, false};
+
+  size_t i = 0;
+  char doc[1024];
+  size_t failCount = 0;
+
+  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
+    size_t j = 0;
+    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
+      const bool expectedSuccess
+          = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
+      sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);
+      XML_Parser parser = XML_ParserCreate(NULL);
+
+      const enum XML_Status status
+          = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
+
+      bool success = true;
+      if ((status == XML_STATUS_OK) != expectedSuccess) {
+        success = false;
+      }
+      if ((status == XML_STATUS_ERROR)
+          && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
+        success = false;
+      }
+
+      if (! success) {
+        fprintf(
+            stderr,
+            "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
+            (unsigned)i + 1u, atNameStart[j] ? "    " : "not ",
+            (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
+        failCount++;
+      }
+
+      XML_ParserFree(parser);
+    }
+  }
+
+  if (failCount > 0) {
+    fail("UTF-8 regression detected");
+  }
+}
+END_TEST
+
 START_TEST(test_utf16)
 {
     /* <?xml version="1.0" encoding="UTF-16"?>
@@ -1138,6 +1266,15 @@ START_TEST(test_suspend_parser_between_c
 END_TEST
 
 
+START_TEST(test_bad_doctype_utf8) {
+  char *text = "<!DOCTYPE \xDB\x25"
+                     "doc><doc/>"; // [1101 1011] [<0>010 0101]
+  expect_failure(text, XML_ERROR_INVALID_TOKEN,
+                 "Invalid UTF-8 in DOCTYPE not faulted");
+}
+END_TEST
+
+
 /*
  * Namespaces tests.
  */
@@ -1618,6 +1755,7 @@ make_suite(void)
     tcase_add_test(tc_basic, test_bom_utf16_be);
     tcase_add_test(tc_basic, test_bom_utf16_le);
     tcase_add_test(tc_basic, test_illegal_utf8);
+    tcase_add_test(tc_basic, test_utf8_in_start_tags);
     tcase_add_test(tc_basic, test_utf16);
     tcase_add_test(tc_basic, test_utf16_le_epilog_newline);
     tcase_add_test(tc_basic, test_latin1_umlauts);
@@ -1657,6 +1795,7 @@ make_suite(void)
 #ifdef XML_DTD
     tcase_add_test(tc_basic, test_misc_deny_internal_entity_closing_doctype_issue_317);
 #endif
+    tcase_add_test(tc_basic, test_bad_doctype_utf8);
 
     suite_add_tcase(s, tc_namespace);
     tcase_add_checked_fixture(tc_namespace,
openSUSE Build Service is sponsored by