File v8-icu73-simple-case-folding.patch of Package nodejs-electron

From 572b80f2e906a826a499c4c5561b90b97a687f0e Mon Sep 17 00:00:00 2001
From: pthier <pthier@chromium.org>
Date: Tue, 18 Jul 2023 16:27:28 +0200
Subject: [PATCH] [regexp] Remove special handling for simple case folding

ICU 73 introduced creating closures using simple case folding.
We can directly use this method instead of our own special handling where simple case folding (required by JS spec) differs from full case
folding (the previously only supported mode in ICU).

Bug: v8:13377
Change-Id: I42bbcc37fe5c1f33a1d6c36f0d4ceb18a67a9b43
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/4694009
Commit-Queue: Patrick Thier <pthier@chromium.org>
Reviewed-by: Jakob Linke <jgruber@chromium.org>
Cr-Commit-Position: refs/heads/main@{#89024}
---
 src/regexp/gen-regexp-special-case.cc | 48 ---------------------------
 src/regexp/regexp-ast.h               |  6 ----
 src/regexp/regexp-compiler-tonode.cc  | 24 +-------------
 src/regexp/regexp-parser.cc           |  2 +-
 src/regexp/special-case.h             | 10 ------
 5 files changed, 2 insertions(+), 88 deletions(-)

diff --git a/src/regexp/gen-regexp-special-case.cc b/src/regexp/gen-regexp-special-case.cc
index 55618f11783..86f6b212c93 100644
--- a/v8/src/regexp/gen-regexp-special-case.cc
+++ b/v8/src/regexp/gen-regexp-special-case.cc
@@ -9,7 +9,6 @@
 
 #include "src/base/strings.h"
 #include "src/regexp/special-case.h"
-#include "unicode/usetiter.h"
 
 namespace v8 {
 namespace internal {
@@ -127,52 +126,6 @@ void PrintSpecial(std::ofstream& out) {
   PrintSet(out, "SpecialAddSet", special_add);
 }
 
-void PrintUnicodeSpecial(std::ofstream& out) {
-  icu::UnicodeSet non_simple_folding;
-  icu::UnicodeSet current;
-  UErrorCode status = U_ZERO_ERROR;
-  // Look at all characters except white spaces.
-  icu::UnicodeSet interestingCP(u"[^[:White_Space:]]", status);
-  CHECK_EQ(status, U_ZERO_ERROR);
-  icu::UnicodeSetIterator iter(interestingCP);
-  while (iter.next()) {
-    UChar32 c = iter.getCodepoint();
-    current.set(c, c);
-    current.closeOver(USET_CASE_INSENSITIVE).removeAllStrings();
-    CHECK(!current.isBogus());
-    // Remove characters from the closeover that have a simple case folding.
-    icu::UnicodeSet toRemove;
-    icu::UnicodeSetIterator closeOverIter(current);
-    while (closeOverIter.next()) {
-      UChar32 closeOverChar = closeOverIter.getCodepoint();
-      UChar32 closeOverSCF = u_foldCase(closeOverChar, U_FOLD_CASE_DEFAULT);
-      if (closeOverChar != closeOverSCF) {
-        toRemove.add(closeOverChar);
-      }
-    }
-    CHECK(!toRemove.isBogus());
-    current.removeAll(toRemove);
-
-    // The current character and its simple case folding are also always OK.
-    UChar32 scf = u_foldCase(c, U_FOLD_CASE_DEFAULT);
-    current.remove(c);
-    current.remove(scf);
-
-    // If there are any characters remaining, they were added due to full case
-    // foldings and shouldn't match the current charcter according to the spec.
-    if (!current.isEmpty()) {
-      // Ensure that the character doesn't have a simple case folding.
-      // Otherwise the current approach of simply removing the character from
-      // the set before calling closeOver won't work.
-      CHECK_EQ(c, scf);
-      non_simple_folding.add(c);
-    }
-  }
-  CHECK(!non_simple_folding.isBogus());
-
-  PrintSet(out, "UnicodeNonSimpleCloseOverSet", non_simple_folding);
-}
-
 void WriteHeader(const char* header_filename) {
   std::ofstream out(header_filename);
   out << std::hex << std::setfill('0') << std::setw(4);
@@ -193,7 +146,6 @@ void WriteHeader(const char* header_filename) {
       << "namespace internal {\n\n";
 
   PrintSpecial(out);
-  PrintUnicodeSpecial(out);
 
   out << "\n"
       << "}  // namespace internal\n"
diff --git a/src/regexp/regexp-ast.h b/src/regexp/regexp-ast.h
index e7453ad3f8f..8e3bb12fce2 100644
--- a/v8/src/regexp/regexp-ast.h
+++ b/v8/src/regexp/regexp-ast.h
@@ -134,12 +134,6 @@ class CharacterRange {
   static void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
                                         Zone* zone);
 
-#ifdef V8_INTL_SUPPORT
-  // Creates the closeOver of the given UnicodeSet, removing all
-  // characters/strings that can't be derived via simple case folding.
-  static void UnicodeSimpleCloseOver(icu::UnicodeSet& set);
-#endif  // V8_INTL_SUPPORT
-
   bool Contains(base::uc32 i) const { return from_ <= i && i <= to_; }
   base::uc32 from() const { return from_; }
   base::uc32 to() const { return to_; }
diff --git a/src/regexp/regexp-compiler-tonode.cc b/src/regexp/regexp-compiler-tonode.cc
index 5ff16ee71d2..9c83e2332e8 100644
--- a/v8/src/regexp/regexp-compiler-tonode.cc
+++ b/v8/src/regexp/regexp-compiler-tonode.cc
@@ -423,27 +423,6 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
 
 }  // namespace
 
-#ifdef V8_INTL_SUPPORT
-// static
-void CharacterRange::UnicodeSimpleCloseOver(icu::UnicodeSet& set) {
-  // Remove characters for which closeOver() adds full-case-folding equivalents
-  // because we should work only with simple case folding mappings.
-  icu::UnicodeSet non_simple = icu::UnicodeSet(set);
-  non_simple.retainAll(RegExpCaseFolding::UnicodeNonSimpleCloseOverSet());
-  set.removeAll(non_simple);
-
-  set.closeOver(USET_CASE_INSENSITIVE);
-  // Full case folding maps single characters to multiple characters.
-  // Those are represented as strings in the set. Remove them so that
-  // we end up with only simple and common case mappings.
-  set.removeAllStrings();
-
-  // Add characters that have non-simple case foldings again (they match
-  // themselves).
-  set.addAll(non_simple);
-}
-#endif  // V8_INTL_SUPPORT
-
 // static
 void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
                                                Zone* zone) {
@@ -465,8 +444,7 @@ void CharacterRange::AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges,
   }
   // Clear the ranges list without freeing the backing store.
   ranges->Rewind(0);
-
-  UnicodeSimpleCloseOver(set);
+  set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
   for (int i = 0; i < set.getRangeCount(); i++) {
     ranges->Add(Range(set.getRangeStart(i), set.getRangeEnd(i)), zone);
   }
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index 76ca02bf240..730dfb9da86 100644
--- a/v8/src/regexp/regexp-parser.cc
+++ b/v8/src/regexp/regexp-parser.cc
@@ -1897,7 +1897,7 @@ bool LookupPropertyValueName(UProperty property,
       ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);
     }
     const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);
-    if (needs_case_folding) CharacterRange::UnicodeSimpleCloseOver(set);
+    if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
     set.removeAllStrings();
     if (negate) set.complement();
     for (int i = 0; i < set.getRangeCount(); i++) {
diff --git a/src/regexp/special-case.h b/src/regexp/special-case.h
index c80b94e976a..753c9231ede 100644
--- a/v8/src/regexp/special-case.h
+++ b/v8/src/regexp/special-case.h
@@ -71,21 +71,11 @@ namespace internal {
 // another character. Characters that match no other characters in
 // their equivalence class are added to IgnoreSet. Characters that
 // match at least one other character are added to SpecialAddSet.
-//
-// For unicode ignoreCase ("iu" and "iv"),
-// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) adds all characters that are in
-// the same equivalence class. This includes characaters that are in the same
-// equivalence class using full case folding. According to the spec, only
-// simple case folding shall be considered. We therefore create
-// UnicodeNonSimpleCloseOverSet containing all characters for which
-// UnicodeSet::closeOver adds characters that are not simple case folds. This
-// set should be used similar to IgnoreSet described above.
 
 class RegExpCaseFolding final : public AllStatic {
  public:
   static const icu::UnicodeSet& IgnoreSet();
   static const icu::UnicodeSet& SpecialAddSet();
-  static const icu::UnicodeSet& UnicodeNonSimpleCloseOverSet();
 
   // This implements ECMAScript 2020 21.2.2.8.2 (Runtime Semantics:
   // Canonicalize) step 3, which is used to determine whether
Places

File v8-icu73-simple-case-folding.patch of Package nodejs-electron

Places