File 0001-CVE-2020-27783.patch of Package python-lxml
From 89e7aad6e7ff9ecd88678ff25f885988b184b26e Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Sun, 18 Oct 2020 10:06:46 +0200
Subject: [PATCH] Prevent combinations of <noscript> and <style> to sneak
JavaScript through the HTML cleaner.
---
src/lxml/html/clean.py | 3 +++
src/lxml/html/tests/test_clean.py | 10 ++++++++++
2 files changed, 13 insertions(+)
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index d43b9bafa..7b51981d7 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -519,6 +519,9 @@ def _has_sneaky_javascript(self, style):
return True
if '@import' in style:
return True
+ if '</noscript' in style:
+ # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+ return True
return False
def clean_html(self, html):
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
index 447733793..3c8ee252f 100644
--- a/src/lxml/html/tests/test_clean.py
+++ b/src/lxml/html/tests/test_clean.py
@@ -151,6 +151,16 @@ def test_clean_with_comments(self):
expected,
cleaner.clean_html(html))
+ def test_sneaky_noscript_in_style(self):
+ # This gets parsed as <noscript> -> <style>"...</noscript>..."</style>
+ # thus passing the </noscript> through into the output.
+ html = '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+ s = lxml.html.fragment_fromstring(html)
+
+ self.assertEqual(
+ b'<noscript><style>/* deleted */</style></noscript>',
+ lxml.html.tostring(clean_html(s)))
+
def test_suite():
suite = unittest.TestSuite()