File CVE-2025-14009.patch of Package python-nltk
From 36c38b60a562f2ff8591386f35565c31ddcc569f Mon Sep 17 00:00:00 2001
From: hyperps <sarvesharmy6809@gmail.com>
Date: Thu, 20 Nov 2025 06:41:45 +0530
Subject: [PATCH 01/18] Fix: Secure ZIP extraction to prevent Zip-Slip and RCE
in nltk.downloader
---
nltk/downloader.py | 236 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 229 insertions(+), 7 deletions(-)
Index: nltk-3.9.1/nltk/downloader.py
===================================================================
--- nltk-3.9.1.orig/nltk/downloader.py
+++ nltk-3.9.1/nltk/downloader.py
@@ -2234,20 +2234,58 @@ def unzip(filename, root, verbose=True):
def _unzip_iter(filename, root, verbose=True):
+ """
+ Secure ZIP extraction with minimal behavioural changes.
+
+ - Prevents classic Zip-Slip (.., absolute paths, drive letters)
+ - Prevents writes through pre-existing symlinks
+ - Preserves original extraction behaviour for valid archives
+ """
+
if verbose:
sys.stdout.write("Unzipping %s" % os.path.split(filename)[1])
sys.stdout.flush()
try:
zf = zipfile.ZipFile(filename)
- except zipfile.BadZipFile:
- yield ErrorMessage(filename, "Error with downloaded zip file")
- return
except Exception as e:
yield ErrorMessage(filename, e)
return
- zf.extractall(root)
+ # Canonical root
+ root_abs = os.path.abspath(root)
+ root_real = os.path.realpath(root_abs)
+ root_prefix = root_real.rstrip(os.sep) + os.sep
+
+ # Ensure the extraction root directory exists
+ os.makedirs(root, exist_ok=True)
+
+ try:
+ for member in zf.namelist():
+
+ # Construct target path
+ raw_target = os.path.join(root_abs, member)
+ target_abs = os.path.abspath(raw_target)
+
+ # Zip-Slip check (absolute/traversal/drive-letter cases)
+ if not target_abs.startswith(root_prefix):
+ yield ErrorMessage(filename, f"Zip Slip blocked: {member}")
+ continue
+
+ # Symlink-escape check
+ target_real = os.path.realpath(target_abs)
+ if not target_real.startswith(root_prefix):
+ yield ErrorMessage(filename, f"Symlink escape blocked: {member}")
+ continue
+
+ # Safe extraction
+ try:
+ zf.extract(member, root)
+ except Exception as e:
+ yield ErrorMessage(filename, f"Extraction error for {member}: {e}")
+ continue
+ finally:
+ zf.close()
if verbose:
print()
Index: nltk-3.9.1/nltk/tag/mapping.py
===================================================================
--- nltk-3.9.1.orig/nltk/tag/mapping.py
+++ nltk-3.9.1/nltk/tag/mapping.py
@@ -32,7 +32,7 @@ X - other: foreign words, typos, abbrevi
from collections import defaultdict
from os.path import join
-from nltk.data import load
+from nltk.data import load, normalize_resource_url
_UNIVERSAL_DATA = "taggers/universal_tagset"
_UNIVERSAL_TAGS = (
@@ -56,7 +56,10 @@ _MAPPINGS = defaultdict(lambda: defaultd
def _load_universal_map(fileid):
- contents = load(join(_UNIVERSAL_DATA, fileid + ".map"), format="text")
+ resource = normalize_resource_url(
+ f"nltk:{_UNIVERSAL_DATA.rstrip('/')}/{fileid.lstrip('/')}.map"
+ )
+ contents = load(resource, format="text")
# When mapping to the Universal Tagset,
# map unknown inputs to 'X' not 'UNK'
Index: nltk-3.9.1/nltk/test/unit/test_downloader_unzip.py
===================================================================
--- /dev/null
+++ nltk-3.9.1/nltk/test/unit/test_downloader_unzip.py
@@ -0,0 +1,233 @@
+import os
+import sys
+import zipfile
+from pathlib import Path
+
+import pytest
+
+from nltk.downloader import ErrorMessage, _unzip_iter
+
+
+def _make_zip(file_path: Path, members: dict[str, bytes]) -> None:
+ """
+ Create a ZIP file at file_path, with the given arcname->content mapping.
+ """
+ with zipfile.ZipFile(file_path, "w") as zf:
+ for arcname, content in members.items():
+ zf.writestr(arcname, content)
+
+
+def _run_unzip_iter(zip_path: Path, extract_root: Path, verbose: bool = False):
+ """
+ Convenience wrapper that runs _unzip_iter and returns the list of yielded
+ messages (if any).
+ """
+ return list(_unzip_iter(str(zip_path), str(extract_root), verbose=verbose))
+
+
+class TestSecureUnzip:
+ """
+ Tests for secure ZIP extraction behaviour in nltk.downloader._unzip_iter.
+
+ These tests are specifically designed so that:
+
+ * On the old implementation (using zf.extractall(root) without checks),
+ the "Zip-Slip" tests will fail because they rely on the new behaviour
+ (yielding ErrorMessage for escaping entries, and NOT writing outside
+ the extraction root).
+
+ * On the new implementation, the tests pass, because the new path
+ validation prevents escapes and emits the expected ErrorMessage.
+ """
+
+ def test_normal_relative_paths_are_extracted(self, tmp_path: Path) -> None:
+ """
+ A ZIP with only safe, relative paths should fully extract under the
+ given root, and should not yield any ErrorMessage.
+ """
+ zip_path = tmp_path / "safe.zip"
+ extract_root = tmp_path / "extract"
+
+ members = {
+ "pkg/file.txt": b"hello",
+ "pkg/subdir/other.txt": b"world",
+ }
+ _make_zip(zip_path, members)
+
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
+
+ # No ErrorMessage should be yielded for valid archives.
+ assert not any(isinstance(m, ErrorMessage) for m in messages)
+
+ assert (extract_root / "pkg" / "file.txt").read_bytes() == b"hello"
+ assert (extract_root / "pkg" / "subdir" / "other.txt").read_bytes() == b"world"
+
+ def test_zip_slip_with_parent_directory_component_is_blocked(
+ self, tmp_path: Path
+ ) -> None:
+ """
+ An entry containing ``..`` that would escape the target directory
+ must not be written outside the extraction root, and must cause
+ _unzip_iter to yield an ErrorMessage.
+
+ On the old implementation (extractall(root)), this test will fail
+ because the file outside the root is actually created and no
+ ErrorMessage is yielded.
+ """
+ zip_path = tmp_path / "zip_slip_parent.zip"
+ extract_root = tmp_path / "extract"
+
+ # This is the path that would be written if "../outside.txt" is
+ # extracted with extractall(root).
+ outside_target = (extract_root / ".." / "outside.txt").resolve()
+
+ members = {
+ "pkg/good.txt": b"ok",
+ # This would escape extract_root if not validated.
+ "../outside.txt": b"evil",
+ }
+ _make_zip(zip_path, members)
+
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
+
+ # With the secure implementation, the escaping entry must be blocked
+ # and reported as an ErrorMessage.
+ err_msgs = [m for m in messages if isinstance(m, ErrorMessage)]
+ assert (
+ err_msgs
+ ), "Expected an ErrorMessage for a Zip-Slip parent-directory attempt"
+
+ # Check that the message text indicates it was blocked. Adjust to the
+ # exact wording if needed.
+ combined_messages = " ".join(str(m.message) for m in err_msgs)
+ assert "Zip Slip" in combined_messages and "blocked" in combined_messages
+
+ # The escaping path must not have been written.
+ assert not outside_target.exists()
+
+ # The safe entry should still be extracted under the root.
+ assert (extract_root / "pkg" / "good.txt").read_bytes() == b"ok"
+
+ @pytest.mark.skipif(
+ sys.platform.startswith("win"),
+ reason="Absolute POSIX paths are not meaningful on Windows",
+ )
+ def test_zip_slip_with_absolute_posix_path_is_blocked(self, tmp_path: Path) -> None:
+ """
+ An entry with an absolute POSIX path (e.g. ``/tmp/evil``) must not be
+ extracted as-is; it should not overwrite arbitrary filesystem paths,
+ and should result in an ErrorMessage.
+
+ On the old implementation (extractall(root)), this test will fail
+ because the absolute path gets created without any ErrorMessage.
+ """
+ zip_path = tmp_path / "zip_slip_abs_posix.zip"
+ extract_root = tmp_path / "extract"
+
+ # Choose a unique location in /tmp to avoid interfering with real files.
+ absolute_target = Path("/tmp") / f"nltk_zip_slip_test_{os.getpid()}"
+
+ try:
+ members = {
+ "pkg/good.txt": b"ok",
+ # This absolute path should never be created by the secure
+ # implementation.
+ str(absolute_target): b"evil",
+ }
+ _make_zip(zip_path, members)
+
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
+
+ err_msgs = [m for m in messages if isinstance(m, ErrorMessage)]
+ assert (
+ err_msgs
+ ), "Expected an ErrorMessage for absolute-path Zip-Slip attempt"
+
+ combined_messages = " ".join(str(m.message) for m in err_msgs)
+ assert "Zip Slip" in combined_messages and "blocked" in combined_messages
+
+ # Absolute path must not be created by the secure implementation.
+ assert not absolute_target.exists()
+
+ # Safe entry must still be extracted under extract_root.
+ assert (extract_root / "pkg" / "good.txt").read_bytes() == b"ok"
+ finally:
+ # Best-effort cleanup if the implementation under test behaves
+ # incorrectly and creates the file.
+ if absolute_target.exists():
+ try:
+ absolute_target.unlink()
+ except OSError:
+ # Ignore cleanup errors: file may not exist or be locked.
+ pass
+
+ def test_entries_resolved_outside_root_are_blocked_via_symlink(
+ self, tmp_path: Path
+ ) -> None:
+ """
+ If there is a pre-existing symlink below the extraction root that
+ points outside the root, writing through that symlink should not
+ be allowed to escape the root.
+
+ This test documents the desired hardening to defend against this
+ class of attacks.
+ """
+ if not hasattr(os, "symlink"):
+ pytest.skip("Symlinks not supported on this platform")
+
+ zip_path = tmp_path / "zip_slip_symlink.zip"
+ extract_root = tmp_path / "extract"
+ outside_dir = tmp_path / "outside_dir"
+ outside_dir.mkdir()
+ outside_target = outside_dir / "evil.txt"
+
+ members = {
+ "pkg/good.txt": b"ok",
+ "dir_link/evil.txt": b"evil",
+ }
+ _make_zip(zip_path, members)
+
+ extract_root.mkdir()
+ # Pre-existing symlink inside extract_root pointing *outside* it.
+ os.symlink(outside_dir, extract_root / "dir_link")
+
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
+
+ # Desired property:
+ assert not outside_target.exists()
+ assert (extract_root / "pkg" / "good.txt").read_bytes() == b"ok"
+ # Should yield an ErrorMessage for the blocked symlink escape
+ assert any(isinstance(m, ErrorMessage) for m in messages)
+
+ def test_bad_zipfile_yields_errormessage(self, tmp_path: Path) -> None:
+ """
+ A corrupt or non-zip file should cause _unzip_iter to yield an
+ ErrorMessage instead of raising an unhandled exception.
+ """
+ zip_path = tmp_path / "not_a_zip.txt"
+ zip_path.write_bytes(b"this is not a zip archive")
+ extract_root = tmp_path / "extract"
+
+ messages = _run_unzip_iter(zip_path, extract_root, verbose=False)
+
+ assert any(isinstance(m, ErrorMessage) for m in messages)
+
+ # If the implementation chooses to create the root directory at all,
+ # it should not leave partially extracted content.
+ if extract_root.exists():
+ assert not any(extract_root.iterdir())
+
+ def test_unzip_iter_verbose_writes_to_stdout(self, capsys, tmp_path: Path) -> None:
+ """
+ When verbose=True, _unzip_iter should write a status line to stdout.
+ This checks that existing user-visible behaviour is preserved.
+ """
+ zip_path = tmp_path / "verbose.zip"
+ extract_root = tmp_path / "extract"
+
+ members = {"pkg/file.txt": b"data"}
+ _make_zip(zip_path, members)
+
+ _run_unzip_iter(zip_path, extract_root, verbose=True)
+ captured = capsys.readouterr()
+ assert "Unzipping" in captured.out