File CVE-2024-5206.patch of Package python-scikit-learn.34254
Index: scikit-learn-0.23.2/sklearn/feature_extraction/text.py
===================================================================
--- scikit-learn-0.23.2.orig/sklearn/feature_extraction/text.py
+++ scikit-learn-0.23.2/sklearn/feature_extraction/text.py
@@ -951,15 +951,6 @@ class CountVectorizer(_VectorizerMixin,
True if a fixed vocabulary of term to indices mapping
is provided by the user
- stop_words_ : set
- Terms that were ignored because they either:
-
- - occurred in too many documents (`max_df`)
- - occurred in too few documents (`min_df`)
- - were cut off by feature selection (`max_features`).
-
- This is only available if no vocabulary was given.
-
Examples
--------
>>> from sklearn.feature_extraction.text import CountVectorizer
@@ -994,11 +985,6 @@ class CountVectorizer(_VectorizerMixin,
--------
HashingVectorizer, TfidfVectorizer
- Notes
- -----
- The ``stop_words_`` attribute can get large and increase the model size
- when pickling. This attribute is provided only for introspection and can
- be safely removed using delattr or set to None before pickling.
"""
@_deprecate_positional_args
def __init__(self, *, input='content', encoding='utf-8',
@@ -1076,18 +1062,16 @@ class CountVectorizer(_VectorizerMixin,
mask = new_mask
new_indices = np.cumsum(mask) - 1 # maps old indices to new
- removed_terms = set()
for term, old_index in list(vocabulary.items()):
if mask[old_index]:
vocabulary[term] = new_indices[old_index]
else:
del vocabulary[term]
- removed_terms.add(term)
kept_indices = np.where(mask)[0]
if len(kept_indices) == 0:
raise ValueError("After pruning, no terms remain. Try a lower"
" min_df or a higher max_df.")
- return X[:, kept_indices], removed_terms
+ return X[:, kept_indices]
def _count_vocab(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False
@@ -1214,10 +1198,10 @@ class CountVectorizer(_VectorizerMixin,
"max_df corresponds to < documents than min_df")
if max_features is not None:
X = self._sort_features(X, vocabulary)
- X, self.stop_words_ = self._limit_features(X, vocabulary,
- max_doc_count,
- min_doc_count,
- max_features)
+ X = self._limit_features(X, vocabulary,
+ max_doc_count,
+ min_doc_count,
+ max_features)
if max_features is None:
X = self._sort_features(X, vocabulary)
self.vocabulary_ = vocabulary
@@ -1681,15 +1665,6 @@ class TfidfVectorizer(CountVectorizer):
The inverse document frequency (IDF) vector; only defined
if ``use_idf`` is True.
- stop_words_ : set
- Terms that were ignored because they either:
-
- - occurred in too many documents (`max_df`)
- - occurred in too few documents (`min_df`)
- - were cut off by feature selection (`max_features`).
-
- This is only available if no vocabulary was given.
-
See Also
--------
CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
@@ -1697,12 +1672,6 @@ class TfidfVectorizer(CountVectorizer):
TfidfTransformer : Performs the TF-IDF transformation from a provided
matrix of counts.
- Notes
- -----
- The ``stop_words_`` attribute can get large and increase the model size
- when pickling. This attribute is provided only for introspection and can
- be safely removed using delattr or set to None before pickling.
-
Examples
--------
>>> from sklearn.feature_extraction.text import TfidfVectorizer
Index: scikit-learn-0.23.2/sklearn/feature_extraction/tests/test_text.py
===================================================================
--- scikit-learn-0.23.2.orig/sklearn/feature_extraction/tests/test_text.py
+++ scikit-learn-0.23.2/sklearn/feature_extraction/tests/test_text.py
@@ -612,14 +612,11 @@ def test_feature_names():
@pytest.mark.parametrize('Vectorizer', (CountVectorizer, TfidfVectorizer))
def test_vectorizer_max_features(Vectorizer):
expected_vocabulary = {'burger', 'beer', 'salad', 'pizza'}
- expected_stop_words = {'celeri', 'tomato', 'copyright', 'coke',
- 'sparkling', 'water', 'the'}
# test bounded number of extracted features
vectorizer = Vectorizer(max_df=0.6, max_features=4)
vectorizer.fit(ALL_FOOD_DOCS)
assert set(vectorizer.vocabulary_) == expected_vocabulary
- assert vectorizer.stop_words_ == expected_stop_words
def test_count_vectorizer_max_features():
@@ -654,21 +651,16 @@ def test_vectorizer_max_df():
vect.fit(test_data)
assert 'a' in vect.vocabulary_.keys()
assert len(vect.vocabulary_.keys()) == 6
- assert len(vect.stop_words_) == 0
vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5
vect.fit(test_data)
assert 'a' not in vect.vocabulary_.keys() # {ae} ignored
assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain
- assert 'a' in vect.stop_words_
- assert len(vect.stop_words_) == 2
vect.max_df = 1
vect.fit(test_data)
assert 'a' not in vect.vocabulary_.keys() # {ae} ignored
assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain
- assert 'a' in vect.stop_words_
- assert len(vect.stop_words_) == 2
def test_vectorizer_min_df():
@@ -677,21 +669,16 @@ def test_vectorizer_min_df():
vect.fit(test_data)
assert 'a' in vect.vocabulary_.keys()
assert len(vect.vocabulary_.keys()) == 6
- assert len(vect.stop_words_) == 0
vect.min_df = 2
vect.fit(test_data)
assert 'c' not in vect.vocabulary_.keys() # {bcdt} ignored
assert len(vect.vocabulary_.keys()) == 2 # {ae} remain
- assert 'c' in vect.stop_words_
- assert len(vect.stop_words_) == 4
vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4
vect.fit(test_data)
assert 'c' not in vect.vocabulary_.keys() # {bcdet} ignored
assert len(vect.vocabulary_.keys()) == 1 # {a} remains
- assert 'c' in vect.stop_words_
- assert len(vect.stop_words_) == 5
def test_count_binary_occurrences():
@@ -965,28 +952,6 @@ def test_countvectorizer_vocab_dicts_whe
assert cv.get_feature_names() == unpickled_cv.get_feature_names()
-def test_stop_words_removal():
- # Ensure that deleting the stop_words_ attribute doesn't affect transform
-
- fitted_vectorizers = (
- TfidfVectorizer().fit(JUNK_FOOD_DOCS),
- CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
- CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS)
- )
-
- for vect in fitted_vectorizers:
- vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
-
- vect.stop_words_ = None
- stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
-
- delattr(vect, 'stop_words_')
- stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
-
- assert_array_equal(stop_None_transform, vect_transform)
- assert_array_equal(stop_del_transform, vect_transform)
-
-
def test_pickling_transformer():
X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
orig = TfidfTransformer().fit(X)