File pyo3.patch of Package python-tokenizers

From 3a6504d2740ef3892350ef074beffe4a1ac87a64 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 31 Dec 2024 18:36:01 +0100
Subject: [PATCH] Upgrade to PyO3 0.23 (#1708)

* Upgrade to PyO3 0.23

* Macos-12 deprecated?

* Clippy.

* Clippy auto ellision.
---
 .github/workflows/CI.yml                |  2 +-
 bindings/python/Cargo.toml              |  8 +-
 bindings/python/pyproject.toml          |  1 +
 bindings/python/src/decoders.rs         | 73 ++++++++++++------
 bindings/python/src/encoding.rs         |  2 +-
 bindings/python/src/error.rs            |  5 +-
 bindings/python/src/models.rs           | 22 ++++--
 bindings/python/src/normalizers.rs      | 97 ++++++++++++++++--------
 bindings/python/src/pre_tokenizers.rs   | 99 ++++++++++++++++---------
 bindings/python/src/processors.rs       | 43 +++++++----
 bindings/python/src/tokenizer.rs        | 24 +++---
 bindings/python/src/trainers.rs         | 28 ++++---
 bindings/python/src/utils/iterators.rs  |  3 +-
 bindings/python/src/utils/serde_pyo3.rs | 16 ++--
 tokenizers/Cargo.toml                   |  6 +-
 tokenizers/src/models/mod.rs            |  2 +-
 tokenizers/src/tokenizer/encoding.rs    |  2 +-
 tokenizers/src/tokenizer/mod.rs         |  2 +-
 tokenizers/src/utils/fancy.rs           |  2 +-
 19 files changed, 283 insertions(+), 154 deletions(-)

diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
index 475b1fa23..e987716b8 100644
--- a/bindings/python/Cargo.toml
+++ b/bindings/python/Cargo.toml
@@ -14,9 +14,9 @@ serde = { version = "1.0", features = ["rc", "derive"] }
 serde_json = "1.0"
 libc = "0.2"
 env_logger = "0.11"
-pyo3 = { version = "0.22", features = ["abi3", "abi3-py39"] }
-numpy = "0.22"
-ndarray = "0.15"
+pyo3 = { version = "0.23", features = ["abi3", "abi3-py39"] }
+numpy = "0.23"
+ndarray = "0.16"
 itertools = "0.12"
 
 [dependencies.tokenizers]
@@ -24,7 +24,7 @@ path = "../../tokenizers"
 
 [dev-dependencies]
 tempfile = "3.10"
-pyo3 = { version = "0.22", features = ["auto-initialize"] }
+pyo3 = { version = "0.23", features = ["auto-initialize"] }
 
 [features]
 defaut = ["pyo3/extension-module"]
diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml
index 681619a2f..234765f61 100644
--- a/bindings/python/pyproject.toml
+++ b/bindings/python/pyproject.toml
@@ -25,6 +25,7 @@ dynamic = [
     'description',
     'license',
     'readme',
+    'version',
 ]
 dependencies = ["huggingface_hub>=0.16.4,<1.0"]
 
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
index 88e0a5398..44f33326b 100644
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -43,22 +43,48 @@ impl PyDecoder {
     pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
         let base = self.clone();
         Ok(match &self.decoder {
-            PyDecoderWrapper::Custom(_) => Py::new(py, base)?.into_py(py),
+            PyDecoderWrapper::Custom(_) => Py::new(py, base)?.into_pyobject(py)?.into_any().into(),
             PyDecoderWrapper::Wrapped(inner) => match &*inner.as_ref().read().unwrap() {
-                DecoderWrapper::Metaspace(_) => Py::new(py, (PyMetaspaceDec {}, base))?.into_py(py),
-                DecoderWrapper::WordPiece(_) => Py::new(py, (PyWordPieceDec {}, base))?.into_py(py),
-                DecoderWrapper::ByteFallback(_) => {
-                    Py::new(py, (PyByteFallbackDec {}, base))?.into_py(py)
-                }
-                DecoderWrapper::Strip(_) => Py::new(py, (PyStrip {}, base))?.into_py(py),
-                DecoderWrapper::Fuse(_) => Py::new(py, (PyFuseDec {}, base))?.into_py(py),
-                DecoderWrapper::ByteLevel(_) => Py::new(py, (PyByteLevelDec {}, base))?.into_py(py),
-                DecoderWrapper::Replace(_) => Py::new(py, (PyReplaceDec {}, base))?.into_py(py),
-                DecoderWrapper::BPE(_) => Py::new(py, (PyBPEDecoder {}, base))?.into_py(py),
-                DecoderWrapper::CTC(_) => Py::new(py, (PyCTCDecoder {}, base))?.into_py(py),
-                DecoderWrapper::Sequence(_) => {
-                    Py::new(py, (PySequenceDecoder {}, base))?.into_py(py)
-                }
+                DecoderWrapper::Metaspace(_) => Py::new(py, (PyMetaspaceDec {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
+                DecoderWrapper::WordPiece(_) => Py::new(py, (PyWordPieceDec {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
+                DecoderWrapper::ByteFallback(_) => Py::new(py, (PyByteFallbackDec {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
+                DecoderWrapper::Strip(_) => Py::new(py, (PyStrip {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
+                DecoderWrapper::Fuse(_) => Py::new(py, (PyFuseDec {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
+                DecoderWrapper::ByteLevel(_) => Py::new(py, (PyByteLevelDec {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
+                DecoderWrapper::Replace(_) => Py::new(py, (PyReplaceDec {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
+                DecoderWrapper::BPE(_) => Py::new(py, (PyBPEDecoder {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
+                DecoderWrapper::CTC(_) => Py::new(py, (PyCTCDecoder {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
+                DecoderWrapper::Sequence(_) => Py::new(py, (PySequenceDecoder {}, base))?
+                    .into_pyobject(py)?
+                    .into_any()
+                    .into(),
             },
         })
     }
@@ -85,7 +111,7 @@ impl PyDecoder {
                 e
             ))
         })?;
-        Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
+        Ok(PyBytes::new(py, data.as_bytes()).into())
     }
 
     fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@@ -484,8 +510,8 @@ impl PySequenceDecoder {
         Ok((PySequenceDecoder {}, Sequence::new(decoders).into()))
     }
 
-    fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
-        PyTuple::new_bound(py, [PyList::empty_bound(py)])
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
+        PyTuple::new(py, [PyList::empty(py)])
     }
 }
 
@@ -504,7 +530,7 @@ impl Decoder for CustomDecoder {
         Python::with_gil(|py| {
             let decoded = self
                 .inner
-                .call_method_bound(py, "decode", (tokens,), None)?
+                .call_method(py, "decode", (tokens,), None)?
                 .extract(py)?;
             Ok(decoded)
         })
@@ -514,7 +540,7 @@ impl Decoder for CustomDecoder {
         Python::with_gil(|py| {
             let decoded = self
                 .inner
-                .call_method_bound(py, "decode_chain", (tokens,), None)?
+                .call_method(py, "decode_chain", (tokens,), None)?
                 .extract(py)?;
             Ok(decoded)
         })
@@ -693,7 +719,12 @@ mod test {
 
         let obj = Python::with_gil(|py| {
             let py_msp = PyDecoder::new(Metaspace::default().into());
-            let obj: PyObject = Py::new(py, py_msp).unwrap().into_py(py);
+            let obj: PyObject = Py::new(py, py_msp)
+                .unwrap()
+                .into_pyobject(py)
+                .unwrap()
+                .into_any()
+                .into();
             obj
         });
         let py_seq = PyDecoderWrapper::Custom(Arc::new(RwLock::new(CustomDecoder::new(obj))));
diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs
index dcad1b037..e157b8006 100644
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@@ -37,7 +37,7 @@ impl PyEncoding {
                 e
             ))
         })?;
-        Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
+        Ok(PyBytes::new(py, data.as_bytes()).into())
     }
 
     fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
diff --git a/bindings/python/src/error.rs b/bindings/python/src/error.rs
index 888c0d449..e6db1a6c1 100644
--- a/bindings/python/src/error.rs
+++ b/bindings/python/src/error.rs
@@ -1,6 +1,7 @@
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::type_object::PyTypeInfo;
+use std::ffi::CString;
 use std::fmt::{Display, Formatter, Result as FmtResult};
 use tokenizers::tokenizer::Result;
 
@@ -35,7 +36,7 @@ impl<T> ToPyResult<T> {
 }
 
 pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> {
-    let deprecation_warning = py.import_bound("builtins")?.getattr("DeprecationWarning")?;
+    let deprecation_warning = py.import("builtins")?.getattr("DeprecationWarning")?;
     let full_message = format!("Deprecated in {}: {}", version, message);
-    pyo3::PyErr::warn_bound(py, &deprecation_warning, &full_message, 0)
+    pyo3::PyErr::warn(py, &deprecation_warning, &CString::new(full_message)?, 0)
 }
diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs
index 0d5c0ddcd..2f4dba825 100644
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@@ -35,10 +35,22 @@ impl PyModel {
     pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
         let base = self.clone();
         Ok(match *self.model.as_ref().read().unwrap() {
-            ModelWrapper::BPE(_) => Py::new(py, (PyBPE {}, base))?.into_py(py),
-            ModelWrapper::WordPiece(_) => Py::new(py, (PyWordPiece {}, base))?.into_py(py),
-            ModelWrapper::WordLevel(_) => Py::new(py, (PyWordLevel {}, base))?.into_py(py),
-            ModelWrapper::Unigram(_) => Py::new(py, (PyUnigram {}, base))?.into_py(py),
+            ModelWrapper::BPE(_) => Py::new(py, (PyBPE {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            ModelWrapper::WordPiece(_) => Py::new(py, (PyWordPiece {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            ModelWrapper::WordLevel(_) => Py::new(py, (PyWordLevel {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            ModelWrapper::Unigram(_) => Py::new(py, (PyUnigram {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
         })
     }
 }
@@ -105,7 +117,7 @@ impl PyModel {
                 e
             ))
         })?;
-        Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
+        Ok(PyBytes::new(py, data.as_bytes()).into())
     }
 
     fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs
index 38041fc94..d81596378 100644
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -54,38 +54,73 @@ impl PyNormalizer {
     pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
         let base = self.clone();
         Ok(match self.normalizer {
-            PyNormalizerTypeWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?.into_py(py),
+            PyNormalizerTypeWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
             PyNormalizerTypeWrapper::Single(ref inner) => match &*inner.as_ref().read().unwrap() {
-                PyNormalizerWrapper::Custom(_) => Py::new(py, base)?.into_py(py),
+                PyNormalizerWrapper::Custom(_) => {
+                    Py::new(py, base)?.into_pyobject(py)?.into_any().into()
+                }
                 PyNormalizerWrapper::Wrapped(ref inner) => match inner {
-                    NormalizerWrapper::Sequence(_) => {
-                        Py::new(py, (PySequence {}, base))?.into_py(py)
-                    }
+                    NormalizerWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
                     NormalizerWrapper::BertNormalizer(_) => {
-                        Py::new(py, (PyBertNormalizer {}, base))?.into_py(py)
-                    }
-                    NormalizerWrapper::StripNormalizer(_) => {
-                        Py::new(py, (PyStrip {}, base))?.into_py(py)
-                    }
-                    NormalizerWrapper::Prepend(_) => Py::new(py, (PyPrepend {}, base))?.into_py(py),
-                    NormalizerWrapper::ByteLevel(_) => {
-                        Py::new(py, (PyByteLevel {}, base))?.into_py(py)
-                    }
-                    NormalizerWrapper::StripAccents(_) => {
-                        Py::new(py, (PyStripAccents {}, base))?.into_py(py)
-                    }
-                    NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base))?.into_py(py),
-                    NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base))?.into_py(py),
-                    NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base))?.into_py(py),
-                    NormalizerWrapper::NFKD(_) => Py::new(py, (PyNFKD {}, base))?.into_py(py),
-                    NormalizerWrapper::Lowercase(_) => {
-                        Py::new(py, (PyLowercase {}, base))?.into_py(py)
-                    }
-                    NormalizerWrapper::Precompiled(_) => {
-                        Py::new(py, (PyPrecompiled {}, base))?.into_py(py)
+                        Py::new(py, (PyBertNormalizer {}, base))?
+                            .into_pyobject(py)?
+                            .into_any()
+                            .into()
                     }
-                    NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base))?.into_py(py),
-                    NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base))?.into_py(py),
+                    NormalizerWrapper::StripNormalizer(_) => Py::new(py, (PyStrip {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::Prepend(_) => Py::new(py, (PyPrepend {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::StripAccents(_) => Py::new(py, (PyStripAccents {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::NFKD(_) => Py::new(py, (PyNFKD {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::Lowercase(_) => Py::new(py, (PyLowercase {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::Precompiled(_) => Py::new(py, (PyPrecompiled {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
+                    NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base))?
+                        .into_pyobject(py)?
+                        .into_any()
+                        .into(),
                 },
             },
         })
@@ -114,7 +149,7 @@ impl PyNormalizer {
                 e
             ))
         })?;
-        Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
+        Ok(PyBytes::new(py, data.as_bytes()).into())
     }
 
     fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@@ -371,8 +406,8 @@ impl PySequence {
         ))
     }
 
-    fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
-        PyTuple::new_bound(py, [PyList::empty_bound(py)])
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
+        PyTuple::new(py, [PyList::empty(py)])
     }
 
     fn __len__(&self) -> usize {
diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
index e58d1bee6..fdc862302 100644
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -49,45 +49,69 @@ impl PyPreTokenizer {
     pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
         let base = self.clone();
         Ok(match &self.pretok {
-            PyPreTokenizerTypeWrapper::Sequence(_) => {
-                Py::new(py, (PySequence {}, base))?.into_py(py)
-            }
+            PyPreTokenizerTypeWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
             PyPreTokenizerTypeWrapper::Single(ref inner) => {
                 match &*inner.as_ref().read().unwrap() {
-                    PyPreTokenizerWrapper::Custom(_) => Py::new(py, base)?.into_py(py),
+                    PyPreTokenizerWrapper::Custom(_) => {
+                        Py::new(py, base)?.into_pyobject(py)?.into_any().into()
+                    }
                     PyPreTokenizerWrapper::Wrapped(inner) => match inner {
-                        PreTokenizerWrapper::Whitespace(_) => {
-                            Py::new(py, (PyWhitespace {}, base))?.into_py(py)
-                        }
-                        PreTokenizerWrapper::Split(_) => {
-                            Py::new(py, (PySplit {}, base))?.into_py(py)
-                        }
+                        PreTokenizerWrapper::Whitespace(_) => Py::new(py, (PyWhitespace {}, base))?
+                            .into_pyobject(py)?
+                            .into_any()
+                            .into(),
+                        PreTokenizerWrapper::Split(_) => Py::new(py, (PySplit {}, base))?
+                            .into_pyobject(py)?
+                            .into_any()
+                            .into(),
                         PreTokenizerWrapper::Punctuation(_) => {
-                            Py::new(py, (PyPunctuation {}, base))?.into_py(py)
-                        }
-                        PreTokenizerWrapper::Sequence(_) => {
-                            Py::new(py, (PySequence {}, base))?.into_py(py)
-                        }
-                        PreTokenizerWrapper::Metaspace(_) => {
-                            Py::new(py, (PyMetaspace {}, base))?.into_py(py)
+                            Py::new(py, (PyPunctuation {}, base))?
+                                .into_pyobject(py)?
+                                .into_any()
+                                .into()
                         }
+                        PreTokenizerWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
+                            .into_pyobject(py)?
+                            .into_any()
+                            .into(),
+                        PreTokenizerWrapper::Metaspace(_) => Py::new(py, (PyMetaspace {}, base))?
+                            .into_pyobject(py)?
+                            .into_any()
+                            .into(),
                         PreTokenizerWrapper::Delimiter(_) => {
-                            Py::new(py, (PyCharDelimiterSplit {}, base))?.into_py(py)
+                            Py::new(py, (PyCharDelimiterSplit {}, base))?
+                                .into_pyobject(py)?
+                                .into_any()
+                                .into()
                         }
                         PreTokenizerWrapper::WhitespaceSplit(_) => {
-                            Py::new(py, (PyWhitespaceSplit {}, base))?.into_py(py)
-                        }
-                        PreTokenizerWrapper::ByteLevel(_) => {
-                            Py::new(py, (PyByteLevel {}, base))?.into_py(py)
+                            Py::new(py, (PyWhitespaceSplit {}, base))?
+                                .into_pyobject(py)?
+                                .into_any()
+                                .into()
                         }
+                        PreTokenizerWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))?
+                            .into_pyobject(py)?
+                            .into_any()
+                            .into(),
                         PreTokenizerWrapper::BertPreTokenizer(_) => {
-                            Py::new(py, (PyBertPreTokenizer {}, base))?.into_py(py)
-                        }
-                        PreTokenizerWrapper::Digits(_) => {
-                            Py::new(py, (PyDigits {}, base))?.into_py(py)
+                            Py::new(py, (PyBertPreTokenizer {}, base))?
+                                .into_pyobject(py)?
+                                .into_any()
+                                .into()
                         }
+                        PreTokenizerWrapper::Digits(_) => Py::new(py, (PyDigits {}, base))?
+                            .into_pyobject(py)?
+                            .into_any()
+                            .into(),
                         PreTokenizerWrapper::UnicodeScripts(_) => {
-                            Py::new(py, (PyUnicodeScripts {}, base))?.into_py(py)
+                            Py::new(py, (PyUnicodeScripts {}, base))?
+                                .into_pyobject(py)?
+                                .into_any()
+                                .into()
                         }
                     },
                 }
@@ -118,7 +142,7 @@ impl PyPreTokenizer {
                 e
             ))
         })?;
-        Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
+        Ok(PyBytes::new(py, data.as_bytes()).into())
     }
 
     fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@@ -365,8 +389,8 @@ impl PySplit {
         ))
     }
 
-    fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
-        PyTuple::new_bound(py, [" ", "removed"])
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
+        PyTuple::new(py, [" ", "removed"])
     }
 }
 
@@ -398,8 +422,8 @@ impl PyCharDelimiterSplit {
         ))
     }
 
-    fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
-        PyTuple::new_bound(py, [" "])
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
+        PyTuple::new(py, [" "])
     }
 }
 
@@ -460,8 +484,8 @@ impl PySequence {
         ))
     }
 
-    fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
-        PyTuple::new_bound(py, [PyList::empty_bound(py)])
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
+        PyTuple::new(py, [PyList::empty(py)])
     }
 
     fn __getitem__(self_: PyRef<'_, Self>, py: Python<'_>, index: usize) -> PyResult<Py<PyAny>> {
@@ -823,7 +847,12 @@ mod test {
 
         let obj = Python::with_gil(|py| {
             let py_wsp = PyPreTokenizer::new(Whitespace {}.into());
-            let obj: PyObject = Py::new(py, py_wsp).unwrap().into_py(py);
+            let obj: PyObject = Py::new(py, py_wsp)
+                .unwrap()
+                .into_pyobject(py)
+                .unwrap()
+                .into_any()
+                .into();
             obj
         });
         let py_seq: PyPreTokenizerWrapper =
diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
index 1e7520aad..d558c40b0 100644
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -41,15 +41,26 @@ impl PyPostProcessor {
     pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
         let base = self.clone();
         Ok(match self.processor.as_ref() {
-            PostProcessorWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))?.into_py(py),
-            PostProcessorWrapper::Bert(_) => Py::new(py, (PyBertProcessing {}, base))?.into_py(py),
-            PostProcessorWrapper::Roberta(_) => {
-                Py::new(py, (PyRobertaProcessing {}, base))?.into_py(py)
-            }
-            PostProcessorWrapper::Template(_) => {
-                Py::new(py, (PyTemplateProcessing {}, base))?.into_py(py)
-            }
-            PostProcessorWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?.into_py(py),
+            PostProcessorWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            PostProcessorWrapper::Bert(_) => Py::new(py, (PyBertProcessing {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            PostProcessorWrapper::Roberta(_) => Py::new(py, (PyRobertaProcessing {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            PostProcessorWrapper::Template(_) => Py::new(py, (PyTemplateProcessing {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            PostProcessorWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
         })
     }
 }
@@ -78,7 +89,7 @@ impl PyPostProcessor {
                 e
             ))
         })?;
-        Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
+        Ok(PyBytes::new(py, data.as_bytes()).into())
     }
 
     fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@@ -176,8 +187,8 @@ impl PyBertProcessing {
         )
     }
 
-    fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
-        PyTuple::new_bound(py, [("", 0), ("", 0)])
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
+        PyTuple::new(py, [("", 0), ("", 0)])
     }
 }
 
@@ -226,8 +237,8 @@ impl PyRobertaProcessing {
         )
     }
 
-    fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
-        PyTuple::new_bound(py, [("", 0), ("", 0)])
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
+        PyTuple::new(py, [("", 0), ("", 0)])
     }
 }
 
@@ -451,8 +462,8 @@ impl PySequence {
         )
     }
 
-    fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
-        PyTuple::new_bound(py, [PyList::empty_bound(py)])
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
+        PyTuple::new(py, [PyList::empty(py)])
     }
 }
 
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
index 52b86d975..73a0dbbe8 100644
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -100,7 +100,7 @@ impl PyAddedToken {
     }
 
     pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> {
-        let dict = PyDict::new_bound(py);
+        let dict = PyDict::new(py);
         let token = self.get_token();
 
         dict.set_item("content", token.content)?;
@@ -347,6 +347,7 @@ impl From<PyArrayUnicode> for tk::InputSequence<'_> {
 }
 
 struct PyArrayStr(Vec<String>);
+
 impl FromPyObject<'_> for PyArrayStr {
     fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
         let array = ob.downcast::<PyArray1<PyObject>>()?;
@@ -495,7 +496,7 @@ impl PyTokenizer {
                 e
             ))
         })?;
-        Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
+        Ok(PyBytes::new(py, data.as_bytes()).into())
     }
 
     fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@@ -513,9 +514,12 @@ impl PyTokenizer {
         }
     }
 
-    fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
-        let model = PyModel::from(BPE::default()).into_py(py);
-        PyTuple::new_bound(py, vec![model])
+    fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
+        let model: PyObject = PyModel::from(BPE::default())
+            .into_pyobject(py)?
+            .into_any()
+            .into();
+        PyTuple::new(py, vec![model])
     }
 
     /// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
@@ -594,14 +598,14 @@ impl PyTokenizer {
         token: Option<String>,
     ) -> PyResult<Self> {
         let path = Python::with_gil(|py| -> PyResult<String> {
-            let huggingface_hub = PyModule::import_bound(py, intern!(py, "huggingface_hub"))?;
+            let huggingface_hub = PyModule::import(py, intern!(py, "huggingface_hub"))?;
             let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?;
             let kwargs = [
                 (intern!(py, "repo_id"), identifier),
                 (intern!(py, "filename"), "tokenizer.json"),
                 (intern!(py, "revision"), &revision),
             ]
-            .into_py_dict_bound(py);
+            .into_py_dict(py)?;
             if let Some(token) = token {
                 kwargs.set_item(intern!(py, "token"), token)?;
             }
@@ -796,7 +800,7 @@ impl PyTokenizer {
     #[getter]
     fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> {
         self.tokenizer.get_truncation().map_or(Ok(None), |params| {
-            let dict = PyDict::new_bound(py);
+            let dict = PyDict::new(py);
 
             dict.set_item("max_length", params.max_length)?;
             dict.set_item("stride", params.stride)?;
@@ -906,7 +910,7 @@ impl PyTokenizer {
     #[getter]
     fn get_padding<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> {
         self.tokenizer.get_padding().map_or(Ok(None), |params| {
-            let dict = PyDict::new_bound(py);
+            let dict = PyDict::new(py);
 
             dict.set_item(
                 "length",
@@ -1342,7 +1346,7 @@ impl PyTokenizer {
                 if let Ok(s) = element.downcast::<PyString>() {
                     itertools::Either::Right(std::iter::once(s.to_cow().map(|s| s.into_owned())))
                 } else {
-                    match element.iter() {
+                    match element.try_iter() {
                         Ok(iter) => itertools::Either::Left(
                             iter.map(|i| i?.extract::<String>())
                                 .collect::<Vec<_>>()
diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs
index 45eabf0dd..a3d2d556d 100644
--- a/bindings/python/src/trainers.rs
+++ b/bindings/python/src/trainers.rs
@@ -29,16 +29,22 @@ impl PyTrainer {
     pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
         let base = self.clone();
         Ok(match *self.trainer.as_ref().read().unwrap() {
-            TrainerWrapper::BpeTrainer(_) => Py::new(py, (PyBpeTrainer {}, base))?.into_py(py),
-            TrainerWrapper::WordPieceTrainer(_) => {
-                Py::new(py, (PyWordPieceTrainer {}, base))?.into_py(py)
-            }
-            TrainerWrapper::WordLevelTrainer(_) => {
-                Py::new(py, (PyWordLevelTrainer {}, base))?.into_py(py)
-            }
-            TrainerWrapper::UnigramTrainer(_) => {
-                Py::new(py, (PyUnigramTrainer {}, base))?.into_py(py)
-            }
+            TrainerWrapper::BpeTrainer(_) => Py::new(py, (PyBpeTrainer {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            TrainerWrapper::WordPieceTrainer(_) => Py::new(py, (PyWordPieceTrainer {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            TrainerWrapper::WordLevelTrainer(_) => Py::new(py, (PyWordLevelTrainer {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
+            TrainerWrapper::UnigramTrainer(_) => Py::new(py, (PyUnigramTrainer {}, base))?
+                .into_pyobject(py)?
+                .into_any()
+                .into(),
         })
     }
 }
@@ -51,7 +57,7 @@ impl PyTrainer {
                 e
             ))
         })?;
-        Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
+        Ok(PyBytes::new(py, data.as_bytes()).into())
     }
 
     fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
diff --git a/bindings/python/src/utils/iterators.rs b/bindings/python/src/utils/iterators.rs
index ebb9ac360..d619b93d2 100644
--- a/bindings/python/src/utils/iterators.rs
+++ b/bindings/python/src/utils/iterators.rs
@@ -65,8 +65,7 @@ where
     pub fn new(iter: &Bound<'_, PyAny>, converter: F, buffer_size: usize) -> PyResult<Self> {
         let py = iter.py();
         let iter: Py<PyAny> = unsafe {
-            Bound::from_borrowed_ptr_or_err(py, pyo3::ffi::PyObject_GetIter(iter.as_ptr()))?
-                .to_object(py)
+            Bound::from_borrowed_ptr_or_err(py, pyo3::ffi::PyObject_GetIter(iter.as_ptr()))?.into()
         };
 
         Ok(Self {
diff --git a/bindings/python/src/utils/serde_pyo3.rs b/bindings/python/src/utils/serde_pyo3.rs
index 471993614..d4f8f132e 100644
--- a/bindings/python/src/utils/serde_pyo3.rs
+++ b/bindings/python/src/utils/serde_pyo3.rs
@@ -57,7 +57,7 @@ where
     Ok(serializer.output)
 }
 
-impl<'a> ser::Serializer for &'a mut Serializer {
+impl ser::Serializer for &mut Serializer {
     // The output type produced by this `Serializer` during successful
     // serialization. Most serializers that produce text or binary output should
     // set `Ok = ()` and serialize into an `io::Write` or buffer contained
@@ -355,7 +355,7 @@ impl<'a> ser::Serializer for &'a mut Serializer {
 //
 // This impl is SerializeSeq so these methods are called after `serialize_seq`
 // is called on the Serializer.
-impl<'a> ser::SerializeSeq for &'a mut Serializer {
+impl ser::SerializeSeq for &mut Serializer {
     // Must match the `Ok` type of the serializer.
     type Ok = ();
     // Must match the `Error` type of the serializer.
@@ -391,7 +391,7 @@ impl<'a> ser::SerializeSeq for &'a mut Serializer {
 }
 
 // Same thing but for tuples.
-impl<'a> ser::SerializeTuple for &'a mut Serializer {
+impl ser::SerializeTuple for &mut Serializer {
     type Ok = ();
     type Error = Error;
 
@@ -423,7 +423,7 @@ impl<'a> ser::SerializeTuple for &'a mut Serializer {
 }
 
 // Same thing but for tuple structs.
-impl<'a> ser::SerializeTupleStruct for &'a mut Serializer {
+impl ser::SerializeTupleStruct for &mut Serializer {
     type Ok = ();
     type Error = Error;
 
@@ -463,7 +463,7 @@ impl<'a> ser::SerializeTupleStruct for &'a mut Serializer {
 //
 // So the `end` method in this impl is responsible for closing both the `]` and
 // the `}`.
-impl<'a> ser::SerializeTupleVariant for &'a mut Serializer {
+impl ser::SerializeTupleVariant for &mut Serializer {
     type Ok = ();
     type Error = Error;
 
@@ -502,7 +502,7 @@ impl<'a> ser::SerializeTupleVariant for &'a mut Serializer {
 // `serialize_entry` method allows serializers to optimize for the case where
 // key and value are both available simultaneously. In JSON it doesn't make a
 // difference so the default behavior for `serialize_entry` is fine.
-impl<'a> ser::SerializeMap for &'a mut Serializer {
+impl ser::SerializeMap for &mut Serializer {
     type Ok = ();
     type Error = Error;
 
@@ -559,7 +559,7 @@ impl<'a> ser::SerializeMap for &'a mut Serializer {
 
 // Structs are like maps in which the keys are constrained to be compile-time
 // constant strings.
-impl<'a> ser::SerializeStruct for &'a mut Serializer {
+impl ser::SerializeStruct for &mut Serializer {
     type Ok = ();
     type Error = Error;
 
@@ -590,7 +590,7 @@ impl<'a> ser::SerializeStruct for &'a mut Serializer {
 
 // Similar to `SerializeTupleVariant`, here the `end` method is responsible for
 // closing both of the curly braces opened by `serialize_struct_variant`.
-impl<'a> ser::SerializeStructVariant for &'a mut Serializer {
+impl ser::SerializeStructVariant for &mut Serializer {
     type Ok = ();
     type Error = Error;
 
diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml
index cf3db78b4..dacb96298 100644
--- a/tokenizers/Cargo.toml
+++ b/tokenizers/Cargo.toml
@@ -55,7 +55,7 @@ unicode-normalization-alignments = "0.1"
 unicode_categories = "0.1"
 unicode-segmentation = "1.11"
 indicatif = {version = "0.17", optional = true}
-itertools = "0.12"
+itertools = "0.13"
 log = "0.4"
 derive_builder = "0.20"
 spm_precompiled = "0.1.3"
@@ -63,8 +63,8 @@ hf-hub = { version = "0.3.2", optional = true }
 aho-corasick = "1.1"
 paste = "1.0.14"
 macro_rules_attribute = "0.2.0"
-thiserror = "1.0.49"
-fancy-regex = { version = "0.13", optional = true}
+thiserror = "2"
+fancy-regex = { version = "0.14", optional = true}
 getrandom = { version = "0.2.10" }
 esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
 monostate = "0.1.12"
diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs
index 3ab3b495b..3a3a91adc 100644
--- a/tokenizers/src/models/mod.rs
+++ b/tokenizers/src/models/mod.rs
@@ -28,7 +28,7 @@ impl<'a> OrderedVocabIter<'a> {
     }
 }
 
-impl<'a> Serialize for OrderedVocabIter<'a> {
+impl Serialize for OrderedVocabIter<'_> {
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
         S: Serializer,
diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs
index 0693ad1e1..1732686e4 100644
--- a/tokenizers/src/tokenizer/encoding.rs
+++ b/tokenizers/src/tokenizer/encoding.rs
@@ -341,7 +341,7 @@ impl Encoding {
                 .step_by(offset)
                 .filter_map(|stop| {
                     let stop = stop + 1;
-                    let start = if stop < max_len { 0 } else { stop - max_len };
+                    let start = stop.saturating_sub(max_len);
                     if start < stop && !end {
                         end = start == 0;
                         Some((start, stop))
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index cc095c1f8..893d27430 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -233,7 +233,7 @@ impl<'s> From<&'s [String]> for InputSequence<'s> {
     }
 }
 
-impl<'s> From<Vec<String>> for InputSequence<'s> {
+impl From<Vec<String>> for InputSequence<'_> {
     fn from(input: Vec<String>) -> Self {
         Self::PreTokenizedOwned(Cow::Owned(input))
     }
diff --git a/tokenizers/src/utils/fancy.rs b/tokenizers/src/utils/fancy.rs
index 9d44bc742..bbcf65311 100644
--- a/tokenizers/src/utils/fancy.rs
+++ b/tokenizers/src/utils/fancy.rs
@@ -22,7 +22,7 @@ impl SysRegex {
 
 pub struct Matches<'r, 't>(fancy_regex::Matches<'r, 't>);
 
-impl<'r, 't> Iterator for Matches<'r, 't> {
+impl Iterator for Matches<'_, '_> {
     type Item = (usize, usize);
 
     fn next(&mut self) -> Option<Self::Item> {
Places

File pyo3.patch of Package python-tokenizers

Places