diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
index 0876da773805..66cc3f86afb5 100644
--- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py
+++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
@@ -14,7 +14,6 @@
import os
-import pickle
import unittest
from transformers import AutoTokenizer
@@ -103,26 +102,6 @@ def test_full_tokenizer(self):
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
- def test_pickle_mecab_tokenizer(self):
- tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab")
- self.assertIsNotNone(tokenizer)
-
- text = "こんにちは、世界。\nこんばんは、世界。"
- tokens = tokenizer.tokenize(text)
- self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
- self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
-
- filename = os.path.join(self.tmpdirname, "tokenizer.bin")
- with open(filename, "wb") as handle:
- pickle.dump(tokenizer, handle)
-
- with open(filename, "rb") as handle:
- tokenizer_new = pickle.load(handle)
-
- tokens_loaded = tokenizer_new.tokenize(text)
-
- self.assertListEqual(tokens, tokens_loaded)
-
def test_mecab_full_tokenizer_with_mecab_kwargs(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="mecab", mecab_kwargs={"mecab_dic": "ipadic"}
@@ -198,27 +177,6 @@ def test_mecab_tokenizer_no_normalize(self):
["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"],
)
- @require_sudachi_projection
- def test_pickle_sudachi_tokenizer(self):
- tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi")
- self.assertIsNotNone(tokenizer)
-
- text = "こんにちは、世界。\nこんばんは、世界。"
- tokens = tokenizer.tokenize(text)
- self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
- self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
-
- filename = os.path.join(self.tmpdirname, "tokenizer.bin")
- with open(filename, "wb") as handle:
- pickle.dump(tokenizer, handle)
-
- with open(filename, "rb") as handle:
- tokenizer_new = pickle.load(handle)
-
- tokens_loaded = tokenizer_new.tokenize(text)
-
- self.assertListEqual(tokens, tokens_loaded)
-
@require_sudachi_projection
def test_sudachi_tokenizer_core(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core")
@@ -293,27 +251,6 @@ def test_sudachi_tokenizer_trim_whitespace(self):
["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
)
- @require_jumanpp
- def test_pickle_jumanpp_tokenizer(self):
- tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="jumanpp")
- self.assertIsNotNone(tokenizer)
-
- text = "こんにちは、世界。\nこんばんは、世界。"
- tokens = tokenizer.tokenize(text)
- self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
- self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])
-
- filename = os.path.join(self.tmpdirname, "tokenizer.bin")
- with open(filename, "wb") as handle:
- pickle.dump(tokenizer, handle)
-
- with open(filename, "rb") as handle:
- tokenizer_new = pickle.load(handle)
-
- tokens_loaded = tokenizer_new.tokenize(text)
-
- self.assertListEqual(tokens, tokens_loaded)
-
@require_jumanpp
def test_jumanpp_tokenizer(self):
tokenizer = JumanppTokenizer()
diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py
index 236ab21d2d2a..c0561165c8dc 100644
--- a/tests/models/code_llama/test_tokenization_code_llama.py
+++ b/tests/models/code_llama/test_tokenization_code_llama.py
@@ -13,7 +13,6 @@
# limitations under the License.
import os
-import pickle
import shutil
import tempfile
import unittest
@@ -293,17 +292,6 @@ def test_tokenizer_integration(self):
padding=False,
)
- def test_picklable(self):
- with tempfile.NamedTemporaryFile() as f:
- shutil.copyfile(SAMPLE_VOCAB, f.name)
- tokenizer = CodeLlamaTokenizer(f.name, keep_accents=True)
- pickled_tokenizer = pickle.dumps(tokenizer)
- pickle.loads(pickled_tokenizer)
-
- @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
- def test_pickle_subword_regularization_tokenizer(self):
- pass
-
@unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
def test_subword_regularization_tokenizer(self):
pass
diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py
index 91a5cebaed59..913f7546e84a 100644
--- a/tests/models/gemma/test_tokenization_gemma.py
+++ b/tests/models/gemma/test_tokenization_gemma.py
@@ -140,10 +140,6 @@ def test_tokenizer_integration(self):
padding=False,
)
- @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
- def test_pickle_subword_regularization_tokenizer(self):
- pass
-
@unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
def test_subword_regularization_tokenizer(self):
pass
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index 927aa54fa084..58eb1f4e86e8 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -13,7 +13,6 @@
# limitations under the License.
import os
-import pickle
import shutil
import tempfile
import unittest
@@ -291,17 +290,6 @@ def test_tokenizer_integration(self):
padding=False,
)
- def test_picklable(self):
- with tempfile.NamedTemporaryFile() as f:
- shutil.copyfile(SAMPLE_VOCAB, f.name)
- tokenizer = LlamaTokenizer(f.name, keep_accents=True)
- pickled_tokenizer = pickle.dumps(tokenizer)
- pickle.loads(pickled_tokenizer)
-
- @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
- def test_pickle_subword_regularization_tokenizer(self):
- pass
-
@unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
def test_subword_regularization_tokenizer(self):
pass
diff --git a/tests/models/moshi/test_tokenization_moshi.py b/tests/models/moshi/test_tokenization_moshi.py
index 2b6030f6d792..5e1cdac9d65e 100644
--- a/tests/models/moshi/test_tokenization_moshi.py
+++ b/tests/models/moshi/test_tokenization_moshi.py
@@ -13,9 +13,6 @@
# limitations under the License.
import inspect
-import pickle
-import shutil
-import tempfile
import unittest
from transformers import (
@@ -171,18 +168,6 @@ def test_special_tokens_initialization(self):
self.assertTrue(special_token_id in r_output)
- def test_picklable(self):
- with tempfile.NamedTemporaryFile() as f:
- shutil.copyfile(SAMPLE_VOCAB, f.name)
- tokenizer = PreTrainedTokenizerFast(
- tokenizer_object=MoshiConverter(vocab_file=f.name).converted(),
- bos_token="",
- unk_token="",
- eos_token="",
- )
- pickled_tokenizer = pickle.dumps(tokenizer)
- pickle.loads(pickled_tokenizer)
-
def test_training_new_tokenizer(self):
# This feature only exists for fast tokenizers
if not self.test_rust_tokenizer:
diff --git a/tests/models/pop2piano/test_tokenization_pop2piano.py b/tests/models/pop2piano/test_tokenization_pop2piano.py
index a023421cd6de..6dc433128f38 100644
--- a/tests/models/pop2piano/test_tokenization_pop2piano.py
+++ b/tests/models/pop2piano/test_tokenization_pop2piano.py
@@ -15,8 +15,6 @@
Please note that Pop2PianoTokenizer is too far from our usual tokenizers and thus cannot use the TokenizerTesterMixin class.
"""
-import os
-import pickle
import shutil
import tempfile
import unittest
@@ -224,23 +222,6 @@ def test_save_and_load_tokenizer(self):
shutil.rmtree(tmpdirname)
- def test_pickle_tokenizer(self):
- tmpdirname = tempfile.mkdtemp()
-
- notes = self.get_input_notes()
- subwords = self.tokenizer(notes)["token_ids"]
-
- filename = os.path.join(tmpdirname, "tokenizer.bin")
- with open(filename, "wb") as handle:
- pickle.dump(self.tokenizer, handle)
-
- with open(filename, "rb") as handle:
- tokenizer_new = pickle.load(handle)
-
- subwords_loaded = tokenizer_new(notes)["token_ids"]
-
- self.assertListEqual(subwords, subwords_loaded)
-
def test_padding_side_in_kwargs(self):
tokenizer_p = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano", padding_side="left")
self.assertEqual(tokenizer_p.padding_side, "left")
diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
index f55be02e172b..d395924da35d 100644
--- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
@@ -426,10 +426,6 @@ def test_training_new_tokenizer(self):
self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
- @unittest.skip(reason="Fails because of the hack of adding in _tokenize")
- def test_pickle_subword_regularization_tokenizer(self):
- pass
-
@unittest.skip(reason="Fails because of the hack of adding in _tokenize")
def test_subword_regularization_tokenizer(self):
pass
diff --git a/tests/models/siglip/test_tokenization_siglip.py b/tests/models/siglip/test_tokenization_siglip.py
index 843058c8a019..af8eb8c4ba17 100644
--- a/tests/models/siglip/test_tokenization_siglip.py
+++ b/tests/models/siglip/test_tokenization_siglip.py
@@ -207,10 +207,6 @@ def test_eos_in_input(self):
def test_subword_regularization_tokenizer(self):
pass
- @unittest.skip(reason="SiglipTokenizer strips the punctuation")
- def test_pickle_subword_regularization_tokenizer(self):
- pass
-
# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_special_tokens_initialization with T5->Siglip
def test_special_tokens_initialization(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
diff --git a/tests/models/speecht5/test_tokenization_speecht5.py b/tests/models/speecht5/test_tokenization_speecht5.py
index 76554b0ebdbc..7398820cb5a5 100644
--- a/tests/models/speecht5/test_tokenization_speecht5.py
+++ b/tests/models/speecht5/test_tokenization_speecht5.py
@@ -143,10 +143,6 @@ def test_add_tokens_tokenizer(self):
self.assertEqual(tokens[0], tokenizer.eos_token_id)
self.assertEqual(tokens[-3], tokenizer.pad_token_id)
- @unittest.skip
- def test_pickle_subword_regularization_tokenizer(self):
- pass
-
@unittest.skip
def test_subword_regularization_tokenizer(self):
pass
diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py
index c45f1747f78f..746d96e142d3 100644
--- a/tests/models/xglm/test_tokenization_xglm.py
+++ b/tests/models/xglm/test_tokenization_xglm.py
@@ -12,9 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import pickle
-import shutil
-import tempfile
import unittest
from functools import cached_property
@@ -141,13 +138,6 @@ def test_full_tokenizer(self):
def big_tokenizer(self):
return XGLMTokenizer.from_pretrained("facebook/xglm-564M")
- def test_picklable_without_disk(self):
- with tempfile.NamedTemporaryFile() as f:
- shutil.copyfile(SAMPLE_VOCAB, f.name)
- tokenizer = XGLMTokenizer(f.name, keep_accents=True)
- pickled_tokenizer = pickle.dumps(tokenizer)
- pickle.loads(pickled_tokenizer)
-
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")
diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
index 3fe66d53a263..7c4a13ee3bb4 100644
--- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
+++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import pickle
import shutil
import tempfile
import unittest
@@ -215,13 +214,6 @@ def test_save_pretrained(self):
def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
- def test_picklable_without_disk(self):
- with tempfile.NamedTemporaryFile() as f:
- shutil.copyfile(SAMPLE_VOCAB, f.name)
- tokenizer = XLMRobertaTokenizer(f.name, keep_accents=True)
- pickled_tokenizer = pickle.dumps(tokenizer)
- pickle.loads(pickled_tokenizer)
-
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index fe8f3c2dccc2..583ebc6b0dca 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -18,7 +18,6 @@
import itertools
import json
import os
-import pickle
import re
import shutil
import tempfile
@@ -520,28 +519,6 @@ def test_subword_regularization_tokenizer(self) -> None:
},
)
- def test_pickle_subword_regularization_tokenizer(self) -> None:
- if not self.test_sentencepiece:
- self.skipTest(reason="test_sentencepiece is set to False")
-
- """Google pickle __getstate__ __setstate__ if you are struggling with this."""
- # Subword regularization is only available for the slow tokenizer.
- sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
- tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
- tokenizer_bin = pickle.dumps(tokenizer)
- del tokenizer
- tokenizer_new = pickle.loads(tokenizer_bin)
-
- run_test_in_subprocess(
- test_case=self,
- target_func=_test_subword_regularization_tokenizer,
- inputs={
- "tokenizer": tokenizer_new,
- "sp_model_kwargs": sp_model_kwargs,
- "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
- },
- )
-
def test_save_sentencepiece_tokenizer(self) -> None:
if not self.test_sentencepiece or not self.test_slow_tokenizer:
self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
@@ -827,34 +804,6 @@ def test_save_and_load_tokenizer(self):
shutil.rmtree(tmpdirname)
- def test_pickle_tokenizer(self):
- """Google pickle __getstate__ __setstate__ if you are struggling with this."""
- tokenizers = self.get_tokenizers()
- for tokenizer in tokenizers:
- with self.subTest(f"{tokenizer.__class__.__name__}"):
- self.assertIsNotNone(tokenizer)
-
- text = "Munich and Berlin are nice cities"
- subwords = tokenizer.tokenize(text)
-
- filename = os.path.join(self.tmpdirname, "tokenizer.bin")
- with open(filename, "wb") as handle:
- pickle.dump(tokenizer, handle)
-
- with open(filename, "rb") as handle:
- tokenizer_new = pickle.load(handle)
-
- subwords_loaded = tokenizer_new.tokenize(text)
-
- self.assertListEqual(subwords, subwords_loaded)
-
- @require_tokenizers
- def test_pickle_added_tokens(self):
- tok1 = AddedToken("", rstrip=True, lstrip=True, normalized=False, single_word=True)
- tok2 = pickle.loads(pickle.dumps(tok1))
-
- self.assertEqual(tok1.__getstate__(), tok2.__getstate__())
-
def test_added_tokens_do_lower_case(self):
tokenizers = self.get_tokenizers(do_lower_case=True)
for tokenizer in tokenizers:
diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py
index 9cebc52a171c..24aac3719812 100644
--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@@ -16,11 +16,8 @@
"""
import os
-import pickle
import tempfile
import unittest
-from collections.abc import Callable
-from typing import Optional
import numpy as np
@@ -66,28 +63,6 @@ def check_tokenizer_from_pretrained(self, tokenizer_class):
special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
self.assertIsInstance(special_tok_id, int)
- def assert_dump_and_restore(self, be_original: BatchEncoding, equal_op: Optional[Callable] = None):
- batch_encoding_str = pickle.dumps(be_original)
- self.assertIsNotNone(batch_encoding_str)
-
- be_restored = pickle.loads(batch_encoding_str)
-
- # Ensure is_fast is correctly restored
- self.assertEqual(be_restored.is_fast, be_original.is_fast)
-
- # Ensure encodings are potentially correctly restored
- if be_original.is_fast:
- self.assertIsNotNone(be_restored.encodings)
- else:
- self.assertIsNone(be_restored.encodings)
-
- # Ensure the keys are the same
- for original_v, restored_v in zip(be_original.values(), be_restored.values()):
- if equal_op:
- self.assertTrue(equal_op(restored_v, original_v))
- else:
- self.assertEqual(restored_v, original_v)
-
@slow
def test_pretrained_tokenizers(self):
self.check_tokenizer_from_pretrained(GPT2Tokenizer)
@@ -96,46 +71,6 @@ def test_tensor_type_from_str(self):
self.assertEqual(TensorType("pt"), TensorType.PYTORCH)
self.assertEqual(TensorType("np"), TensorType.NUMPY)
- @require_tokenizers
- def test_batch_encoding_pickle(self):
- tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
- tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased")
-
- # Python no tensor
- with self.subTest("BatchEncoding (Python, return_tensors=None)"):
- self.assert_dump_and_restore(tokenizer_p("Small example to encode"))
-
- with self.subTest("BatchEncoding (Python, return_tensors=NUMPY)"):
- self.assert_dump_and_restore(
- tokenizer_p("Small example to encode", return_tensors=TensorType.NUMPY), np.array_equal
- )
-
- with self.subTest("BatchEncoding (Rust, return_tensors=None)"):
- self.assert_dump_and_restore(tokenizer_r("Small example to encode"))
-
- with self.subTest("BatchEncoding (Rust, return_tensors=NUMPY)"):
- self.assert_dump_and_restore(
- tokenizer_r("Small example to encode", return_tensors=TensorType.NUMPY), np.array_equal
- )
-
- @require_torch
- @require_tokenizers
- def test_batch_encoding_pickle_pt(self):
- import torch
-
- tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
- tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased")
-
- with self.subTest("BatchEncoding (Python, return_tensors=PYTORCH)"):
- self.assert_dump_and_restore(
- tokenizer_p("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal
- )
-
- with self.subTest("BatchEncoding (Rust, return_tensors=PYTORCH)"):
- self.assert_dump_and_restore(
- tokenizer_r("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal
- )
-
@require_tokenizers
def test_batch_encoding_is_fast(self):
tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")