diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py index 0876da773805..66cc3f86afb5 100644 --- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py +++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py @@ -14,7 +14,6 @@ import os -import pickle import unittest from transformers import AutoTokenizer @@ -103,26 +102,6 @@ def test_full_tokenizer(self): self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) - def test_pickle_mecab_tokenizer(self): - tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab") - self.assertIsNotNone(tokenizer) - - text = "こんにちは、世界。\nこんばんは、世界。" - tokens = tokenizer.tokenize(text) - self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"]) - self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) - - filename = os.path.join(self.tmpdirname, "tokenizer.bin") - with open(filename, "wb") as handle: - pickle.dump(tokenizer, handle) - - with open(filename, "rb") as handle: - tokenizer_new = pickle.load(handle) - - tokens_loaded = tokenizer_new.tokenize(text) - - self.assertListEqual(tokens, tokens_loaded) - def test_mecab_full_tokenizer_with_mecab_kwargs(self): tokenizer = self.tokenizer_class( self.vocab_file, word_tokenizer_type="mecab", mecab_kwargs={"mecab_dic": "ipadic"} @@ -198,27 +177,6 @@ def test_mecab_tokenizer_no_normalize(self): ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"], ) - @require_sudachi_projection - def test_pickle_sudachi_tokenizer(self): - tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi") - self.assertIsNotNone(tokenizer) - - text = "こんにちは、世界。\nこんばんは、世界。" - tokens = tokenizer.tokenize(text) - self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"]) - self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) - - filename = os.path.join(self.tmpdirname, "tokenizer.bin") - with open(filename, "wb") as handle: - pickle.dump(tokenizer, handle) - - with open(filename, "rb") as handle: - tokenizer_new = pickle.load(handle) - - tokens_loaded = tokenizer_new.tokenize(text) - - self.assertListEqual(tokens, tokens_loaded) - @require_sudachi_projection def test_sudachi_tokenizer_core(self): tokenizer = SudachiTokenizer(sudachi_dict_type="core") @@ -293,27 +251,6 @@ def test_sudachi_tokenizer_trim_whitespace(self): ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"], ) - @require_jumanpp - def test_pickle_jumanpp_tokenizer(self): - tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="jumanpp") - self.assertIsNotNone(tokenizer) - - text = "こんにちは、世界。\nこんばんは、世界。" - tokens = tokenizer.tokenize(text) - self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"]) - self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14]) - - filename = os.path.join(self.tmpdirname, "tokenizer.bin") - with open(filename, "wb") as handle: - pickle.dump(tokenizer, handle) - - with open(filename, "rb") as handle: - tokenizer_new = pickle.load(handle) - - tokens_loaded = tokenizer_new.tokenize(text) - - self.assertListEqual(tokens, tokens_loaded) - @require_jumanpp def test_jumanpp_tokenizer(self): tokenizer = JumanppTokenizer() diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py index 236ab21d2d2a..c0561165c8dc 100644 --- a/tests/models/code_llama/test_tokenization_code_llama.py +++ b/tests/models/code_llama/test_tokenization_code_llama.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import pickle import shutil import tempfile import unittest @@ -293,17 +292,6 @@ def test_tokenizer_integration(self): padding=False, ) - def test_picklable(self): - with tempfile.NamedTemporaryFile() as f: - shutil.copyfile(SAMPLE_VOCAB, f.name) - tokenizer = CodeLlamaTokenizer(f.name, keep_accents=True) - pickled_tokenizer = pickle.dumps(tokenizer) - pickle.loads(pickled_tokenizer) - - @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") - def test_pickle_subword_regularization_tokenizer(self): - pass - @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") def test_subword_regularization_tokenizer(self): pass diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py index 91a5cebaed59..913f7546e84a 100644 --- a/tests/models/gemma/test_tokenization_gemma.py +++ b/tests/models/gemma/test_tokenization_gemma.py @@ -140,10 +140,6 @@ def test_tokenizer_integration(self): padding=False, ) - @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") - def test_pickle_subword_regularization_tokenizer(self): - pass - @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") def test_subword_regularization_tokenizer(self): pass diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index 927aa54fa084..58eb1f4e86e8 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import pickle import shutil import tempfile import unittest @@ -291,17 +290,6 @@ def test_tokenizer_integration(self): padding=False, ) - def test_picklable(self): - with tempfile.NamedTemporaryFile() as f: - shutil.copyfile(SAMPLE_VOCAB, f.name) - tokenizer = LlamaTokenizer(f.name, keep_accents=True) - pickled_tokenizer = pickle.dumps(tokenizer) - pickle.loads(pickled_tokenizer) - - @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") - def test_pickle_subword_regularization_tokenizer(self): - pass - @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.") def test_subword_regularization_tokenizer(self): pass diff --git a/tests/models/moshi/test_tokenization_moshi.py b/tests/models/moshi/test_tokenization_moshi.py index 2b6030f6d792..5e1cdac9d65e 100644 --- a/tests/models/moshi/test_tokenization_moshi.py +++ b/tests/models/moshi/test_tokenization_moshi.py @@ -13,9 +13,6 @@ # limitations under the License. import inspect -import pickle -import shutil -import tempfile import unittest from transformers import ( @@ -171,18 +168,6 @@ def test_special_tokens_initialization(self): self.assertTrue(special_token_id in r_output) - def test_picklable(self): - with tempfile.NamedTemporaryFile() as f: - shutil.copyfile(SAMPLE_VOCAB, f.name) - tokenizer = PreTrainedTokenizerFast( - tokenizer_object=MoshiConverter(vocab_file=f.name).converted(), - bos_token="", - unk_token="", - eos_token="", - ) - pickled_tokenizer = pickle.dumps(tokenizer) - pickle.loads(pickled_tokenizer) - def test_training_new_tokenizer(self): # This feature only exists for fast tokenizers if not self.test_rust_tokenizer: diff --git a/tests/models/pop2piano/test_tokenization_pop2piano.py b/tests/models/pop2piano/test_tokenization_pop2piano.py index a023421cd6de..6dc433128f38 100644 --- a/tests/models/pop2piano/test_tokenization_pop2piano.py +++ b/tests/models/pop2piano/test_tokenization_pop2piano.py @@ -15,8 +15,6 @@ Please note that Pop2PianoTokenizer is too far from our usual tokenizers and thus cannot use the TokenizerTesterMixin class. """ -import os -import pickle import shutil import tempfile import unittest @@ -224,23 +222,6 @@ def test_save_and_load_tokenizer(self): shutil.rmtree(tmpdirname) - def test_pickle_tokenizer(self): - tmpdirname = tempfile.mkdtemp() - - notes = self.get_input_notes() - subwords = self.tokenizer(notes)["token_ids"] - - filename = os.path.join(tmpdirname, "tokenizer.bin") - with open(filename, "wb") as handle: - pickle.dump(self.tokenizer, handle) - - with open(filename, "rb") as handle: - tokenizer_new = pickle.load(handle) - - subwords_loaded = tokenizer_new(notes)["token_ids"] - - self.assertListEqual(subwords, subwords_loaded) - def test_padding_side_in_kwargs(self): tokenizer_p = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano", padding_side="left") self.assertEqual(tokenizer_p.padding_side, "left") diff --git a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py index f55be02e172b..d395924da35d 100644 --- a/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_tokenization_seamless_m4t.py @@ -426,10 +426,6 @@ def test_training_new_tokenizer(self): self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map) - @unittest.skip(reason="Fails because of the hack of adding in _tokenize") - def test_pickle_subword_regularization_tokenizer(self): - pass - @unittest.skip(reason="Fails because of the hack of adding in _tokenize") def test_subword_regularization_tokenizer(self): pass diff --git a/tests/models/siglip/test_tokenization_siglip.py b/tests/models/siglip/test_tokenization_siglip.py index 843058c8a019..af8eb8c4ba17 100644 --- a/tests/models/siglip/test_tokenization_siglip.py +++ b/tests/models/siglip/test_tokenization_siglip.py @@ -207,10 +207,6 @@ def test_eos_in_input(self): def test_subword_regularization_tokenizer(self): pass - @unittest.skip(reason="SiglipTokenizer strips the punctuation") - def test_pickle_subword_regularization_tokenizer(self): - pass - # Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_special_tokens_initialization with T5->Siglip def test_special_tokens_initialization(self): for tokenizer, pretrained_name, kwargs in self.tokenizers_list: diff --git a/tests/models/speecht5/test_tokenization_speecht5.py b/tests/models/speecht5/test_tokenization_speecht5.py index 76554b0ebdbc..7398820cb5a5 100644 --- a/tests/models/speecht5/test_tokenization_speecht5.py +++ b/tests/models/speecht5/test_tokenization_speecht5.py @@ -143,10 +143,6 @@ def test_add_tokens_tokenizer(self): self.assertEqual(tokens[0], tokenizer.eos_token_id) self.assertEqual(tokens[-3], tokenizer.pad_token_id) - @unittest.skip - def test_pickle_subword_regularization_tokenizer(self): - pass - @unittest.skip def test_subword_regularization_tokenizer(self): pass diff --git a/tests/models/xglm/test_tokenization_xglm.py b/tests/models/xglm/test_tokenization_xglm.py index c45f1747f78f..746d96e142d3 100644 --- a/tests/models/xglm/test_tokenization_xglm.py +++ b/tests/models/xglm/test_tokenization_xglm.py @@ -12,9 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pickle -import shutil -import tempfile import unittest from functools import cached_property @@ -141,13 +138,6 @@ def test_full_tokenizer(self): def big_tokenizer(self): return XGLMTokenizer.from_pretrained("facebook/xglm-564M") - def test_picklable_without_disk(self): - with tempfile.NamedTemporaryFile() as f: - shutil.copyfile(SAMPLE_VOCAB, f.name) - tokenizer = XGLMTokenizer(f.name, keep_accents=True) - pickled_tokenizer = pickle.dumps(tokenizer) - pickle.loads(pickled_tokenizer) - def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: self.skipTest(reason="test_rust_tokenizer is set to False") diff --git a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py index 3fe66d53a263..7c4a13ee3bb4 100644 --- a/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py +++ b/tests/models/xlm_roberta/test_tokenization_xlm_roberta.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pickle import shutil import tempfile import unittest @@ -215,13 +214,6 @@ def test_save_pretrained(self): def big_tokenizer(self): return XLMRobertaTokenizer.from_pretrained("FacebookAI/xlm-roberta-base") - def test_picklable_without_disk(self): - with tempfile.NamedTemporaryFile() as f: - shutil.copyfile(SAMPLE_VOCAB, f.name) - tokenizer = XLMRobertaTokenizer(f.name, keep_accents=True) - pickled_tokenizer = pickle.dumps(tokenizer) - pickle.loads(pickled_tokenizer) - def test_rust_and_python_full_tokenizers(self): if not self.test_rust_tokenizer: self.skipTest(reason="test_rust_tokenizer is set to False") diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index fe8f3c2dccc2..583ebc6b0dca 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -18,7 +18,6 @@ import itertools import json import os -import pickle import re import shutil import tempfile @@ -520,28 +519,6 @@ def test_subword_regularization_tokenizer(self) -> None: }, ) - def test_pickle_subword_regularization_tokenizer(self) -> None: - if not self.test_sentencepiece: - self.skipTest(reason="test_sentencepiece is set to False") - - """Google pickle __getstate__ __setstate__ if you are struggling with this.""" - # Subword regularization is only available for the slow tokenizer. - sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} - tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs) - tokenizer_bin = pickle.dumps(tokenizer) - del tokenizer - tokenizer_new = pickle.loads(tokenizer_bin) - - run_test_in_subprocess( - test_case=self, - target_func=_test_subword_regularization_tokenizer, - inputs={ - "tokenizer": tokenizer_new, - "sp_model_kwargs": sp_model_kwargs, - "test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case, - }, - ) - def test_save_sentencepiece_tokenizer(self) -> None: if not self.test_sentencepiece or not self.test_slow_tokenizer: self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False") @@ -827,34 +804,6 @@ def test_save_and_load_tokenizer(self): shutil.rmtree(tmpdirname) - def test_pickle_tokenizer(self): - """Google pickle __getstate__ __setstate__ if you are struggling with this.""" - tokenizers = self.get_tokenizers() - for tokenizer in tokenizers: - with self.subTest(f"{tokenizer.__class__.__name__}"): - self.assertIsNotNone(tokenizer) - - text = "Munich and Berlin are nice cities" - subwords = tokenizer.tokenize(text) - - filename = os.path.join(self.tmpdirname, "tokenizer.bin") - with open(filename, "wb") as handle: - pickle.dump(tokenizer, handle) - - with open(filename, "rb") as handle: - tokenizer_new = pickle.load(handle) - - subwords_loaded = tokenizer_new.tokenize(text) - - self.assertListEqual(subwords, subwords_loaded) - - @require_tokenizers - def test_pickle_added_tokens(self): - tok1 = AddedToken("", rstrip=True, lstrip=True, normalized=False, single_word=True) - tok2 = pickle.loads(pickle.dumps(tok1)) - - self.assertEqual(tok1.__getstate__(), tok2.__getstate__()) - def test_added_tokens_do_lower_case(self): tokenizers = self.get_tokenizers(do_lower_case=True) for tokenizer in tokenizers: diff --git a/tests/tokenization/test_tokenization_utils.py b/tests/tokenization/test_tokenization_utils.py index 9cebc52a171c..24aac3719812 100644 --- a/tests/tokenization/test_tokenization_utils.py +++ b/tests/tokenization/test_tokenization_utils.py @@ -16,11 +16,8 @@ """ import os -import pickle import tempfile import unittest -from collections.abc import Callable -from typing import Optional import numpy as np @@ -66,28 +63,6 @@ def check_tokenizer_from_pretrained(self, tokenizer_class): special_tok_id = tokenizer.convert_tokens_to_ids(special_tok) self.assertIsInstance(special_tok_id, int) - def assert_dump_and_restore(self, be_original: BatchEncoding, equal_op: Optional[Callable] = None): - batch_encoding_str = pickle.dumps(be_original) - self.assertIsNotNone(batch_encoding_str) - - be_restored = pickle.loads(batch_encoding_str) - - # Ensure is_fast is correctly restored - self.assertEqual(be_restored.is_fast, be_original.is_fast) - - # Ensure encodings are potentially correctly restored - if be_original.is_fast: - self.assertIsNotNone(be_restored.encodings) - else: - self.assertIsNone(be_restored.encodings) - - # Ensure the keys are the same - for original_v, restored_v in zip(be_original.values(), be_restored.values()): - if equal_op: - self.assertTrue(equal_op(restored_v, original_v)) - else: - self.assertEqual(restored_v, original_v) - @slow def test_pretrained_tokenizers(self): self.check_tokenizer_from_pretrained(GPT2Tokenizer) @@ -96,46 +71,6 @@ def test_tensor_type_from_str(self): self.assertEqual(TensorType("pt"), TensorType.PYTORCH) self.assertEqual(TensorType("np"), TensorType.NUMPY) - @require_tokenizers - def test_batch_encoding_pickle(self): - tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased") - tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased") - - # Python no tensor - with self.subTest("BatchEncoding (Python, return_tensors=None)"): - self.assert_dump_and_restore(tokenizer_p("Small example to encode")) - - with self.subTest("BatchEncoding (Python, return_tensors=NUMPY)"): - self.assert_dump_and_restore( - tokenizer_p("Small example to encode", return_tensors=TensorType.NUMPY), np.array_equal - ) - - with self.subTest("BatchEncoding (Rust, return_tensors=None)"): - self.assert_dump_and_restore(tokenizer_r("Small example to encode")) - - with self.subTest("BatchEncoding (Rust, return_tensors=NUMPY)"): - self.assert_dump_and_restore( - tokenizer_r("Small example to encode", return_tensors=TensorType.NUMPY), np.array_equal - ) - - @require_torch - @require_tokenizers - def test_batch_encoding_pickle_pt(self): - import torch - - tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased") - tokenizer_r = BertTokenizerFast.from_pretrained("google-bert/bert-base-cased") - - with self.subTest("BatchEncoding (Python, return_tensors=PYTORCH)"): - self.assert_dump_and_restore( - tokenizer_p("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal - ) - - with self.subTest("BatchEncoding (Rust, return_tensors=PYTORCH)"): - self.assert_dump_and_restore( - tokenizer_r("Small example to encode", return_tensors=TensorType.PYTORCH), torch.equal - ) - @require_tokenizers def test_batch_encoding_is_fast(self): tokenizer_p = BertTokenizer.from_pretrained("google-bert/bert-base-cased")