Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 0 additions & 63 deletions tests/models/bert_japanese/test_tokenization_bert_japanese.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@


import os
import pickle
import unittest

from transformers import AutoTokenizer
Expand Down Expand Up @@ -103,26 +102,6 @@ def test_full_tokenizer(self):
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])

def test_pickle_mecab_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="mecab")
self.assertIsNotNone(tokenizer)

text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])

filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)

with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)

tokens_loaded = tokenizer_new.tokenize(text)

self.assertListEqual(tokens, tokens_loaded)

def test_mecab_full_tokenizer_with_mecab_kwargs(self):
tokenizer = self.tokenizer_class(
self.vocab_file, word_tokenizer_type="mecab", mecab_kwargs={"mecab_dic": "ipadic"}
Expand Down Expand Up @@ -198,27 +177,6 @@ def test_mecab_tokenizer_no_normalize(self):
["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"],
)

@require_sudachi_projection
def test_pickle_sudachi_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="sudachi")
self.assertIsNotNone(tokenizer)

text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])

filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)

with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)

tokens_loaded = tokenizer_new.tokenize(text)

self.assertListEqual(tokens, tokens_loaded)

@require_sudachi_projection
def test_sudachi_tokenizer_core(self):
tokenizer = SudachiTokenizer(sudachi_dict_type="core")
Expand Down Expand Up @@ -293,27 +251,6 @@ def test_sudachi_tokenizer_trim_whitespace(self):
["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"],
)

@require_jumanpp
def test_pickle_jumanpp_tokenizer(self):
tokenizer = self.tokenizer_class(self.vocab_file, word_tokenizer_type="jumanpp")
self.assertIsNotNone(tokenizer)

text = "こんにちは、世界。\nこんばんは、世界。"
tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, ["こんにちは", "、", "世界", "。", "こん", "##ばんは", "、", "世界", "。"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [3, 12, 10, 14, 4, 9, 12, 10, 14])

filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)

with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)

tokens_loaded = tokenizer_new.tokenize(text)

self.assertListEqual(tokens, tokens_loaded)

@require_jumanpp
def test_jumanpp_tokenizer(self):
tokenizer = JumanppTokenizer()
Expand Down
12 changes: 0 additions & 12 deletions tests/models/code_llama/test_tokenization_code_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

import os
import pickle
import shutil
import tempfile
import unittest
Expand Down Expand Up @@ -293,17 +292,6 @@ def test_tokenizer_integration(self):
padding=False,
)

def test_picklable(self):
with tempfile.NamedTemporaryFile() as f:
shutil.copyfile(SAMPLE_VOCAB, f.name)
tokenizer = CodeLlamaTokenizer(f.name, keep_accents=True)
pickled_tokenizer = pickle.dumps(tokenizer)
pickle.loads(pickled_tokenizer)

@unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
def test_pickle_subword_regularization_tokenizer(self):
pass

@unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
def test_subword_regularization_tokenizer(self):
pass
Expand Down
4 changes: 0 additions & 4 deletions tests/models/gemma/test_tokenization_gemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,6 @@ def test_tokenizer_integration(self):
padding=False,
)

@unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
def test_pickle_subword_regularization_tokenizer(self):
pass

@unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
def test_subword_regularization_tokenizer(self):
pass
Expand Down
12 changes: 0 additions & 12 deletions tests/models/llama/test_tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

import os
import pickle
import shutil
import tempfile
import unittest
Expand Down Expand Up @@ -291,17 +290,6 @@ def test_tokenizer_integration(self):
padding=False,
)

def test_picklable(self):
with tempfile.NamedTemporaryFile() as f:
shutil.copyfile(SAMPLE_VOCAB, f.name)
tokenizer = LlamaTokenizer(f.name, keep_accents=True)
pickled_tokenizer = pickle.dumps(tokenizer)
pickle.loads(pickled_tokenizer)

@unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
def test_pickle_subword_regularization_tokenizer(self):
pass

@unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
def test_subword_regularization_tokenizer(self):
pass
Expand Down
15 changes: 0 additions & 15 deletions tests/models/moshi/test_tokenization_moshi.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@
# limitations under the License.

import inspect
import pickle
import shutil
import tempfile
import unittest

from transformers import (
Expand Down Expand Up @@ -171,18 +168,6 @@ def test_special_tokens_initialization(self):

self.assertTrue(special_token_id in r_output)

def test_picklable(self):
with tempfile.NamedTemporaryFile() as f:
shutil.copyfile(SAMPLE_VOCAB, f.name)
tokenizer = PreTrainedTokenizerFast(
tokenizer_object=MoshiConverter(vocab_file=f.name).converted(),
bos_token="<s>",
unk_token="<unk>",
eos_token="</s>",
)
pickled_tokenizer = pickle.dumps(tokenizer)
pickle.loads(pickled_tokenizer)

def test_training_new_tokenizer(self):
# This feature only exists for fast tokenizers
if not self.test_rust_tokenizer:
Expand Down
19 changes: 0 additions & 19 deletions tests/models/pop2piano/test_tokenization_pop2piano.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
Please note that Pop2PianoTokenizer is too far from our usual tokenizers and thus cannot use the TokenizerTesterMixin class.
"""

import os
import pickle
import shutil
import tempfile
import unittest
Expand Down Expand Up @@ -224,23 +222,6 @@ def test_save_and_load_tokenizer(self):

shutil.rmtree(tmpdirname)

def test_pickle_tokenizer(self):
tmpdirname = tempfile.mkdtemp()

notes = self.get_input_notes()
subwords = self.tokenizer(notes)["token_ids"]

filename = os.path.join(tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(self.tokenizer, handle)

with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)

subwords_loaded = tokenizer_new(notes)["token_ids"]

self.assertListEqual(subwords, subwords_loaded)

def test_padding_side_in_kwargs(self):
tokenizer_p = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano", padding_side="left")
self.assertEqual(tokenizer_p.padding_side, "left")
Expand Down
4 changes: 0 additions & 4 deletions tests/models/seamless_m4t/test_tokenization_seamless_m4t.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,10 +426,6 @@ def test_training_new_tokenizer(self):

self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)

@unittest.skip(reason="Fails because of the hack of adding <unk> in _tokenize")
def test_pickle_subword_regularization_tokenizer(self):
pass

@unittest.skip(reason="Fails because of the hack of adding <unk> in _tokenize")
def test_subword_regularization_tokenizer(self):
pass
Expand Down
4 changes: 0 additions & 4 deletions tests/models/siglip/test_tokenization_siglip.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,10 +207,6 @@ def test_eos_in_input(self):
def test_subword_regularization_tokenizer(self):
pass

@unittest.skip(reason="SiglipTokenizer strips the punctuation")
def test_pickle_subword_regularization_tokenizer(self):
pass

# Copied from tests.models.t5.test_tokenization_t5.T5TokenizationTest.test_special_tokens_initialization with T5->Siglip
def test_special_tokens_initialization(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
Expand Down
4 changes: 0 additions & 4 deletions tests/models/speecht5/test_tokenization_speecht5.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,6 @@ def test_add_tokens_tokenizer(self):
self.assertEqual(tokens[0], tokenizer.eos_token_id)
self.assertEqual(tokens[-3], tokenizer.pad_token_id)

@unittest.skip
def test_pickle_subword_regularization_tokenizer(self):
pass

@unittest.skip
def test_subword_regularization_tokenizer(self):
pass
Expand Down
10 changes: 0 additions & 10 deletions tests/models/xglm/test_tokenization_xglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle
import shutil
import tempfile
import unittest
from functools import cached_property

Expand Down Expand Up @@ -141,13 +138,6 @@ def test_full_tokenizer(self):
def big_tokenizer(self):
return XGLMTokenizer.from_pretrained("facebook/xglm-564M")

def test_picklable_without_disk(self):
with tempfile.NamedTemporaryFile() as f:
shutil.copyfile(SAMPLE_VOCAB, f.name)
tokenizer = XGLMTokenizer(f.name, keep_accents=True)
pickled_tokenizer = pickle.dumps(tokenizer)
pickle.loads(pickled_tokenizer)

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")
Expand Down
8 changes: 0 additions & 8 deletions tests/models/xlm_roberta/test_tokenization_xlm_roberta.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle
import shutil
import tempfile
import unittest
Expand Down Expand Up @@ -215,13 +214,6 @@ def test_save_pretrained(self):
def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

def test_picklable_without_disk(self):
with tempfile.NamedTemporaryFile() as f:
shutil.copyfile(SAMPLE_VOCAB, f.name)
tokenizer = XLMRobertaTokenizer(f.name, keep_accents=True)
pickled_tokenizer = pickle.dumps(tokenizer)
pickle.loads(pickled_tokenizer)

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")
Expand Down
51 changes: 0 additions & 51 deletions tests/test_tokenization_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import itertools
import json
import os
import pickle
import re
import shutil
import tempfile
Expand Down Expand Up @@ -520,28 +519,6 @@ def test_subword_regularization_tokenizer(self) -> None:
},
)

def test_pickle_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece:
self.skipTest(reason="test_sentencepiece is set to False")

"""Google pickle __getstate__ __setstate__ if you are struggling with this."""
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
tokenizer_bin = pickle.dumps(tokenizer)
del tokenizer
tokenizer_new = pickle.loads(tokenizer_bin)

run_test_in_subprocess(
test_case=self,
target_func=_test_subword_regularization_tokenizer,
inputs={
"tokenizer": tokenizer_new,
"sp_model_kwargs": sp_model_kwargs,
"test_sentencepiece_ignore_case": self.test_sentencepiece_ignore_case,
},
)

def test_save_sentencepiece_tokenizer(self) -> None:
if not self.test_sentencepiece or not self.test_slow_tokenizer:
self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
Expand Down Expand Up @@ -827,34 +804,6 @@ def test_save_and_load_tokenizer(self):

shutil.rmtree(tmpdirname)

def test_pickle_tokenizer(self):
"""Google pickle __getstate__ __setstate__ if you are struggling with this."""
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
self.assertIsNotNone(tokenizer)

text = "Munich and Berlin are nice cities"
subwords = tokenizer.tokenize(text)

filename = os.path.join(self.tmpdirname, "tokenizer.bin")
with open(filename, "wb") as handle:
pickle.dump(tokenizer, handle)

with open(filename, "rb") as handle:
tokenizer_new = pickle.load(handle)

subwords_loaded = tokenizer_new.tokenize(text)

self.assertListEqual(subwords, subwords_loaded)

@require_tokenizers
def test_pickle_added_tokens(self):
tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
tok2 = pickle.loads(pickle.dumps(tok1))

self.assertEqual(tok1.__getstate__(), tok2.__getstate__())

def test_added_tokens_do_lower_case(self):
tokenizers = self.get_tokenizers(do_lower_case=True)
for tokenizer in tokenizers:
Expand Down
Loading