From 5b237b4745dabbd54d6793ce40b06d379535a90c Mon Sep 17 00:00:00 2001 From: Sagi Polaczek Date: Thu, 10 Oct 2024 12:14:02 +0300 Subject: [PATCH 1/3] done --- .../modular_tokenizer/modular_tokenizer.py | 11 ----------- fuse/data/tokenizers/modular_tokenizer/op.py | 12 +++++++++--- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py b/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py index 2d57c107b..304925e21 100644 --- a/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py +++ b/fuse/data/tokenizers/modular_tokenizer/modular_tokenizer.py @@ -868,19 +868,12 @@ def add_single_tokenizer( # we update the special tokens but do not save here. remember to save yourself. self.update_special_tokens( special_tokens=new_tokenize_special_tokens, - # save_tokenizer_path=self.cfg_raw["data"]["tokenizer"]["out_path"], ) def add_tokenizers( self, ) -> None: raise Exception("Not implemented") - # self.build_inner_decoder() - # if self._max_possible_token_id is not None: - # if self._get_max_mapped_id() > self._max_possible_token_id: - # raise Exception( - # f"tokenizer remapping resulted in IDs greater (max_id={self._get_max_mapped_id()}) than max_possible_id ({self._max_possible_token_id}). Reinitialize the modular tokenizer with larger max_possible_id" - # ) def _encode_single_type( self, @@ -1059,10 +1052,6 @@ def encode_list( merged_encoding = Encoding.merge(encoded_list) max_len = self.get_expected_max_len(override_max_len=max_len) - # if max_len is None: - # if self.max_len is not None: - # max_len = self.max_len - if max_len is not None: if len(merged_encoding) > max_len: overflow_info += f"OVERALL:{len(merged_encoding)}=>{max_len}|" diff --git a/fuse/data/tokenizers/modular_tokenizer/op.py b/fuse/data/tokenizers/modular_tokenizer/op.py index 9ccf6650a..9fdcd8648 100644 --- a/fuse/data/tokenizers/modular_tokenizer/op.py +++ b/fuse/data/tokenizers/modular_tokenizer/op.py @@ -30,6 +30,7 @@ def __init__( validate_ends_with_eos: Optional[bool] = True, eos: Optional[str] = "", verbose: Optional[bool] = False, + on_unknown_default_value: str = "warn", **kwargs: Any, ) -> None: """ @@ -41,6 +42,7 @@ def __init__( validate_ends_with_eos: during encoder request (a _call_ to the op) will make sure that it ends with the provided eos token, and raise exception otherwise. having an eos (end of sentence) token in the end is useful for multiple scenarios, for example in a generative transformer (like T5 encoder-decoder) verbose: + on_unknown_default_value: User can define the default behavior of unknown token here in the constructor. In addition, this value can be overwritten in the __call__ """ super().__init__(**kwargs) @@ -60,6 +62,7 @@ def __init__( self._validate_ends_with_eos = validate_ends_with_eos self._eos = eos + self._on_unknown_default_value = on_unknown_default_value if self._validate_ends_with_eos: eos_id = self._tokenizer.token_to_id(self._eos) @@ -211,7 +214,7 @@ def __call__( key_out_attention_mask: Optional[str] = None, convert_attention_mask_to_bool: Optional[bool] = True, max_seq_len: Optional[int] = None, - on_unknown: Optional[str] = "warn", + on_unknown: Optional[str] = None, verbose: Optional[int] = 1, validate_ends_with_eos: Optional[bool] = None, additional_caller_info_text: Optional[str] = "", @@ -230,7 +233,7 @@ def __call__( key_out_attention_mask (Optional[str], optional): _description_. Defaults to None. convert_attention_mask_to_bool (Optional[bool], optional): _description_. Defaults to True. max_seq_len (Optional[int], optional): set maximum sequence len dynamically, used for both padding and truncation.. Defaults to None. - on_unknown (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn'. Defaults to "warn". + on_unknown (Optional[str], optional): What happens if unknown tokens (i.e. ones mapped to ) are encountered: 'raise' or 'warn'. Defaults to "warn". The default value can be determined in the constructor itself. verbose (Optional[int], optional): verbosity level. 0: no notification, 1: warning notification, 2: warning with partial data, 3: warning with full data. Defaults to 1. validate_ends_with_eos (Optional[bool], optional): if not None, overrides self._validate_ends_with_eos @@ -243,7 +246,6 @@ def __call__( Returns: NDict: _description_ """ - data = sample_dict[key_in] if not isinstance(data, (list, str)): # data is a list of named tuples of type collections.namedtuple("TypedInput", ["input_type", "input_string", "max_len"]) @@ -263,6 +265,10 @@ def __call__( f"validate_ends_with_eos was set to {validate_ends_with_eos}, but about to encode a string that does not end with {self._eos}. The str end was: {last_seq}" ) + if on_unknown is None: + # Use tokenizer instance defautl value + on_unknown = self._on_unknown_default_value + if isinstance(data, str): _ans = self._tokenizer.encode( data, From 522a657409536bb4f2c6129eb5a662cdf8a71073 Mon Sep 17 00:00:00 2001 From: Sagi Polaczek Date: Thu, 10 Oct 2024 12:47:55 +0300 Subject: [PATCH 2/3] support from_pretrained --- fuse/data/tokenizers/modular_tokenizer/op.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fuse/data/tokenizers/modular_tokenizer/op.py b/fuse/data/tokenizers/modular_tokenizer/op.py index 9fdcd8648..73f948838 100644 --- a/fuse/data/tokenizers/modular_tokenizer/op.py +++ b/fuse/data/tokenizers/modular_tokenizer/op.py @@ -30,7 +30,7 @@ def __init__( validate_ends_with_eos: Optional[bool] = True, eos: Optional[str] = "", verbose: Optional[bool] = False, - on_unknown_default_value: str = "warn", + on_unknown_default_value: Optional[str] = None, **kwargs: Any, ) -> None: """ @@ -62,7 +62,13 @@ def __init__( self._validate_ends_with_eos = validate_ends_with_eos self._eos = eos - self._on_unknown_default_value = on_unknown_default_value + if on_unknown_default_value is not None: + self._on_unknown_default_value = on_unknown_default_value + else: + self._on_unknown_default_value = "warn" + + if on_unknown_default_value not in ["warn", "raise"]: + raise ValueError(f"Doesn't support {on_unknown_default_value=}!") if self._validate_ends_with_eos: eos_id = self._tokenizer.token_to_id(self._eos) @@ -538,6 +544,7 @@ def from_pretrained( identifier: str, pad_token: str = "", max_size: Optional[int] = None, + on_unknown_default_value: Optional[str] = None, force_download: bool = False, resume_download: Optional[bool] = None, proxies: Optional[Dict] = None, @@ -577,7 +584,10 @@ def from_pretrained( ) from e tokenizer_op = cls( - tokenizer_path=identifier, pad_token=pad_token, max_size=max_size + tokenizer_path=identifier, + pad_token=pad_token, + max_size=max_size, + on_unknown_default_value=on_unknown_default_value, ) return tokenizer_op From 467e54c243ca636d42d835a06ae992840da83101 Mon Sep 17 00:00:00 2001 From: Sagi Polaczek Date: Sun, 13 Oct 2024 11:52:27 +0300 Subject: [PATCH 3/3] switch to default value of 'warn' instead of 'None' --- fuse/data/tokenizers/modular_tokenizer/op.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fuse/data/tokenizers/modular_tokenizer/op.py b/fuse/data/tokenizers/modular_tokenizer/op.py index 8d40b2478..67d5ff4d5 100644 --- a/fuse/data/tokenizers/modular_tokenizer/op.py +++ b/fuse/data/tokenizers/modular_tokenizer/op.py @@ -30,7 +30,7 @@ def __init__( validate_ends_with_eos: Optional[bool] = True, eos: Optional[str] = "", verbose: Optional[bool] = False, - on_unknown_default_value: Optional[str] = None, + on_unknown_default_value: str = "warn", **kwargs: Any, ) -> None: """ @@ -62,10 +62,7 @@ def __init__( self._validate_ends_with_eos = validate_ends_with_eos self._eos = eos - if on_unknown_default_value is not None: - self._on_unknown_default_value = on_unknown_default_value - else: - self._on_unknown_default_value = "warn" + self._on_unknown_default_value = on_unknown_default_value if on_unknown_default_value not in ["warn", "raise"]: raise ValueError(f"Doesn't support {on_unknown_default_value=}!") @@ -272,7 +269,7 @@ def __call__( ) if on_unknown is None: - # Use tokenizer instance defautl value + # Use tokenizer instance default value on_unknown = self._on_unknown_default_value if isinstance(data, str): @@ -522,7 +519,7 @@ def from_pretrained( identifier: str, pad_token: str = "", max_size: Optional[int] = None, - on_unknown_default_value: Optional[str] = None, + on_unknown_default_value: str = "warn", force_download: bool = False, resume_download: Optional[bool] = None, proxies: Optional[Dict] = None,