From 65728ed30c7616b384f5f29a1b1617babb33c5e3 Mon Sep 17 00:00:00 2001 From: Florian Maas Date: Fri, 12 Jul 2024 09:47:15 +0200 Subject: [PATCH] [`chore`] Enable ruff rules `UP006` and `UP007` to improve type hints. (#2830) * eable isort improve ci/cd improve ci/cd improve ci/cd fix isort try fix * fix * Enable UP006 and UP007 * fix --- pyproject.toml | 16 +- sentence_transformers/LoggingHandler.py | 2 + sentence_transformers/SentenceTransformer.py | 188 +++++++++--------- sentence_transformers/__init__.py | 2 + .../cross_encoder/CrossEncoder.py | 76 +++---- .../cross_encoder/__init__.py | 2 + .../evaluation/CEBinaryAccuracyEvaluator.py | 9 +- .../CEBinaryClassificationEvaluator.py | 9 +- .../evaluation/CECorrelationEvaluator.py | 7 +- .../cross_encoder/evaluation/CEF1Evaluator.py | 9 +- .../evaluation/CERerankingEvaluator.py | 7 +- .../evaluation/CESoftmaxAccuracyEvaluator.py | 7 +- .../cross_encoder/evaluation/__init__.py | 2 + sentence_transformers/data_collator.py | 8 +- .../datasets/DenoisingAutoEncoderDataset.py | 4 +- .../datasets/NoDuplicatesDataLoader.py | 2 + .../datasets/ParallelSentencesDataset.py | 5 +- .../datasets/SentenceLabelDataset.py | 5 +- .../datasets/SentencesDataset.py | 4 +- sentence_transformers/datasets/__init__.py | 2 + .../BinaryClassificationEvaluator.py | 16 +- .../EmbeddingSimilarityEvaluator.py | 20 +- .../InformationRetrievalEvaluator.py | 32 +-- .../evaluation/LabelAccuracyEvaluator.py | 6 +- .../evaluation/MSEEvaluator.py | 12 +- .../evaluation/MSEEvaluatorFromDataFrame.py | 12 +- .../evaluation/ParaphraseMiningEvaluator.py | 14 +- .../evaluation/RerankingEvaluator.py | 10 +- .../evaluation/SentenceEvaluator.py | 10 +- .../evaluation/SequentialEvaluator.py | 6 +- .../evaluation/SimilarityFunction.py | 2 + .../evaluation/TranslationEvaluator.py | 12 +- .../evaluation/TripletEvaluator.py | 18 +- sentence_transformers/evaluation/__init__.py | 2 + sentence_transformers/fit_mixin.py | 24 ++- .../losses/AdaptiveLayerLoss.py | 22 +- sentence_transformers/losses/AnglELoss.py | 2 + .../losses/BatchAllTripletLoss.py | 6 +- .../losses/BatchHardSoftMarginTripletLoss.py | 6 +- .../losses/BatchHardTripletLoss.py | 6 +- .../losses/BatchSemiHardTripletLoss.py | 6 +- .../losses/CachedGISTEmbedLoss.py | 32 +-- .../CachedMultipleNegativesRankingLoss.py | 32 +-- sentence_transformers/losses/CoSENTLoss.py | 8 +- .../losses/ContrastiveLoss.py | 8 +- .../losses/ContrastiveTensionLoss.py | 8 +- .../losses/CosineSimilarityLoss.py | 8 +- .../losses/DenoisingAutoEncoderLoss.py | 10 +- sentence_transformers/losses/GISTEmbedLoss.py | 8 +- sentence_transformers/losses/MSELoss.py | 6 +- sentence_transformers/losses/MarginMSELoss.py | 6 +- .../losses/Matryoshka2dLoss.py | 10 +- .../losses/MatryoshkaLoss.py | 14 +- .../losses/MegaBatchMarginLoss.py | 8 +- .../losses/MultipleNegativesRankingLoss.py | 8 +- .../MultipleNegativesSymmetricRankingLoss.py | 8 +- .../losses/OnlineContrastiveLoss.py | 6 +- sentence_transformers/losses/SoftmaxLoss.py | 8 +- sentence_transformers/losses/TripletLoss.py | 8 +- sentence_transformers/losses/__init__.py | 2 + sentence_transformers/model_card.py | 96 ++++----- sentence_transformers/model_card_templates.py | 2 + sentence_transformers/models/Asym.py | 10 +- sentence_transformers/models/BoW.py | 16 +- sentence_transformers/models/CLIPModel.py | 6 +- sentence_transformers/models/CNN.py | 9 +- sentence_transformers/models/Dense.py | 5 +- sentence_transformers/models/Dropout.py | 5 +- sentence_transformers/models/LSTM.py | 5 +- sentence_transformers/models/LayerNorm.py | 5 +- sentence_transformers/models/Normalize.py | 4 +- sentence_transformers/models/Pooling.py | 8 +- sentence_transformers/models/Transformer.py | 22 +- .../models/WeightedLayerPooling.py | 5 +- .../models/WordEmbeddings.py | 5 +- sentence_transformers/models/WordWeights.py | 7 +- sentence_transformers/models/__init__.py | 2 + .../models/tokenizer/PhraseTokenizer.py | 6 +- .../models/tokenizer/WhitespaceTokenizer.py | 6 +- .../models/tokenizer/WordTokenizer.py | 6 +- .../models/tokenizer/__init__.py | 2 + sentence_transformers/quantization.py | 30 +-- sentence_transformers/readers/InputExample.py | 4 +- .../readers/LabelSentenceReader.py | 2 + .../readers/NLIDataReader.py | 2 + .../readers/PairedFilesReader.py | 2 + .../readers/STSDataReader.py | 2 + .../readers/TripletReader.py | 2 + sentence_transformers/readers/__init__.py | 2 + sentence_transformers/sampler.py | 22 +- sentence_transformers/similarity_functions.py | 14 +- sentence_transformers/trainer.py | 93 +++++---- sentence_transformers/training_args.py | 7 +- sentence_transformers/util.py | 76 +++---- tests/conftest.py | 2 + .../test_group_by_label_batch_sampler.py | 2 + .../test_no_duplicates_batch_sampler.py | 2 + .../test_round_robin_batch_sampler.py | 2 + tests/test_cmnrl.py | 7 +- tests/test_compute_embeddings.py | 2 + tests/test_cross_encoder.py | 14 +- tests/test_evaluator.py | 2 + tests/test_image_embeddings.py | 2 + tests/test_model_card_data.py | 2 + tests/test_multi_process.py | 4 +- tests/test_pretrained_stsb.py | 5 +- tests/test_sentence_transformer.py | 10 +- tests/test_train_stsb.py | 20 +- tests/test_trainer.py | 2 + tests/test_util.py | 2 + 110 files changed, 782 insertions(+), 596 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ffa7f93bf..636930e17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ line-length = 119 fix = true [tool.ruff.lint] -select = ["E", "F", "W", "I"] +select = ["E", "F", "W", "I", "UP006", "UP007"] # Skip `E731` (do not assign a lambda expression, use a def) ignore = [ # LineTooLong @@ -72,11 +72,21 @@ ignore = [ ] [tool.ruff.lint.per-file-ignores] -# Ignore `E402` (import violations) in all examples -"examples/**" = ["E402"] +"examples/**" = [ + # Ignore `E402` (import violations) in all examples + "E402", + # Ignore missing required imports + "I002" + ] +"docs/**" = [ + # Ignore missing required imports + "I002" + ] [tool.ruff.lint.isort] known-third-party = ["datasets"] +required-imports = ["from __future__ import annotations"] + [tool.pytest.ini_options] testpaths = [ diff --git a/sentence_transformers/LoggingHandler.py b/sentence_transformers/LoggingHandler.py index bda6f3e00..29f2ac54a 100644 --- a/sentence_transformers/LoggingHandler.py +++ b/sentence_transformers/LoggingHandler.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import logging import tqdm diff --git a/sentence_transformers/SentenceTransformer.py b/sentence_transformers/SentenceTransformer.py index eea804139..6dc544e75 100644 --- a/sentence_transformers/SentenceTransformer.py +++ b/sentence_transformers/SentenceTransformer.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import copy import importlib import json @@ -12,7 +14,7 @@ from contextlib import contextmanager from multiprocessing import Queue from pathlib import Path -from typing import Any, Callable, Dict, Iterable, Iterator, List, Literal, Optional, Tuple, Union, overload +from typing import Any, Callable, Iterable, Iterator, Literal, overload import numpy as np import torch @@ -143,23 +145,23 @@ class SentenceTransformer(nn.Sequential, FitMixin): def __init__( self, - model_name_or_path: Optional[str] = None, - modules: Optional[Iterable[nn.Module]] = None, - device: Optional[str] = None, - prompts: Optional[Dict[str, str]] = None, - default_prompt_name: Optional[str] = None, - similarity_fn_name: Optional[Union[str, SimilarityFunction]] = None, - cache_folder: Optional[str] = None, + model_name_or_path: str | None = None, + modules: Iterable[nn.Module] | None = None, + device: str | None = None, + prompts: dict[str, str] | None = None, + default_prompt_name: str | None = None, + similarity_fn_name: str | SimilarityFunction | None = None, + cache_folder: str | None = None, trust_remote_code: bool = False, - revision: Optional[str] = None, + revision: str | None = None, local_files_only: bool = False, - token: Optional[Union[bool, str]] = None, - use_auth_token: Optional[Union[bool, str]] = None, - truncate_dim: Optional[int] = None, - model_kwargs: Optional[Dict[str, Any]] = None, - tokenizer_kwargs: Optional[Dict[str, Any]] = None, - config_kwargs: Optional[Dict[str, Any]] = None, - model_card_data: Optional[SentenceTransformerModelCardData] = None, + token: bool | str | None = None, + use_auth_token: bool | str | None = None, + truncate_dim: int | None = None, + model_kwargs: dict[str, Any] | None = None, + tokenizer_kwargs: dict[str, Any] | None = None, + config_kwargs: dict[str, Any] | None = None, + model_card_data: SentenceTransformerModelCardData | None = None, ) -> None: # Note: self._load_sbert_model can also update `self.prompts` and `self.default_prompt_name` self.prompts = prompts or {} @@ -355,11 +357,11 @@ def __init__( def encode( self, sentences: str, - prompt_name: Optional[str] = ..., - prompt: Optional[str] = ..., + prompt_name: str | None = ..., + prompt: str | None = ..., batch_size: int = ..., - show_progress_bar: Optional[bool] = ..., - output_value: Optional[Literal["sentence_embedding", "token_embeddings"]] = ..., + show_progress_bar: bool | None = ..., + output_value: Literal["sentence_embedding", "token_embeddings"] | None = ..., precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = ..., convert_to_numpy: Literal[False] = ..., convert_to_tensor: Literal[False] = ..., @@ -370,12 +372,12 @@ def encode( @overload def encode( self, - sentences: Union[str, List[str]], - prompt_name: Optional[str] = ..., - prompt: Optional[str] = ..., + sentences: str | list[str], + prompt_name: str | None = ..., + prompt: str | None = ..., batch_size: int = ..., - show_progress_bar: Optional[bool] = ..., - output_value: Optional[Literal["sentence_embedding", "token_embeddings"]] = ..., + show_progress_bar: bool | None = ..., + output_value: Literal["sentence_embedding", "token_embeddings"] | None = ..., precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = ..., convert_to_numpy: Literal[True] = ..., convert_to_tensor: Literal[False] = ..., @@ -386,12 +388,12 @@ def encode( @overload def encode( self, - sentences: Union[str, List[str]], - prompt_name: Optional[str] = ..., - prompt: Optional[str] = ..., + sentences: str | list[str], + prompt_name: str | None = ..., + prompt: str | None = ..., batch_size: int = ..., - show_progress_bar: Optional[bool] = ..., - output_value: Optional[Literal["sentence_embedding", "token_embeddings"]] = ..., + show_progress_bar: bool | None = ..., + output_value: Literal["sentence_embedding", "token_embeddings"] | None = ..., precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = ..., convert_to_numpy: bool = ..., convert_to_tensor: Literal[True] = ..., @@ -402,33 +404,33 @@ def encode( @overload def encode( self, - sentences: Union[List[str], np.ndarray], - prompt_name: Optional[str] = ..., - prompt: Optional[str] = ..., + sentences: list[str] | np.ndarray, + prompt_name: str | None = ..., + prompt: str | None = ..., batch_size: int = ..., - show_progress_bar: Optional[bool] = ..., - output_value: Optional[Literal["sentence_embedding", "token_embeddings"]] = ..., + show_progress_bar: bool | None = ..., + output_value: Literal["sentence_embedding", "token_embeddings"] | None = ..., precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = ..., convert_to_numpy: Literal[False] = ..., convert_to_tensor: Literal[False] = ..., device: str = ..., normalize_embeddings: bool = ..., - ) -> List[Tensor]: ... + ) -> list[Tensor]: ... def encode( self, - sentences: Union[str, List[str]], - prompt_name: Optional[str] = None, - prompt: Optional[str] = None, + sentences: str | list[str], + prompt_name: str | None = None, + prompt: str | None = None, batch_size: int = 32, - show_progress_bar: Optional[bool] = None, - output_value: Optional[Literal["sentence_embedding", "token_embeddings"]] = "sentence_embedding", + show_progress_bar: bool | None = None, + output_value: Literal["sentence_embedding", "token_embeddings"] | None = "sentence_embedding", precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32", convert_to_numpy: bool = True, convert_to_tensor: bool = False, device: str = None, normalize_embeddings: bool = False, - ) -> Union[List[Tensor], np.ndarray, Tensor]: + ) -> list[Tensor] | np.ndarray | Tensor: """ Computes sentence embeddings. @@ -637,7 +639,7 @@ def encode( return all_embeddings @property - def similarity_fn_name(self) -> Optional[str]: + def similarity_fn_name(self) -> str | None: """Return the name of the similarity function used by :meth:`SentenceTransformer.similarity` and :meth:`SentenceTransformer.similarity_pairwise`. Returns: @@ -652,7 +654,7 @@ def similarity_fn_name(self) -> Optional[str]: return self._similarity_fn_name @similarity_fn_name.setter - def similarity_fn_name(self, value: Union[str, SimilarityFunction]) -> None: + def similarity_fn_name(self, value: str | SimilarityFunction) -> None: if isinstance(value, SimilarityFunction): value = value.value self._similarity_fn_name = value @@ -668,7 +670,7 @@ def similarity(self, embeddings1: Tensor, embeddings2: Tensor) -> Tensor: ... def similarity(self, embeddings1: ndarray, embeddings2: ndarray) -> Tensor: ... @property - def similarity(self) -> Callable[[Union[Tensor, ndarray], Union[Tensor, ndarray]], Tensor]: + def similarity(self) -> Callable[[Tensor | ndarray, Tensor | ndarray], Tensor]: """ Compute the similarity between two collections of embeddings. The output will be a matrix with the similarity scores between all embeddings from the first parameter and all embeddings from the second parameter. This @@ -717,7 +719,7 @@ def similarity_pairwise(self, embeddings1: Tensor, embeddings2: Tensor) -> Tenso def similarity_pairwise(self, embeddings1: ndarray, embeddings2: ndarray) -> Tensor: ... @property - def similarity_pairwise(self) -> Callable[[Union[Tensor, ndarray], Union[Tensor, ndarray]], Tensor]: + def similarity_pairwise(self) -> Callable[[Tensor | ndarray, Tensor | ndarray], Tensor]: """ Compute the similarity between two collections of embeddings. The output will be a vector with the similarity scores between each pair of embeddings. @@ -753,8 +755,8 @@ def similarity_pairwise(self) -> Callable[[Union[Tensor, ndarray], Union[Tensor, return self._similarity_pairwise def start_multi_process_pool( - self, target_devices: List[str] = None - ) -> Dict[Literal["input", "output", "processes"], Any]: + self, target_devices: list[str] = None + ) -> dict[Literal["input", "output", "processes"], Any]: """ Starts a multi-process pool to process the encoding with several independent processes via :meth:`SentenceTransformer.encode_multi_process `. @@ -802,7 +804,7 @@ def start_multi_process_pool( return {"input": input_queue, "output": output_queue, "processes": processes} @staticmethod - def stop_multi_process_pool(pool: Dict[Literal["input", "output", "processes"], Any]) -> None: + def stop_multi_process_pool(pool: dict[Literal["input", "output", "processes"], Any]) -> None: """ Stops all processes started with start_multi_process_pool. @@ -824,13 +826,13 @@ def stop_multi_process_pool(pool: Dict[Literal["input", "output", "processes"], def encode_multi_process( self, - sentences: List[str], - pool: Dict[Literal["input", "output", "processes"], Any], - prompt_name: Optional[str] = None, - prompt: Optional[str] = None, + sentences: list[str], + pool: dict[Literal["input", "output", "processes"], Any], + prompt_name: str | None = None, + prompt: str | None = None, batch_size: int = 32, chunk_size: int = None, - show_progress_bar: Optional[bool] = None, + show_progress_bar: bool | None = None, precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] = "float32", normalize_embeddings: bool = False, ) -> np.ndarray: @@ -966,7 +968,7 @@ def set_pooling_include_prompt(self, include_prompt: bool) -> None: module.include_prompt = include_prompt break - def get_max_seq_length(self) -> Optional[int]: + def get_max_seq_length(self) -> int | None: """ Returns the maximal sequence length that the model accepts. Longer inputs will be truncated. @@ -978,7 +980,7 @@ def get_max_seq_length(self) -> Optional[int]: return None - def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]) -> Dict[str, Tensor]: + def tokenize(self, texts: list[str] | list[dict] | list[tuple[str, str]]) -> dict[str, Tensor]: """ Tokenizes the texts. @@ -991,10 +993,10 @@ def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]) - """ return self._first_module().tokenize(texts) - def get_sentence_features(self, *features) -> Dict[Literal["sentence_embedding"], torch.Tensor]: + def get_sentence_features(self, *features) -> dict[Literal["sentence_embedding"], torch.Tensor]: return self._first_module().get_sentence_features(*features) - def get_sentence_embedding_dimension(self) -> Optional[int]: + def get_sentence_embedding_dimension(self) -> int | None: """ Returns the number of dimensions in the output of :meth:`SentenceTransformer.encode `. @@ -1014,7 +1016,7 @@ def get_sentence_embedding_dimension(self) -> Optional[int]: return output_dim @contextmanager - def truncate_sentence_embeddings(self, truncate_dim: Optional[int]) -> Iterator[None]: + def truncate_sentence_embeddings(self, truncate_dim: int | None) -> Iterator[None]: """ In this context, :meth:`SentenceTransformer.encode ` outputs sentence embeddings truncated at dimension ``truncate_dim``. @@ -1054,9 +1056,9 @@ def _last_module(self) -> torch.nn.Module: def save( self, path: str, - model_name: Optional[str] = None, + model_name: str | None = None, create_model_card: bool = True, - train_datasets: Optional[List[str]] = None, + train_datasets: list[str] | None = None, safe_serialization: bool = True, ) -> None: """ @@ -1122,9 +1124,9 @@ def save( def save_pretrained( self, path: str, - model_name: Optional[str] = None, + model_name: str | None = None, create_model_card: bool = True, - train_datasets: Optional[List[str]] = None, + train_datasets: list[str] | None = None, safe_serialization: bool = True, ) -> None: """ @@ -1148,7 +1150,7 @@ def save_pretrained( ) def _create_model_card( - self, path: str, model_name: Optional[str] = None, train_datasets: Optional[List[str]] = "deprecated" + self, path: str, model_name: str | None = None, train_datasets: list[str] | None = "deprecated" ) -> None: """ Create an automatic model and stores it in the specified path. If no training was done and the loaded model @@ -1195,15 +1197,15 @@ def _create_model_card( def save_to_hub( self, repo_id: str, - organization: Optional[str] = None, - token: Optional[str] = None, - private: Optional[bool] = None, + organization: str | None = None, + token: str | None = None, + private: bool | None = None, safe_serialization: bool = True, commit_message: str = "Add new SentenceTransformer model.", - local_model_path: Optional[str] = None, + local_model_path: str | None = None, exist_ok: bool = False, replace_model_card: bool = False, - train_datasets: Optional[List[str]] = None, + train_datasets: list[str] | None = None, ) -> str: """ DEPRECATED, use `push_to_hub` instead. @@ -1259,14 +1261,14 @@ def save_to_hub( def push_to_hub( self, repo_id: str, - token: Optional[str] = None, - private: Optional[bool] = None, + token: str | None = None, + private: bool | None = None, safe_serialization: bool = True, commit_message: str = "Add new SentenceTransformer model.", - local_model_path: Optional[str] = None, + local_model_path: str | None = None, exist_ok: bool = False, replace_model_card: bool = False, - train_datasets: Optional[List[str]] = None, + train_datasets: list[str] | None = None, ) -> str: """ Uploads all elements of this Sentence Transformer to a new HuggingFace Hub repository. @@ -1317,7 +1319,7 @@ def push_to_hub( # This isn't expected to ever be reached. return folder_url - def _text_length(self, text: Union[List[int], List[List[int]]]) -> int: + def _text_length(self, text: list[int] | list[list[int]]) -> int: """ Help function to get the length for the input text. Text can be either a list of ints (which means a single text as input), or a tuple of list of ints @@ -1333,7 +1335,7 @@ def _text_length(self, text: Union[List[int], List[List[int]]]) -> int: else: return sum([len(t) for t in text]) # Sum of length of individual strings - def evaluate(self, evaluator: SentenceEvaluator, output_path: str = None) -> Union[Dict[str, float], float]: + def evaluate(self, evaluator: SentenceEvaluator, output_path: str = None) -> dict[str, float] | float: """ Evaluate the model based on an evaluator @@ -1351,15 +1353,15 @@ def evaluate(self, evaluator: SentenceEvaluator, output_path: str = None) -> Uni def _load_auto_model( self, model_name_or_path: str, - token: Optional[Union[bool, str]], - cache_folder: Optional[str], - revision: Optional[str] = None, + token: bool | str | None, + cache_folder: str | None, + revision: str | None = None, trust_remote_code: bool = False, local_files_only: bool = False, - model_kwargs: Optional[Dict[str, Any]] = None, - tokenizer_kwargs: Optional[Dict[str, Any]] = None, - config_kwargs: Optional[Dict[str, Any]] = None, - ) -> List[nn.Module]: + model_kwargs: dict[str, Any] | None = None, + tokenizer_kwargs: dict[str, Any] | None = None, + config_kwargs: dict[str, Any] | None = None, + ) -> list[nn.Module]: """ Creates a simple Transformer + Mean Pooling model and returns the modules @@ -1405,15 +1407,15 @@ def _load_auto_model( def _load_sbert_model( self, model_name_or_path: str, - token: Optional[Union[bool, str]], - cache_folder: Optional[str], - revision: Optional[str] = None, + token: bool | str | None, + cache_folder: str | None, + revision: str | None = None, trust_remote_code: bool = False, local_files_only: bool = False, - model_kwargs: Optional[Dict[str, Any]] = None, - tokenizer_kwargs: Optional[Dict[str, Any]] = None, - config_kwargs: Optional[Dict[str, Any]] = None, - ) -> Dict[str, nn.Module]: + model_kwargs: dict[str, Any] | None = None, + tokenizer_kwargs: dict[str, Any] | None = None, + config_kwargs: dict[str, Any] | None = None, + ) -> dict[str, nn.Module]: """ Loads a full SentenceTransformer model using the modules.json file. @@ -1595,7 +1597,7 @@ def device(self) -> device: except StopIteration: # For nn.DataParallel compatibility in PyTorch 1.5 - def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: + def find_tensor_attributes(module: nn.Module) -> list[tuple[str, Tensor]]: tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] return tuples @@ -1654,18 +1656,18 @@ def _target_device(self) -> torch.device: return self.device @_target_device.setter - def _target_device(self, device: Optional[Union[int, str, torch.device]] = None) -> None: + def _target_device(self, device: int | str | torch.device | None = None) -> None: self.to(device) @property - def _no_split_modules(self) -> List[str]: + def _no_split_modules(self) -> list[str]: try: return self._first_module()._no_split_modules except AttributeError: return [] @property - def _keys_to_ignore_on_save(self) -> List[str]: + def _keys_to_ignore_on_save(self) -> list[str]: try: return self._first_module()._keys_to_ignore_on_save except AttributeError: diff --git a/sentence_transformers/__init__.py b/sentence_transformers/__init__.py index a31ad2657..79a390ca4 100644 --- a/sentence_transformers/__init__.py +++ b/sentence_transformers/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + __version__ = "3.1.0.dev0" __MODEL_HUB_ORGANIZATION__ = "sentence-transformers" diff --git a/sentence_transformers/cross_encoder/CrossEncoder.py b/sentence_transformers/cross_encoder/CrossEncoder.py index 31984a9e1..806d3a816 100644 --- a/sentence_transformers/cross_encoder/CrossEncoder.py +++ b/sentence_transformers/cross_encoder/CrossEncoder.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import logging import os from functools import wraps -from typing import Callable, Dict, List, Literal, Optional, Tuple, Type, Union, overload +from typing import Callable, Literal, overload import numpy as np import torch @@ -58,11 +60,11 @@ def __init__( model_name: str, num_labels: int = None, max_length: int = None, - device: Optional[str] = None, - tokenizer_args: Dict = None, - automodel_args: Dict = None, + device: str | None = None, + tokenizer_args: dict = None, + automodel_args: dict = None, trust_remote_code: bool = False, - revision: Optional[str] = None, + revision: str | None = None, local_files_only: bool = False, default_activation_function=None, classifier_dropout: float = None, @@ -127,7 +129,7 @@ def __init__( else: self.default_activation_function = nn.Sigmoid() if self.config.num_labels == 1 else nn.Identity() - def smart_batching_collate(self, batch: List[InputExample]) -> Tuple[BatchEncoding, Tensor]: + def smart_batching_collate(self, batch: list[InputExample]) -> tuple[BatchEncoding, Tensor]: texts = [[] for _ in range(len(batch[0].texts))] labels = [] @@ -149,7 +151,7 @@ def smart_batching_collate(self, batch: List[InputExample]) -> Tuple[BatchEncodi return tokenized, labels - def smart_batching_collate_text_only(self, batch: List[InputExample]) -> BatchEncoding: + def smart_batching_collate_text_only(self, batch: list[InputExample]) -> BatchEncoding: texts = [[] for _ in range(len(batch[0]))] for example in batch: @@ -174,8 +176,8 @@ def fit( activation_fct=nn.Identity(), scheduler: str = "WarmupLinear", warmup_steps: int = 10000, - optimizer_class: Type[Optimizer] = torch.optim.AdamW, - optimizer_params: Dict[str, object] = {"lr": 2e-5}, + optimizer_class: type[Optimizer] = torch.optim.AdamW, + optimizer_params: dict[str, object] = {"lr": 2e-5}, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: str = None, @@ -305,12 +307,12 @@ def fit( @overload def predict( self, - sentences: Union[Tuple[str, str], List[str]], + sentences: tuple[str, str] | list[str], batch_size: int = ..., - show_progress_bar: Optional[bool] = ..., + show_progress_bar: bool | None = ..., num_workers: int = ..., - activation_fct: Optional[Callable] = ..., - apply_softmax: Optional[bool] = ..., + activation_fct: Callable | None = ..., + apply_softmax: bool | None = ..., convert_to_numpy: Literal[False] = ..., convert_to_tensor: Literal[False] = ..., ) -> torch.Tensor: ... @@ -318,12 +320,12 @@ def predict( @overload def predict( self, - sentences: Union[List[Tuple[str, str]], List[List[str]], Tuple[str, str], List[str]], + sentences: list[tuple[str, str]] | list[list[str]] | tuple[str, str] | list[str], batch_size: int = ..., - show_progress_bar: Optional[bool] = ..., + show_progress_bar: bool | None = ..., num_workers: int = ..., - activation_fct: Optional[Callable] = ..., - apply_softmax: Optional[bool] = ..., + activation_fct: Callable | None = ..., + apply_softmax: bool | None = ..., convert_to_numpy: Literal[True] = True, convert_to_tensor: Literal[False] = False, ) -> np.ndarray: ... @@ -331,12 +333,12 @@ def predict( @overload def predict( self, - sentences: Union[List[Tuple[str, str]], List[List[str]], Tuple[str, str], List[str]], + sentences: list[tuple[str, str]] | list[list[str]] | tuple[str, str] | list[str], batch_size: int = ..., - show_progress_bar: Optional[bool] = ..., + show_progress_bar: bool | None = ..., num_workers: int = ..., - activation_fct: Optional[Callable] = ..., - apply_softmax: Optional[bool] = ..., + activation_fct: Callable | None = ..., + apply_softmax: bool | None = ..., convert_to_numpy: bool = ..., convert_to_tensor: Literal[True] = ..., ) -> torch.Tensor: ... @@ -344,27 +346,27 @@ def predict( @overload def predict( self, - sentences: Union[List[Tuple[str, str]], List[List[str]]], + sentences: list[tuple[str, str]] | list[list[str]], batch_size: int = ..., - show_progress_bar: Optional[bool] = ..., + show_progress_bar: bool | None = ..., num_workers: int = ..., - activation_fct: Optional[Callable] = ..., - apply_softmax: Optional[bool] = ..., + activation_fct: Callable | None = ..., + apply_softmax: bool | None = ..., convert_to_numpy: Literal[False] = ..., convert_to_tensor: Literal[False] = ..., - ) -> List[torch.Tensor]: ... + ) -> list[torch.Tensor]: ... def predict( self, - sentences: Union[List[Tuple[str, str]], List[List[str]], Tuple[str, str], List[str]], + sentences: list[tuple[str, str]] | list[list[str]] | tuple[str, str] | list[str], batch_size: int = 32, - show_progress_bar: Optional[bool] = None, + show_progress_bar: bool | None = None, num_workers: int = 0, - activation_fct: Optional[Callable] = None, - apply_softmax: Optional[bool] = False, + activation_fct: Callable | None = None, + apply_softmax: bool | None = False, convert_to_numpy: bool = True, convert_to_tensor: bool = False, - ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]: + ) -> list[torch.Tensor] | np.ndarray | torch.Tensor: """ Performs predictions with the CrossEncoder on the given sentence pairs. @@ -451,8 +453,8 @@ def predict( def rank( self, query: str, - documents: List[str], - top_k: Optional[int] = None, + documents: list[str], + top_k: int | None = None, return_documents: bool = False, batch_size: int = 32, show_progress_bar: bool = None, @@ -461,7 +463,7 @@ def rank( apply_softmax=False, convert_to_numpy: bool = True, convert_to_tensor: bool = False, - ) -> List[Dict[Literal["corpus_id", "score", "text"], Union[int, float, str]]]: + ) -> list[dict[Literal["corpus_id", "score", "text"], int | float | str]]: """ Performs ranking with the CrossEncoder on the given query and documents. Returns a sorted list with the document indices and scores. @@ -572,10 +574,10 @@ def push_to_hub( self, repo_id: str, *, - commit_message: Optional[str] = None, - private: Optional[bool] = None, + commit_message: str | None = None, + private: bool | None = None, safe_serialization: bool = True, - tags: Optional[List[str]] = None, + tags: list[str] | None = None, **kwargs, ) -> str: if isinstance(tags, str): diff --git a/sentence_transformers/cross_encoder/__init__.py b/sentence_transformers/cross_encoder/__init__.py index ef2ec5fae..ebd40785a 100644 --- a/sentence_transformers/cross_encoder/__init__.py +++ b/sentence_transformers/cross_encoder/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .CrossEncoder import CrossEncoder __all__ = ["CrossEncoder"] diff --git a/sentence_transformers/cross_encoder/evaluation/CEBinaryAccuracyEvaluator.py b/sentence_transformers/cross_encoder/evaluation/CEBinaryAccuracyEvaluator.py index b0fb33723..7892d52b3 100644 --- a/sentence_transformers/cross_encoder/evaluation/CEBinaryAccuracyEvaluator.py +++ b/sentence_transformers/cross_encoder/evaluation/CEBinaryAccuracyEvaluator.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import csv import logging import os -from typing import List import numpy as np @@ -22,8 +23,8 @@ class CEBinaryAccuracyEvaluator: def __init__( self, - sentence_pairs: List[List[str]], - labels: List[int], + sentence_pairs: list[list[str]], + labels: list[int], name: str = "", threshold: float = 0.5, write_csv: bool = True, @@ -38,7 +39,7 @@ def __init__( self.write_csv = write_csv @classmethod - def from_input_examples(cls, examples: List[InputExample], **kwargs): + def from_input_examples(cls, examples: list[InputExample], **kwargs): sentence_pairs = [] labels = [] diff --git a/sentence_transformers/cross_encoder/evaluation/CEBinaryClassificationEvaluator.py b/sentence_transformers/cross_encoder/evaluation/CEBinaryClassificationEvaluator.py index da6e3a36d..26b5ac4cf 100644 --- a/sentence_transformers/cross_encoder/evaluation/CEBinaryClassificationEvaluator.py +++ b/sentence_transformers/cross_encoder/evaluation/CEBinaryClassificationEvaluator.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import csv import logging import os -from typing import List import numpy as np from sklearn.metrics import average_precision_score @@ -20,8 +21,8 @@ class CEBinaryClassificationEvaluator: def __init__( self, - sentence_pairs: List[List[str]], - labels: List[int], + sentence_pairs: list[list[str]], + labels: list[int], name: str = "", show_progress_bar: bool = False, write_csv: bool = True, @@ -55,7 +56,7 @@ def __init__( self.write_csv = write_csv @classmethod - def from_input_examples(cls, examples: List[InputExample], **kwargs): + def from_input_examples(cls, examples: list[InputExample], **kwargs): sentence_pairs = [] labels = [] diff --git a/sentence_transformers/cross_encoder/evaluation/CECorrelationEvaluator.py b/sentence_transformers/cross_encoder/evaluation/CECorrelationEvaluator.py index 80f8f48df..90bbeaa08 100644 --- a/sentence_transformers/cross_encoder/evaluation/CECorrelationEvaluator.py +++ b/sentence_transformers/cross_encoder/evaluation/CECorrelationEvaluator.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import csv import logging import os -from typing import List from scipy.stats import pearsonr, spearmanr @@ -17,7 +18,7 @@ class CECorrelationEvaluator: and the gold score. """ - def __init__(self, sentence_pairs: List[List[str]], scores: List[float], name: str = "", write_csv: bool = True): + def __init__(self, sentence_pairs: list[list[str]], scores: list[float], name: str = "", write_csv: bool = True): self.sentence_pairs = sentence_pairs self.scores = scores self.name = name @@ -27,7 +28,7 @@ def __init__(self, sentence_pairs: List[List[str]], scores: List[float], name: s self.write_csv = write_csv @classmethod - def from_input_examples(cls, examples: List[InputExample], **kwargs): + def from_input_examples(cls, examples: list[InputExample], **kwargs): sentence_pairs = [] scores = [] diff --git a/sentence_transformers/cross_encoder/evaluation/CEF1Evaluator.py b/sentence_transformers/cross_encoder/evaluation/CEF1Evaluator.py index c28ab631e..5bdedc5c8 100644 --- a/sentence_transformers/cross_encoder/evaluation/CEF1Evaluator.py +++ b/sentence_transformers/cross_encoder/evaluation/CEF1Evaluator.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import csv import logging import os -from typing import List import numpy as np from sklearn.metrics import f1_score @@ -31,8 +32,8 @@ class CEF1Evaluator: def __init__( self, - sentence_pairs: List[List[str]], - labels: List[int], + sentence_pairs: list[list[str]], + labels: list[int], *, batch_size: int = 32, show_progress_bar: bool = False, @@ -67,7 +68,7 @@ def __init__( self.csv_headers = ["epoch", "steps"] + [metric_name for metric_name, _ in self.f1_callables] @classmethod - def from_input_examples(cls, examples: List[InputExample], **kwargs): + def from_input_examples(cls, examples: list[InputExample], **kwargs): """ Create an instance of CEF1Evaluator from a list of InputExample objects. diff --git a/sentence_transformers/cross_encoder/evaluation/CERerankingEvaluator.py b/sentence_transformers/cross_encoder/evaluation/CERerankingEvaluator.py index 8552fc9d9..2b913c091 100644 --- a/sentence_transformers/cross_encoder/evaluation/CERerankingEvaluator.py +++ b/sentence_transformers/cross_encoder/evaluation/CERerankingEvaluator.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import csv import logging import os -from typing import Optional import numpy as np from sklearn.metrics import ndcg_score @@ -22,9 +23,7 @@ class CERerankingEvaluator: of positive (relevant) documents, negative is a list of negative (irrelevant) documents. """ - def __init__( - self, samples, at_k: int = 10, name: str = "", write_csv: bool = True, mrr_at_k: Optional[int] = None - ): + def __init__(self, samples, at_k: int = 10, name: str = "", write_csv: bool = True, mrr_at_k: int | None = None): self.samples = samples self.name = name if mrr_at_k is not None: diff --git a/sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py b/sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py index e3973d952..05aa1beac 100644 --- a/sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py +++ b/sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import csv import logging import os -from typing import List import numpy as np @@ -18,7 +19,7 @@ class CESoftmaxAccuracyEvaluator: accuracy of the predict class vs. the gold labels. """ - def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str = "", write_csv: bool = True): + def __init__(self, sentence_pairs: list[list[str]], labels: list[int], name: str = "", write_csv: bool = True): self.sentence_pairs = sentence_pairs self.labels = labels self.name = name @@ -28,7 +29,7 @@ def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str self.write_csv = write_csv @classmethod - def from_input_examples(cls, examples: List[InputExample], **kwargs): + def from_input_examples(cls, examples: list[InputExample], **kwargs): sentence_pairs = [] labels = [] diff --git a/sentence_transformers/cross_encoder/evaluation/__init__.py b/sentence_transformers/cross_encoder/evaluation/__init__.py index ac176ff83..fc0a49e92 100644 --- a/sentence_transformers/cross_encoder/evaluation/__init__.py +++ b/sentence_transformers/cross_encoder/evaluation/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .CEBinaryAccuracyEvaluator import CEBinaryAccuracyEvaluator from .CEBinaryClassificationEvaluator import CEBinaryClassificationEvaluator from .CECorrelationEvaluator import CECorrelationEvaluator diff --git a/sentence_transformers/data_collator.py b/sentence_transformers/data_collator.py index 1f843b778..24314ec36 100644 --- a/sentence_transformers/data_collator.py +++ b/sentence_transformers/data_collator.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List +from typing import Any, Callable import torch @@ -13,9 +15,9 @@ class SentenceTransformerDataCollator: """ tokenize_fn: Callable - valid_label_columns: List[str] = field(default_factory=lambda: ["label", "score"]) + valid_label_columns: list[str] = field(default_factory=lambda: ["label", "score"]) - def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]: + def __call__(self, features: list[dict[str, Any]]) -> dict[str, torch.Tensor]: columns = list(features[0].keys()) # We should always be able to return a loss, label or not: diff --git a/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py b/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py index 70b6bc0e4..340f4abd2 100644 --- a/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py +++ b/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py @@ -1,4 +1,4 @@ -from typing import List +from __future__ import annotations import numpy as np from torch.utils.data import Dataset @@ -19,7 +19,7 @@ class DenoisingAutoEncoderDataset(Dataset): with noise, e.g. deleted words """ - def __init__(self, sentences: List[str], noise_fn=lambda s: DenoisingAutoEncoderDataset.delete(s)): + def __init__(self, sentences: list[str], noise_fn=lambda s: DenoisingAutoEncoderDataset.delete(s)): if not is_nltk_available(): raise ImportError(NLTK_IMPORT_ERROR.format(self.__class__.__name__)) diff --git a/sentence_transformers/datasets/NoDuplicatesDataLoader.py b/sentence_transformers/datasets/NoDuplicatesDataLoader.py index f910183c1..c8504a2be 100644 --- a/sentence_transformers/datasets/NoDuplicatesDataLoader.py +++ b/sentence_transformers/datasets/NoDuplicatesDataLoader.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import math import random diff --git a/sentence_transformers/datasets/ParallelSentencesDataset.py b/sentence_transformers/datasets/ParallelSentencesDataset.py index 6b64179dc..397356f8e 100644 --- a/sentence_transformers/datasets/ParallelSentencesDataset.py +++ b/sentence_transformers/datasets/ParallelSentencesDataset.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import gzip import logging import random -from typing import List from torch.utils.data import Dataset @@ -99,7 +100,7 @@ def load_data( def add_dataset( self, - parallel_sentences: List[List[str]], + parallel_sentences: list[list[str]], weight: int = 100, max_sentences: int = None, max_sentence_length: int = 128, diff --git a/sentence_transformers/datasets/SentenceLabelDataset.py b/sentence_transformers/datasets/SentenceLabelDataset.py index c716f82ed..1dcfb8a2a 100644 --- a/sentence_transformers/datasets/SentenceLabelDataset.py +++ b/sentence_transformers/datasets/SentenceLabelDataset.py @@ -1,5 +1,6 @@ +from __future__ import annotations + import logging -from typing import List import numpy as np from torch.utils.data import IterableDataset @@ -23,7 +24,7 @@ class SentenceLabelDataset(IterableDataset): by the samples drawn per label. """ - def __init__(self, examples: List[InputExample], samples_per_label: int = 2, with_replacement: bool = False): + def __init__(self, examples: list[InputExample], samples_per_label: int = 2, with_replacement: bool = False): """ Creates a LabelSampler for a SentenceLabelDataset. diff --git a/sentence_transformers/datasets/SentencesDataset.py b/sentence_transformers/datasets/SentencesDataset.py index f7795a8fc..57c10f508 100644 --- a/sentence_transformers/datasets/SentencesDataset.py +++ b/sentence_transformers/datasets/SentencesDataset.py @@ -1,4 +1,4 @@ -from typing import List +from __future__ import annotations from torch.utils.data import Dataset @@ -12,7 +12,7 @@ class SentencesDataset(Dataset): and then passing it to the DataLoader, you can pass the list of InputExamples directly to the dataset loader. """ - def __init__(self, examples: List[InputExample], model: SentenceTransformer): + def __init__(self, examples: list[InputExample], model: SentenceTransformer): self.examples = examples def __getitem__(self, item): diff --git a/sentence_transformers/datasets/__init__.py b/sentence_transformers/datasets/__init__.py index 33cc755d5..85091d137 100644 --- a/sentence_transformers/datasets/__init__.py +++ b/sentence_transformers/datasets/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .DenoisingAutoEncoderDataset import DenoisingAutoEncoderDataset from .NoDuplicatesDataLoader import NoDuplicatesDataLoader from .ParallelSentencesDataset import ParallelSentencesDataset diff --git a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py index a4910c299..0596e23df 100644 --- a/sentence_transformers/evaluation/BinaryClassificationEvaluator.py +++ b/sentence_transformers/evaluation/BinaryClassificationEvaluator.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import csv import logging import os from contextlib import nullcontext -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING import numpy as np from sklearn.metrics import average_precision_score @@ -94,14 +96,14 @@ class BinaryClassificationEvaluator(SentenceEvaluator): def __init__( self, - sentences1: List[str], - sentences2: List[str], - labels: List[int], + sentences1: list[str], + sentences2: list[str], + labels: list[int], name: str = "", batch_size: int = 32, show_progress_bar: bool = False, write_csv: bool = True, - truncate_dim: Optional[int] = None, + truncate_dim: int | None = None, ): self.sentences1 = sentences1 self.sentences2 = sentences2 @@ -140,7 +142,7 @@ def __init__( self.csv_headers.append(f"{v}_{m}") @classmethod - def from_input_examples(cls, examples: List[InputExample], **kwargs): + def from_input_examples(cls, examples: list[InputExample], **kwargs): sentences1 = [] sentences2 = [] scores = [] @@ -153,7 +155,7 @@ def from_input_examples(cls, examples: List[InputExample], **kwargs): def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Dict[str, float]: + ) -> dict[str, float]: """ Compute the evaluation metrics for the given model. diff --git a/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py b/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py index bf9729631..f5a036486 100644 --- a/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py +++ b/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import csv import logging import os from contextlib import nullcontext -from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union +from typing import TYPE_CHECKING, Literal import numpy as np from scipy.stats import pearsonr, spearmanr @@ -59,16 +61,16 @@ class EmbeddingSimilarityEvaluator(SentenceEvaluator): def __init__( self, - sentences1: List[str], - sentences2: List[str], - scores: List[float], + sentences1: list[str], + sentences2: list[str], + scores: list[float], batch_size: int = 16, - main_similarity: Optional[Union[str, SimilarityFunction]] = None, + main_similarity: str | SimilarityFunction | None = None, name: str = "", show_progress_bar: bool = False, write_csv: bool = True, - precision: Optional[Literal["float32", "int8", "uint8", "binary", "ubinary"]] = None, - truncate_dim: Optional[int] = None, + precision: Literal["float32", "int8", "uint8", "binary", "ubinary"] | None = None, + truncate_dim: int | None = None, ): """ Constructs an evaluator based for the dataset. @@ -129,7 +131,7 @@ def __init__( ] @classmethod - def from_input_examples(cls, examples: List[InputExample], **kwargs): + def from_input_examples(cls, examples: list[InputExample], **kwargs): sentences1 = [] sentences2 = [] scores = [] @@ -142,7 +144,7 @@ def from_input_examples(cls, examples: List[InputExample], **kwargs): def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Dict[str, float]: + ) -> dict[str, float]: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}" diff --git a/sentence_transformers/evaluation/InformationRetrievalEvaluator.py b/sentence_transformers/evaluation/InformationRetrievalEvaluator.py index 522e8d7ac..b66b2df29 100644 --- a/sentence_transformers/evaluation/InformationRetrievalEvaluator.py +++ b/sentence_transformers/evaluation/InformationRetrievalEvaluator.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import heapq import logging import os from contextlib import nullcontext -from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Set, Union +from typing import TYPE_CHECKING, Callable import numpy as np import torch @@ -112,25 +114,25 @@ class InformationRetrievalEvaluator(SentenceEvaluator): def __init__( self, - queries: Dict[str, str], # qid => query - corpus: Dict[str, str], # cid => doc - relevant_docs: Dict[str, Set[str]], # qid => Set[cid] + queries: dict[str, str], # qid => query + corpus: dict[str, str], # cid => doc + relevant_docs: dict[str, set[str]], # qid => Set[cid] corpus_chunk_size: int = 50000, - mrr_at_k: List[int] = [10], - ndcg_at_k: List[int] = [10], - accuracy_at_k: List[int] = [1, 3, 5, 10], - precision_recall_at_k: List[int] = [1, 3, 5, 10], - map_at_k: List[int] = [100], + mrr_at_k: list[int] = [10], + ndcg_at_k: list[int] = [10], + accuracy_at_k: list[int] = [1, 3, 5, 10], + precision_recall_at_k: list[int] = [1, 3, 5, 10], + map_at_k: list[int] = [100], show_progress_bar: bool = False, batch_size: int = 32, name: str = "", write_csv: bool = True, - truncate_dim: Optional[int] = None, - score_functions: Dict[str, Callable[[Tensor, Tensor], Tensor]] = { + truncate_dim: int | None = None, + score_functions: dict[str, Callable[[Tensor, Tensor], Tensor]] = { SimilarityFunction.COSINE.value: cos_sim, SimilarityFunction.DOT_PRODUCT.value: dot_score, }, # Score function, higher=more similar - main_score_function: Optional[Union[str, SimilarityFunction]] = None, + main_score_function: str | SimilarityFunction | None = None, ) -> None: """ Initializes the InformationRetrievalEvaluator. @@ -206,7 +208,7 @@ def __init__( def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1, *args, **kwargs - ) -> Dict[str, float]: + ) -> dict[str, float]: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}" @@ -276,7 +278,7 @@ def __call__( def compute_metrices( self, model: "SentenceTransformer", corpus_model=None, corpus_embeddings: Tensor = None - ) -> Dict[str, float]: + ) -> dict[str, float]: if corpus_model is None: corpus_model = model @@ -363,7 +365,7 @@ def compute_metrices( return scores - def compute_metrics(self, queries_result_list: List[object]): + def compute_metrics(self, queries_result_list: list[object]): # Init score computation values num_hits_at_k = {k: 0 for k in self.accuracy_at_k} precisions_at_k = {k: [] for k in self.precision_recall_at_k} diff --git a/sentence_transformers/evaluation/LabelAccuracyEvaluator.py b/sentence_transformers/evaluation/LabelAccuracyEvaluator.py index 4031cef9d..714a57dda 100644 --- a/sentence_transformers/evaluation/LabelAccuracyEvaluator.py +++ b/sentence_transformers/evaluation/LabelAccuracyEvaluator.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import csv import logging import os -from typing import TYPE_CHECKING, Dict +from typing import TYPE_CHECKING import torch from torch.utils.data import DataLoader @@ -46,7 +48,7 @@ def __init__(self, dataloader: DataLoader, name: str = "", softmax_model=None, w def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Dict[str, float]: + ) -> dict[str, float]: model.eval() total = 0 correct = 0 diff --git a/sentence_transformers/evaluation/MSEEvaluator.py b/sentence_transformers/evaluation/MSEEvaluator.py index 6fa300bb1..376205bc8 100644 --- a/sentence_transformers/evaluation/MSEEvaluator.py +++ b/sentence_transformers/evaluation/MSEEvaluator.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import csv import logging import os from contextlib import nullcontext -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING from sentence_transformers.evaluation.SentenceEvaluator import SentenceEvaluator @@ -68,14 +70,14 @@ class MSEEvaluator(SentenceEvaluator): def __init__( self, - source_sentences: List[str], - target_sentences: List[str], + source_sentences: list[str], + target_sentences: list[str], teacher_model=None, show_progress_bar: bool = False, batch_size: int = 32, name: str = "", write_csv: bool = True, - truncate_dim: Optional[int] = None, + truncate_dim: int | None = None, ): super().__init__() self.truncate_dim = truncate_dim @@ -96,7 +98,7 @@ def __init__( self.write_csv = write_csv self.primary_metric = "negative_mse" - def __call__(self, model: "SentenceTransformer", output_path: str = None, epoch=-1, steps=-1) -> Dict[str, float]: + def __call__(self, model: "SentenceTransformer", output_path: str = None, epoch=-1, steps=-1) -> dict[str, float]: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}" diff --git a/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py b/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py index 6147877ca..2cb788035 100644 --- a/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py +++ b/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import csv import logging import os from contextlib import nullcontext -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING import numpy as np @@ -38,13 +40,13 @@ class MSEEvaluatorFromDataFrame(SentenceEvaluator): def __init__( self, - dataframe: List[Dict[str, str]], + dataframe: list[dict[str, str]], teacher_model: "SentenceTransformer", - combinations: List[Tuple[str, str]], + combinations: list[tuple[str, str]], batch_size: int = 8, name: str = "", write_csv: bool = True, - truncate_dim: Optional[int] = None, + truncate_dim: int | None = None, ): super().__init__() self.combinations = combinations @@ -85,7 +87,7 @@ def __init__( def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Dict[str, float]: + ) -> dict[str, float]: model.eval() mse_scores = [] diff --git a/sentence_transformers/evaluation/ParaphraseMiningEvaluator.py b/sentence_transformers/evaluation/ParaphraseMiningEvaluator.py index 7728e1393..1af7b196c 100644 --- a/sentence_transformers/evaluation/ParaphraseMiningEvaluator.py +++ b/sentence_transformers/evaluation/ParaphraseMiningEvaluator.py @@ -1,9 +1,11 @@ +from __future__ import annotations + import csv import logging import os from collections import defaultdict from contextlib import nullcontext -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple +from typing import TYPE_CHECKING from sentence_transformers.evaluation.SentenceEvaluator import SentenceEvaluator from sentence_transformers.util import paraphrase_mining @@ -62,9 +64,9 @@ class ParaphraseMiningEvaluator(SentenceEvaluator): def __init__( self, - sentences_map: Dict[str, str], - duplicates_list: List[Tuple[str, str]] = None, - duplicates_dict: Dict[str, Dict[str, bool]] = None, + sentences_map: dict[str, str], + duplicates_list: list[tuple[str, str]] = None, + duplicates_dict: dict[str, dict[str, bool]] = None, add_transitive_closure: bool = False, query_chunk_size: int = 5000, corpus_chunk_size: int = 100000, @@ -74,7 +76,7 @@ def __init__( batch_size: int = 16, name: str = "", write_csv: bool = True, - truncate_dim: Optional[int] = None, + truncate_dim: int | None = None, ): """ Initializes the ParaphraseMiningEvaluator. @@ -157,7 +159,7 @@ def __init__( def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Dict[str, float]: + ) -> dict[str, float]: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}" diff --git a/sentence_transformers/evaluation/RerankingEvaluator.py b/sentence_transformers/evaluation/RerankingEvaluator.py index 8fa6f93bd..7fd9dca04 100644 --- a/sentence_transformers/evaluation/RerankingEvaluator.py +++ b/sentence_transformers/evaluation/RerankingEvaluator.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import csv import logging import os from contextlib import nullcontext -from typing import TYPE_CHECKING, Callable, Dict, Optional +from typing import TYPE_CHECKING, Callable import numpy as np import torch @@ -51,8 +53,8 @@ def __init__( batch_size: int = 64, show_progress_bar: bool = False, use_batched_encoding: bool = True, - truncate_dim: Optional[int] = None, - mrr_at_k: Optional[int] = None, + truncate_dim: int | None = None, + mrr_at_k: int | None = None, ): super().__init__() self.samples = samples @@ -91,7 +93,7 @@ def __init__( def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Dict[str, float]: + ) -> dict[str, float]: """ Evaluates the model on the dataset and returns the evaluation metrics. diff --git a/sentence_transformers/evaluation/SentenceEvaluator.py b/sentence_transformers/evaluation/SentenceEvaluator.py index 8744047b7..41f9752da 100644 --- a/sentence_transformers/evaluation/SentenceEvaluator.py +++ b/sentence_transformers/evaluation/SentenceEvaluator.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import re -from typing import TYPE_CHECKING, Any, Dict, Union +from typing import TYPE_CHECKING, Any if TYPE_CHECKING: from sentence_transformers.SentenceTransformer import SentenceTransformer @@ -27,7 +29,7 @@ def __init__(self): def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Union[float, Dict[str, float]]: + ) -> float | dict[str, float]: """ This is called during training to evaluate the model. It returns a score for the evaluation with a higher score indicating a better result. @@ -52,7 +54,7 @@ def __call__( """ pass - def prefix_name_to_metrics(self, metrics: Dict[str, float], name: str) -> Dict[str, float]: + def prefix_name_to_metrics(self, metrics: dict[str, float], name: str) -> dict[str, float]: if not name: return metrics metrics = {name + "_" + key: value for key, value in metrics.items()} @@ -60,7 +62,7 @@ def prefix_name_to_metrics(self, metrics: Dict[str, float], name: str) -> Dict[s self.primary_metric = name + "_" + self.primary_metric return metrics - def store_metrics_in_model_card_data(self, model: "SentenceTransformer", metrics: Dict[str, Any]) -> None: + def store_metrics_in_model_card_data(self, model: "SentenceTransformer", metrics: dict[str, Any]) -> None: model.model_card_data.set_evaluation_metrics(self, metrics) @property diff --git a/sentence_transformers/evaluation/SequentialEvaluator.py b/sentence_transformers/evaluation/SequentialEvaluator.py index 5613b55c8..87552a9d7 100644 --- a/sentence_transformers/evaluation/SequentialEvaluator.py +++ b/sentence_transformers/evaluation/SequentialEvaluator.py @@ -1,4 +1,6 @@ -from typing import TYPE_CHECKING, Dict, Iterable +from __future__ import annotations + +from typing import TYPE_CHECKING, Iterable from sentence_transformers.evaluation.SentenceEvaluator import SentenceEvaluator @@ -37,7 +39,7 @@ def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function= def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Dict[str, float]: + ) -> dict[str, float]: evaluations = [] scores = [] for evaluator_idx, evaluator in enumerate(self.evaluators): diff --git a/sentence_transformers/evaluation/SimilarityFunction.py b/sentence_transformers/evaluation/SimilarityFunction.py index f149b30a3..cabaa6cbd 100644 --- a/sentence_transformers/evaluation/SimilarityFunction.py +++ b/sentence_transformers/evaluation/SimilarityFunction.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from sentence_transformers.similarity_functions import SimilarityFunction __all__ = ["SimilarityFunction"] diff --git a/sentence_transformers/evaluation/TranslationEvaluator.py b/sentence_transformers/evaluation/TranslationEvaluator.py index a344b3bb5..079bcd2dd 100644 --- a/sentence_transformers/evaluation/TranslationEvaluator.py +++ b/sentence_transformers/evaluation/TranslationEvaluator.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import csv import logging import os from contextlib import nullcontext -from typing import TYPE_CHECKING, Dict, List, Optional +from typing import TYPE_CHECKING import numpy as np import torch @@ -55,14 +57,14 @@ class TranslationEvaluator(SentenceEvaluator): def __init__( self, - source_sentences: List[str], - target_sentences: List[str], + source_sentences: list[str], + target_sentences: list[str], show_progress_bar: bool = False, batch_size: int = 16, name: str = "", print_wrong_matches: bool = False, write_csv: bool = True, - truncate_dim: Optional[int] = None, + truncate_dim: int | None = None, ): """ Constructs an evaluator based for the dataset @@ -101,7 +103,7 @@ def __init__( def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Dict[str, float]: + ) -> dict[str, float]: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}" diff --git a/sentence_transformers/evaluation/TripletEvaluator.py b/sentence_transformers/evaluation/TripletEvaluator.py index 7b26c4a27..49578bf77 100644 --- a/sentence_transformers/evaluation/TripletEvaluator.py +++ b/sentence_transformers/evaluation/TripletEvaluator.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import csv import logging import os from contextlib import nullcontext -from typing import TYPE_CHECKING, Dict, List, Optional, Union +from typing import TYPE_CHECKING import numpy as np from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances @@ -58,15 +60,15 @@ class TripletEvaluator(SentenceEvaluator): def __init__( self, - anchors: List[str], - positives: List[str], - negatives: List[str], - main_distance_function: Optional[Union[str, SimilarityFunction]] = None, + anchors: list[str], + positives: list[str], + negatives: list[str], + main_distance_function: str | SimilarityFunction | None = None, name: str = "", batch_size: int = 16, show_progress_bar: bool = False, write_csv: bool = True, - truncate_dim: Optional[int] = None, + truncate_dim: int | None = None, ): """ Initializes a TripletEvaluator object. @@ -109,7 +111,7 @@ def __init__( self.write_csv = write_csv @classmethod - def from_input_examples(cls, examples: List[InputExample], **kwargs): + def from_input_examples(cls, examples: list[InputExample], **kwargs): anchors = [] positives = [] negatives = [] @@ -122,7 +124,7 @@ def from_input_examples(cls, examples: List[InputExample], **kwargs): def __call__( self, model: "SentenceTransformer", output_path: str = None, epoch: int = -1, steps: int = -1 - ) -> Dict[str, float]: + ) -> dict[str, float]: if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}" diff --git a/sentence_transformers/evaluation/__init__.py b/sentence_transformers/evaluation/__init__.py index 7a2568992..da05aa40e 100644 --- a/sentence_transformers/evaluation/__init__.py +++ b/sentence_transformers/evaluation/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .BinaryClassificationEvaluator import BinaryClassificationEvaluator from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator from .InformationRetrievalEvaluator import InformationRetrievalEvaluator diff --git a/sentence_transformers/fit_mixin.py b/sentence_transformers/fit_mixin.py index f54739d33..dc9ebd468 100644 --- a/sentence_transformers/fit_mixin.py +++ b/sentence_transformers/fit_mixin.py @@ -1,9 +1,11 @@ +from __future__ import annotations + import json import logging import os import shutil from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Callable, Iterable import numpy as np import torch @@ -49,7 +51,7 @@ class SaveModelCallback(TrainerCallback): We save after the model has been trained. """ - def __init__(self, output_dir: str, evaluator: Optional[SentenceEvaluator], save_best_model: bool) -> None: + def __init__(self, output_dir: str, evaluator: SentenceEvaluator | None, save_best_model: bool) -> None: super().__init__() self.output_dir = output_dir self.evaluator = evaluator @@ -66,7 +68,7 @@ def on_evaluate( args: SentenceTransformerTrainingArguments, state: TrainerState, control: TrainerControl, - metrics: Dict[str, Any], + metrics: dict[str, Any], model: "SentenceTransformer", **kwargs, ) -> None: @@ -140,7 +142,7 @@ def on_evaluate( args: transformers.TrainingArguments, state: TrainerState, control: TrainerControl, - metrics: Dict[str, Any], + metrics: dict[str, Any], **kwargs, ) -> None: metric_key = getattr(self.evaluator, "primary_metric", "evaluator") @@ -154,14 +156,14 @@ class FitMixin: def fit( self, - train_objectives: Iterable[Tuple[DataLoader, nn.Module]], + train_objectives: Iterable[tuple[DataLoader, nn.Module]], evaluator: SentenceEvaluator = None, epochs: int = 1, steps_per_epoch=None, scheduler: str = "WarmupLinear", warmup_steps: int = 10000, - optimizer_class: Type[Optimizer] = torch.optim.AdamW, - optimizer_params: Dict[str, object] = {"lr": 2e-5}, + optimizer_class: type[Optimizer] = torch.optim.AdamW, + optimizer_params: dict[str, object] = {"lr": 2e-5}, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: str = None, @@ -402,7 +404,7 @@ def _get_scheduler(optimizer, scheduler: str, warmup_steps: int, t_total: int) - else: raise ValueError("Unknown scheduler {}".format(scheduler)) - def smart_batching_collate(self, batch: List["InputExample"]) -> Tuple[List[Dict[str, Tensor]], Tensor]: + def smart_batching_collate(self, batch: list["InputExample"]) -> tuple[list[dict[str, Tensor]], Tensor]: """ Transforms a batch from a SmartBatchingDataset to a batch of tensors for the model Here, batch is a list of InputExample instances: [InputExample(...), ...] @@ -432,14 +434,14 @@ def smart_batching_collate(self, batch: List["InputExample"]) -> Tuple[List[Dict def old_fit( self, - train_objectives: Iterable[Tuple[DataLoader, nn.Module]], + train_objectives: Iterable[tuple[DataLoader, nn.Module]], evaluator: SentenceEvaluator = None, epochs: int = 1, steps_per_epoch=None, scheduler: str = "WarmupLinear", warmup_steps: int = 10000, - optimizer_class: Type[Optimizer] = torch.optim.AdamW, - optimizer_params: Dict[str, object] = {"lr": 2e-5}, + optimizer_class: type[Optimizer] = torch.optim.AdamW, + optimizer_params: dict[str, object] = {"lr": 2e-5}, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: str = None, diff --git a/sentence_transformers/losses/AdaptiveLayerLoss.py b/sentence_transformers/losses/AdaptiveLayerLoss.py index be8d8c798..43fabbe91 100644 --- a/sentence_transformers/losses/AdaptiveLayerLoss.py +++ b/sentence_transformers/losses/AdaptiveLayerLoss.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import random import warnings -from typing import Any, Dict, Iterable, List, Tuple +from typing import Any, Iterable import torch from torch import Tensor, nn @@ -23,9 +25,9 @@ class TransformerDecorator: def __init__(self, transformer: Transformer, original_forward) -> None: self.transformer = transformer self.original_forward = original_forward - self.embeddings: List[Tuple[Tensor]] = [] - self.last_embeddings: List[Tensor] = [] - self.features: List[Dict[str, Tensor]] = [] + self.embeddings: list[tuple[Tensor]] = [] + self.last_embeddings: list[Tensor] = [] + self.features: list[dict[str, Tensor]] = [] self.layer_idx = None self.call_idx = 0 @@ -36,7 +38,7 @@ def set_layer_idx(self, layer_idx) -> None: def get_layer_embeddings(self) -> Tensor: return torch.concat([embedding[self.layer_idx] for embedding in self.embeddings], dim=1) - def __call__(self, features) -> Dict[str, Tensor]: + def __call__(self, features) -> dict[str, Tensor]: if self.layer_idx is None: output = self.call_grow_cache(features) else: @@ -44,7 +46,7 @@ def __call__(self, features) -> Dict[str, Tensor]: self.call_idx += 1 return output - def call_grow_cache(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]: + def call_grow_cache(self, features: dict[str, Tensor]) -> dict[str, Tensor]: """ Temporarily sets the output_hidden_states to True, runs the model, and then restores the original setting. Use the all_layer_embeddings to get the embeddings of all layers. @@ -70,7 +72,7 @@ def call_grow_cache(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]: return output - def call_use_cache(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]: + def call_use_cache(self, features: dict[str, Tensor]) -> dict[str, Tensor]: return {**self.features[self.call_idx], "token_embeddings": self.embeddings[self.call_idx][self.layer_idx]} @@ -86,7 +88,7 @@ def __init__(self, fn) -> None: self.fn = fn self.embeddings = [] - def __call__(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]: + def __call__(self, features: dict[str, Tensor]) -> dict[str, Tensor]: output = self.fn(features) self.embeddings.append(output["sentence_embedding"]) return output @@ -194,7 +196,7 @@ def __init__( if isinstance(loss, CachedGISTEmbedLoss): warnings.warn("MatryoshkaLoss is not compatible with CachedGISTEmbedLoss.", stacklevel=2) - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: # Decorate the forward function of the transformer to cache the embeddings of all layers original_transformer_forward = self.model[0].forward transformer_decorator = TransformerDecorator(self.model[0], original_transformer_forward) @@ -241,7 +243,7 @@ def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor return loss - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return { "loss": self.loss.__class__.__name__, "n_layers_per_step": self.n_layers_per_step, diff --git a/sentence_transformers/losses/AnglELoss.py b/sentence_transformers/losses/AnglELoss.py index eaa8019e9..323224e29 100644 --- a/sentence_transformers/losses/AnglELoss.py +++ b/sentence_transformers/losses/AnglELoss.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from sentence_transformers import SentenceTransformer, losses, util diff --git a/sentence_transformers/losses/BatchAllTripletLoss.py b/sentence_transformers/losses/BatchAllTripletLoss.py index f1a1369ce..932150556 100644 --- a/sentence_transformers/losses/BatchAllTripletLoss.py +++ b/sentence_transformers/losses/BatchAllTripletLoss.py @@ -1,4 +1,6 @@ -from typing import Dict, Iterable +from __future__ import annotations + +from typing import Iterable from torch import Tensor, nn @@ -83,7 +85,7 @@ def __init__( self.triplet_margin = margin self.distance_metric = distance_metric - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: rep = self.sentence_embedder(sentence_features[0])["sentence_embedding"] return self.batch_all_triplet_loss(labels, rep) diff --git a/sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py b/sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py index b74ffe2e4..e72d3c19a 100644 --- a/sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py +++ b/sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py @@ -1,4 +1,6 @@ -from typing import Dict, Iterable +from __future__ import annotations + +from typing import Iterable import torch from torch import Tensor @@ -83,7 +85,7 @@ def __init__( self.sentence_embedder = model self.distance_metric = distance_metric - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: rep = self.sentence_embedder(sentence_features[0])["sentence_embedding"] return self.batch_hard_triplet_soft_margin_loss(labels, rep) diff --git a/sentence_transformers/losses/BatchHardTripletLoss.py b/sentence_transformers/losses/BatchHardTripletLoss.py index 73ca06f39..ab06d400c 100644 --- a/sentence_transformers/losses/BatchHardTripletLoss.py +++ b/sentence_transformers/losses/BatchHardTripletLoss.py @@ -1,4 +1,6 @@ -from typing import Dict, Iterable +from __future__ import annotations + +from typing import Iterable import torch from torch import Tensor, nn @@ -137,7 +139,7 @@ def __init__( self.triplet_margin = margin self.distance_metric = distance_metric - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor): rep = self.sentence_embedder(sentence_features[0])["sentence_embedding"] return self.batch_hard_triplet_loss(labels, rep) diff --git a/sentence_transformers/losses/BatchSemiHardTripletLoss.py b/sentence_transformers/losses/BatchSemiHardTripletLoss.py index d2f3e52a3..0b40bc429 100644 --- a/sentence_transformers/losses/BatchSemiHardTripletLoss.py +++ b/sentence_transformers/losses/BatchSemiHardTripletLoss.py @@ -1,4 +1,6 @@ -from typing import Dict, Iterable +from __future__ import annotations + +from typing import Iterable import torch from torch import Tensor, nn @@ -93,7 +95,7 @@ def __init__( self.margin = margin self.distance_metric = distance_metric - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: rep = self.sentence_embedder(sentence_features[0])["sentence_embedding"] return self.batch_semi_hard_triplet_loss(labels, rep) diff --git a/sentence_transformers/losses/CachedGISTEmbedLoss.py b/sentence_transformers/losses/CachedGISTEmbedLoss.py index 66d39ba00..aed90d88b 100644 --- a/sentence_transformers/losses/CachedGISTEmbedLoss.py +++ b/sentence_transformers/losses/CachedGISTEmbedLoss.py @@ -2,7 +2,7 @@ from contextlib import nullcontext from functools import partial -from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple +from typing import Any, Iterable, Iterator import torch import tqdm @@ -38,7 +38,7 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None: def _backward_hook( grad_output: Tensor, - sentence_features: Iterable[Dict[str, Tensor]], + sentence_features: Iterable[dict[str, Tensor]], loss_obj: CachedGISTEmbedLoss, ) -> None: """A backward hook to backpropagate the cached gradients mini-batch by mini-batch.""" @@ -143,8 +143,8 @@ def __init__( ) self.cross_entropy_loss = nn.CrossEntropyLoss() self.mini_batch_size = mini_batch_size - self.cache: Optional[List[List[Tensor]]] = None - self.random_states: Optional[List[List[RandContext]]] = None + self.cache: list[list[Tensor]] | None = None + self.random_states: list[list[RandContext]] | None = None self.show_progress_bar = show_progress_bar self.must_retokenize = ( model.tokenizer.vocab != guide.tokenizer.vocab or guide.max_seq_length < model.max_seq_length @@ -157,13 +157,13 @@ def sim_matrix(self, embed1: Tensor, embed2: Tensor) -> Tensor: def embed_minibatch( self, - sentence_feature: Dict[str, Tensor], + sentence_feature: dict[str, Tensor], begin: int, end: int, with_grad: bool, copy_random_state: bool, - random_state: Optional[RandContext] = None, - ) -> Tuple[Tensor, Optional[RandContext]]: + random_state: RandContext | None = None, + ) -> tuple[Tensor, RandContext | None]: """Do forward pass on a minibatch of the input features and return corresponding embeddings.""" grad_context = nullcontext if with_grad else torch.no_grad random_state_context = nullcontext() if random_state is None else random_state @@ -187,11 +187,11 @@ def embed_minibatch( def embed_minibatch_iter( self, - sentence_feature: Dict[str, Tensor], + sentence_feature: dict[str, Tensor], with_grad: bool, copy_random_state: bool, - random_states: Optional[List[RandContext]] = None, - ) -> Iterator[Tuple[Tensor, Optional[RandContext]]]: + random_states: list[RandContext] | None = None, + ) -> Iterator[tuple[Tensor, RandContext | None]]: """Do forward pass on all the minibatches of the input features and yield corresponding embeddings.""" input_ids: Tensor = sentence_feature["input_ids"] bsz, _ = input_ids.shape @@ -215,7 +215,7 @@ def embed_minibatch_iter( ) yield reps, guide_reps, random_state # reps: (mbsz, hdim) - def calculate_loss_and_cache_gradients(self, reps: List[List[Tensor]], reps_guided: List[List[Tensor]]) -> Tensor: + def calculate_loss_and_cache_gradients(self, reps: list[list[Tensor]], reps_guided: list[list[Tensor]]) -> Tensor: """Calculate the cross-entropy loss and cache the gradients wrt. the embeddings.""" if len(reps) == 2: anchor, positive = reps @@ -240,7 +240,7 @@ def calculate_loss_and_cache_gradients(self, reps: List[List[Tensor]], reps_guid labels = torch.arange(anchor.size(0)).long().to(anchor.device) batch_size = anchor.shape[0] - losses: List[torch.Tensor] = [] + losses: list[torch.Tensor] = [] for b in tqdm.trange( 0, batch_size, @@ -289,7 +289,7 @@ def calculate_loss_and_cache_gradients(self, reps: List[List[Tensor]], reps_guid return loss - def calculate_loss(self, reps: List[List[Tensor]], reps_guided: List[List[Tensor]]) -> Tensor: + def calculate_loss(self, reps: list[list[Tensor]], reps_guided: list[list[Tensor]]) -> Tensor: """Calculate the cross-entropy loss. No need to cache the gradients.""" if len(reps) == 2: anchor, positive = reps @@ -314,7 +314,7 @@ def calculate_loss(self, reps: List[List[Tensor]], reps_guided: List[List[Tensor labels = torch.arange(anchor.size(0)).long().to(anchor.device) batch_size = anchor.shape[0] - losses: List[torch.Tensor] = [] + losses: list[torch.Tensor] = [] for b in tqdm.trange( 0, batch_size, @@ -359,7 +359,7 @@ def calculate_loss(self, reps: List[List[Tensor]], reps_guided: List[List[Tensor loss = sum(losses) return loss - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: # Step (1): A quick embedding step without gradients/computation graphs to get all the embeddings reps = [] reps_guided = [] @@ -391,7 +391,7 @@ def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor loss = self.calculate_loss(reps, reps_guided) return loss - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return { "guide": self.guide, "temperature": self.temperature, diff --git a/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py b/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py index 36d642b78..7fabc87f2 100644 --- a/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py +++ b/sentence_transformers/losses/CachedMultipleNegativesRankingLoss.py @@ -2,7 +2,7 @@ from contextlib import nullcontext from functools import partial -from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple +from typing import Any, Iterable, Iterator import torch import tqdm @@ -37,7 +37,7 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> None: def _backward_hook( grad_output: Tensor, - sentence_features: Iterable[Dict[str, Tensor]], + sentence_features: Iterable[dict[str, Tensor]], loss_obj: CachedMultipleNegativesRankingLoss, ) -> None: """A backward hook to backpropagate the cached gradients mini-batch by mini-batch.""" @@ -144,19 +144,19 @@ def __init__( self.similarity_fct = similarity_fct self.cross_entropy_loss = nn.CrossEntropyLoss() self.mini_batch_size = mini_batch_size - self.cache: Optional[List[List[Tensor]]] = None - self.random_states: Optional[List[List[RandContext]]] = None + self.cache: list[list[Tensor]] | None = None + self.random_states: list[list[RandContext]] | None = None self.show_progress_bar = show_progress_bar def embed_minibatch( self, - sentence_feature: Dict[str, Tensor], + sentence_feature: dict[str, Tensor], begin: int, end: int, with_grad: bool, copy_random_state: bool, - random_state: Optional[RandContext] = None, - ) -> Tuple[Tensor, Optional[RandContext]]: + random_state: RandContext | None = None, + ) -> tuple[Tensor, RandContext | None]: """Do forward pass on a minibatch of the input features and return corresponding embeddings.""" grad_context = nullcontext if with_grad else torch.no_grad random_state_context = nullcontext() if random_state is None else random_state @@ -169,11 +169,11 @@ def embed_minibatch( def embed_minibatch_iter( self, - sentence_feature: Dict[str, Tensor], + sentence_feature: dict[str, Tensor], with_grad: bool, copy_random_state: bool, - random_states: Optional[List[RandContext]] = None, - ) -> Iterator[Tuple[Tensor, Optional[RandContext]]]: + random_states: list[RandContext] | None = None, + ) -> Iterator[tuple[Tensor, RandContext | None]]: """Do forward pass on all the minibatches of the input features and yield corresponding embeddings.""" input_ids: Tensor = sentence_feature["input_ids"] bsz, _ = input_ids.shape @@ -197,7 +197,7 @@ def embed_minibatch_iter( ) yield reps, random_state # reps: (mbsz, hdim) - def calculate_loss_and_cache_gradients(self, reps: List[List[Tensor]]) -> Tensor: + def calculate_loss_and_cache_gradients(self, reps: list[list[Tensor]]) -> Tensor: """Calculate the cross-entropy loss and cache the gradients wrt. the embeddings.""" embeddings_a = torch.cat(reps[0]) # (bsz, hdim) embeddings_b = torch.cat([torch.cat(r) for r in reps[1:]]) # ((1 + nneg) * bsz, hdim) @@ -206,7 +206,7 @@ def calculate_loss_and_cache_gradients(self, reps: List[List[Tensor]]) -> Tensor labels = torch.tensor( range(batch_size), dtype=torch.long, device=embeddings_a.device ) # (bsz, (1 + nneg) * bsz) Example a[i] should match with b[i] - losses: List[torch.Tensor] = [] + losses: list[torch.Tensor] = [] for b in tqdm.trange( 0, batch_size, @@ -226,7 +226,7 @@ def calculate_loss_and_cache_gradients(self, reps: List[List[Tensor]]) -> Tensor return loss - def calculate_loss(self, reps: List[List[Tensor]]) -> Tensor: + def calculate_loss(self, reps: list[list[Tensor]]) -> Tensor: """Calculate the cross-entropy loss. No need to cache the gradients.""" embeddings_a = torch.cat(reps[0]) # (bsz, hdim) embeddings_b = torch.cat([torch.cat(r) for r in reps[1:]]) # ((1 + nneg) * bsz, hdim) @@ -235,7 +235,7 @@ def calculate_loss(self, reps: List[List[Tensor]]) -> Tensor: labels = torch.tensor( range(batch_size), dtype=torch.long, device=embeddings_a.device ) # (bsz, (1 + nneg) * bsz) Example a[i] should match with b[i] - losses: List[torch.Tensor] = [] + losses: list[torch.Tensor] = [] for b in tqdm.trange( 0, batch_size, @@ -251,7 +251,7 @@ def calculate_loss(self, reps: List[List[Tensor]]) -> Tensor: loss = sum(losses) return loss - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: # Step (1): A quick embedding step without gradients/computation graphs to get all the embeddings reps = [] self.random_states = [] # Copy random states to guarantee exact reproduction of the embeddings during the second forward pass, i.e. step (3) @@ -280,7 +280,7 @@ def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor return loss - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__} @property diff --git a/sentence_transformers/losses/CoSENTLoss.py b/sentence_transformers/losses/CoSENTLoss.py index 1a45d2fee..c4f003b13 100644 --- a/sentence_transformers/losses/CoSENTLoss.py +++ b/sentence_transformers/losses/CoSENTLoss.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, Iterable +from __future__ import annotations + +from typing import Any, Iterable import torch from torch import Tensor, nn @@ -75,7 +77,7 @@ def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_f self.similarity_fct = similarity_fct self.scale = scale - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] scores = self.similarity_fct(embeddings[0], embeddings[1]) @@ -95,7 +97,7 @@ def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor return loss - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__} @property diff --git a/sentence_transformers/losses/ContrastiveLoss.py b/sentence_transformers/losses/ContrastiveLoss.py index 9e206bfc7..3e1f4b58a 100644 --- a/sentence_transformers/losses/ContrastiveLoss.py +++ b/sentence_transformers/losses/ContrastiveLoss.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from enum import Enum -from typing import Any, Dict, Iterable +from typing import Any, Iterable import torch.nn.functional as F from torch import Tensor, nn @@ -81,7 +83,7 @@ def __init__( self.model = model self.size_average = size_average - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: distance_metric_name = self.distance_metric.__name__ for name, value in vars(SiameseDistanceMetric).items(): if value == self.distance_metric: @@ -90,7 +92,7 @@ def get_config_dict(self) -> Dict[str, Any]: return {"distance_metric": distance_metric_name, "margin": self.margin, "size_average": self.size_average} - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] assert len(reps) == 2 rep_anchor, rep_other = reps diff --git a/sentence_transformers/losses/ContrastiveTensionLoss.py b/sentence_transformers/losses/ContrastiveTensionLoss.py index 15d05b9e1..93b61d654 100644 --- a/sentence_transformers/losses/ContrastiveTensionLoss.py +++ b/sentence_transformers/losses/ContrastiveTensionLoss.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import copy import math import random -from typing import Dict, Iterable +from typing import Iterable import numpy as np import torch @@ -76,7 +78,7 @@ def __init__(self, model: SentenceTransformer) -> None: self.model1 = copy.deepcopy(model) self.criterion = nn.BCEWithLogitsLoss(reduction="sum") - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: sentence_features1, sentence_features2 = tuple(sentence_features) reps_1 = self.model1(sentence_features1)["sentence_embedding"] # (bsz, hdim) reps_2 = self.model2(sentence_features2)["sentence_embedding"] @@ -170,7 +172,7 @@ def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_f self.cross_entropy_loss = nn.CrossEntropyLoss() self.logit_scale = nn.Parameter(torch.ones([]) * np.log(scale)) - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: sentence_features1, sentence_features2 = tuple(sentence_features) embeddings_a = self.model1(sentence_features1)["sentence_embedding"] # (bsz, hdim) embeddings_b = self.model2(sentence_features2)["sentence_embedding"] diff --git a/sentence_transformers/losses/CosineSimilarityLoss.py b/sentence_transformers/losses/CosineSimilarityLoss.py index d9dcf7240..d62e25c1a 100644 --- a/sentence_transformers/losses/CosineSimilarityLoss.py +++ b/sentence_transformers/losses/CosineSimilarityLoss.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, Iterable +from __future__ import annotations + +from typing import Any, Iterable import torch from torch import Tensor, nn @@ -72,10 +74,10 @@ def __init__( self.loss_fct = loss_fct self.cos_score_transformation = cos_score_transformation - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1])) return self.loss_fct(output, labels.float().view(-1)) - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return {"loss_fct": fullname(self.loss_fct)} diff --git a/sentence_transformers/losses/DenoisingAutoEncoderLoss.py b/sentence_transformers/losses/DenoisingAutoEncoderLoss.py index 015483cbe..d3e1a155c 100644 --- a/sentence_transformers/losses/DenoisingAutoEncoderLoss.py +++ b/sentence_transformers/losses/DenoisingAutoEncoderLoss.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import logging -from typing import Dict, Iterable, Optional +from typing import Iterable from torch import Tensor, nn from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, PreTrainedModel @@ -11,7 +13,7 @@ class DenoisingAutoEncoderLoss(nn.Module): def __init__( - self, model: SentenceTransformer, decoder_name_or_path: Optional[str] = None, tie_encoder_decoder: bool = True + self, model: SentenceTransformer, decoder_name_or_path: str | None = None, tie_encoder_decoder: bool = True ) -> None: r""" This loss expects as input a pairs of damaged sentences and the corresponding original ones. @@ -134,7 +136,7 @@ def __init__( encoder_name_or_path, ) - def retokenize(self, sentence_features: Dict[str, Tensor]) -> Dict[str, Tensor]: + def retokenize(self, sentence_features: dict[str, Tensor]) -> dict[str, Tensor]: input_ids = sentence_features["input_ids"] device = input_ids.device sentences_decoded = self.tokenizer_encoder.batch_decode( @@ -145,7 +147,7 @@ def retokenize(self, sentence_features: Dict[str, Tensor]) -> Dict[str, Tensor]: ).to(device) return retokenized - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: source_features, target_features = tuple(sentence_features) if self.need_retokenization: # since the sentence_features here are all tokenized by encoder's tokenizer, diff --git a/sentence_transformers/losses/GISTEmbedLoss.py b/sentence_transformers/losses/GISTEmbedLoss.py index d9de70317..2819dfa6c 100644 --- a/sentence_transformers/losses/GISTEmbedLoss.py +++ b/sentence_transformers/losses/GISTEmbedLoss.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, Iterable +from __future__ import annotations + +from typing import Any, Iterable import torch from torch import Tensor, nn @@ -88,7 +90,7 @@ def __init__( def sim_matrix(self, embed1: Tensor, embed2: Tensor) -> Tensor: return self.similarity_fct(embed1.unsqueeze(1), embed2.unsqueeze(0)) - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] with torch.no_grad(): if self.must_retokenize: @@ -157,7 +159,7 @@ def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor return nn.CrossEntropyLoss()(scores, labels) - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return { "guide": self.guide, "temperature": self.temperature, diff --git a/sentence_transformers/losses/MSELoss.py b/sentence_transformers/losses/MSELoss.py index 528ffa2ba..c8568cad6 100644 --- a/sentence_transformers/losses/MSELoss.py +++ b/sentence_transformers/losses/MSELoss.py @@ -1,4 +1,6 @@ -from typing import Dict, Iterable +from __future__ import annotations + +from typing import Iterable import torch from torch import Tensor, nn @@ -70,7 +72,7 @@ def compute_labels(batch): self.model = model self.loss_fct = nn.MSELoss() - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: # Concatenate multiple inputs on the batch dimension if len(sentence_features) > 1: embeddings = torch.cat([self.model(inputs)["sentence_embedding"] for inputs in sentence_features], dim=0) diff --git a/sentence_transformers/losses/MarginMSELoss.py b/sentence_transformers/losses/MarginMSELoss.py index c322d484d..5ebb220d1 100644 --- a/sentence_transformers/losses/MarginMSELoss.py +++ b/sentence_transformers/losses/MarginMSELoss.py @@ -1,4 +1,6 @@ -from typing import Dict, Iterable +from __future__ import annotations + +from typing import Iterable from torch import Tensor, nn @@ -113,7 +115,7 @@ def compute_labels(batch): self.similarity_fct = similarity_fct self.loss_fct = nn.MSELoss() - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: # sentence_features: query, positive passage, negative passage reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] embeddings_query = reps[0] diff --git a/sentence_transformers/losses/Matryoshka2dLoss.py b/sentence_transformers/losses/Matryoshka2dLoss.py index 5e90e8c9f..e730a7ec9 100644 --- a/sentence_transformers/losses/Matryoshka2dLoss.py +++ b/sentence_transformers/losses/Matryoshka2dLoss.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, List, Optional, Union +from __future__ import annotations + +from typing import Any from torch.nn import Module @@ -11,8 +13,8 @@ def __init__( self, model: SentenceTransformer, loss: Module, - matryoshka_dims: List[int], - matryoshka_weights: Optional[List[Union[float, int]]] = None, + matryoshka_dims: list[int], + matryoshka_weights: list[float | int] | None = None, n_layers_per_step: int = 1, n_dims_per_step: int = 1, last_layer_weight: float = 1.0, @@ -124,7 +126,7 @@ def __init__( kl_temperature=kl_temperature, ) - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return { **super().get_config_dict(), **self.loss.get_config_dict(), diff --git a/sentence_transformers/losses/MatryoshkaLoss.py b/sentence_transformers/losses/MatryoshkaLoss.py index cf38b629d..9964c2425 100644 --- a/sentence_transformers/losses/MatryoshkaLoss.py +++ b/sentence_transformers/losses/MatryoshkaLoss.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import random import warnings -from typing import Any, Dict, Iterable, List, Optional, Union +from typing import Any, Iterable import torch.nn.functional as F from torch import Tensor, nn @@ -33,7 +35,7 @@ def shrink(self, tensor: Tensor) -> Tensor: tensor = F.normalize(tensor, p=2, dim=-1) return tensor - def __call__(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]: + def __call__(self, features: dict[str, Tensor]) -> dict[str, Tensor]: # Growing cache: if self.cache_dim is None or self.cache_dim == self.dim: output = self.fn(features) @@ -53,8 +55,8 @@ def __init__( self, model: SentenceTransformer, loss: nn.Module, - matryoshka_dims: List[int], - matryoshka_weights: Optional[List[Union[float, int]]] = None, + matryoshka_dims: list[int], + matryoshka_weights: list[float | int] | None = None, n_dims_per_step: int = -1, ) -> None: """ @@ -129,7 +131,7 @@ def __init__( self.matryoshka_dims, self.matryoshka_weights = zip(*sorted(dims_weights, key=lambda x: x[0], reverse=True)) self.n_dims_per_step = n_dims_per_step - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: original_forward = self.model.forward try: decorated_forward = ForwardDecorator(original_forward) @@ -149,7 +151,7 @@ def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor self.model.forward = original_forward return loss - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return { "loss": self.loss.__class__.__name__, "matryoshka_dims": self.matryoshka_dims, diff --git a/sentence_transformers/losses/MegaBatchMarginLoss.py b/sentence_transformers/losses/MegaBatchMarginLoss.py index 3657c14b6..ed0c6a965 100644 --- a/sentence_transformers/losses/MegaBatchMarginLoss.py +++ b/sentence_transformers/losses/MegaBatchMarginLoss.py @@ -1,4 +1,6 @@ -from typing import Dict, Iterable +from __future__ import annotations + +from typing import Iterable import torch import torch.nn.functional as F @@ -80,7 +82,7 @@ def __init__( self.mini_batch_size = mini_batch_size self.forward = self.forward_mini_batched if use_mini_batched_version else self.forward_non_mini_batched - def forward_mini_batched(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward_mini_batched(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: anchor, positive = sentence_features feature_names = list(anchor.keys()) @@ -137,7 +139,7 @@ def forward_mini_batched(self, sentence_features: Iterable[Dict[str, Tensor]], l return losses ##### Non mini-batched version ### - def forward_non_mini_batched(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward_non_mini_batched(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] embeddings_a, embeddings_b = reps diff --git a/sentence_transformers/losses/MultipleNegativesRankingLoss.py b/sentence_transformers/losses/MultipleNegativesRankingLoss.py index a9fde8e6e..06f17c5c4 100644 --- a/sentence_transformers/losses/MultipleNegativesRankingLoss.py +++ b/sentence_transformers/losses/MultipleNegativesRankingLoss.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, Iterable +from __future__ import annotations + +from typing import Any, Iterable import torch from torch import Tensor, nn @@ -89,7 +91,7 @@ def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_f self.similarity_fct = similarity_fct self.cross_entropy_loss = nn.CrossEntropyLoss() - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] embeddings_a = reps[0] embeddings_b = torch.cat(reps[1:]) @@ -99,7 +101,7 @@ def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor range_labels = torch.arange(0, scores.size(0), device=scores.device) return self.cross_entropy_loss(scores, range_labels) - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__} @property diff --git a/sentence_transformers/losses/MultipleNegativesSymmetricRankingLoss.py b/sentence_transformers/losses/MultipleNegativesSymmetricRankingLoss.py index d6349424d..957dc5e19 100644 --- a/sentence_transformers/losses/MultipleNegativesSymmetricRankingLoss.py +++ b/sentence_transformers/losses/MultipleNegativesSymmetricRankingLoss.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, Iterable +from __future__ import annotations + +from typing import Any, Iterable import torch from torch import Tensor, nn @@ -69,7 +71,7 @@ def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_f self.similarity_fct = similarity_fct self.cross_entropy_loss = nn.CrossEntropyLoss() - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] anchor = reps[0] candidates = torch.cat(reps[1:]) @@ -84,5 +86,5 @@ def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor backward_loss = self.cross_entropy_loss(anchor_positive_scores.transpose(0, 1), labels) return (forward_loss + backward_loss) / 2 - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return {"scale": self.scale, "similarity_fct": self.similarity_fct.__name__} diff --git a/sentence_transformers/losses/OnlineContrastiveLoss.py b/sentence_transformers/losses/OnlineContrastiveLoss.py index 5f1e68a0a..aee8e7b2f 100644 --- a/sentence_transformers/losses/OnlineContrastiveLoss.py +++ b/sentence_transformers/losses/OnlineContrastiveLoss.py @@ -1,4 +1,6 @@ -from typing import Dict, Iterable +from __future__ import annotations + +from typing import Iterable import torch.nn.functional as F from torch import Tensor, nn @@ -69,7 +71,7 @@ def __init__( self.margin = margin self.distance_metric = distance_metric - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor, size_average=False) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor, size_average=False) -> Tensor: embeddings = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] distance_matrix = self.distance_metric(embeddings[0], embeddings[1]) diff --git a/sentence_transformers/losses/SoftmaxLoss.py b/sentence_transformers/losses/SoftmaxLoss.py index 8b675de9d..f506bfe9f 100644 --- a/sentence_transformers/losses/SoftmaxLoss.py +++ b/sentence_transformers/losses/SoftmaxLoss.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import logging -from typing import Callable, Dict, Iterable, Tuple, Union +from typing import Callable, Iterable import torch from torch import Tensor, nn @@ -102,8 +104,8 @@ def __init__( self.loss_fct = loss_fct def forward( - self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor - ) -> Union[Tensor, Tuple[Tensor, Tensor]]: + self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor + ) -> Tensor | tuple[Tensor, Tensor]: reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] rep_a, rep_b = reps diff --git a/sentence_transformers/losses/TripletLoss.py b/sentence_transformers/losses/TripletLoss.py index 9b340b904..23c62d0fb 100644 --- a/sentence_transformers/losses/TripletLoss.py +++ b/sentence_transformers/losses/TripletLoss.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from enum import Enum -from typing import Any, Dict, Iterable +from typing import Any, Iterable import torch.nn.functional as F from torch import Tensor, nn @@ -75,7 +77,7 @@ def __init__( self.distance_metric = distance_metric self.triplet_margin = triplet_margin - def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor) -> Tensor: + def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor: reps = [self.model(sentence_feature)["sentence_embedding"] for sentence_feature in sentence_features] rep_anchor, rep_pos, rep_neg = reps @@ -85,7 +87,7 @@ def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor losses = F.relu(distance_pos - distance_neg + self.triplet_margin) return losses.mean() - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: distance_metric_name = self.distance_metric.__name__ for name, value in vars(TripletDistanceMetric).items(): if value == self.distance_metric: diff --git a/sentence_transformers/losses/__init__.py b/sentence_transformers/losses/__init__.py index fdac35735..9f2af6079 100644 --- a/sentence_transformers/losses/__init__.py +++ b/sentence_transformers/losses/__init__.py @@ -1,4 +1,6 @@ # CoSENTLoss must be imported before AnglELoss +from __future__ import annotations + from .CoSENTLoss import CoSENTLoss # isort: skip from .AdaptiveLayerLoss import AdaptiveLayerLoss diff --git a/sentence_transformers/model_card.py b/sentence_transformers/model_card.py index ea7e66181..fb4c13c2c 100644 --- a/sentence_transformers/model_card.py +++ b/sentence_transformers/model_card.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import json import logging import random @@ -8,7 +10,7 @@ from pathlib import Path from platform import python_version from textwrap import indent -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Literal import torch import transformers @@ -41,7 +43,7 @@ class ModelCardCallback(TrainerCallback): - def __init__(self, trainer: "SentenceTransformerTrainer", default_args_dict: Dict[str, Any]) -> None: + def __init__(self, trainer: "SentenceTransformerTrainer", default_args_dict: dict[str, Any]) -> None: super().__init__() self.trainer = trainer self.default_args_dict = default_args_dict @@ -146,7 +148,7 @@ def on_evaluate( state: TrainerState, control: TrainerControl, model: "SentenceTransformer", - metrics: Dict[str, float], + metrics: dict[str, float], **kwargs, ) -> None: loss_dict = {" ".join(key.split("_")[1:]): metrics[key] for key in metrics if key.endswith("_loss")} @@ -170,7 +172,7 @@ def on_log( state: TrainerState, control: TrainerControl, model: "SentenceTransformer", - logs: Dict[str, float], + logs: dict[str, float], **kwargs, ) -> None: keys = {"loss"} & set(logs) @@ -206,7 +208,7 @@ def on_log( IGNORED_FIELDS = ["model", "trainer", "eval_results_dict"] -def get_versions() -> Dict[str, Any]: +def get_versions() -> dict[str, Any]: versions = { "python": python_version(), "sentence_transformers": sentence_transformers_version, @@ -269,16 +271,16 @@ class SentenceTransformerModelCardData(CardData): """ # Potentially provided by the user - language: Optional[Union[str, List[str]]] = field(default_factory=list) - license: Optional[str] = None - model_name: Optional[str] = None - model_id: Optional[str] = None - train_datasets: List[Dict[str, str]] = field(default_factory=list) - eval_datasets: List[Dict[str, str]] = field(default_factory=list) + language: str | list[str] | None = field(default_factory=list) + license: str | None = None + model_name: str | None = None + model_id: str | None = None + train_datasets: list[dict[str, str]] = field(default_factory=list) + eval_datasets: list[dict[str, str]] = field(default_factory=list) task_name: str = ( "semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more" ) - tags: Optional[List[str]] = field( + tags: list[str] | None = field( default_factory=lambda: [ "sentence-transformers", "sentence-similarity", @@ -288,20 +290,20 @@ class SentenceTransformerModelCardData(CardData): generate_widget_examples: Literal["deprecated"] = "deprecated" # Automatically filled by `ModelCardCallback` and the Trainer directly - base_model: Optional[str] = field(default=None, init=False) - base_model_revision: Optional[str] = field(default=None, init=False) - non_default_hyperparameters: Dict[str, Any] = field(default_factory=dict, init=False) - all_hyperparameters: Dict[str, Any] = field(default_factory=dict, init=False) - eval_results_dict: Optional[Dict["SentenceEvaluator", Dict[str, Any]]] = field(default_factory=dict, init=False) - training_logs: List[Dict[str, float]] = field(default_factory=list, init=False) - widget: List[Dict[str, str]] = field(default_factory=list, init=False) - predict_example: Optional[str] = field(default=None, init=False) - label_example_list: List[Dict[str, str]] = field(default_factory=list, init=False) - code_carbon_callback: Optional[CodeCarbonCallback] = field(default=None, init=False) - citations: Dict[str, str] = field(default_factory=dict, init=False) - best_model_step: Optional[int] = field(default=None, init=False) - trainer: Optional["SentenceTransformerTrainer"] = field(default=None, init=False, repr=False) - datasets: List[str] = field(default_factory=list, init=False, repr=False) + base_model: str | None = field(default=None, init=False) + base_model_revision: str | None = field(default=None, init=False) + non_default_hyperparameters: dict[str, Any] = field(default_factory=dict, init=False) + all_hyperparameters: dict[str, Any] = field(default_factory=dict, init=False) + eval_results_dict: dict["SentenceEvaluator", dict[str, Any]] | None = field(default_factory=dict, init=False) + training_logs: list[dict[str, float]] = field(default_factory=list, init=False) + widget: list[dict[str, str]] = field(default_factory=list, init=False) + predict_example: str | None = field(default=None, init=False) + label_example_list: list[dict[str, str]] = field(default_factory=list, init=False) + code_carbon_callback: CodeCarbonCallback | None = field(default=None, init=False) + citations: dict[str, str] = field(default_factory=dict, init=False) + best_model_step: int | None = field(default=None, init=False) + trainer: "SentenceTransformerTrainer" | None = field(default=None, init=False, repr=False) + datasets: list[str] = field(default_factory=list, init=False, repr=False) # Utility fields first_save: bool = field(default=True, init=False) @@ -310,10 +312,10 @@ class SentenceTransformerModelCardData(CardData): # Computed once, always unchanged pipeline_tag: str = field(default="sentence-similarity", init=False) library_name: str = field(default="sentence-transformers", init=False) - version: Dict[str, str] = field(default_factory=get_versions, init=False) + version: dict[str, str] = field(default_factory=get_versions, init=False) # Passed via `register_model` only - model: Optional["SentenceTransformer"] = field(default=None, init=False, repr=False) + model: "SentenceTransformer" | None = field(default=None, init=False, repr=False) def __post_init__(self) -> None: # We don't want to save "ignore_metadata_errors" in our Model Card @@ -365,7 +367,7 @@ def validate_datasets(self, dataset_list, infer_languages: bool = True) -> None: output_dataset_list.append(dataset) return output_dataset_list - def set_losses(self, losses: List[nn.Module]) -> None: + def set_losses(self, losses: list[nn.Module]) -> None: citations = { "Sentence Transformers": """ @inproceedings{reimers-2019-sentence-bert, @@ -388,7 +390,7 @@ def set_losses(self, losses: List[nn.Module]) -> None: for loss, citation in citations.items(): inverted_citations[citation].append(loss) - def join_list(losses: List[str]) -> str: + def join_list(losses: list[str]) -> str: if len(losses) > 1: return ", ".join(losses[:-1]) + " and " + losses[-1] return losses[0] @@ -399,7 +401,7 @@ def join_list(losses: List[str]) -> str: def set_best_model_step(self, step: int) -> None: self.best_model_step = step - def set_widget_examples(self, dataset: Union["Dataset", "DatasetDict"]) -> None: + def set_widget_examples(self, dataset: "Dataset" | "DatasetDict") -> None: if isinstance(dataset, Dataset): dataset = DatasetDict(dataset=dataset) @@ -450,7 +452,7 @@ def set_widget_examples(self, dataset: Union["Dataset", "DatasetDict"]) -> None: ) self.predict_example = sentences[:3] - def set_evaluation_metrics(self, evaluator: "SentenceEvaluator", metrics: Dict[str, Any]) -> None: + def set_evaluation_metrics(self, evaluator: "SentenceEvaluator", metrics: dict[str, Any]) -> None: from sentence_transformers.evaluation import SequentialEvaluator self.eval_results_dict[evaluator] = copy(metrics) @@ -503,8 +505,8 @@ def set_label_examples(self, dataset: "Dataset") -> None: ] def infer_datasets( - self, dataset: Union["Dataset", "DatasetDict"], dataset_name: Optional[str] = None - ) -> List[Dict[str, str]]: + self, dataset: "Dataset" | "DatasetDict", dataset_name: str | None = None + ) -> list[dict[str, str]]: if isinstance(dataset, DatasetDict): return [ dataset @@ -512,7 +514,7 @@ def infer_datasets( for dataset in self.infer_datasets(sub_dataset, dataset_name=dataset_name) ] - def subtuple_finder(tuple: Tuple[str], subtuple: Tuple[str]) -> int: + def subtuple_finder(tuple: tuple[str], subtuple: tuple[str]) -> int: for i, element in enumerate(tuple): if element == subtuple[0] and tuple[i : i + len(subtuple)] == subtuple: return i @@ -558,10 +560,10 @@ def subtuple_finder(tuple: Tuple[str], subtuple: Tuple[str]) -> int: def compute_dataset_metrics( self, - dataset: Dict[str, str], - dataset_info: Dict[str, Any], - loss: Optional[Union[Dict[str, nn.Module], nn.Module]], - ) -> Dict[str, str]: + dataset: dict[str, str], + dataset_info: dict[str, Any], + loss: dict[str, nn.Module] | nn.Module | None, + ) -> dict[str, str]: """ Given a dataset, compute the following: * Dataset Size @@ -677,8 +679,8 @@ def to_html_list(data: dict): return dataset_info def extract_dataset_metadata( - self, dataset: Union["Dataset", "DatasetDict"], dataset_metadata, dataset_type: Literal["train", "eval"] - ) -> Dict[str, Any]: + self, dataset: "Dataset" | "DatasetDict", dataset_metadata, dataset_type: Literal["train", "eval"] + ) -> dict[str, Any]: if dataset: if dataset_metadata and ( (isinstance(dataset, DatasetDict) and len(dataset_metadata) != len(dataset)) @@ -721,7 +723,7 @@ def register_model(self, model: "SentenceTransformer") -> None: def set_model_id(self, model_id: str) -> None: self.model_id = model_id - def set_base_model(self, model_id: str, revision: Optional[str] = None) -> None: + def set_base_model(self, model_id: str, revision: str | None = None) -> None: try: model_info = get_model_info(model_id) except Exception: @@ -733,7 +735,7 @@ def set_base_model(self, model_id: str, revision: Optional[str] = None) -> None: self.base_model_revision = revision return True - def set_language(self, language: Union[str, List[str]]) -> None: + def set_language(self, language: str | list[str]) -> None: if isinstance(language, str): language = [language] self.language = language @@ -741,7 +743,7 @@ def set_language(self, language: Union[str, List[str]]) -> None: def set_license(self, license: str) -> None: self.license = license - def add_tags(self, tags: Union[str, List[str]]) -> None: + def add_tags(self, tags: str | list[str]) -> None: if isinstance(tags, str): tags = [tags] for tag in tags: @@ -767,7 +769,7 @@ def try_to_set_base_model(self) -> None: if self.set_base_model(model_id): break - def format_eval_metrics(self) -> Dict[str, Any]: + def format_eval_metrics(self) -> dict[str, Any]: """Format the evaluation metrics for the model card. The following keys will be returned: @@ -875,7 +877,7 @@ def sort_metrics(key: str) -> str: "explain_bold_in_eval": "**" in eval_lines, } - def get_codecarbon_data(self) -> Dict[Literal["co2_eq_emissions"], Dict[str, Any]]: + def get_codecarbon_data(self) -> dict[Literal["co2_eq_emissions"], dict[str, Any]]: emissions_data = self.code_carbon_callback.tracker._prepare_emissions_data() results = { "co2_eq_emissions": { @@ -894,7 +896,7 @@ def get_codecarbon_data(self) -> Dict[Literal["co2_eq_emissions"], Dict[str, Any results["co2_eq_emissions"]["hardware_used"] = emissions_data.gpu_model return results - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> dict[str, Any]: # Extract some meaningful examples from the evaluation or training dataset to showcase the performance if ( not self.widget diff --git a/sentence_transformers/model_card_templates.py b/sentence_transformers/model_card_templates.py index ec81d008a..9adfeb594 100644 --- a/sentence_transformers/model_card_templates.py +++ b/sentence_transformers/model_card_templates.py @@ -3,6 +3,8 @@ SentenceTransformer.old_fit for backwards compatibility, but will be removed in a future release. """ +from __future__ import annotations + import logging from .util import fullname diff --git a/sentence_transformers/models/Asym.py b/sentence_transformers/models/Asym.py index c4ca6ace8..f507d09b1 100644 --- a/sentence_transformers/models/Asym.py +++ b/sentence_transformers/models/Asym.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import json import os from collections import OrderedDict -from typing import Dict, List, Tuple, Union +from typing import List from torch import Tensor, nn @@ -9,7 +11,7 @@ class Asym(nn.Sequential): - def __init__(self, sub_modules: Dict[str, List[nn.Module]], allow_empty_key: bool = True): + def __init__(self, sub_modules: dict[str, list[nn.Module]], allow_empty_key: bool = True): """ This model allows to create asymmetric SentenceTransformer models, that apply different models depending on the specified input key. @@ -50,7 +52,7 @@ def __init__(self, sub_modules: Dict[str, List[nn.Module]], allow_empty_key: boo ordered_dict[name + "-" + str(idx)] = model super(Asym, self).__init__(ordered_dict) - def forward(self, features: Dict[str, Tensor]): + def forward(self, features: dict[str, Tensor]): if "text_keys" in features and len(features["text_keys"]) > 0: text_key = features["text_keys"][0] for model in self.sub_modules[text_key]: @@ -95,7 +97,7 @@ def save(self, output_path): indent=2, ) - def tokenize(self, texts: Union[List[str], List[Tuple[str, str]]], **kwargs): + def tokenize(self, texts: list[str] | list[tuple[str, str]], **kwargs): """Tokenizes a text and maps tokens to token-ids""" if not isinstance(texts[0], dict): raise AttributeError("Asym. model requires that texts are passed as dicts: {'key': 'text'}") diff --git a/sentence_transformers/models/BoW.py b/sentence_transformers/models/BoW.py index c65015c9c..4ec34bb4c 100644 --- a/sentence_transformers/models/BoW.py +++ b/sentence_transformers/models/BoW.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import json import logging import os -from typing import Dict, List, Literal +from typing import Literal import torch from torch import Tensor, nn @@ -19,8 +21,8 @@ class BoW(nn.Module): def __init__( self, - vocab: List[str], - word_weights: Dict[str, float] = {}, + vocab: list[str], + word_weights: dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True, ): @@ -54,11 +56,11 @@ def __init__( self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False) self.sentence_embedding_dimension = len(vocab) - def forward(self, features: Dict[str, Tensor]): + def forward(self, features: dict[str, Tensor]): # Nothing to do, everything is done in get_sentence_features return features - def tokenize(self, texts: List[str], **kwargs) -> List[int]: + def tokenize(self, texts: list[str], **kwargs) -> list[int]: tokenized = [self.tokenizer.tokenize(text, **kwargs) for text in texts] return self.get_sentence_features(tokenized) @@ -66,8 +68,8 @@ def get_sentence_embedding_dimension(self): return self.sentence_embedding_dimension def get_sentence_features( - self, tokenized_texts: List[List[int]], pad_seq_length: int = 0 - ) -> Dict[Literal["sentence_embedding"], torch.Tensor]: + self, tokenized_texts: list[list[int]], pad_seq_length: int = 0 + ) -> dict[Literal["sentence_embedding"], torch.Tensor]: vectors = [] for tokens in tokenized_texts: diff --git a/sentence_transformers/models/CLIPModel.py b/sentence_transformers/models/CLIPModel.py index 17a85d4eb..9f510c09f 100644 --- a/sentence_transformers/models/CLIPModel.py +++ b/sentence_transformers/models/CLIPModel.py @@ -1,4 +1,4 @@ -from typing import Dict, Union +from __future__ import annotations import torch import transformers @@ -19,7 +19,7 @@ def __init__(self, model_name: str = "openai/clip-vit-base-patch32", processor_n def __repr__(self) -> str: return "CLIPModel()" - def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: image_embeds = [] text_embeds = [] @@ -51,7 +51,7 @@ def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: return features - def tokenize(self, texts, padding: Union[str, bool] = True) -> Dict[str, torch.Tensor]: + def tokenize(self, texts, padding: str | bool = True) -> dict[str, torch.Tensor]: images = [] texts_values = [] image_text_info = [] diff --git a/sentence_transformers/models/CNN.py b/sentence_transformers/models/CNN.py index 02e0c5898..6d46ea962 100644 --- a/sentence_transformers/models/CNN.py +++ b/sentence_transformers/models/CNN.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import json import os -from typing import List import torch from safetensors.torch import load_model as load_safetensors_model @@ -15,8 +16,8 @@ def __init__( self, in_word_embedding_dimension: int, out_channels: int = 256, - kernel_sizes: List[int] = [1, 3, 5], - stride_sizes: List[int] = None, + kernel_sizes: list[int] = [1, 3, 5], + stride_sizes: list[int] = None, ): nn.Module.__init__(self) self.config_keys = ["in_word_embedding_dimension", "out_channels", "kernel_sizes"] @@ -55,7 +56,7 @@ def forward(self, features): def get_word_embedding_dimension(self) -> int: return self.embeddings_dimension - def tokenize(self, text: str, **kwargs) -> List[int]: + def tokenize(self, text: str, **kwargs) -> list[int]: raise NotImplementedError() def save(self, output_path: str, safe_serialization: bool = True): diff --git a/sentence_transformers/models/Dense.py b/sentence_transformers/models/Dense.py index 7b77b9a3f..809a34377 100644 --- a/sentence_transformers/models/Dense.py +++ b/sentence_transformers/models/Dense.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import json import os -from typing import Dict import torch from safetensors.torch import load_model as load_safetensors_model @@ -48,7 +49,7 @@ def __init__( if init_bias is not None: self.linear.bias = nn.Parameter(init_bias) - def forward(self, features: Dict[str, Tensor]): + def forward(self, features: dict[str, Tensor]): features.update({"sentence_embedding": self.activation_function(self.linear(features["sentence_embedding"]))}) return features diff --git a/sentence_transformers/models/Dropout.py b/sentence_transformers/models/Dropout.py index f909e609b..a96083043 100644 --- a/sentence_transformers/models/Dropout.py +++ b/sentence_transformers/models/Dropout.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import json import os -from typing import Dict from torch import Tensor, nn @@ -17,7 +18,7 @@ def __init__(self, dropout: float = 0.2): self.dropout = dropout self.dropout_layer = nn.Dropout(self.dropout) - def forward(self, features: Dict[str, Tensor]): + def forward(self, features: dict[str, Tensor]): features.update({"sentence_embedding": self.dropout_layer(features["sentence_embedding"])}) return features diff --git a/sentence_transformers/models/LSTM.py b/sentence_transformers/models/LSTM.py index 1c3866eeb..a9983bc1a 100644 --- a/sentence_transformers/models/LSTM.py +++ b/sentence_transformers/models/LSTM.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import json import os -from typing import List import torch from safetensors.torch import load_model as load_safetensors_model @@ -55,7 +56,7 @@ def forward(self, features): def get_word_embedding_dimension(self) -> int: return self.embeddings_dimension - def tokenize(self, text: str, **kwargs) -> List[int]: + def tokenize(self, text: str, **kwargs) -> list[int]: raise NotImplementedError() def save(self, output_path: str, safe_serialization: bool = True): diff --git a/sentence_transformers/models/LayerNorm.py b/sentence_transformers/models/LayerNorm.py index 1dc7b8198..c414902df 100644 --- a/sentence_transformers/models/LayerNorm.py +++ b/sentence_transformers/models/LayerNorm.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import json import os -from typing import Dict import torch from safetensors.torch import load_model as load_safetensors_model @@ -14,7 +15,7 @@ def __init__(self, dimension: int): self.dimension = dimension self.norm = nn.LayerNorm(dimension) - def forward(self, features: Dict[str, Tensor]): + def forward(self, features: dict[str, Tensor]): features["sentence_embedding"] = self.norm(features["sentence_embedding"]) return features diff --git a/sentence_transformers/models/Normalize.py b/sentence_transformers/models/Normalize.py index 263f0887c..07edf3c4f 100644 --- a/sentence_transformers/models/Normalize.py +++ b/sentence_transformers/models/Normalize.py @@ -1,4 +1,4 @@ -from typing import Dict +from __future__ import annotations import torch.nn.functional as F from torch import Tensor, nn @@ -10,7 +10,7 @@ class Normalize(nn.Module): def __init__(self) -> None: super(Normalize, self).__init__() - def forward(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]: + def forward(self, features: dict[str, Tensor]) -> dict[str, Tensor]: features.update({"sentence_embedding": F.normalize(features["sentence_embedding"], p=2, dim=1)}) return features diff --git a/sentence_transformers/models/Pooling.py b/sentence_transformers/models/Pooling.py index a8695b244..e96e06ef6 100644 --- a/sentence_transformers/models/Pooling.py +++ b/sentence_transformers/models/Pooling.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import json import os -from typing import Any, Dict +from typing import Any import torch from torch import Tensor, nn @@ -128,7 +130,7 @@ def get_pooling_mode_str(self) -> str: return "+".join(modes) - def forward(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]: + def forward(self, features: dict[str, Tensor]) -> dict[str, Tensor]: token_embeddings = features["token_embeddings"] attention_mask = features["attention_mask"] if not self.include_prompt and "prompt_length" in features: @@ -226,7 +228,7 @@ def forward(self, features: Dict[str, Tensor]) -> Dict[str, Tensor]: def get_sentence_embedding_dimension(self) -> int: return self.pooling_output_dimension - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return {key: self.__dict__[key] for key in self.config_keys} def save(self, output_path) -> None: diff --git a/sentence_transformers/models/Transformer.py b/sentence_transformers/models/Transformer.py index 70edb16a2..ad0e0950e 100644 --- a/sentence_transformers/models/Transformer.py +++ b/sentence_transformers/models/Transformer.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import json import os -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any import torch from torch import nn @@ -32,11 +34,11 @@ class Transformer(nn.Module): def __init__( self, model_name_or_path: str, - max_seq_length: Optional[int] = None, - model_args: Optional[Dict[str, Any]] = None, - tokenizer_args: Optional[Dict[str, Any]] = None, - config_args: Optional[Dict[str, Any]] = None, - cache_dir: Optional[str] = None, + max_seq_length: int | None = None, + model_args: dict[str, Any] | None = None, + tokenizer_args: dict[str, Any] | None = None, + config_args: dict[str, Any] | None = None, + cache_dir: str | None = None, do_lower_case: bool = False, tokenizer_name_or_path: str = None, ) -> None: @@ -109,7 +111,7 @@ def __repr__(self) -> str: self.get_config_dict(), self.auto_model.__class__.__name__ ) - def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + def forward(self, features: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: """Returns token_embeddings, cls_token""" trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]} if "token_type_ids" in features: @@ -134,8 +136,8 @@ def get_word_embedding_dimension(self) -> int: return self.auto_model.config.hidden_size def tokenize( - self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]], padding: Union[str, bool] = True - ) -> Dict[str, torch.Tensor]: + self, texts: list[str] | list[dict] | list[tuple[str, str]], padding: str | bool = True + ) -> dict[str, torch.Tensor]: """Tokenizes a text and maps tokens to token-ids""" output = {} if isinstance(texts[0], str): @@ -173,7 +175,7 @@ def tokenize( ) return output - def get_config_dict(self) -> Dict[str, Any]: + def get_config_dict(self) -> dict[str, Any]: return {key: self.__dict__[key] for key in self.config_keys} def save(self, output_path: str, safe_serialization: bool = True) -> None: diff --git a/sentence_transformers/models/WeightedLayerPooling.py b/sentence_transformers/models/WeightedLayerPooling.py index 8693eaab0..2a3b4e8c8 100644 --- a/sentence_transformers/models/WeightedLayerPooling.py +++ b/sentence_transformers/models/WeightedLayerPooling.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import json import os -from typing import Dict import torch from safetensors.torch import load_model as load_safetensors_model @@ -25,7 +26,7 @@ def __init__( else nn.Parameter(torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float)) ) - def forward(self, features: Dict[str, Tensor]): + def forward(self, features: dict[str, Tensor]): ft_all_layers = features["all_layer_embeddings"] all_layer_embedding = torch.stack(ft_all_layers) diff --git a/sentence_transformers/models/WordEmbeddings.py b/sentence_transformers/models/WordEmbeddings.py index 3f40d581e..36e0d4b67 100644 --- a/sentence_transformers/models/WordEmbeddings.py +++ b/sentence_transformers/models/WordEmbeddings.py @@ -1,8 +1,9 @@ +from __future__ import annotations + import gzip import json import logging import os -from typing import List import numpy as np import torch @@ -54,7 +55,7 @@ def forward(self, features): ) return features - def tokenize(self, texts: List[str], **kwargs): + def tokenize(self, texts: list[str], **kwargs): tokenized_texts = [self.tokenizer.tokenize(text, **kwargs) for text in texts] sentence_lengths = [len(tokens) for tokens in tokenized_texts] max_len = max(sentence_lengths) diff --git a/sentence_transformers/models/WordWeights.py b/sentence_transformers/models/WordWeights.py index d545fab38..3ed6adbfe 100644 --- a/sentence_transformers/models/WordWeights.py +++ b/sentence_transformers/models/WordWeights.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import json import logging import os -from typing import Dict, List import torch from torch import Tensor, nn @@ -12,7 +13,7 @@ class WordWeights(nn.Module): """This model can weight word embeddings, for example, with idf-values.""" - def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1): + def __init__(self, vocab: list[str], word_weights: dict[str, float], unknown_word_weight: float = 1): """ Initializes the WordWeights class. @@ -50,7 +51,7 @@ def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_wor self.emb_layer = nn.Embedding(len(vocab), 1) self.emb_layer.load_state_dict({"weight": torch.FloatTensor(weights).unsqueeze(1)}) - def forward(self, features: Dict[str, Tensor]): + def forward(self, features: dict[str, Tensor]): attention_mask = features["attention_mask"] token_embeddings = features["token_embeddings"] diff --git a/sentence_transformers/models/__init__.py b/sentence_transformers/models/__init__.py index c238101ed..d9684310a 100644 --- a/sentence_transformers/models/__init__.py +++ b/sentence_transformers/models/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .Asym import Asym from .BoW import BoW from .CLIPModel import CLIPModel diff --git a/sentence_transformers/models/tokenizer/PhraseTokenizer.py b/sentence_transformers/models/tokenizer/PhraseTokenizer.py index 834154e0d..466835022 100644 --- a/sentence_transformers/models/tokenizer/PhraseTokenizer.py +++ b/sentence_transformers/models/tokenizer/PhraseTokenizer.py @@ -1,9 +1,11 @@ +from __future__ import annotations + import collections import json import logging import os import string -from typing import Iterable, List +from typing import Iterable from transformers.utils.import_utils import NLTK_IMPORT_ERROR, is_nltk_available @@ -58,7 +60,7 @@ def set_vocab(self, vocab: Iterable[str]): logger.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths)) logger.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup))) - def tokenize(self, text: str, **kwargs) -> List[int]: + def tokenize(self, text: str, **kwargs) -> list[int]: from nltk import word_tokenize tokens = word_tokenize(text, preserve_line=True) diff --git a/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py b/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py index 7a6a39473..74c47b02f 100644 --- a/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py +++ b/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import collections import json import os import string -from typing import Iterable, List +from typing import Iterable from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer @@ -27,7 +29,7 @@ def set_vocab(self, vocab: Iterable[str]): self.vocab = vocab self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) - def tokenize(self, text: str, **kwargs) -> List[int]: + def tokenize(self, text: str, **kwargs) -> list[int]: if self.do_lower_case: text = text.lower() diff --git a/sentence_transformers/models/tokenizer/WordTokenizer.py b/sentence_transformers/models/tokenizer/WordTokenizer.py index d732f49cb..15796ddd5 100644 --- a/sentence_transformers/models/tokenizer/WordTokenizer.py +++ b/sentence_transformers/models/tokenizer/WordTokenizer.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from abc import ABC, abstractmethod -from typing import Iterable, List +from typing import Iterable ENGLISH_STOP_WORDS = [ "!", @@ -401,7 +403,7 @@ def get_vocab(self, vocab: Iterable[str]): pass @abstractmethod - def tokenize(self, text: str, **kwargs) -> List[int]: + def tokenize(self, text: str, **kwargs) -> list[int]: pass @abstractmethod diff --git a/sentence_transformers/models/tokenizer/__init__.py b/sentence_transformers/models/tokenizer/__init__.py index b09bed73a..9dcab525f 100644 --- a/sentence_transformers/models/tokenizer/__init__.py +++ b/sentence_transformers/models/tokenizer/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .PhraseTokenizer import PhraseTokenizer from .WhitespaceTokenizer import WhitespaceTokenizer from .WordTokenizer import ENGLISH_STOP_WORDS, WordTokenizer diff --git a/sentence_transformers/quantization.py b/sentence_transformers/quantization.py index 8750b974b..0c4d9df23 100644 --- a/sentence_transformers/quantization.py +++ b/sentence_transformers/quantization.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import logging import time -from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union +from typing import TYPE_CHECKING, Literal import numpy as np from torch import Tensor @@ -15,17 +17,17 @@ def semantic_search_faiss( query_embeddings: np.ndarray, - corpus_embeddings: Optional[np.ndarray] = None, - corpus_index: Optional["faiss.Index"] = None, + corpus_embeddings: np.ndarray | None = None, + corpus_index: "faiss.Index" | None = None, corpus_precision: Literal["float32", "uint8", "ubinary"] = "float32", top_k: int = 10, - ranges: Optional[np.ndarray] = None, - calibration_embeddings: Optional[np.ndarray] = None, + ranges: np.ndarray | None = None, + calibration_embeddings: np.ndarray | None = None, rescore: bool = True, rescore_multiplier: int = 2, exact: bool = True, output_index: bool = False, -) -> Tuple[List[List[Dict[str, Union[int, float]]]], float, "faiss.Index"]: +) -> tuple[list[list[dict[str, int | float]]], float, "faiss.Index"]: """ Performs semantic search using the FAISS library. @@ -182,17 +184,17 @@ def semantic_search_faiss( def semantic_search_usearch( query_embeddings: np.ndarray, - corpus_embeddings: Optional[np.ndarray] = None, - corpus_index: Optional["usearch.index.Index"] = None, + corpus_embeddings: np.ndarray | None = None, + corpus_index: "usearch.index.Index" | None = None, corpus_precision: Literal["float32", "int8", "binary"] = "float32", top_k: int = 10, - ranges: Optional[np.ndarray] = None, - calibration_embeddings: Optional[np.ndarray] = None, + ranges: np.ndarray | None = None, + calibration_embeddings: np.ndarray | None = None, rescore: bool = True, rescore_multiplier: int = 2, exact: bool = True, output_index: bool = False, -) -> Tuple[List[List[Dict[str, Union[int, float]]]], float, "usearch.index.Index"]: +) -> tuple[list[list[dict[str, int | float]]], float, "usearch.index.Index"]: """ Performs semantic search using the usearch library. @@ -361,10 +363,10 @@ def semantic_search_usearch( def quantize_embeddings( - embeddings: Union[Tensor, np.ndarray], + embeddings: Tensor | np.ndarray, precision: Literal["float32", "int8", "uint8", "binary", "ubinary"], - ranges: Optional[np.ndarray] = None, - calibration_embeddings: Optional[np.ndarray] = None, + ranges: np.ndarray | None = None, + calibration_embeddings: np.ndarray | None = None, ) -> np.ndarray: """ Quantizes embeddings to a lower precision. This can be used to reduce the memory footprint and increase the diff --git a/sentence_transformers/readers/InputExample.py b/sentence_transformers/readers/InputExample.py index 7266159e3..852a4a411 100644 --- a/sentence_transformers/readers/InputExample.py +++ b/sentence_transformers/readers/InputExample.py @@ -1,10 +1,10 @@ -from typing import List, Union +from __future__ import annotations class InputExample: """Structure for one input example with texts, the label and a unique id""" - def __init__(self, guid: str = "", texts: List[str] = None, label: Union[int, float] = 0): + def __init__(self, guid: str = "", texts: list[str] = None, label: int | float = 0): """ Creates one InputExample with the given texts, guid and label diff --git a/sentence_transformers/readers/LabelSentenceReader.py b/sentence_transformers/readers/LabelSentenceReader.py index 82aefedb7..3571e2c27 100644 --- a/sentence_transformers/readers/LabelSentenceReader.py +++ b/sentence_transformers/readers/LabelSentenceReader.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os from . import InputExample diff --git a/sentence_transformers/readers/NLIDataReader.py b/sentence_transformers/readers/NLIDataReader.py index ce359d6f5..6198dd71a 100644 --- a/sentence_transformers/readers/NLIDataReader.py +++ b/sentence_transformers/readers/NLIDataReader.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import gzip import os diff --git a/sentence_transformers/readers/PairedFilesReader.py b/sentence_transformers/readers/PairedFilesReader.py index 157ac5cbe..b6f1953a1 100644 --- a/sentence_transformers/readers/PairedFilesReader.py +++ b/sentence_transformers/readers/PairedFilesReader.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import gzip from . import InputExample diff --git a/sentence_transformers/readers/STSDataReader.py b/sentence_transformers/readers/STSDataReader.py index 6c9533989..61a0011f1 100644 --- a/sentence_transformers/readers/STSDataReader.py +++ b/sentence_transformers/readers/STSDataReader.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import csv import gzip import os diff --git a/sentence_transformers/readers/TripletReader.py b/sentence_transformers/readers/TripletReader.py index be32ebd9b..2fd3ed0aa 100644 --- a/sentence_transformers/readers/TripletReader.py +++ b/sentence_transformers/readers/TripletReader.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import csv import os diff --git a/sentence_transformers/readers/__init__.py b/sentence_transformers/readers/__init__.py index fb2add55a..6a905d28e 100644 --- a/sentence_transformers/readers/__init__.py +++ b/sentence_transformers/readers/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from .InputExample import InputExample from .LabelSentenceReader import LabelSentenceReader from .NLIDataReader import NLIDataReader diff --git a/sentence_transformers/sampler.py b/sentence_transformers/sampler.py index 80ecc1c88..4569b759e 100644 --- a/sentence_transformers/sampler.py +++ b/sentence_transformers/sampler.py @@ -1,7 +1,9 @@ +from __future__ import annotations + import logging from collections import defaultdict from itertools import accumulate, cycle -from typing import Any, Iterator, List +from typing import Any, Iterator import torch from torch.utils.data import BatchSampler, ConcatDataset, SubsetRandomSampler @@ -58,7 +60,7 @@ def __init__( dataset: "Dataset", batch_size: int, drop_last: bool, - valid_label_columns: List[str] = None, + valid_label_columns: list[str] = None, generator: torch.Generator = None, seed: int = 0, ) -> None: @@ -84,7 +86,7 @@ def __init__( } @staticmethod - def _determine_labels_to_use(dataset: "Dataset", valid_label_columns: List[str]) -> List[Any]: + def _determine_labels_to_use(dataset: "Dataset", valid_label_columns: list[str]) -> list[Any]: for column_name in valid_label_columns or []: if column_name in dataset.column_names: return dataset[column_name] @@ -93,7 +95,7 @@ def _determine_labels_to_use(dataset: "Dataset", valid_label_columns: List[str]) f"which only has these columns: {dataset.column_names}." ) - def __iter__(self) -> Iterator[List[int]]: + def __iter__(self) -> Iterator[list[int]]: if self.generator and self.seed: self.generator.manual_seed(self.seed + self.epoch) @@ -117,7 +119,7 @@ def __init__( dataset: "Dataset", batch_size: int, drop_last: bool, - valid_label_columns: List[str] = [], + valid_label_columns: list[str] = [], generator: torch.Generator = None, seed: int = 0, ) -> None: @@ -130,7 +132,7 @@ def __init__( self.generator = generator self.seed = seed - def __iter__(self) -> Iterator[List[int]]: + def __iter__(self) -> Iterator[list[int]]: """ Iterate over the remaining non-yielded indices. For each index, check if the sample values are already in the batch. If not, add the sample values to the batch keep going until the batch is full. If the batch is full, yield @@ -185,7 +187,7 @@ class RoundRobinBatchSampler(SetEpochMixin, BatchSampler): def __init__( self, dataset: ConcatDataset, - batch_samplers: List[BatchSampler], + batch_samplers: list[BatchSampler], generator: torch.Generator = None, seed: int = None, ) -> None: @@ -197,7 +199,7 @@ def __init__( self.generator = generator self.seed = seed - def __iter__(self) -> Iterator[List[int]]: + def __iter__(self) -> Iterator[list[int]]: if self.generator and self.seed: self.generator.manual_seed(self.seed + self.epoch) @@ -221,7 +223,7 @@ class ProportionalBatchSampler(SetEpochMixin, BatchSampler): def __init__( self, dataset: ConcatDataset, - batch_samplers: List[BatchSampler], + batch_samplers: list[BatchSampler], generator: torch.Generator, seed: int, ) -> None: @@ -231,7 +233,7 @@ def __init__( self.generator = generator self.seed = seed - def __iter__(self) -> Iterator[List[int]]: + def __iter__(self) -> Iterator[list[int]]: self.generator.manual_seed(self.seed + self.epoch) num_samples = [len(dataset) for dataset in self.dataset.datasets] diff --git a/sentence_transformers/similarity_functions.py b/sentence_transformers/similarity_functions.py index 188b12a74..e05681703 100644 --- a/sentence_transformers/similarity_functions.py +++ b/sentence_transformers/similarity_functions.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from enum import Enum -from typing import Callable, List, Union +from typing import Callable from numpy import ndarray from torch import Tensor @@ -34,8 +36,8 @@ class SimilarityFunction(Enum): @staticmethod def to_similarity_fn( - similarity_function: Union[str, "SimilarityFunction"], - ) -> Callable[[Union[Tensor, ndarray], Union[Tensor, ndarray]], Tensor]: + similarity_function: str | "SimilarityFunction", + ) -> Callable[[Tensor | ndarray, Tensor | ndarray], Tensor]: """ Converts a similarity function name or enum value to the corresponding similarity function. @@ -74,8 +76,8 @@ def to_similarity_fn( @staticmethod def to_similarity_pairwise_fn( - similarity_function: Union[str, "SimilarityFunction"], - ) -> Callable[[Union[Tensor, ndarray], Union[Tensor, ndarray]], Tensor]: + similarity_function: str | "SimilarityFunction", + ) -> Callable[[Tensor | ndarray, Tensor | ndarray], Tensor]: """ Converts a similarity function into a pairwise similarity function. @@ -116,7 +118,7 @@ def to_similarity_pairwise_fn( ) @staticmethod - def possible_values() -> List[str]: + def possible_values() -> list[str]: """ Returns a list of possible values for the SimilarityFunction enum. diff --git a/sentence_transformers/trainer.py b/sentence_transformers/trainer.py index 85b70070b..afb913889 100644 --- a/sentence_transformers/trainer.py +++ b/sentence_transformers/trainer.py @@ -1,8 +1,10 @@ +from __future__ import annotations + import logging import os import warnings from contextlib import nullcontext -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable import torch from torch import nn @@ -114,26 +116,23 @@ class SentenceTransformerTrainer(Trainer): def __init__( self, - model: Optional["SentenceTransformer"] = None, + model: "SentenceTransformer" | None = None, args: SentenceTransformerTrainingArguments = None, - train_dataset: Optional[Union["Dataset", "DatasetDict", Dict[str, "Dataset"]]] = None, - eval_dataset: Optional[Union["Dataset", "DatasetDict", Dict[str, "Dataset"]]] = None, - loss: Optional[ - Union[ - nn.Module, - Dict[str, nn.Module], - Callable[["SentenceTransformer"], torch.nn.Module], - Dict[str, Callable[["SentenceTransformer"], torch.nn.Module]], - ] - ] = None, - evaluator: Optional[Union[SentenceEvaluator, List[SentenceEvaluator]]] = None, - data_collator: Optional[DataCollator] = None, - tokenizer: Optional[Union[PreTrainedTokenizerBase, Callable]] = None, - model_init: Optional[Callable[[], "SentenceTransformer"]] = None, - compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, - callbacks: Optional[List[TrainerCallback]] = None, - optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), - preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + train_dataset: "Dataset" | "DatasetDict" | dict[str, "Dataset"] | None = None, + eval_dataset: "Dataset" | "DatasetDict" | dict[str, "Dataset"] | None = None, + loss: nn.Module + | dict[str, nn.Module] + | Callable[["SentenceTransformer"], torch.nn.Module] + | dict[str, Callable[["SentenceTransformer"], torch.nn.Module]] + | None = None, + evaluator: SentenceEvaluator | list[SentenceEvaluator] | None = None, + data_collator: DataCollator | None = None, + tokenizer: PreTrainedTokenizerBase | Callable | None = None, + model_init: Callable[[], "SentenceTransformer"] | None = None, + compute_metrics: Callable[[EvalPrediction], dict] | None = None, + callbacks: list[TrainerCallback] | None = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] | None = None, ) -> None: if not is_training_available(): raise RuntimeError( @@ -275,7 +274,7 @@ def override_model_in_loss(self, loss: torch.nn.Module, model: "SentenceTransfor def prepare_loss( self, - loss: Union[Callable[["SentenceTransformer"], torch.nn.Module], torch.nn.Module], + loss: Callable[["SentenceTransformer"], torch.nn.Module] | torch.nn.Module, model: "SentenceTransformer", ) -> torch.nn.Module: if isinstance(loss, torch.nn.Module): @@ -291,9 +290,9 @@ def add_dataset_name_column(self, dataset_dict: "DatasetDict") -> "DatasetDict": def compute_loss( self, model: "SentenceTransformer", - inputs: Dict[str, Union[torch.Tensor, Any]], + inputs: dict[str, torch.Tensor | Any], return_outputs: bool = False, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, Any]]]: + ) -> torch.Tensor | tuple[torch.Tensor, dict[str, Any]]: """ Computes the loss for the SentenceTransformer model. @@ -336,8 +335,8 @@ def compute_loss( return loss def collect_features( - self, inputs: Dict[str, Union[torch.Tensor, Any]] - ) -> Tuple[List[Dict[str, torch.Tensor]], Optional[torch.Tensor]]: + self, inputs: dict[str, torch.Tensor | Any] + ) -> tuple[list[dict[str, torch.Tensor]], torch.Tensor | None]: """Turn the inputs from the dataloader into the separate model inputs & the labels. Example:: @@ -372,10 +371,10 @@ def collect_features( def evaluate( self, - eval_dataset: Optional[Union["Dataset", Dict[str, "Dataset"]]] = None, - ignore_keys: Optional[List[str]] = None, + eval_dataset: "Dataset" | dict[str, "Dataset"] | None = None, + ignore_keys: list[str] | None = None, metric_key_prefix: str = "eval", - ) -> Dict[str, float]: + ) -> dict[str, float]: eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset if isinstance(eval_dataset, DatasetDict) and isinstance(self.loss, dict): eval_dataset = self.add_dataset_name_column(eval_dataset) @@ -385,8 +384,8 @@ def evaluation_loop( self, dataloader: DataLoader, description: str, - prediction_loss_only: Optional[bool] = None, - ignore_keys: Optional[List[str]] = None, + prediction_loss_only: bool | None = None, + ignore_keys: list[str] | None = None, metric_key_prefix: str = "eval", ) -> EvalLoopOutput: output = super().evaluation_loop( @@ -449,7 +448,7 @@ def _load_best_model(self) -> None: self.model = full_model self.model[0].auto_model = loaded_auto_model - def validate_column_names(self, dataset: "Dataset", dataset_name: Optional[str] = None) -> bool: + def validate_column_names(self, dataset: "Dataset", dataset_name: str | None = None) -> bool: if overlap := set(dataset.column_names) & {"return_loss", "dataset_name"}: raise ValueError( f"The following column names are invalid in your {dataset_name + ' ' if dataset_name else ''}dataset: {list(overlap)}." @@ -461,8 +460,8 @@ def get_batch_sampler( dataset: "Dataset", batch_size: int, drop_last: bool, - valid_label_columns: Optional[List[str]] = None, - generator: Optional[torch.Generator] = None, + valid_label_columns: list[str] | None = None, + generator: torch.Generator | None = None, ) -> BatchSampler: if self.args.batch_sampler == BatchSamplers.NO_DUPLICATES: return NoDuplicatesBatchSampler( @@ -491,9 +490,9 @@ def get_batch_sampler( def get_multi_dataset_batch_sampler( self, dataset: ConcatDataset, - batch_samplers: List[BatchSampler], - generator: Optional[torch.Generator] = None, - seed: Optional[int] = 0, + batch_samplers: list[BatchSampler], + generator: torch.Generator | None = None, + seed: int | None = 0, ) -> BatchSampler: if self.args.multi_dataset_batch_sampler == MultiDatasetBatchSamplers.ROUND_ROBIN: return RoundRobinBatchSampler( @@ -581,7 +580,7 @@ def get_train_dataloader(self) -> DataLoader: self._train_dataloader = self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params)) return self._train_dataloader - def get_eval_dataloader(self, eval_dataset: Union["Dataset", None] = None) -> DataLoader: + def get_eval_dataloader(self, eval_dataset: "Dataset" | None = None) -> DataLoader: """ Returns the evaluation [`~torch.utils.data.DataLoader`]. @@ -718,7 +717,7 @@ def get_test_dataloader(self, test_dataset: "Dataset") -> DataLoader: self._train_dataloader = self.accelerator.prepare(DataLoader(test_dataset, **dataloader_params)) return self._train_dataloader - def _save(self, output_dir: Optional[str] = None, state_dict=None) -> None: + def _save(self, output_dir: str | None = None, state_dict=None) -> None: # If we are executing this function, we are the process zero, so we don't check for that. output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) @@ -740,15 +739,15 @@ def _load_from_checkpoint(self, checkpoint_path: str) -> None: def create_model_card( self, - language: Optional[str] = None, - license: Optional[str] = None, - tags: Union[str, List[str], None] = None, - model_name: Optional[str] = None, - finetuned_from: Optional[str] = None, - tasks: Union[str, List[str], None] = None, - dataset_tags: Union[str, List[str], None] = None, - dataset: Union[str, List[str], None] = None, - dataset_args: Union[str, List[str], None] = None, + language: str | None = None, + license: str | None = None, + tags: str | list[str] | None = None, + model_name: str | None = None, + finetuned_from: str | None = None, + tasks: str | list[str] | None = None, + dataset_tags: str | list[str] | None = None, + dataset: str | list[str] | None = None, + dataset_args: str | list[str] | None = None, **kwargs, ) -> None: if not self.is_world_process_zero(): diff --git a/sentence_transformers/training_args.py b/sentence_transformers/training_args.py index 4aefcd426..39ba4ac04 100644 --- a/sentence_transformers/training_args.py +++ b/sentence_transformers/training_args.py @@ -1,6 +1,7 @@ +from __future__ import annotations + import logging from dataclasses import dataclass, field -from typing import Union from transformers import TrainingArguments as TransformersTrainingArguments from transformers.training_args import ParallelMode @@ -62,10 +63,10 @@ class SentenceTransformerTrainingArguments(TransformersTrainingArguments): for valid options. Defaults to ``MultiDatasetBatchSamplers.PROPORTIONAL``. """ - batch_sampler: Union[BatchSamplers, str] = field( + batch_sampler: BatchSamplers | str = field( default=BatchSamplers.BATCH_SAMPLER, metadata={"help": "The batch sampler to use."} ) - multi_dataset_batch_sampler: Union[MultiDatasetBatchSamplers, str] = field( + multi_dataset_batch_sampler: MultiDatasetBatchSamplers | str = field( default=MultiDatasetBatchSamplers.PROPORTIONAL, metadata={"help": "The multi-dataset batch sampler to use."} ) diff --git a/sentence_transformers/util.py b/sentence_transformers/util.py index 5288d9e08..aebf6a3d6 100644 --- a/sentence_transformers/util.py +++ b/sentence_transformers/util.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import functools import heapq import importlib @@ -7,7 +9,7 @@ import random import sys from contextlib import contextmanager -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Type, Union, overload +from typing import TYPE_CHECKING, Any, Callable, Literal, overload import numpy as np import requests @@ -27,7 +29,7 @@ from sentence_transformers.SentenceTransformer import SentenceTransformer -def _convert_to_tensor(a: Union[list, np.ndarray, Tensor]) -> Tensor: +def _convert_to_tensor(a: list | np.ndarray | Tensor) -> Tensor: """ Converts the input `a` to a PyTorch tensor if it is not already a tensor. @@ -57,7 +59,7 @@ def _convert_to_batch(a: Tensor) -> Tensor: return a -def _convert_to_batch_tensor(a: Union[list, np.ndarray, Tensor]) -> Tensor: +def _convert_to_batch_tensor(a: list | np.ndarray | Tensor) -> Tensor: """ Converts the input data to a tensor with a batch dimension. @@ -86,7 +88,7 @@ def pytorch_cos_sim(a: Tensor, b: Tensor) -> Tensor: return cos_sim(a, b) -def cos_sim(a: Union[list, np.ndarray, Tensor], b: Union[list, np.ndarray, Tensor]) -> Tensor: +def cos_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor) -> Tensor: """ Computes the cosine similarity between two tensors. @@ -122,7 +124,7 @@ def pairwise_cos_sim(a: Tensor, b: Tensor) -> Tensor: return pairwise_dot_score(normalize_embeddings(a), normalize_embeddings(b)) -def dot_score(a: Union[list, np.ndarray, Tensor], b: Union[list, np.ndarray, Tensor]) -> Tensor: +def dot_score(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor) -> Tensor: """ Computes the dot-product dot_prod(a[i], b[j]) for all i and j. @@ -156,7 +158,7 @@ def pairwise_dot_score(a: Tensor, b: Tensor) -> Tensor: return (a * b).sum(dim=-1) -def manhattan_sim(a: Union[list, np.ndarray, Tensor], b: Union[list, np.ndarray, Tensor]) -> Tensor: +def manhattan_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor) -> Tensor: """ Computes the manhattan similarity (i.e., negative distance) between two tensors. @@ -173,7 +175,7 @@ def manhattan_sim(a: Union[list, np.ndarray, Tensor], b: Union[list, np.ndarray, return -torch.cdist(a, b, p=1.0) -def pairwise_manhattan_sim(a: Union[list, np.ndarray, Tensor], b: Union[list, np.ndarray, Tensor]): +def pairwise_manhattan_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor): """ Computes the manhattan similarity (i.e., negative distance) between pairs of tensors. @@ -190,7 +192,7 @@ def pairwise_manhattan_sim(a: Union[list, np.ndarray, Tensor], b: Union[list, np return -torch.sum(torch.abs(a - b), dim=-1) -def euclidean_sim(a: Union[list, np.ndarray, Tensor], b: Union[list, np.ndarray, Tensor]) -> Tensor: +def euclidean_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor) -> Tensor: """ Computes the euclidean similarity (i.e., negative distance) between two tensors. @@ -207,7 +209,7 @@ def euclidean_sim(a: Union[list, np.ndarray, Tensor], b: Union[list, np.ndarray, return -torch.cdist(a, b, p=2.0) -def pairwise_euclidean_sim(a: Union[list, np.ndarray, Tensor], b: Union[list, np.ndarray, Tensor]): +def pairwise_euclidean_sim(a: list | np.ndarray | Tensor, b: list | np.ndarray | Tensor): """ Computes the euclidean distance (i.e., negative distance) between pairs of tensors. @@ -272,16 +274,14 @@ def normalize_embeddings(embeddings: Tensor) -> Tensor: @overload -def truncate_embeddings(embeddings: np.ndarray, truncate_dim: Optional[int]) -> np.ndarray: ... +def truncate_embeddings(embeddings: np.ndarray, truncate_dim: int | None) -> np.ndarray: ... @overload -def truncate_embeddings(embeddings: torch.Tensor, truncate_dim: Optional[int]) -> torch.Tensor: ... +def truncate_embeddings(embeddings: torch.Tensor, truncate_dim: int | None) -> torch.Tensor: ... -def truncate_embeddings( - embeddings: Union[np.ndarray, torch.Tensor], truncate_dim: Optional[int] -) -> Union[np.ndarray, torch.Tensor]: +def truncate_embeddings(embeddings: np.ndarray | torch.Tensor, truncate_dim: int | None) -> np.ndarray | torch.Tensor: """ Truncates the embeddings matrix. @@ -315,7 +315,7 @@ def truncate_embeddings( def paraphrase_mining( model, - sentences: List[str], + sentences: list[str], show_progress_bar: bool = False, batch_size: int = 32, query_chunk_size: int = 5000, @@ -323,7 +323,7 @@ def paraphrase_mining( max_pairs: int = 500000, top_k: int = 100, score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim, -) -> List[List[Union[float, int]]]: +) -> list[list[float | int]]: """ Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all other sentences and returns a list with the pairs that have the highest cosine similarity score. @@ -365,7 +365,7 @@ def paraphrase_mining_embeddings( max_pairs: int = 500000, top_k: int = 100, score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim, -) -> List[List[Union[float, int]]]: +) -> list[list[float | int]]: """ Given a list of sentences / texts, this function performs paraphrase mining. It compares all sentences against all other sentences and returns a list with the pairs that have the highest cosine similarity score. @@ -431,7 +431,7 @@ def paraphrase_mining_embeddings( return pairs_list -def information_retrieval(*args, **kwargs) -> List[List[Dict[str, Union[int, float]]]]: +def information_retrieval(*args, **kwargs) -> list[list[dict[str, int | float]]]: """This function is deprecated. Use semantic_search instead""" return semantic_search(*args, **kwargs) @@ -443,7 +443,7 @@ def semantic_search( corpus_chunk_size: int = 500000, top_k: int = 10, score_function: Callable[[Tensor, Tensor], Tensor] = cos_sim, -) -> List[List[Dict[str, Union[int, float]]]]: +) -> list[list[dict[str, int | float]]]: """ This function performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings. It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries. @@ -519,11 +519,11 @@ def semantic_search( def mine_hard_negatives( dataset: "Dataset", model: "SentenceTransformer", - cross_encoder: Optional["CrossEncoder"] = None, + cross_encoder: "CrossEncoder" | None = None, range_min: int = 0, - range_max: Optional[int] = None, - max_score: Optional[float] = None, - margin: Optional[float] = None, + range_max: int | None = None, + max_score: float | None = None, + margin: float | None = None, num_negatives: int = 3, sampling_strategy: Literal["random", "top"] = "top", as_triplets: bool = True, @@ -911,7 +911,7 @@ def http_get(url: str, path: str) -> None: progress.close() -def batch_to_device(batch: Dict[str, Any], target_device: device) -> Dict[str, Any]: +def batch_to_device(batch: dict[str, Any], target_device: device) -> dict[str, Any]: """ Send a PyTorch batch (i.e., a dictionary of string keys to Tensors) to a device (e.g. "cpu", "cuda", "mps"). @@ -956,7 +956,7 @@ def fullname(o) -> str: return module + "." + o.__class__.__name__ -def import_from_string(dotted_path: str) -> Type: +def import_from_string(dotted_path: str) -> type: """ Import a dotted module path and return the attribute/class designated by the last name in the path. Raise ImportError if the import failed. @@ -993,12 +993,12 @@ def import_from_string(dotted_path: str) -> Type: def community_detection( - embeddings: Union[torch.Tensor, np.ndarray], + embeddings: torch.Tensor | np.ndarray, threshold: float = 0.75, min_community_size: int = 10, batch_size: int = 1024, show_progress_bar: bool = False, -) -> List[List[int]]: +) -> list[list[int]]: """ Function for Fast Community Detection. @@ -1141,9 +1141,9 @@ def disable_logging(highest_level=logging.CRITICAL): def is_sentence_transformer_model( model_name_or_path: str, - token: Optional[Union[bool, str]] = None, - cache_folder: Optional[str] = None, - revision: Optional[str] = None, + token: bool | str | None = None, + cache_folder: str | None = None, + revision: str | None = None, local_files_only: bool = False, ) -> bool: """ @@ -1174,11 +1174,11 @@ def is_sentence_transformer_model( def load_file_path( model_name_or_path: str, filename: str, - token: Optional[Union[bool, str]], - cache_folder: Optional[str], - revision: Optional[str] = None, + token: bool | str | None, + cache_folder: str | None, + revision: str | None = None, local_files_only: bool = False, -) -> Optional[str]: +) -> str | None: """ Loads a file from a local or remote location. @@ -1216,11 +1216,11 @@ def load_file_path( def load_dir_path( model_name_or_path: str, directory: str, - token: Optional[Union[bool, str]], - cache_folder: Optional[str], - revision: Optional[str] = None, + token: bool | str | None, + cache_folder: str | None, + revision: str | None = None, local_files_only: bool = False, -) -> Optional[str]: +) -> str | None: """ Loads the directory path for a given model name or path. diff --git a/tests/conftest.py b/tests/conftest.py index 5a83759ad..9632334a0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os import platform import tempfile diff --git a/tests/samplers/test_group_by_label_batch_sampler.py b/tests/samplers/test_group_by_label_batch_sampler.py index 93fe6b93c..c3f213acc 100644 --- a/tests/samplers/test_group_by_label_batch_sampler.py +++ b/tests/samplers/test_group_by_label_batch_sampler.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from collections import Counter import pytest diff --git a/tests/samplers/test_no_duplicates_batch_sampler.py b/tests/samplers/test_no_duplicates_batch_sampler.py index d0323421c..ad93ff08c 100644 --- a/tests/samplers/test_no_duplicates_batch_sampler.py +++ b/tests/samplers/test_no_duplicates_batch_sampler.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import random import pytest diff --git a/tests/samplers/test_round_robin_batch_sampler.py b/tests/samplers/test_round_robin_batch_sampler.py index 37682525a..a8883c649 100644 --- a/tests/samplers/test_round_robin_batch_sampler.py +++ b/tests/samplers/test_round_robin_batch_sampler.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest from datasets import Dataset from torch.utils.data import BatchSampler, ConcatDataset, SequentialSampler diff --git a/tests/test_cmnrl.py b/tests/test_cmnrl.py index 3d47b4b02..8fa352829 100644 --- a/tests/test_cmnrl.py +++ b/tests/test_cmnrl.py @@ -1,5 +1,6 @@ +from __future__ import annotations + from contextlib import nullcontext -from typing import List import pytest import torch @@ -79,8 +80,8 @@ ], ) def test_cmnrl_same_grad( - train_samples_mnrl: List[InputExample], - train_samples_cmnrl: List[InputExample], + train_samples_mnrl: list[InputExample], + train_samples_cmnrl: list[InputExample], same_grad: bool, scaler: float, precision: float, diff --git a/tests/test_compute_embeddings.py b/tests/test_compute_embeddings.py index d301367b7..5b0bf6aaa 100644 --- a/tests/test_compute_embeddings.py +++ b/tests/test_compute_embeddings.py @@ -2,6 +2,8 @@ Computes embeddings """ +from __future__ import annotations + import numpy as np from sentence_transformers import SentenceTransformer diff --git a/tests/test_cross_encoder.py b/tests/test_cross_encoder.py index c2c0c3e65..4431bc727 100644 --- a/tests/test_cross_encoder.py +++ b/tests/test_cross_encoder.py @@ -2,12 +2,14 @@ Tests that the pretrained models produce the correct scores on the STSbenchmark dataset """ +from __future__ import annotations + import csv import gzip import os import tempfile from pathlib import Path -from typing import Generator, List, Tuple +from typing import Generator import numpy as np import pytest @@ -20,7 +22,7 @@ @pytest.fixture() -def sts_resource() -> Generator[Tuple[List[InputExample], List[InputExample]], None, None]: +def sts_resource() -> Generator[tuple[list[InputExample], list[InputExample]], None, None]: sts_dataset_path = "datasets/stsbenchmark.tsv.gz" if not os.path.exists(sts_dataset_path): util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path) @@ -43,7 +45,7 @@ def sts_resource() -> Generator[Tuple[List[InputExample], List[InputExample]], N def evaluate_stsb_test( distilroberta_base_ce_model: CrossEncoder, expected_score: float, - test_samples: List[InputExample], + test_samples: list[InputExample], num_test_samples: int = -1, ) -> None: model = distilroberta_base_ce_model @@ -53,7 +55,7 @@ def evaluate_stsb_test( assert score > expected_score or abs(score - expected_score) < 0.1 -def test_pretrained_stsb(sts_resource: Tuple[List[InputExample], List[InputExample]]): +def test_pretrained_stsb(sts_resource: tuple[list[InputExample], list[InputExample]]): _, sts_test_samples = sts_resource model = CrossEncoder("cross-encoder/stsb-distilroberta-base") evaluate_stsb_test(model, 87.92, sts_test_samples) @@ -61,7 +63,7 @@ def test_pretrained_stsb(sts_resource: Tuple[List[InputExample], List[InputExamp @pytest.mark.slow def test_train_stsb_slow( - distilroberta_base_ce_model: CrossEncoder, sts_resource: Tuple[List[InputExample], List[InputExample]] + distilroberta_base_ce_model: CrossEncoder, sts_resource: tuple[list[InputExample], list[InputExample]] ) -> None: model = distilroberta_base_ce_model sts_train_samples, sts_test_samples = sts_resource @@ -75,7 +77,7 @@ def test_train_stsb_slow( def test_train_stsb( - distilroberta_base_ce_model: CrossEncoder, sts_resource: Tuple[List[InputExample], List[InputExample]] + distilroberta_base_ce_model: CrossEncoder, sts_resource: tuple[list[InputExample], list[InputExample]] ) -> None: model = distilroberta_base_ce_model sts_train_samples, sts_test_samples = sts_resource diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index ee6eb3409..844776869 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -2,6 +2,8 @@ Tests the correct computation of evaluation scores from BinaryClassificationEvaluator """ +from __future__ import annotations + import csv import gzip import os diff --git a/tests/test_image_embeddings.py b/tests/test_image_embeddings.py index d684e258f..0a4c8cb4b 100644 --- a/tests/test_image_embeddings.py +++ b/tests/test_image_embeddings.py @@ -2,6 +2,8 @@ Compute image embeddings """ +from __future__ import annotations + import os from PIL import Image diff --git a/tests/test_model_card_data.py b/tests/test_model_card_data.py index 8138b3ba6..9e358f2b4 100644 --- a/tests/test_model_card_data.py +++ b/tests/test_model_card_data.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import pytest from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer diff --git a/tests/test_multi_process.py b/tests/test_multi_process.py index a1deef2f9..ea19c9b5e 100644 --- a/tests/test_multi_process.py +++ b/tests/test_multi_process.py @@ -2,7 +2,7 @@ Computes embeddings """ -from typing import Optional +from __future__ import annotations import numpy as np import pytest @@ -13,7 +13,7 @@ @pytest.mark.parametrize("normalize_embeddings", (False, True)) @pytest.mark.parametrize("prompt_name", (None, "retrieval")) def test_encode_multi_process( - stsb_bert_tiny_model: SentenceTransformer, normalize_embeddings: bool, prompt_name: Optional[str] + stsb_bert_tiny_model: SentenceTransformer, normalize_embeddings: bool, prompt_name: str | None ) -> None: model = stsb_bert_tiny_model model.prompts = {"retrieval": "Represent this sentence for searching relevant passages: "} diff --git a/tests/test_pretrained_stsb.py b/tests/test_pretrained_stsb.py index 4a98a337d..0e20ecfc7 100644 --- a/tests/test_pretrained_stsb.py +++ b/tests/test_pretrained_stsb.py @@ -2,11 +2,12 @@ Tests that the pretrained models produce the correct scores on the STSbenchmark dataset """ +from __future__ import annotations + import csv import gzip import os from functools import partial -from typing import Optional import pytest @@ -15,7 +16,7 @@ def pretrained_model_score( - model_name, expected_score: float, max_test_samples: int = 100, cache_dir: Optional[str] = None + model_name, expected_score: float, max_test_samples: int = 100, cache_dir: str | None = None ) -> None: model = SentenceTransformer(model_name, cache_folder=cache_dir) sts_dataset_path = "datasets/stsbenchmark.tsv.gz" diff --git a/tests/test_sentence_transformer.py b/tests/test_sentence_transformer.py index 0b173b45b..b37e70cc0 100644 --- a/tests/test_sentence_transformer.py +++ b/tests/test_sentence_transformer.py @@ -2,6 +2,8 @@ Tests general behaviour of the SentenceTransformer class """ +from __future__ import annotations + import json import logging import os @@ -9,7 +11,7 @@ import tempfile from functools import partial from pathlib import Path -from typing import Dict, List, Literal, Optional, Union, cast +from typing import Dict, List, Literal, cast import numpy as np import pytest @@ -444,11 +446,11 @@ def test_encode_quantization( @pytest.mark.parametrize("normalize_embeddings", [True, False]) @pytest.mark.parametrize("output_value", ["sentence_embedding", None]) def test_encode_truncate( - sentences: Union[str, List[str]], + sentences: str | list[str], convert_to_tensor: bool, convert_to_numpy: bool, normalize_embeddings: bool, - output_value: Optional[Literal["sentence_embedding"]], + output_value: Literal["sentence_embedding"] | None, ) -> None: model = SentenceTransformer("sentence-transformers-testing/stsb-bert-tiny-safetensors") embeddings_full_unnormalized: torch.Tensor = model.encode( @@ -636,7 +638,7 @@ def test_override_config_versions(stsb_bert_tiny_model: SentenceTransformer) -> SentenceTransformer("sentence-transformers/average_word_embeddings_levy_dependency"), ], ) -def test_safetensors(modules: Union[List[nn.Module], SentenceTransformer]) -> None: +def test_safetensors(modules: list[nn.Module] | SentenceTransformer) -> None: if isinstance(modules, SentenceTransformer): model = modules else: diff --git a/tests/test_train_stsb.py b/tests/test_train_stsb.py index e2ac0171a..97efe3a37 100644 --- a/tests/test_train_stsb.py +++ b/tests/test_train_stsb.py @@ -2,10 +2,12 @@ Tests that the pretrained models produce the correct scores on the STSbenchmark dataset """ +from __future__ import annotations + import csv import gzip import os -from typing import Generator, List, Tuple +from typing import Generator import pytest import torch @@ -23,7 +25,7 @@ @pytest.fixture() -def sts_resource() -> Generator[Tuple[List[InputExample], List[InputExample]], None, None]: +def sts_resource() -> Generator[tuple[list[InputExample], list[InputExample]], None, None]: sts_dataset_path = "datasets/stsbenchmark.tsv.gz" if not os.path.exists(sts_dataset_path): util.http_get("https://sbert.net/datasets/stsbenchmark.tsv.gz", sts_dataset_path) @@ -44,7 +46,7 @@ def sts_resource() -> Generator[Tuple[List[InputExample], List[InputExample]], N @pytest.fixture() -def nli_resource() -> Generator[List[InputExample], None, None]: +def nli_resource() -> Generator[list[InputExample], None, None]: nli_dataset_path = "datasets/AllNLI.tsv.gz" if not os.path.exists(nli_dataset_path): util.http_get("https://sbert.net/datasets/AllNLI.tsv.gz", nli_dataset_path) @@ -77,7 +79,7 @@ def evaluate_stsb_test(model, expected_score, test_samples) -> None: reason='Sentence Transformers was not installed with the `["train"]` extra.', ) def test_train_stsb_slow( - distilbert_base_uncased_model: SentenceTransformer, sts_resource: Tuple[List[InputExample], List[InputExample]] + distilbert_base_uncased_model: SentenceTransformer, sts_resource: tuple[list[InputExample], list[InputExample]] ) -> None: model = distilbert_base_uncased_model sts_train_samples, sts_test_samples = sts_resource @@ -102,7 +104,7 @@ def test_train_stsb_slow( reason='Sentence Transformers was not installed with the `["train"]` extra.', ) def test_train_stsb( - distilbert_base_uncased_model: SentenceTransformer, sts_resource: Tuple[List[InputExample], List[InputExample]] + distilbert_base_uncased_model: SentenceTransformer, sts_resource: tuple[list[InputExample], list[InputExample]] ) -> None: model = distilbert_base_uncased_model sts_train_samples, sts_test_samples = sts_resource @@ -128,8 +130,8 @@ def test_train_stsb( ) def test_train_nli_slow( distilbert_base_uncased_model: SentenceTransformer, - nli_resource: List[InputExample], - sts_resource: Tuple[List[InputExample], List[InputExample]], + nli_resource: list[InputExample], + sts_resource: tuple[list[InputExample], list[InputExample]], ): model = distilbert_base_uncased_model _, sts_test_samples = sts_resource @@ -158,8 +160,8 @@ def test_train_nli_slow( ) def test_train_nli( distilbert_base_uncased_model: SentenceTransformer, - nli_resource: List[InputExample], - sts_resource: Tuple[List[InputExample], List[InputExample]], + nli_resource: list[InputExample], + sts_resource: tuple[list[InputExample], list[InputExample]], ): model = distilbert_base_uncased_model _, sts_test_samples = sts_resource diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 83e524f2a..33c48b11e 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import re import tempfile from pathlib import Path diff --git a/tests/test_util.py b/tests/test_util.py index 82cc1f5fb..71d194e54 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import numpy as np import sklearn import torch