[chore] Enable ruff's pypgrade (UP) ruleset (UKPLab#2834)

* eable isort improve ci/cd improve ci/cd improve ci/cd fix isort try fix * fix * fix * Enable ruffs UP ruleset * unsafe fixes
tomaarsen · Jul 12, 2024 · c0fc0e8 · c0fc0e8
1 parent 65728ed
commit c0fc0e8
Show file tree

Hide file tree

Showing 127 changed files with 357 additions and 407 deletions.
diff --git a/examples/applications/clustering/fast_clustering.py b/examples/applications/clustering/fast_clustering.py
@@ -57,11 +57,11 @@
 # threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
 clusters = util.community_detection(corpus_embeddings, min_community_size=25, threshold=0.75)
 
-print("Clustering done after {:.2f} sec".format(time.time() - start_time))
+print(f"Clustering done after {time.time() - start_time:.2f} sec")
 
 # Print for all clusters the top 3 and bottom 3 elements
 for i, cluster in enumerate(clusters):
-    print("\nCluster {}, #{} Elements ".format(i + 1, len(cluster)))
+    print(f"\nCluster {i + 1}, #{len(cluster)} Elements ")
     for sentence_id in cluster[0:3]:
         print("\t", corpus_sentences[sentence_id])
     print("\t", "...")

diff --git a/examples/applications/computing-embeddings/computing_embeddings_multi_gpu.py b/examples/applications/computing-embeddings/computing_embeddings_multi_gpu.py
@@ -15,7 +15,7 @@
 # Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
 if __name__ == "__main__":
     # Create a large list of 100k sentences
-    sentences = ["This is sentence {}".format(i) for i in range(100000)]
+    sentences = [f"This is sentence {i}" for i in range(100000)]
 
     # Define the model
     model = SentenceTransformer("all-MiniLM-L6-v2")

diff --git a/examples/applications/cross-encoder/cross-encoder_reranking.py b/examples/applications/cross-encoder/cross-encoder_reranking.py
@@ -68,7 +68,7 @@
         corpus_embeddings = cache_data["embeddings"][0:max_corpus_size]
 
 ###############################
-print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
+print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")
 
 while True:
     inp_question = input("Please enter a question: ")
@@ -80,7 +80,7 @@
     hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=num_candidates)
     hits = hits[0]  # Get the hits for the first query
 
-    print("Cosine-Similarity search took {:.3f} seconds".format(time.time() - start_time))
+    print(f"Cosine-Similarity search took {time.time() - start_time:.3f} seconds")
     print("Top 5 hits with cosine-similarity:")
     for hit in hits[0:5]:
         print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
@@ -95,7 +95,7 @@
 
     # Sort list by CrossEncoder scores
     hits = sorted(hits, key=lambda x: x["cross-encoder_score"], reverse=True)
-    print("\nRe-ranking with CrossEncoder took {:.3f} seconds".format(time.time() - start_time))
+    print(f"\nRe-ranking with CrossEncoder took {time.time() - start_time:.3f} seconds")
     print("Top 5 hits with CrossEncoder:")
     for hit in hits[0:5]:
         print("\t{:.3f}\t{}".format(hit["cross-encoder_score"], corpus_sentences[hit["corpus_id"]]))

diff --git a/examples/applications/parallel-sentence-mining/bitext_mining.py b/examples/applications/parallel-sentence-mining/bitext_mining.py
@@ -177,4 +177,4 @@
             )
             sentences_written += 1
 
-print("Done. {} sentences written".format(sentences_written))
+print(f"Done. {sentences_written} sentences written")
diff --git a/examples/applications/parallel-sentence-mining/bitext_mining_utils.py b/examples/applications/parallel-sentence-mining/bitext_mining_utils.py
@@ -45,7 +45,7 @@ def kNN(x, y, k, use_ann_search=False, ann_num_clusters=32768, ann_num_cluster_p
         idx.add(y)
         sim, ind = idx.search(x, k)
 
-    print("Done: {:.2f} sec".format(time.time() - start_time))
+    print(f"Done: {time.time() - start_time:.2f} sec")
     return sim, ind
 
 
@@ -56,4 +56,4 @@ def file_open(filepath):
     elif filepath.endswith("xz"):
         return lzma.open(filepath, "rt", encoding="utf8")
     else:
-        return open(filepath, "r", encoding="utf8")
+        return open(filepath, encoding="utf8")
diff --git a/examples/applications/parallel-sentence-mining/bucc2018.py b/examples/applications/parallel-sentence-mining/bucc2018.py
@@ -54,12 +54,8 @@
 pca_dimensions = 128
 
 # We store the embeddings on disc, so that they can later be loaded from disc
-source_embedding_file = "{}_{}_{}.emb".format(
-    model_name, os.path.basename(source_file), pca_dimensions if use_pca else model.get_sentence_embedding_dimension()
-)
-target_embedding_file = "{}_{}_{}.emb".format(
-    model_name, os.path.basename(target_file), pca_dimensions if use_pca else model.get_sentence_embedding_dimension()
-)
+source_embedding_file = f"{model_name}_{os.path.basename(source_file)}_{pca_dimensions if use_pca else model.get_sentence_embedding_dimension()}.emb"
+target_embedding_file = f"{model_name}_{os.path.basename(target_file)}_{pca_dimensions if use_pca else model.get_sentence_embedding_dimension()}.emb"
 
 
 # Use PCA to reduce the dimensionality of the sentence embedding model

diff --git a/examples/applications/retrieve_rerank/in_document_search_crossencoder.py b/examples/applications/retrieve_rerank/in_document_search_crossencoder.py
@@ -88,7 +88,7 @@
     results = sorted(results, key=lambda x: x["score"], reverse=True)
 
     print("Query:", query)
-    print("Search took {:.2f} seconds".format(time.time() - start_time))
+    print(f"Search took {time.time() - start_time:.2f} seconds")
     for hit in results[0:5]:
         print("Score: {:.2f}".format(hit["score"]), "\t", hit["input"][1])
 

diff --git a/examples/applications/semantic-search/semantic_search.py b/examples/applications/semantic-search/semantic_search.py
@@ -48,7 +48,7 @@
     print("Top 5 most similar sentences in corpus:")
 
     for score, idx in zip(scores, indices):
-        print(corpus[idx], "(Score: {:.4f})".format(score))
+        print(corpus[idx], f"(Score: {score:.4f})")
 
     """
     # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk

diff --git a/examples/applications/semantic-search/semantic_search_quora_annoy.py b/examples/applications/semantic-search/semantic_search_quora_annoy.py
@@ -90,7 +90,7 @@
 
 if not os.path.exists(annoy_index_path):
     # Create Annoy Index
-    print("Create Annoy index with {} trees. This can take some time.".format(n_trees))
+    print(f"Create Annoy index with {n_trees} trees. This can take some time.")
     annoy_index = AnnoyIndex(embedding_size, "angular")
 
     for i in range(len(corpus_embeddings)):
@@ -108,7 +108,7 @@
 
 ######### Search in the index ###########
 
-print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
+print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")
 
 while True:
     inp_question = input("Please enter a question: ")
@@ -124,7 +124,7 @@
     end_time = time.time()
 
     print("Input question:", inp_question)
-    print("Results (after {:.3f} seconds):".format(end_time - start_time))
+    print(f"Results (after {end_time - start_time:.3f} seconds):")
     for hit in hits[0:top_k_hits]:
         print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
 
@@ -139,7 +139,7 @@
         print("Approximate Nearest Neighbor returned a different number of results than expected")
 
     recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
-    print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
+    print(f"\nApproximate Nearest Neighbor Recall@{top_k_hits}: {recall * 100:.2f}")
 
     if recall < 1:
         print("Missing results:")

diff --git a/examples/applications/semantic-search/semantic_search_quora_faiss.py b/examples/applications/semantic-search/semantic_search_quora_faiss.py
@@ -107,7 +107,7 @@
 ######### Search in the index ###########
 
 
-print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
+print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")
 
 while True:
     inp_question = input("Please enter a question: ")
@@ -128,7 +128,7 @@
     end_time = time.time()
 
     print("Input question:", inp_question)
-    print("Results (after {:.3f} seconds):".format(end_time - start_time))
+    print(f"Results (after {end_time - start_time:.3f} seconds):")
     for hit in hits[0:top_k_hits]:
         print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
 
@@ -142,7 +142,7 @@
         print("Approximate Nearest Neighbor returned a different number of results than expected")
 
     recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
-    print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
+    print(f"\nApproximate Nearest Neighbor Recall@{top_k_hits}: {recall * 100:.2f}")
 
     if recall < 1:
         print("Missing results:")

diff --git a/examples/applications/semantic-search/semantic_search_quora_hnswlib.py b/examples/applications/semantic-search/semantic_search_quora_hnswlib.py
@@ -102,7 +102,7 @@
 
 ######### Search in the index ###########
 
-print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
+print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")
 
 while True:
     inp_question = input("Please enter a question: ")
@@ -119,7 +119,7 @@
     end_time = time.time()
 
     print("Input question:", inp_question)
-    print("Results (after {:.3f} seconds):".format(end_time - start_time))
+    print(f"Results (after {end_time - start_time:.3f} seconds):")
     for hit in hits[0:top_k_hits]:
         print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
 
@@ -133,7 +133,7 @@
         print("Approximate Nearest Neighbor returned a different number of results than expected")
 
     recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
-    print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
+    print(f"\nApproximate Nearest Neighbor Recall@{top_k_hits}: {recall * 100:.2f}")
 
     if recall < 1:
         print("Missing results:")

diff --git a/examples/applications/semantic-search/semantic_search_quora_pytorch.py b/examples/applications/semantic-search/semantic_search_quora_pytorch.py
@@ -67,7 +67,7 @@
         corpus_embeddings = cache_data["embeddings"][0:max_corpus_size]
 
 ###############################
-print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
+print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")
 
 # Move embeddings to the target device of the model
 corpus_embeddings = corpus_embeddings.to(model.device)
@@ -82,7 +82,7 @@
     hits = hits[0]  # Get the hits for the first query
 
     print("Input question:", inp_question)
-    print("Results (after {:.3f} seconds):".format(end_time - start_time))
+    print(f"Results (after {end_time - start_time:.3f} seconds):")
     for hit in hits[0:5]:
         print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
 

diff --git a/examples/applications/semantic-search/semantic_search_wikipedia_qa.py b/examples/applications/semantic-search/semantic_search_wikipedia_qa.py
@@ -73,7 +73,7 @@
 
     # Output of top-k hits
     print("Input question:", query)
-    print("Results (after {:.3f} seconds):".format(end_time - start_time))
+    print(f"Results (after {end_time - start_time:.3f} seconds):")
     for hit in hits:
         print("\t{:.3f}\t{}".format(hit["score"], passages[hit["corpus_id"]]))
 

diff --git a/examples/evaluation/evaluation_inference_speed.py b/examples/evaluation/evaluation_inference_speed.py
@@ -37,6 +37,6 @@
     emb = model.encode(sentences, batch_size=32)
     end_time = time.time()
     diff_time = end_time - start_time
-    print("Done after {:.2f} seconds".format(diff_time))
-    print("Speed: {:.2f} sentences / second".format(len(sentences) / diff_time))
+    print(f"Done after {diff_time:.2f} seconds")
+    print(f"Speed: {len(sentences) / diff_time:.2f} sentences / second")
     print("=====")
diff --git a/examples/training/cross-encoder/training_nli.py b/examples/training/cross-encoder/training_nli.py
@@ -71,7 +71,7 @@
 evaluator = SequentialEvaluator([accuracy_evaluator, f1_evaluator])
 
 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-logger.info("Warmup-steps: {}".format(warmup_steps))
+logger.info(f"Warmup-steps: {warmup_steps}")
 
 
 # Train the model

diff --git a/examples/training/cross-encoder/training_quora_duplicate_questions.py b/examples/training/cross-encoder/training_quora_duplicate_questions.py
@@ -45,7 +45,7 @@
 # Read the quora dataset split for classification
 logger.info("Read train dataset")
 train_samples = []
-with open(os.path.join(dataset_path, "classification", "train_pairs.tsv"), "r", encoding="utf8") as fIn:
+with open(os.path.join(dataset_path, "classification", "train_pairs.tsv"), encoding="utf8") as fIn:
     reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
     for row in reader:
         train_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"])))
@@ -54,7 +54,7 @@
 
 logger.info("Read dev dataset")
 dev_samples = []
-with open(os.path.join(dataset_path, "classification", "dev_pairs.tsv"), "r", encoding="utf8") as fIn:
+with open(os.path.join(dataset_path, "classification", "dev_pairs.tsv"), encoding="utf8") as fIn:
     reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
     for row in reader:
         dev_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"])))
@@ -79,7 +79,7 @@
 
 # Configure the training
 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-logger.info("Warmup-steps: {}".format(warmup_steps))
+logger.info(f"Warmup-steps: {warmup_steps}")
 
 
 # Train the model

diff --git a/examples/training/cross-encoder/training_stsbenchmark.py b/examples/training/cross-encoder/training_stsbenchmark.py
@@ -76,7 +76,7 @@
 
 # Configure the training
 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-logger.info("Warmup-steps: {}".format(warmup_steps))
+logger.info(f"Warmup-steps: {warmup_steps}")
 
 
 # Train the model

diff --git a/examples/training/data_augmentation/train_sts_indomain_bm25.py b/examples/training/data_augmentation/train_sts_indomain_bm25.py
@@ -87,7 +87,7 @@
 #
 #####################################################################
 
-logging.info("Step 1: Train cross-encoder: ({}) with STSbenchmark".format(model_name))
+logging.info(f"Step 1: Train cross-encoder: ({model_name}) with STSbenchmark")
 
 # Load the STSB dataset: https://huggingface.co/datasets/sentence-transformers/stsb
 train_dataset = load_dataset("sentence-transformers/stsb", split="train")
@@ -113,7 +113,7 @@
 
 # Configure the training
 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
-logging.info("Warmup-steps: {}".format(warmup_steps))
+logging.info(f"Warmup-steps: {warmup_steps}")
 
 # Train the cross-encoder model
 cross_encoder.fit(
@@ -134,7 +134,7 @@
 #### Larger the k, bigger the silver dataset ####
 
 index_name = "stsb"  # index-name should be in lowercase
-logging.info("Step 2.1: Generate STSbenchmark (silver dataset) using top-{} bm25 combinations".format(top_k))
+logging.info(f"Step 2.1: Generate STSbenchmark (silver dataset) using top-{top_k} bm25 combinations")
 
 unique_sentences = set()
 
@@ -148,15 +148,15 @@
 )  # not to include gold pairs of sentences again
 
 # Ignore 400 cause by IndexAlreadyExistsException when creating an index
-logging.info("Creating elastic-search index - {}".format(index_name))
+logging.info(f"Creating elastic-search index - {index_name}")
 es.indices.create(index=index_name, ignore=[400])
 
 # indexing all sentences
 logging.info("Starting to index....")
 for sent in unique_sentences:
     response = es.index(index=index_name, id=sent2idx[sent], body={"sent": sent})
 
-logging.info("Indexing complete for {} unique sentences".format(len(unique_sentences)))
+logging.info(f"Indexing complete for {len(unique_sentences)} unique sentences")
 
 silver_data = []
 progress = tqdm.tqdm(unit="docs", total=len(sent2idx))
@@ -173,8 +173,8 @@
 progress.reset()
 progress.close()
 
-logging.info("Number of silver pairs generated for STSbenchmark: {}".format(len(silver_data)))
-logging.info("Step 2.2: Label STSbenchmark (silver dataset) with cross-encoder: {}".format(model_name))
+logging.info(f"Number of silver pairs generated for STSbenchmark: {len(silver_data)}")
+logging.info(f"Step 2.2: Label STSbenchmark (silver dataset) with cross-encoder: {model_name}")
 
 cross_encoder = CrossEncoder(cross_encoder_path)
 silver_scores = cross_encoder.predict(silver_data)
@@ -188,7 +188,7 @@
 #
 #################################################################################################
 
-logging.info("Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)".format(model_name))
+logging.info(f"Step 3: Train bi-encoder: {model_name} with STSbenchmark (gold + silver dataset)")
 
 # Convert the dataset to a DataLoader ready for training
 logging.info("Read STSbenchmark gold and silver train dataset")

diff --git a/examples/training/data_augmentation/train_sts_indomain_nlpaug.py b/examples/training/data_augmentation/train_sts_indomain_nlpaug.py
@@ -111,7 +111,7 @@
 progress.reset()
 progress.close()
 logging.info("Textual augmentation completed....")
-logging.info("Number of silver pairs generated: {}".format(len(silver_samples)))
+logging.info(f"Number of silver pairs generated: {len(silver_samples)}")
 
 ###################################################################
 #