Skip to content

Commit

Permalink
[chore] Enable ruff's pypgrade (UP) ruleset (UKPLab#2834)
Browse files Browse the repository at this point in the history
* eable isort

improve ci/cd

improve ci/cd

improve ci/cd

fix isort

try

fix

* fix

* fix

* Enable ruffs UP ruleset

* unsafe fixes
  • Loading branch information
fpgmaas authored Jul 12, 2024
1 parent 65728ed commit c0fc0e8
Show file tree
Hide file tree
Showing 127 changed files with 357 additions and 407 deletions.
4 changes: 2 additions & 2 deletions examples/applications/clustering/fast_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@
# threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = util.community_detection(corpus_embeddings, min_community_size=25, threshold=0.75)

print("Clustering done after {:.2f} sec".format(time.time() - start_time))
print(f"Clustering done after {time.time() - start_time:.2f} sec")

# Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
print("\nCluster {}, #{} Elements ".format(i + 1, len(cluster)))
print(f"\nCluster {i + 1}, #{len(cluster)} Elements ")
for sentence_id in cluster[0:3]:
print("\t", corpus_sentences[sentence_id])
print("\t", "...")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
if __name__ == "__main__":
# Create a large list of 100k sentences
sentences = ["This is sentence {}".format(i) for i in range(100000)]
sentences = [f"This is sentence {i}" for i in range(100000)]

# Define the model
model = SentenceTransformer("all-MiniLM-L6-v2")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
corpus_embeddings = cache_data["embeddings"][0:max_corpus_size]

###############################
print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")

while True:
inp_question = input("Please enter a question: ")
Expand All @@ -80,7 +80,7 @@
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=num_candidates)
hits = hits[0] # Get the hits for the first query

print("Cosine-Similarity search took {:.3f} seconds".format(time.time() - start_time))
print(f"Cosine-Similarity search took {time.time() - start_time:.3f} seconds")
print("Top 5 hits with cosine-similarity:")
for hit in hits[0:5]:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))
Expand All @@ -95,7 +95,7 @@

# Sort list by CrossEncoder scores
hits = sorted(hits, key=lambda x: x["cross-encoder_score"], reverse=True)
print("\nRe-ranking with CrossEncoder took {:.3f} seconds".format(time.time() - start_time))
print(f"\nRe-ranking with CrossEncoder took {time.time() - start_time:.3f} seconds")
print("Top 5 hits with CrossEncoder:")
for hit in hits[0:5]:
print("\t{:.3f}\t{}".format(hit["cross-encoder_score"], corpus_sentences[hit["corpus_id"]]))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,4 @@
)
sentences_written += 1

print("Done. {} sentences written".format(sentences_written))
print(f"Done. {sentences_written} sentences written")
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def kNN(x, y, k, use_ann_search=False, ann_num_clusters=32768, ann_num_cluster_p
idx.add(y)
sim, ind = idx.search(x, k)

print("Done: {:.2f} sec".format(time.time() - start_time))
print(f"Done: {time.time() - start_time:.2f} sec")
return sim, ind


Expand All @@ -56,4 +56,4 @@ def file_open(filepath):
elif filepath.endswith("xz"):
return lzma.open(filepath, "rt", encoding="utf8")
else:
return open(filepath, "r", encoding="utf8")
return open(filepath, encoding="utf8")
8 changes: 2 additions & 6 deletions examples/applications/parallel-sentence-mining/bucc2018.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,8 @@
pca_dimensions = 128

# We store the embeddings on disc, so that they can later be loaded from disc
source_embedding_file = "{}_{}_{}.emb".format(
model_name, os.path.basename(source_file), pca_dimensions if use_pca else model.get_sentence_embedding_dimension()
)
target_embedding_file = "{}_{}_{}.emb".format(
model_name, os.path.basename(target_file), pca_dimensions if use_pca else model.get_sentence_embedding_dimension()
)
source_embedding_file = f"{model_name}_{os.path.basename(source_file)}_{pca_dimensions if use_pca else model.get_sentence_embedding_dimension()}.emb"
target_embedding_file = f"{model_name}_{os.path.basename(target_file)}_{pca_dimensions if use_pca else model.get_sentence_embedding_dimension()}.emb"


# Use PCA to reduce the dimensionality of the sentence embedding model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@
results = sorted(results, key=lambda x: x["score"], reverse=True)

print("Query:", query)
print("Search took {:.2f} seconds".format(time.time() - start_time))
print(f"Search took {time.time() - start_time:.2f} seconds")
for hit in results[0:5]:
print("Score: {:.2f}".format(hit["score"]), "\t", hit["input"][1])

Expand Down
2 changes: 1 addition & 1 deletion examples/applications/semantic-search/semantic_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
print("Top 5 most similar sentences in corpus:")

for score, idx in zip(scores, indices):
print(corpus[idx], "(Score: {:.4f})".format(score))
print(corpus[idx], f"(Score: {score:.4f})")

"""
# Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@

if not os.path.exists(annoy_index_path):
# Create Annoy Index
print("Create Annoy index with {} trees. This can take some time.".format(n_trees))
print(f"Create Annoy index with {n_trees} trees. This can take some time.")
annoy_index = AnnoyIndex(embedding_size, "angular")

for i in range(len(corpus_embeddings)):
Expand All @@ -108,7 +108,7 @@

######### Search in the index ###########

print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")

while True:
inp_question = input("Please enter a question: ")
Expand All @@ -124,7 +124,7 @@
end_time = time.time()

print("Input question:", inp_question)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
print(f"Results (after {end_time - start_time:.3f} seconds):")
for hit in hits[0:top_k_hits]:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))

Expand All @@ -139,7 +139,7 @@
print("Approximate Nearest Neighbor returned a different number of results than expected")

recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
print(f"\nApproximate Nearest Neighbor Recall@{top_k_hits}: {recall * 100:.2f}")

if recall < 1:
print("Missing results:")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@
######### Search in the index ###########


print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")

while True:
inp_question = input("Please enter a question: ")
Expand All @@ -128,7 +128,7 @@
end_time = time.time()

print("Input question:", inp_question)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
print(f"Results (after {end_time - start_time:.3f} seconds):")
for hit in hits[0:top_k_hits]:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))

Expand All @@ -142,7 +142,7 @@
print("Approximate Nearest Neighbor returned a different number of results than expected")

recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
print(f"\nApproximate Nearest Neighbor Recall@{top_k_hits}: {recall * 100:.2f}")

if recall < 1:
print("Missing results:")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@

######### Search in the index ###########

print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")

while True:
inp_question = input("Please enter a question: ")
Expand All @@ -119,7 +119,7 @@
end_time = time.time()

print("Input question:", inp_question)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
print(f"Results (after {end_time - start_time:.3f} seconds):")
for hit in hits[0:top_k_hits]:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))

Expand All @@ -133,7 +133,7 @@
print("Approximate Nearest Neighbor returned a different number of results than expected")

recall = len(ann_corpus_ids.intersection(correct_hits_ids)) / len(correct_hits_ids)
print("\nApproximate Nearest Neighbor Recall@{}: {:.2f}".format(top_k_hits, recall * 100))
print(f"\nApproximate Nearest Neighbor Recall@{top_k_hits}: {recall * 100:.2f}")

if recall < 1:
print("Missing results:")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
corpus_embeddings = cache_data["embeddings"][0:max_corpus_size]

###############################
print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))
print(f"Corpus loaded with {len(corpus_sentences)} sentences / embeddings")

# Move embeddings to the target device of the model
corpus_embeddings = corpus_embeddings.to(model.device)
Expand All @@ -82,7 +82,7 @@
hits = hits[0] # Get the hits for the first query

print("Input question:", inp_question)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
print(f"Results (after {end_time - start_time:.3f} seconds):")
for hit in hits[0:5]:
print("\t{:.3f}\t{}".format(hit["score"], corpus_sentences[hit["corpus_id"]]))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@

# Output of top-k hits
print("Input question:", query)
print("Results (after {:.3f} seconds):".format(end_time - start_time))
print(f"Results (after {end_time - start_time:.3f} seconds):")
for hit in hits:
print("\t{:.3f}\t{}".format(hit["score"], passages[hit["corpus_id"]]))

Expand Down
4 changes: 2 additions & 2 deletions examples/evaluation/evaluation_inference_speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,6 @@
emb = model.encode(sentences, batch_size=32)
end_time = time.time()
diff_time = end_time - start_time
print("Done after {:.2f} seconds".format(diff_time))
print("Speed: {:.2f} sentences / second".format(len(sentences) / diff_time))
print(f"Done after {diff_time:.2f} seconds")
print(f"Speed: {len(sentences) / diff_time:.2f} sentences / second")
print("=====")
2 changes: 1 addition & 1 deletion examples/training/cross-encoder/training_nli.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
evaluator = SequentialEvaluator([accuracy_evaluator, f1_evaluator])

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))
logger.info(f"Warmup-steps: {warmup_steps}")


# Train the model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
# Read the quora dataset split for classification
logger.info("Read train dataset")
train_samples = []
with open(os.path.join(dataset_path, "classification", "train_pairs.tsv"), "r", encoding="utf8") as fIn:
with open(os.path.join(dataset_path, "classification", "train_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
train_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"])))
Expand All @@ -54,7 +54,7 @@

logger.info("Read dev dataset")
dev_samples = []
with open(os.path.join(dataset_path, "classification", "dev_pairs.tsv"), "r", encoding="utf8") as fIn:
with open(os.path.join(dataset_path, "classification", "dev_pairs.tsv"), encoding="utf8") as fIn:
reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
for row in reader:
dev_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"])))
Expand All @@ -79,7 +79,7 @@

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))
logger.info(f"Warmup-steps: {warmup_steps}")


# Train the model
Expand Down
2 changes: 1 addition & 1 deletion examples/training/cross-encoder/training_stsbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))
logger.info(f"Warmup-steps: {warmup_steps}")


# Train the model
Expand Down
16 changes: 8 additions & 8 deletions examples/training/data_augmentation/train_sts_indomain_bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
#
#####################################################################

logging.info("Step 1: Train cross-encoder: ({}) with STSbenchmark".format(model_name))
logging.info(f"Step 1: Train cross-encoder: ({model_name}) with STSbenchmark")

# Load the STSB dataset: https://huggingface.co/datasets/sentence-transformers/stsb
train_dataset = load_dataset("sentence-transformers/stsb", split="train")
Expand All @@ -113,7 +113,7 @@

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
logging.info(f"Warmup-steps: {warmup_steps}")

# Train the cross-encoder model
cross_encoder.fit(
Expand All @@ -134,7 +134,7 @@
#### Larger the k, bigger the silver dataset ####

index_name = "stsb" # index-name should be in lowercase
logging.info("Step 2.1: Generate STSbenchmark (silver dataset) using top-{} bm25 combinations".format(top_k))
logging.info(f"Step 2.1: Generate STSbenchmark (silver dataset) using top-{top_k} bm25 combinations")

unique_sentences = set()

Expand All @@ -148,15 +148,15 @@
) # not to include gold pairs of sentences again

# Ignore 400 cause by IndexAlreadyExistsException when creating an index
logging.info("Creating elastic-search index - {}".format(index_name))
logging.info(f"Creating elastic-search index - {index_name}")
es.indices.create(index=index_name, ignore=[400])

# indexing all sentences
logging.info("Starting to index....")
for sent in unique_sentences:
response = es.index(index=index_name, id=sent2idx[sent], body={"sent": sent})

logging.info("Indexing complete for {} unique sentences".format(len(unique_sentences)))
logging.info(f"Indexing complete for {len(unique_sentences)} unique sentences")

silver_data = []
progress = tqdm.tqdm(unit="docs", total=len(sent2idx))
Expand All @@ -173,8 +173,8 @@
progress.reset()
progress.close()

logging.info("Number of silver pairs generated for STSbenchmark: {}".format(len(silver_data)))
logging.info("Step 2.2: Label STSbenchmark (silver dataset) with cross-encoder: {}".format(model_name))
logging.info(f"Number of silver pairs generated for STSbenchmark: {len(silver_data)}")
logging.info(f"Step 2.2: Label STSbenchmark (silver dataset) with cross-encoder: {model_name}")

cross_encoder = CrossEncoder(cross_encoder_path)
silver_scores = cross_encoder.predict(silver_data)
Expand All @@ -188,7 +188,7 @@
#
#################################################################################################

logging.info("Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)".format(model_name))
logging.info(f"Step 3: Train bi-encoder: {model_name} with STSbenchmark (gold + silver dataset)")

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark gold and silver train dataset")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@
progress.reset()
progress.close()
logging.info("Textual augmentation completed....")
logging.info("Number of silver pairs generated: {}".format(len(silver_samples)))
logging.info(f"Number of silver pairs generated: {len(silver_samples)}")

###################################################################
#
Expand Down
Loading

0 comments on commit c0fc0e8

Please sign in to comment.