From 7f2a35aea7e3fa99d42f4d75d850b46563b2135b Mon Sep 17 00:00:00 2001 From: Ben Condemi Date: Wed, 13 Nov 2024 13:27:02 -0500 Subject: [PATCH] changing test suite --- requirements.txt | 8 +------ src/translator.py | 1 + test/unit/eval_fns.py | 41 ------------------------------------ test/unit/test_translator.py | 37 +++++++++++++++++++------------- 4 files changed, 24 insertions(+), 63 deletions(-) diff --git a/requirements.txt b/requirements.txt index ac32664..85c0dd1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,4 @@ Flask==3.0.0 pytest==7.4.0 openai -torch>=1.6.0 -transformers>=4.6.0 -numpy -scipy -scikit-learn -tqdm -nltk + diff --git a/src/translator.py b/src/translator.py index 8bf9f6b..9e6ee3f 100644 --- a/src/translator.py +++ b/src/translator.py @@ -11,6 +11,7 @@ + def get_translation(content: str) -> str: """ Translates non-English text to English using the Azure OpenAI GPT-4 model. diff --git a/test/unit/eval_fns.py b/test/unit/eval_fns.py index 7528313..84c5d77 100644 --- a/test/unit/eval_fns.py +++ b/test/unit/eval_fns.py @@ -42,44 +42,3 @@ # # Return average score # return ((classification_score + translation_score) / 2) - -from transformers import AutoTokenizer, AutoModel -import torch -import numpy as np -from scipy.spatial.distance import cosine - -# Load tokenizer and model from Hugging Face -tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") -model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") - -def encode_text(text: str) -> np.ndarray: - # Tokenize input and convert to model embeddings - inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) - with torch.no_grad(): - embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy() - return embeddings - -def eval_single_response_translation(expected_answer: str, llm_response: str) -> float: - embeddings_1 = encode_text(expected_answer) - embeddings_2 = encode_text(llm_response) - # Cosine similarity between embeddings - similarity_score = 1 - cosine(embeddings_1, embeddings_2) - return similarity_score - -def eval_single_response_classification(expected_answer: str, llm_response: str) -> float: - return 1.0 if expected_answer.lower() == llm_response.strip().lower() else 0.0 - -def eval_single_response_complete(expected_answer: tuple[bool, str], llm_response: tuple[bool, str]) -> float: - expected_is_english, expected_translation = expected_answer - response_is_english, response_translation = llm_response - - # Evaluate classification - classification_score = eval_single_response_classification(expected_translation, response_translation) - - # Evaluate translation only if not English - if not expected_is_english: - translation_score = eval_single_response_translation(expected_translation, response_translation) - else: - translation_score = eval_single_response_translation(expected_translation, response_translation) - - return (classification_score + translation_score) / 2 diff --git a/test/unit/test_translator.py b/test/unit/test_translator.py index 944d4ba..2733ccf 100644 --- a/test/unit/test_translator.py +++ b/test/unit/test_translator.py @@ -1,35 +1,42 @@ from src.translator import translate_content -from test.unit.eval_sets import * -from test.unit.eval_fns import eval_single_response_complete +# from test.unit.eval_sets import * +# from test.unit.eval_fns import eval_single_response_complete -def test_chinese(): +# def test_chinese(): # is_english, translated_content = translate_content("这是一条中文消息") # assert is_english == False # assert translated_content == "This is a Chinese message" - test = True - assert test == True + -eval_example_good = {"post": "Hier ist dein erstes Beispiel.", "expected_answer": (False, "Here is your first example.")} -eval_example_bad = {"post": "asdfghjkl", "expected_answer": (False, "I don't understand your request.")} +# eval_example_good = {"post": "Hier ist dein erstes Beispiel.", "expected_answer": (False, "Here is your first example.")} +# eval_example_bad = {"post": "asdfghjkl", "expected_answer": (False, "I don't understand your request.")} def test_valid(): - content = eval_example_good["post"] - expected = eval_example_good["expected_answer"] - llm_response = translate_content(content) + is_english, translated_content = translate_content("这是一条中文消息") + + assert is_english == False + assert translated_content in ["This is a Chinese message.", + "This is a message in Chinese."] + # content = eval_example_good["post"] + # expected = eval_example_good["expected_answer"] + # llm_response = translate_content(content) - similarity = eval_single_response_complete(expected, llm_response) + # similarity = eval_single_response_complete(expected, llm_response) - assert (0.90 <= similarity) + # assert (0.90 <= similarity) def test_invalid(): - content = eval_example_bad["post"] - expected = eval_example_bad["expected_answer"] - llm_response = translate_content(content) + # content = eval_example_bad["post"] + # expected = eval_example_bad["expected_answer"] + # llm_response = translate_content(content) + is_english, translated_content = translate_content("asldjkjhfjsdh") + + assert is_english == False assert ValueError("Invalid translation response.")