diff --git a/README.md b/README.md index 21dc08a..aa17e0c 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,19 @@ This release introduces a number of exciting updates and improvements: These updates are part of our ongoing commitment to improve performance and usability. ## Evaluation +English +| wandbot version | Comment | response accuracy | +|---|---|---| +| 1.0.0 | our baseline wandbot | 53.8 % | +| 1.1.0 | improvement over baseline; in production for the longest | 72.5 % | +| 1.2.0 | our new enhanced wandbot | 81.6 % | + +Japanese | wandbot version | Comment | response accuracy | |---|---|---| -| 1.0.0 | our baseline wandbot | 53.78 % | -| 1.1.0 | improvement over baseline; in production for the longest | 72.45 % | -| 1.2.0 | our new enhanced wandbot | 81.63 % | +| 1.2.0 | our new enhanced wandbot | 56.3 % | +| 1.2.1 | add translation process | 71.9 % | ## Features @@ -118,8 +125,20 @@ Launch the wandbot with 8 workers. This speeds up evaluation WANDBOT_EVALUATION=1 gunicorn wandbot.api.app:app --bind 0.0.0.0:8000 --timeout=200 --workers=8 --worker-class uvicorn.workers.UvicornWorker ``` -Launch W&B Weave evaluation + +Set up for evaluation + +wandbot/src/wandbot/evaluation/config.py +- `evaluation_strategy_name` : attribute name in Weave Evaluation dashboard +- `eval_dataset` : + - [Latest English evaluation dataset](https://wandb.ai/wandbot/wandbot-eval/weave/datasets?peekPath=%2Fwandbot%2Fwandbot-eval%2Fobjects%2Fwandbot_eval_data%2Fversions%2FeCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU%3F%26): "weave:///wandbot/wandbot-eval/object/wandbot_eval_data:eCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU" + - [Latest Japanese evaluation dataset](https://wandb.ai/wandbot/wandbot-eval-jp/weave/datasets?peekPath=%2Fwandbot%2Fwandbot-eval-jp%2Fobjects%2Fwandbot_eval_data_jp%2Fversions%2FoCWifIAtEVCkSjushP0bOEc5GnhsMUYXURwQznBeKLA%3F%26): "weave:///wandbot/wandbot-eval-jp/object/wandbot_eval_data_jp:oCWifIAtEVCkSjushP0bOEc5GnhsMUYXURwQznBeKLA" +- `eval_judge_model` : model used for judge +- `wandb_entity` : wandb entity name for record +- `wandb_project` : wandb project name for record + +Launch W&B Weave evaluation ``` python src/wandbot/evaluation/weave_eval/main.py ``` diff --git a/src/wandbot/chat/chat.py b/src/wandbot/chat/chat.py index 311c49d..5876415 100644 --- a/src/wandbot/chat/chat.py +++ b/src/wandbot/chat/chat.py @@ -36,6 +36,8 @@ from wandbot.retriever import VectorStore from wandbot.utils import Timer, get_logger +from openai import OpenAI + logger = get_logger(__name__) @@ -84,6 +86,79 @@ def _get_answer( result = self.rag_pipeline(question, history) return result + + @weave.op() + def _translate_ja_to_en(self, text: str) -> str: + """ + Translates Japanese text to English using OpenAI's GPT-4. + + Args: + text: The Japanese text to be translated. + + Returns: + The translated text in English. + """ + client = OpenAI() + response = client.chat.completions.create( + model="gpt-4o-2024-08-06", + messages=[ + { + "role": "system", + "content": f"You are a professional translator. \n\n\ + Translate the user's question about Weights & Biases into English according to the specified rules. \n\ + Rule of translation. \n\ + - Maintain the original nuance\n\ + - Keep code unchanged.\n\ + - Only return the English translation without any additional explanation" + }, + { + "role": "user", + "content": text + } + ], + temperature=0, + max_tokens=1000, + top_p=1 + ) + return response.choices[0].message.content + + @weave.op() + def _translate_en_to_ja(self, text: str) -> str: + """ + Translates English text to Japanese using OpenAI's GPT-4. + + Args: + text: The English text to be translated. + + Returns: + The translated text in Japanese. + """ + client = OpenAI() + response = client.chat.completions.create( + model="gpt-4o-2024-08-06", + messages=[ + { + "role": "system", + "content": f"You are a professional translator. \n\n\ + Translate the user's text into Japanese according to the specified rules. \n\ + Rule of translation. \n\ + - Maintain the original nuance\n\ + - Use 'run' in English where appropriate, as it's a term used in Wandb.\n\ + - Translate the terms 'reference artifacts' and 'lineage' into Katakana. \n\ + - Include specific terms in English or Katakana where appropriate\n\ + - Keep code unchanged.\n\ + - Only return the Japanese translation without any additional explanation" + }, + { + "role": "user", + "content": text + } + ], + temperature=0, + max_tokens=1000, + top_p=1 + ) + return response.choices[0].message.content @weave.op() def __call__(self, chat_request: ChatRequest) -> ChatResponse: @@ -95,13 +170,27 @@ def __call__(self, chat_request: ChatRequest) -> ChatResponse: Returns: An instance of `ChatResponse` representing the chat response. """ + original_language = chat_request.language try: + if original_language == "ja": + translated_question = self._translate_ja_to_en(chat_request.question) + chat_request.language = "en" + chat_request = ChatRequest( + question=translated_question, + chat_history=chat_request.chat_history, + application=chat_request.application, + language="en" + ) + result = self._get_answer( chat_request.question, chat_request.chat_history or [] ) result_dict = result.model_dump() + if original_language == "ja": + result_dict["answer"] = self._translate_en_to_ja(result_dict["answer"]) + usage_stats = { "total_tokens": result.total_tokens, "prompt_tokens": result.prompt_tokens, @@ -135,4 +224,4 @@ def __call__(self, chat_request: ChatRequest) -> ChatResponse: } ) - return ChatResponse(**result) + return ChatResponse(**result) \ No newline at end of file diff --git a/src/wandbot/evaluation/config.py b/src/wandbot/evaluation/config.py index fafb0ab..680a872 100644 --- a/src/wandbot/evaluation/config.py +++ b/src/wandbot/evaluation/config.py @@ -3,34 +3,17 @@ class EvalConfig(BaseSettings): - model_config = SettingsConfigDict( - env_file=".env", env_file_encoding="utf-8", extra="allow" - ) - eval_artifact: str = Field( - "wandbot/wandbot-eval/autoeval_dataset:v3", - env="EVAL_ARTIFACT", - validation_alias="eval_artifact", - ) - eval_artifact_root: str = Field( - "data/eval", - env="EVAL_ARTIFACT_ROOT", - validation_alias="eval_artifact_root", + evaluation_strategy_name: str = Field("jp v1.2.0-beta", description="Will be shown in evaluation page, and be used for just visibility") + eval_dataset: str = Field( + "weave:///wandbot/wandbot-eval-jp/object/wandbot_eval_data_jp:oCWifIAtEVCkSjushP0bOEc5GnhsMUYXURwQznBeKLA" + ,description="Dataset reference for evaluation" ) + language: str = Field("ja", description="language for application (en or ja)") - eval_annotations_file: str = Field( - "wandbot_cleaned_annotated_dataset_11-12-2023.jsonl", - env="EVAL_ANNOTATIONS_FILE", - validation_alias="eval_annotations_file", - ) - eval_output_file: str = Field( - "eval.jsonl", - env="EVAL_OUTPUT_FILE", - validation_alias="eval_output_file", - ) eval_judge_model: str = Field( "gpt-4-1106-preview", env="EVAL_JUDGE_MODEL", validation_alias="eval_judge_model", ) wandb_entity: str = Field("wandbot", env="WANDB_ENTITY") - wandb_project: str = Field("wandbot-eval", env="WANDB_PROJECT") + wandb_project: str = Field("wandbot-eval", env="WANDB_PROJECT") \ No newline at end of file diff --git a/src/wandbot/evaluation/jp_evaluation_dataprep.py b/src/wandbot/evaluation/jp_evaluation_dataprep.py new file mode 100644 index 0000000..89010c3 --- /dev/null +++ b/src/wandbot/evaluation/jp_evaluation_dataprep.py @@ -0,0 +1,103 @@ +import weave +import json +import requests +import os +from typing import List, Dict, Any +from tqdm import tqdm + +dataset_ref = weave.ref("weave:///wandbot/wandbot-eval/object/wandbot_eval_data:eCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU").get() +question_rows = dataset_ref.rows +question_rows = [ + { + "question": row["question"], + "ground_truth": row["answer"], + "notes": row["notes"], + "context": row["context"], + "correctness": row["correctness"], + "is_wandb_query": row["is_wandb_query"] + } for row in question_rows +] + +def translate_with_openai(text: str) -> str: + # Get the OpenAI API key from environment variables + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise ValueError("OPENAI_API_KEY environment variable is not set") + + # Set headers for the OpenAI API request + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + + # Data payload for GPT-4-turbo (gpt-4o) API request + + #質問には答えないでというプロンプトを入れても良いかもしれない + + data = { + "model": "gpt-4o-2024-08-06", # Updated to GPT-4 Turbo + "max_tokens": 4000, + "messages": [ + { + "role": "system", + "content": "You are a professional translator. \n\n\ + Translate the user's text into Japanese according to the specified rules. \n\ + Rule of translation. \n\ + - Maintain the original nuance\n\ + - Use 'run' in English where appropriate, as it's a term used in Wandb.\n\ + - Translate the terms 'reference artifacts' and 'lineage' into Katakana. \n\ + - Include specific terms in English or Katakana where appropriate\n\ + - Keep code unchanged.\n\ + - Keep URL starting from 'Source:\thttps:', but translate texts after 'Source:\thttps:'\n\ + - Only return the Japanese translation without any additional explanation" + }, + { + "role": "user", + "content": text + } + ] + } + + # Make the API request to OpenAI + response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers=headers, + json=data + ) + + # Check if the request was successful + if response.status_code == 200: + # Return the translated text + return response.json()["choices"][0]["message"]["content"].strip() + else: + raise Exception(f"API request failed with status code {response.status_code}: {response.text}") + +def translate_data(data: List[Dict[str, Any]], output_file: str) -> None: + total_items = len(data) + + # Check if the file exists and get the last processed index + if os.path.exists(output_file): + with open(output_file, "r", encoding="utf-8") as file: + processed_data = json.load(file) + start_index = len(processed_data) + else: + processed_data = [] + start_index = 0 + + for i in tqdm(range(start_index, total_items), initial=start_index, total=total_items): + item = data[i] + translated_item = item.copy() + for key in ["question", "ground_truth", "notes", "context"]: + if key in item: + translated_item[key] = translate_with_claude(item[key]) + + processed_data.append(translated_item) + + # Save progress after each item + with open(output_file, "w", encoding="utf-8") as file: + json.dump(processed_data, file, ensure_ascii=False, indent=2) + + print(f"Translation completed. Results saved in '{output_file}'") + +output_file = "translated_data.json" +translate_data(question_rows, output_file) \ No newline at end of file diff --git a/src/wandbot/evaluation/jp_evaluation_dataupload.py b/src/wandbot/evaluation/jp_evaluation_dataupload.py new file mode 100644 index 0000000..195e01a --- /dev/null +++ b/src/wandbot/evaluation/jp_evaluation_dataupload.py @@ -0,0 +1,50 @@ +import json +import weave +from weave import Dataset + +def rename_key(item): + if 'ground_truth' in item: + item['answer'] = item.pop('ground_truth') + return item + +def create_test_file(json_file_path, test_file_path, num_lines=5): + with open(json_file_path, 'r') as file: + data = json.load(file) + + test_data = data[:num_lines] + + with open(test_file_path, 'w') as file: + json.dump(test_data, file, indent=2, ensure_ascii=False) + + print(f"Test file with {num_lines} lines has been created at {test_file_path}") + +def publish_json_to_weave(json_file_path, dataset_name, project_name): + # Initialize Weave + weave.init(project_name) + + # Read JSON file + with open(json_file_path, 'r') as file: + data = json.load(file) + + # Rename 'ground_truth' to 'answer' in each item + processed_data = [rename_key(item) for item in data] + + # Create a dataset + dataset = Dataset(name=dataset_name, rows=processed_data) + + # Publish the dataset + weave.publish(dataset) + + print(f"Dataset '{dataset_name}' has been published to project '{project_name}'.") + +# Usage example +json_file_path = 'translated_data.json' +test_file_path = 'test_translated_data.json' +dataset_name = 'wandbot_eval_data_jp' +project_name = 'wandbot/wandbot-eval-jp' + +# Create test file +#create_test_file(json_file_path, test_file_path) + +# Publish full dataset to Weave +publish_json_to_weave(json_file_path, dataset_name, project_name) \ No newline at end of file diff --git a/src/wandbot/evaluation/weave_eval/main.py b/src/wandbot/evaluation/weave_eval/main.py index c4da450..1dc33e0 100644 --- a/src/wandbot/evaluation/weave_eval/main.py +++ b/src/wandbot/evaluation/weave_eval/main.py @@ -1,11 +1,8 @@ -import os -os.environ["WANDB_ENTITY"] = "wandbot" - import json import httpx import weave import asyncio -import requests +import re from weave import Evaluation from weave import Model from llama_index.llms.openai import OpenAI @@ -38,13 +35,26 @@ async def get_answer(question: str, application: str = "api-eval") -> str: payload = { "question": question, "application": application, - "language": "en", + "language": config.language, } - async with httpx.AsyncClient(timeout=200.0) as client: + async with httpx.AsyncClient(timeout=900.0) as client: response = await client.post(url, json=payload) response_json = response.json() return json.dumps(response_json) +def parse_text_to_json(text): + # Split the text into documents + documents = re.split(r'source: https?://', text)[1:] + result = [] + for doc in documents: + source_url = 'https://' + doc.split('\n')[0].strip() + content = '\n'.join(doc.split('\n')[1:]).strip() + document = { + 'source': source_url, + 'content': content + } + result.append(document) + return result @weave.op() async def get_eval_record( @@ -55,7 +65,7 @@ async def get_eval_record( return { "system_prompt": response["system_prompt"], "generated_answer": response["answer"], - "retrieved_contexts": response["source_documents"], + "retrieved_contexts_individual": parse_text_to_json(response["source_documents"]), "model": response["model"], "total_tokens": response["total_tokens"], "prompt_tokens": response["prompt_tokens"], @@ -92,10 +102,7 @@ async def get_answer_correctness( "answer_correctness": result.dict()["passing"] } - -dataset_ref = weave.ref( - "weave:///wandbot/wandbot-eval/object/wandbot_eval_data:eCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU" -).get() +dataset_ref = weave.ref(config.eval_dataset).get() question_rows = dataset_ref.rows question_rows = [ { @@ -109,6 +116,6 @@ async def get_answer_correctness( evaluation = Evaluation( dataset=question_rows, scorers=[get_answer_correctness] ) - if __name__ == "__main__": - asyncio.run(evaluation.evaluate(EvaluatorModel())) + with weave.attributes({'evaluation_strategy_name': config.evaluation_strategy_name}): + asyncio.run(evaluation.evaluate(EvaluatorModel()))