Merge pull request #83 from wandb/weave-eval-jp

Weave eval jp
wandb · Oct 29, 2024 · 757cfba · 757cfba
2 parents 8ec557e + 226be68
commit 757cfba
Show file tree

Hide file tree

Showing 6 changed files with 292 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -19,12 +19,19 @@ This release introduces a number of exciting updates and improvements:
 These updates are part of our ongoing commitment to improve performance and usability.
 
 ## Evaluation
+English 
+| wandbot version  | Comment  | response accuracy |
+|---|---|---|
+| 1.0.0 | our baseline wandbot |  53.8 % |
+| 1.1.0 | improvement over baseline; in production for the longest | 72.5 %  | 
+| 1.2.0 | our new enhanced wandbot | 81.6 % |
 
+
+Japanese
 | wandbot version  | Comment  | response accuracy |
 |---|---|---|
-| 1.0.0 | our baseline wandbot |  53.78 % |
-| 1.1.0 | improvement over baseline; in production for the longest | 72.45 %  | 
-| 1.2.0 | our new enhanced wandbot | 81.63 % |
+| 1.2.0 | our new enhanced wandbot | 56.3 % |
+| 1.2.1 | add translation process | 71.9 % |
 
 ## Features
 
@@ -118,8 +125,20 @@ Launch the wandbot with 8 workers. This speeds up evaluation
 WANDBOT_EVALUATION=1 gunicorn wandbot.api.app:app --bind 0.0.0.0:8000 --timeout=200 --workers=8 --worker-class uvicorn.workers.UvicornWorker
 ```
 
-Launch W&B Weave evaluation
 
+
+Set up for evaluation
+
+wandbot/src/wandbot/evaluation/config.py
+- `evaluation_strategy_name` : attribute name in Weave Evaluation dashboard
+- `eval_dataset` : 
+    - [Latest English evaluation dataset](https://wandb.ai/wandbot/wandbot-eval/weave/datasets?peekPath=%2Fwandbot%2Fwandbot-eval%2Fobjects%2Fwandbot_eval_data%2Fversions%2FeCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU%3F%26): "weave:///wandbot/wandbot-eval/object/wandbot_eval_data:eCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU"
+    - [Latest Japanese evaluation dataset](https://wandb.ai/wandbot/wandbot-eval-jp/weave/datasets?peekPath=%2Fwandbot%2Fwandbot-eval-jp%2Fobjects%2Fwandbot_eval_data_jp%2Fversions%2FoCWifIAtEVCkSjushP0bOEc5GnhsMUYXURwQznBeKLA%3F%26): "weave:///wandbot/wandbot-eval-jp/object/wandbot_eval_data_jp:oCWifIAtEVCkSjushP0bOEc5GnhsMUYXURwQznBeKLA" 
+- `eval_judge_model` : model used for judge
+- `wandb_entity` : wandb entity name for record
+- `wandb_project` : wandb project name for record
+
+Launch W&B Weave evaluation
 ```
 python src/wandbot/evaluation/weave_eval/main.py
 ```

diff --git a/src/wandbot/chat/chat.py b/src/wandbot/chat/chat.py
@@ -36,6 +36,8 @@
 from wandbot.retriever import VectorStore
 from wandbot.utils import Timer, get_logger
 
+from openai import OpenAI
+
 logger = get_logger(__name__)
 
 
@@ -84,6 +86,79 @@ def _get_answer(
         result = self.rag_pipeline(question, history)
 
         return result
+
+    @weave.op()
+    def _translate_ja_to_en(self, text: str) -> str:
+        """
+        Translates Japanese text to English using OpenAI's GPT-4.
+
+        Args:
+            text: The Japanese text to be translated.
+
+        Returns:
+            The translated text in English.
+        """
+        client = OpenAI()
+        response = client.chat.completions.create(
+            model="gpt-4o-2024-08-06",
+            messages=[
+                {
+                    "role": "system",
+                    "content": f"You are a professional translator. \n\n\
+                    Translate the user's question about Weights & Biases into English according to the specified rules. \n\
+                    Rule of translation. \n\
+                    - Maintain the original nuance\n\
+                    - Keep code unchanged.\n\
+                    - Only return the English translation without any additional explanation"
+                },
+                {
+                    "role": "user",
+                    "content": text
+                }
+            ],
+            temperature=0,
+            max_tokens=1000,
+            top_p=1
+        )
+        return response.choices[0].message.content
+
+    @weave.op()
+    def _translate_en_to_ja(self, text: str) -> str:
+        """
+        Translates English text to Japanese using OpenAI's GPT-4.
+
+        Args:
+            text: The English text to be translated.
+
+        Returns:
+            The translated text in Japanese.
+        """
+        client = OpenAI()
+        response = client.chat.completions.create(
+                        model="gpt-4o-2024-08-06",
+            messages=[
+                {
+                    "role": "system",
+                    "content": f"You are a professional translator. \n\n\
+                    Translate the user's text into Japanese according to the specified rules. \n\
+                    Rule of translation. \n\
+                    - Maintain the original nuance\n\
+                    - Use 'run' in English where appropriate, as it's a term used in Wandb.\n\
+                    - Translate the terms 'reference artifacts' and 'lineage' into Katakana. \n\
+                    - Include specific terms in English or Katakana where appropriate\n\
+                    - Keep code unchanged.\n\
+                    - Only return the Japanese translation without any additional explanation"
+                },
+                {
+                    "role": "user",
+                    "content": text
+                }
+            ],
+            temperature=0,
+            max_tokens=1000,
+            top_p=1
+        )
+        return response.choices[0].message.content
 
     @weave.op()
     def __call__(self, chat_request: ChatRequest) -> ChatResponse:
@@ -95,13 +170,27 @@ def __call__(self, chat_request: ChatRequest) -> ChatResponse:
         Returns:
             An instance of `ChatResponse` representing the chat response.
         """
+        original_language = chat_request.language
         try:
+            if original_language == "ja":
+                translated_question = self._translate_ja_to_en(chat_request.question)
+                chat_request.language = "en"
+                chat_request = ChatRequest(
+                    question=translated_question,
+                    chat_history=chat_request.chat_history,
+                    application=chat_request.application,
+                    language="en"
+                )
+
             result = self._get_answer(
                 chat_request.question, chat_request.chat_history or []
             )
 
             result_dict = result.model_dump()
 
+            if original_language == "ja":
+                result_dict["answer"] = self._translate_en_to_ja(result_dict["answer"])
+
             usage_stats = {
                 "total_tokens": result.total_tokens,
                 "prompt_tokens": result.prompt_tokens,
@@ -135,4 +224,4 @@ def __call__(self, chat_request: ChatRequest) -> ChatResponse:
                 }
             )
 
-            return ChatResponse(**result)
+            return ChatResponse(**result)
diff --git a/src/wandbot/evaluation/config.py b/src/wandbot/evaluation/config.py
@@ -3,34 +3,17 @@
 
 
 class EvalConfig(BaseSettings):
-    model_config = SettingsConfigDict(
-        env_file=".env", env_file_encoding="utf-8", extra="allow"
-    )
-    eval_artifact: str = Field(
-        "wandbot/wandbot-eval/autoeval_dataset:v3",
-        env="EVAL_ARTIFACT",
-        validation_alias="eval_artifact",
-    )
-    eval_artifact_root: str = Field(
-        "data/eval",
-        env="EVAL_ARTIFACT_ROOT",
-        validation_alias="eval_artifact_root",
+    evaluation_strategy_name: str = Field("jp v1.2.0-beta", description="Will be shown in evaluation page, and be used for just visibility")
+    eval_dataset: str = Field(
+        "weave:///wandbot/wandbot-eval-jp/object/wandbot_eval_data_jp:oCWifIAtEVCkSjushP0bOEc5GnhsMUYXURwQznBeKLA"
+        ,description="Dataset reference for evaluation"
     )
+    language: str = Field("ja", description="language for application (en or ja)")
 
-    eval_annotations_file: str = Field(
-        "wandbot_cleaned_annotated_dataset_11-12-2023.jsonl",
-        env="EVAL_ANNOTATIONS_FILE",
-        validation_alias="eval_annotations_file",
-    )
-    eval_output_file: str = Field(
-        "eval.jsonl",
-        env="EVAL_OUTPUT_FILE",
-        validation_alias="eval_output_file",
-    )
     eval_judge_model: str = Field(
         "gpt-4-1106-preview",
         env="EVAL_JUDGE_MODEL",
         validation_alias="eval_judge_model",
     )
     wandb_entity: str = Field("wandbot", env="WANDB_ENTITY")
-    wandb_project: str = Field("wandbot-eval", env="WANDB_PROJECT")
+    wandb_project: str = Field("wandbot-eval", env="WANDB_PROJECT")
diff --git a/src/wandbot/evaluation/jp_evaluation_dataprep.py b/src/wandbot/evaluation/jp_evaluation_dataprep.py
@@ -0,0 +1,103 @@
+import weave
+import json
+import requests
+import os
+from typing import List, Dict, Any
+from tqdm import tqdm
+
+dataset_ref = weave.ref("weave:///wandbot/wandbot-eval/object/wandbot_eval_data:eCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU").get()
+question_rows = dataset_ref.rows
+question_rows = [
+    {
+        "question": row["question"],
+        "ground_truth": row["answer"],
+        "notes": row["notes"],
+        "context": row["context"],
+        "correctness": row["correctness"],
+        "is_wandb_query": row["is_wandb_query"]
+    } for row in question_rows
+]
+
+def translate_with_openai(text: str) -> str:
+    # Get the OpenAI API key from environment variables
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OPENAI_API_KEY environment variable is not set")
+
+    # Set headers for the OpenAI API request
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {api_key}"
+    }
+
+    # Data payload for GPT-4-turbo (gpt-4o) API request
+
+    #質問には答えないでというプロンプトを入れても良いかもしれない
+
+    data = {
+        "model": "gpt-4o-2024-08-06",  # Updated to GPT-4 Turbo
+        "max_tokens": 4000,
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a professional translator. \n\n\
+                    Translate the user's text into Japanese according to the specified rules. \n\
+                    Rule of translation. \n\
+                    - Maintain the original nuance\n\
+                    - Use 'run' in English where appropriate, as it's a term used in Wandb.\n\
+                    - Translate the terms 'reference artifacts' and 'lineage' into Katakana. \n\
+                    - Include specific terms in English or Katakana where appropriate\n\
+                    - Keep code unchanged.\n\
+                    - Keep URL starting from 'Source:\thttps:', but translate texts after 'Source:\thttps:'\n\
+                    - Only return the Japanese translation without any additional explanation"
+            },
+            {
+                "role": "user",
+                "content": text
+            }
+        ]
+    }
+
+    # Make the API request to OpenAI
+    response = requests.post(
+        "https://api.openai.com/v1/chat/completions",
+        headers=headers,
+        json=data
+    )
+
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Return the translated text
+        return response.json()["choices"][0]["message"]["content"].strip()
+    else:
+        raise Exception(f"API request failed with status code {response.status_code}: {response.text}")
+
+def translate_data(data: List[Dict[str, Any]], output_file: str) -> None:
+    total_items = len(data)
+
+    # Check if the file exists and get the last processed index
+    if os.path.exists(output_file):
+        with open(output_file, "r", encoding="utf-8") as file:
+            processed_data = json.load(file)
+        start_index = len(processed_data)
+    else:
+        processed_data = []
+        start_index = 0
+
+    for i in tqdm(range(start_index, total_items), initial=start_index, total=total_items):
+        item = data[i]
+        translated_item = item.copy()
+        for key in ["question", "ground_truth", "notes", "context"]:
+            if key in item:
+                translated_item[key] = translate_with_claude(item[key])
+
+        processed_data.append(translated_item)
+
+        # Save progress after each item
+        with open(output_file, "w", encoding="utf-8") as file:
+            json.dump(processed_data, file, ensure_ascii=False, indent=2)
+
+    print(f"Translation completed. Results saved in '{output_file}'")
+
+output_file = "translated_data.json"
+translate_data(question_rows, output_file)
diff --git a/src/wandbot/evaluation/jp_evaluation_dataupload.py b/src/wandbot/evaluation/jp_evaluation_dataupload.py
@@ -0,0 +1,50 @@
+import json
+import weave
+from weave import Dataset
+
+def rename_key(item):
+    if 'ground_truth' in item:
+        item['answer'] = item.pop('ground_truth')
+    return item
+
+def create_test_file(json_file_path, test_file_path, num_lines=5):
+    with open(json_file_path, 'r') as file:
+        data = json.load(file)
+
+    test_data = data[:num_lines]
+
+    with open(test_file_path, 'w') as file:
+        json.dump(test_data, file, indent=2, ensure_ascii=False)
+
+    print(f"Test file with {num_lines} lines has been created at {test_file_path}")
+
+def publish_json_to_weave(json_file_path, dataset_name, project_name):
+    # Initialize Weave
+    weave.init(project_name)
+
+    # Read JSON file
+    with open(json_file_path, 'r') as file:
+        data = json.load(file)
+
+    # Rename 'ground_truth' to 'answer' in each item
+    processed_data = [rename_key(item) for item in data]
+
+    # Create a dataset
+    dataset = Dataset(name=dataset_name, rows=processed_data)
+
+    # Publish the dataset
+    weave.publish(dataset)
+
+    print(f"Dataset '{dataset_name}' has been published to project '{project_name}'.")
+
+# Usage example
+json_file_path = 'translated_data.json'
+test_file_path = 'test_translated_data.json'
+dataset_name = 'wandbot_eval_data_jp'
+project_name = 'wandbot/wandbot-eval-jp'
+
+# Create test file
+#create_test_file(json_file_path, test_file_path)
+
+# Publish full dataset to Weave
+publish_json_to_weave(json_file_path, dataset_name, project_name)