Skip to content

Commit

Permalink
Merge pull request #83 from wandb/weave-eval-jp
Browse files Browse the repository at this point in the history
Weave eval jp
  • Loading branch information
morganmcg1 authored Oct 29, 2024
2 parents 8ec557e + 226be68 commit 757cfba
Show file tree
Hide file tree
Showing 6 changed files with 292 additions and 41 deletions.
27 changes: 23 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,19 @@ This release introduces a number of exciting updates and improvements:
These updates are part of our ongoing commitment to improve performance and usability.

## Evaluation
English
| wandbot version | Comment | response accuracy |
|---|---|---|
| 1.0.0 | our baseline wandbot | 53.8 % |
| 1.1.0 | improvement over baseline; in production for the longest | 72.5 % |
| 1.2.0 | our new enhanced wandbot | 81.6 % |


Japanese
| wandbot version | Comment | response accuracy |
|---|---|---|
| 1.0.0 | our baseline wandbot | 53.78 % |
| 1.1.0 | improvement over baseline; in production for the longest | 72.45 % |
| 1.2.0 | our new enhanced wandbot | 81.63 % |
| 1.2.0 | our new enhanced wandbot | 56.3 % |
| 1.2.1 | add translation process | 71.9 % |

## Features

Expand Down Expand Up @@ -118,8 +125,20 @@ Launch the wandbot with 8 workers. This speeds up evaluation
WANDBOT_EVALUATION=1 gunicorn wandbot.api.app:app --bind 0.0.0.0:8000 --timeout=200 --workers=8 --worker-class uvicorn.workers.UvicornWorker
```

Launch W&B Weave evaluation


Set up for evaluation

wandbot/src/wandbot/evaluation/config.py
- `evaluation_strategy_name` : attribute name in Weave Evaluation dashboard
- `eval_dataset` :
- [Latest English evaluation dataset](https://wandb.ai/wandbot/wandbot-eval/weave/datasets?peekPath=%2Fwandbot%2Fwandbot-eval%2Fobjects%2Fwandbot_eval_data%2Fversions%2FeCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU%3F%26): "weave:///wandbot/wandbot-eval/object/wandbot_eval_data:eCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU"
- [Latest Japanese evaluation dataset](https://wandb.ai/wandbot/wandbot-eval-jp/weave/datasets?peekPath=%2Fwandbot%2Fwandbot-eval-jp%2Fobjects%2Fwandbot_eval_data_jp%2Fversions%2FoCWifIAtEVCkSjushP0bOEc5GnhsMUYXURwQznBeKLA%3F%26): "weave:///wandbot/wandbot-eval-jp/object/wandbot_eval_data_jp:oCWifIAtEVCkSjushP0bOEc5GnhsMUYXURwQznBeKLA"
- `eval_judge_model` : model used for judge
- `wandb_entity` : wandb entity name for record
- `wandb_project` : wandb project name for record

Launch W&B Weave evaluation
```
python src/wandbot/evaluation/weave_eval/main.py
```
Expand Down
91 changes: 90 additions & 1 deletion src/wandbot/chat/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
from wandbot.retriever import VectorStore
from wandbot.utils import Timer, get_logger

from openai import OpenAI

logger = get_logger(__name__)


Expand Down Expand Up @@ -84,6 +86,79 @@ def _get_answer(
result = self.rag_pipeline(question, history)

return result

@weave.op()
def _translate_ja_to_en(self, text: str) -> str:
"""
Translates Japanese text to English using OpenAI's GPT-4.
Args:
text: The Japanese text to be translated.
Returns:
The translated text in English.
"""
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": f"You are a professional translator. \n\n\
Translate the user's question about Weights & Biases into English according to the specified rules. \n\
Rule of translation. \n\
- Maintain the original nuance\n\
- Keep code unchanged.\n\
- Only return the English translation without any additional explanation"
},
{
"role": "user",
"content": text
}
],
temperature=0,
max_tokens=1000,
top_p=1
)
return response.choices[0].message.content

@weave.op()
def _translate_en_to_ja(self, text: str) -> str:
"""
Translates English text to Japanese using OpenAI's GPT-4.
Args:
text: The English text to be translated.
Returns:
The translated text in Japanese.
"""
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o-2024-08-06",
messages=[
{
"role": "system",
"content": f"You are a professional translator. \n\n\
Translate the user's text into Japanese according to the specified rules. \n\
Rule of translation. \n\
- Maintain the original nuance\n\
- Use 'run' in English where appropriate, as it's a term used in Wandb.\n\
- Translate the terms 'reference artifacts' and 'lineage' into Katakana. \n\
- Include specific terms in English or Katakana where appropriate\n\
- Keep code unchanged.\n\
- Only return the Japanese translation without any additional explanation"
},
{
"role": "user",
"content": text
}
],
temperature=0,
max_tokens=1000,
top_p=1
)
return response.choices[0].message.content

@weave.op()
def __call__(self, chat_request: ChatRequest) -> ChatResponse:
Expand All @@ -95,13 +170,27 @@ def __call__(self, chat_request: ChatRequest) -> ChatResponse:
Returns:
An instance of `ChatResponse` representing the chat response.
"""
original_language = chat_request.language
try:
if original_language == "ja":
translated_question = self._translate_ja_to_en(chat_request.question)
chat_request.language = "en"
chat_request = ChatRequest(
question=translated_question,
chat_history=chat_request.chat_history,
application=chat_request.application,
language="en"
)

result = self._get_answer(
chat_request.question, chat_request.chat_history or []
)

result_dict = result.model_dump()

if original_language == "ja":
result_dict["answer"] = self._translate_en_to_ja(result_dict["answer"])

usage_stats = {
"total_tokens": result.total_tokens,
"prompt_tokens": result.prompt_tokens,
Expand Down Expand Up @@ -135,4 +224,4 @@ def __call__(self, chat_request: ChatRequest) -> ChatResponse:
}
)

return ChatResponse(**result)
return ChatResponse(**result)
29 changes: 6 additions & 23 deletions src/wandbot/evaluation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,17 @@


class EvalConfig(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env", env_file_encoding="utf-8", extra="allow"
)
eval_artifact: str = Field(
"wandbot/wandbot-eval/autoeval_dataset:v3",
env="EVAL_ARTIFACT",
validation_alias="eval_artifact",
)
eval_artifact_root: str = Field(
"data/eval",
env="EVAL_ARTIFACT_ROOT",
validation_alias="eval_artifact_root",
evaluation_strategy_name: str = Field("jp v1.2.0-beta", description="Will be shown in evaluation page, and be used for just visibility")
eval_dataset: str = Field(
"weave:///wandbot/wandbot-eval-jp/object/wandbot_eval_data_jp:oCWifIAtEVCkSjushP0bOEc5GnhsMUYXURwQznBeKLA"
,description="Dataset reference for evaluation"
)
language: str = Field("ja", description="language for application (en or ja)")

eval_annotations_file: str = Field(
"wandbot_cleaned_annotated_dataset_11-12-2023.jsonl",
env="EVAL_ANNOTATIONS_FILE",
validation_alias="eval_annotations_file",
)
eval_output_file: str = Field(
"eval.jsonl",
env="EVAL_OUTPUT_FILE",
validation_alias="eval_output_file",
)
eval_judge_model: str = Field(
"gpt-4-1106-preview",
env="EVAL_JUDGE_MODEL",
validation_alias="eval_judge_model",
)
wandb_entity: str = Field("wandbot", env="WANDB_ENTITY")
wandb_project: str = Field("wandbot-eval", env="WANDB_PROJECT")
wandb_project: str = Field("wandbot-eval", env="WANDB_PROJECT")
103 changes: 103 additions & 0 deletions src/wandbot/evaluation/jp_evaluation_dataprep.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import weave
import json
import requests
import os
from typing import List, Dict, Any
from tqdm import tqdm

dataset_ref = weave.ref("weave:///wandbot/wandbot-eval/object/wandbot_eval_data:eCQQ0GjM077wi4ykTWYhLPRpuGIaXbMwUGEB7IyHlFU").get()
question_rows = dataset_ref.rows
question_rows = [
{
"question": row["question"],
"ground_truth": row["answer"],
"notes": row["notes"],
"context": row["context"],
"correctness": row["correctness"],
"is_wandb_query": row["is_wandb_query"]
} for row in question_rows
]

def translate_with_openai(text: str) -> str:
# Get the OpenAI API key from environment variables
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise ValueError("OPENAI_API_KEY environment variable is not set")

# Set headers for the OpenAI API request
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}

# Data payload for GPT-4-turbo (gpt-4o) API request

#質問には答えないでというプロンプトを入れても良いかもしれない

data = {
"model": "gpt-4o-2024-08-06", # Updated to GPT-4 Turbo
"max_tokens": 4000,
"messages": [
{
"role": "system",
"content": "You are a professional translator. \n\n\
Translate the user's text into Japanese according to the specified rules. \n\
Rule of translation. \n\
- Maintain the original nuance\n\
- Use 'run' in English where appropriate, as it's a term used in Wandb.\n\
- Translate the terms 'reference artifacts' and 'lineage' into Katakana. \n\
- Include specific terms in English or Katakana where appropriate\n\
- Keep code unchanged.\n\
- Keep URL starting from 'Source:\thttps:', but translate texts after 'Source:\thttps:'\n\
- Only return the Japanese translation without any additional explanation"
},
{
"role": "user",
"content": text
}
]
}

# Make the API request to OpenAI
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=headers,
json=data
)

# Check if the request was successful
if response.status_code == 200:
# Return the translated text
return response.json()["choices"][0]["message"]["content"].strip()
else:
raise Exception(f"API request failed with status code {response.status_code}: {response.text}")

def translate_data(data: List[Dict[str, Any]], output_file: str) -> None:
total_items = len(data)

# Check if the file exists and get the last processed index
if os.path.exists(output_file):
with open(output_file, "r", encoding="utf-8") as file:
processed_data = json.load(file)
start_index = len(processed_data)
else:
processed_data = []
start_index = 0

for i in tqdm(range(start_index, total_items), initial=start_index, total=total_items):
item = data[i]
translated_item = item.copy()
for key in ["question", "ground_truth", "notes", "context"]:
if key in item:
translated_item[key] = translate_with_claude(item[key])

processed_data.append(translated_item)

# Save progress after each item
with open(output_file, "w", encoding="utf-8") as file:
json.dump(processed_data, file, ensure_ascii=False, indent=2)

print(f"Translation completed. Results saved in '{output_file}'")

output_file = "translated_data.json"
translate_data(question_rows, output_file)
50 changes: 50 additions & 0 deletions src/wandbot/evaluation/jp_evaluation_dataupload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import json
import weave
from weave import Dataset

def rename_key(item):
if 'ground_truth' in item:
item['answer'] = item.pop('ground_truth')
return item

def create_test_file(json_file_path, test_file_path, num_lines=5):
with open(json_file_path, 'r') as file:
data = json.load(file)

test_data = data[:num_lines]

with open(test_file_path, 'w') as file:
json.dump(test_data, file, indent=2, ensure_ascii=False)

print(f"Test file with {num_lines} lines has been created at {test_file_path}")

def publish_json_to_weave(json_file_path, dataset_name, project_name):
# Initialize Weave
weave.init(project_name)

# Read JSON file
with open(json_file_path, 'r') as file:
data = json.load(file)

# Rename 'ground_truth' to 'answer' in each item
processed_data = [rename_key(item) for item in data]

# Create a dataset
dataset = Dataset(name=dataset_name, rows=processed_data)

# Publish the dataset
weave.publish(dataset)

print(f"Dataset '{dataset_name}' has been published to project '{project_name}'.")

# Usage example
json_file_path = 'translated_data.json'
test_file_path = 'test_translated_data.json'
dataset_name = 'wandbot_eval_data_jp'
project_name = 'wandbot/wandbot-eval-jp'

# Create test file
#create_test_file(json_file_path, test_file_path)

# Publish full dataset to Weave
publish_json_to_weave(json_file_path, dataset_name, project_name)
Loading

0 comments on commit 757cfba

Please sign in to comment.