Skip to content

Commit a1f4589

Browse files
authored
Adds wandb to eval files (ShishirPatil#114)
Add Weights & Biases logging to: - log the llm responses in a file and in a W&B Table to explore - Keep track of progress of llm responses progress (helpful during long llm response queries) - log the ast evaluation accuracy of the logged responses
1 parent 2d5d6e1 commit a1f4589

File tree

2 files changed

+60
-1
lines changed

2 files changed

+60
-1
lines changed

eval/eval-scripts/ast_eval_hf.py

+13
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,15 @@ def main(args):
161161
else:
162162
pass
163163

164+
if use_wandb:
165+
if args.wandb_run_id is not None:
166+
wandb.init(project=args.wandb_project, entity=args.wandb_entity, id=args.wandb_run_id, resume="must")
167+
else:
168+
wandb.init(project=args.wandb_project, entity=args.wandb_entity)
169+
170+
wandb.summary['final_functionality_accuracy': total_correct / len(llm_responses)]
171+
wandb.summary['final_hallucination': total_hallucination/len(llm_responses)]
172+
164173
print('Final Functionality accuracy: ', total_correct / len(llm_responses))
165174
print('Final hallucination: ', total_hallucination/len(llm_responses))
166175

@@ -169,5 +178,9 @@ def main(args):
169178
parser.add_argument("--api_dataset", type=str, default=None, help="path to your api dataset")
170179
parser.add_argument("--apibench", type=str, default=None, help="path to your apibench dataset including the question and answer pairs")
171180
parser.add_argument("--llm_responses", type=str, default=None, help="path to the language model responses")
181+
parser.add_argument("--use_wandb", action='store_true', help="pass this argument to turn on Weights & Biases logging of the LLM responses")
182+
parser.add_argument("--wandb_project", type=str, default="gorilla-api", help="Weights & Biases project name")
183+
parser.add_argument("--wandb_entity", type=str, default=None, help="Weights & Biases entity name")
184+
parser.add_argument("--wandb_run_id", type=str, default=None, help="pass W&B run id to append results to that run, otherwise a new W&B run is logged")
172185
args = parser.parse_args()
173186
main(args)

eval/get_llm_responses.py

+47-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
import anthropic
2020
import multiprocessing as mp
2121
import time
22+
import wandb
23+
from tenacity import retry, wait_exponential
2224

2325
def encode_question(question, api_name):
2426
"""Encode multiple prompt instructions into a single string."""
@@ -47,6 +49,7 @@ def encode_question(question, api_name):
4749
prompts.append({"role": "user", "content": prompt})
4850
return prompts
4951

52+
@retry(wait=wait_exponential(multiplier=1, min=10, max=120), reraise=True)
5053
def get_response(get_response_input, api_key):
5154
question, question_id, api_name, model = get_response_input
5255
question = encode_question(question, api_name)
@@ -82,6 +85,7 @@ def get_response(get_response_input, api_key):
8285
def process_entry(entry, api_key):
8386
question, question_id, api_name, model = entry
8487
result = get_response((question, question_id, api_name, model), api_key)
88+
wandb.log({"question_id_completed":question_id})
8589
return result
8690

8791
def write_result_to_file(result, output_file):
@@ -102,8 +106,23 @@ def callback_with_lock(result, output_file):
102106
parser.add_argument("--output_file", type=str, default=None, help="the output file this script writes to")
103107
parser.add_argument("--question_data", type=str, default=None, help="path to the questions data file")
104108
parser.add_argument("--api_name", type=str, default=None, help="this will be the api dataset name you are testing, only support ['torchhub', 'tensorhun', 'huggingface'] now")
109+
parser.add_argument("--use_wandb", action='store_true', help="pass this argument to turn on Weights & Biases logging of the LLM responses")
110+
parser.add_argument("--wandb_project", type=str, default="gorilla-api", help="Weights & Biases project name")
111+
parser.add_argument("--wandb_entity", type=str, default=None, help="Weights & Biases entity name")
105112
args = parser.parse_args()
106113

114+
if args.use_wandb:
115+
wandb.init(
116+
project=args.wandb_project,
117+
entity=args.wandb_entity,
118+
config={
119+
"api_name":args.api_name,
120+
"model":args.model,
121+
"question_data":args.question_data,
122+
"output_file": args.output_file
123+
}
124+
)
125+
107126
start_time = time.time()
108127
# Read the question file
109128
questions = []
@@ -127,4 +146,31 @@ def callback_with_lock(result, output_file):
127146
pool.join()
128147

129148
end_time = time.time()
130-
print("Total time used: ", end_time - start_time)
149+
elapsed_time = end_time - start_time
150+
print("Total time used: ", elapsed_time)
151+
152+
if args.use_wandb:
153+
print("\nSaving all responses to Weights & Biases...\n")
154+
155+
wandb.summary["elapsed_time_s"] = elapsed_time
156+
157+
line_count = 0
158+
with open(args.output_file, 'r') as file:
159+
for i,line in enumerate(file):
160+
data = json.loads(line.strip())
161+
162+
if i == 0:
163+
tbl = wandb.Table(columns=list(data.keys()))
164+
tbl.add_data(*list(data.values()))
165+
line_count+=1
166+
167+
# Log the Tale to W&B
168+
wandb.log({"llm_eval_responses": tbl})
169+
wandb.summary["response_count"] = line_count
170+
171+
# Also log results file as W&B Artifact
172+
wandb.log_artifact(args.output_file,
173+
name=f"{args.api_name}-{args._model}-eval-results",
174+
type=f"eval-results",
175+
aliases=[f"{line_count}-responses"]
176+
)

0 commit comments

Comments
 (0)