Skip to content

Commit

Permalink
save o1 results with hidden reasoning token numbers and two more runs…
Browse files Browse the repository at this point in the history
… of o1 mini
  • Loading branch information
(Bill) Yuchen Lin committed Oct 23, 2024
1 parent 4ceb7ca commit effd564
Show file tree
Hide file tree
Showing 6 changed files with 48,058 additions and 16 deletions.
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ This repository aims to evaluate instruction-tuned LLMs for their zero-shot perf

## Todo

- [ ] Support new tasks (GPPA, AIME, etc.)
- [ ] Add private tests
- [ ] Support new tasks (GPPA, AIME, etc.)
- [ ] Prefix-prefill for open models such that the parsing is easier
- [ ] Add other formatting options (e.g. markup language instead of json, etc.)

Expand Down Expand Up @@ -104,12 +103,12 @@ python src/evaluation/summarize.py
python src/evaluation/math_eval.py gsm
-->


<!--
### Changelogs
- 08/02/2024: added Gemini 1.5 Pro Exp 0801 and CRUX results
- 07/31/2024: added Meta-Llama-3.1-70B-Instruct and gemma-2-2b-it
- 07/29/2024: added Llama-3.1-8B, Mistral-Large-2, and deepseek-coder-v2-0724
- 07/29/2024: added Llama-3.1-8B, Mistral-Large-2, and deepseek-coder-v2-0724 -->

## Citation
If you find ZeroEval useful, please cite it as follows in your publication:
Expand All @@ -127,3 +126,15 @@ If you find ZeroEval useful, please cite it as follows in your publication:
## Star History

[![Star History Chart](https://api.star-history.com/svg?repos=WildEval/ZeroEval&type=Date)](https://star-history.com/#WildEval/ZeroEval&Date)


<!--
bash zero_eval_api.sh -f openai -d zebra-grid -m openai/o1-mini-2024-09-12 -p o1-mini-2024-09-12-v2 -s 4
wait
bash zero_eval_api.sh -f openai -d zebra-grid -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12-v2 -s 4
wait
-->
24,002 changes: 24,002 additions & 0 deletions result_dirs/zebra-grid/o1-mini-2024-09-12-v2.json

Large diffs are not rendered by default.

24,002 changes: 24,002 additions & 0 deletions result_dirs/zebra-grid/o1-mini-2024-09-12-v3.json

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions scripts/local/gpt-4o-new.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@
# wait


# bash zero_eval_api.sh -f openai -d zebra-grid -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 8
# wait
bash zero_eval_api.sh -f openai -d math-l5 -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
wait
bash zero_eval_api.sh -f openai -d crux -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
wait
bash zero_eval_api.sh -f openai -d mmlu-redux -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
wait
# bash zero_eval_api.sh -f openai -d zebra-grid -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12-v2 -s 1
# wait
# bash zero_eval_api.sh -f openai -d math-l5 -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
# wait
# bash zero_eval_api.sh -f openai -d crux -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
# wait
# bash zero_eval_api.sh -f openai -d mmlu-redux -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
# wait

24 changes: 21 additions & 3 deletions src/unified_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,12 +206,16 @@ def sanitize_args(args):
formatted_outputs = json.load(f)
for output_item in formatted_outputs:
outputs.append([output_item["output"]] if type(output_item["output"]) == str else output_item["output"])
if args.model_name.startswith("openai/o1-"):
if "hidden_reasoning_token" not in metadata:
metadata["hidden_reasoning_token"] = []
metadata["hidden_reasoning_token"].append(output_item["hidden_reasoning_token"])
num_skipped = len(outputs)
print(f"We skipped the first {num_skipped} examples")


# Load the existing data from the cache_filepath
cache_outputs = {}
cache_outputs = {}
if args.cache_filepath is not None:
if os.path.exists(args.cache_filepath):
with open(args.cache_filepath) as f:
Expand All @@ -220,6 +224,7 @@ def sanitize_args(args):
# if output_item["output"] is a list and the first string is not empty
if type(output_item["output"]) == list and len(output_item["output"]) > 0 and len(output_item["output"][0]) > 0:
cache_outputs[output_item["session_id"]] = output_item

print(f"Loaded {len(cache_outputs)} non-empty outputs from the cache file: {args.cache_filepath}")

todo_inputs = model_inputs[num_skipped:]
Expand Down Expand Up @@ -267,7 +272,11 @@ def api(**kwargs):
# check if in the cache
if current_id_str in cache_outputs:
print(f"Using cache from {args.cache_filepath} for {current_id_str}")
outputs.append(cache_outputs[current_id_str]["output"])
cache_item = cache_outputs[current_id_str]
outputs.append(cache_item["output"])
if "hidden_reasoning_token" not in metadata:
metadata["hidden_reasoning_token"] = []
metadata["hidden_reasoning_token"].append(cache_item["hidden_reasoning_token"])
else:
openai_msg = [{"role":"system", "content":"You are a helpful AI assistant."}]
for i, chat_item in enumerate(chat):
Expand All @@ -285,7 +294,16 @@ def api(**kwargs):
"stop": stop_words,
}
result = api(**openai_args)
outputs.append(result)
# for o1
if args.model_name.startswith("openai/o1-"):
content, hidden_reasoning_token = result
# print(f"hidden_reasoning_token: {hidden_reasoning_token}")
if "hidden_reasoning_token" not in metadata:
metadata["hidden_reasoning_token"] = []
metadata["hidden_reasoning_token"].append(hidden_reasoning_token)
else:
content = result
outputs.append(content)
save_outputs(args, id_strs, outputs, chat_history, metadata, model_inputs, filepath)

elif args.engine == "together":
Expand Down
11 changes: 10 additions & 1 deletion src/unified_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ def save_outputs(args, id_strs, outputs, chat_history, metadata, model_inputs, f
for key in metadata:
if key in output_item:
continue
output_item[key] = metadata[key][ind]
if ind < len(metadata[key]):
output_item[key] = metadata[key][ind]
output_item = result_format(output_item, args)
formatted_outputs.append(output_item)
with open(filepath, "w") as f:
Expand Down Expand Up @@ -165,6 +166,8 @@ def wrapper(*args, **kwargs):
if "invalid" in str(e).lower():
print("Invalid request, returning.")
retried = retry_limit
if kwargs["model"].startswith("o1-"):
return ['API Error: this query is blocked by APIs. ' + str(e)], -1
return ['API Error: this query is blocked by APIs. ' + str(e)]
else:
err_msg = str(e)
Expand Down Expand Up @@ -260,6 +263,7 @@ def openai_chat_request(
contents.append(choice['message']['content'])
else:
nvidia_mode = False
o1_mode = False
# for version > 1.0
if "deepseek" in model:
assert os.environ.get("DEEPSEEK_API_KEY") is not None, "Please set DEEPSEEK_API_KEY in the environment variables."
Expand Down Expand Up @@ -307,6 +311,7 @@ def openai_chat_request(
else:
# print(f"Requesting chat completion from OpenAI API with model {model}")
if model.startswith("o1-"):
o1_mode = True
if messages[0]["role"] == "system":
messages = messages[1:]
response = client.chat.completions.create(
Expand All @@ -315,10 +320,12 @@ def openai_chat_request(
messages=messages,
top_p=top_p,
n=n,
# temperature=temperature,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
**kwargs,
)
hidden_reasoning_tokens = response.usage.completion_tokens_details.reasoning_tokens
else:
response = client.chat.completions.create(
model=model,
Expand All @@ -343,6 +350,8 @@ def openai_chat_request(
else:
raise ValueError(f"OpenAI Finish Reason Error: {choice.finish_reason}")
contents.append(choice.message.content.strip())
if o1_mode:
return contents, hidden_reasoning_tokens
return contents

def together_chat_request(
Expand Down

0 comments on commit effd564

Please sign in to comment.