save o1 results with hidden reasoning token numbers and two more runs…

… of o1 mini
WildEval · Oct 23, 2024 · effd564 · effd564
1 parent 4ceb7ca
commit effd564
Show file tree

Hide file tree

Showing 6 changed files with 48,058 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -9,8 +9,7 @@ This repository aims to evaluate instruction-tuned LLMs for their zero-shot perf
 
 ## Todo
 
-- [ ] Support new tasks (GPPA, AIME, etc.)
-- [ ] Add private tests 
+- [ ] Support new tasks (GPPA, AIME, etc.) 
 - [ ] Prefix-prefill for open models such that the parsing is easier
 - [ ] Add other formatting options (e.g. markup language instead of json, etc.)
 
@@ -104,12 +103,12 @@ python src/evaluation/summarize.py
 python src/evaluation/math_eval.py gsm 
  -->
 
-
+<!-- 
 ### Changelogs 
 
 - 08/02/2024: added Gemini 1.5 Pro Exp 0801 and CRUX results 
 - 07/31/2024: added Meta-Llama-3.1-70B-Instruct and gemma-2-2b-it 
-- 07/29/2024: added Llama-3.1-8B, Mistral-Large-2, and deepseek-coder-v2-0724 
+- 07/29/2024: added Llama-3.1-8B, Mistral-Large-2, and deepseek-coder-v2-0724  -->
 
 ## Citation
 If you find ZeroEval useful, please cite it as follows in your publication:
@@ -127,3 +126,15 @@ If you find ZeroEval useful, please cite it as follows in your publication:
 ## Star History
 
 [![Star History Chart](https://api.star-history.com/svg?repos=WildEval/ZeroEval&type=Date)](https://star-history.com/#WildEval/ZeroEval&Date)
+
+
+<!-- 
+
+
+
+bash zero_eval_api.sh -f openai -d zebra-grid -m openai/o1-mini-2024-09-12 -p o1-mini-2024-09-12-v2 -s 4
+wait 
+bash zero_eval_api.sh -f openai -d zebra-grid -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12-v2 -s 4
+wait 
+
+ -->
diff --git a/result_dirs/zebra-grid/o1-mini-2024-09-12-v2.json b/result_dirs/zebra-grid/o1-mini-2024-09-12-v2.json
diff --git a/result_dirs/zebra-grid/o1-mini-2024-09-12-v3.json b/result_dirs/zebra-grid/o1-mini-2024-09-12-v3.json
diff --git a/scripts/local/gpt-4o-new.sh b/scripts/local/gpt-4o-new.sh
@@ -23,12 +23,12 @@
 # wait
 
 
-# bash zero_eval_api.sh -f openai -d zebra-grid -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 8
-# wait
-bash zero_eval_api.sh -f openai -d math-l5 -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
-wait
-bash zero_eval_api.sh -f openai -d crux -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
-wait
-bash zero_eval_api.sh -f openai -d mmlu-redux -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
-wait
+# bash zero_eval_api.sh -f openai -d zebra-grid -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12-v2 -s 1
+# wait
+# bash zero_eval_api.sh -f openai -d math-l5 -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
+# wait
+# bash zero_eval_api.sh -f openai -d crux -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
+# wait
+# bash zero_eval_api.sh -f openai -d mmlu-redux -m openai/o1-preview-2024-09-12 -p o1-preview-2024-09-12 -s 4
+# wait
 
diff --git a/src/unified_infer.py b/src/unified_infer.py
@@ -206,12 +206,16 @@ def sanitize_args(args):
             formatted_outputs = json.load(f)
         for output_item in formatted_outputs:
             outputs.append([output_item["output"]] if type(output_item["output"]) == str else output_item["output"])
+            if args.model_name.startswith("openai/o1-"):
+                if "hidden_reasoning_token" not in metadata:
+                    metadata["hidden_reasoning_token"] = []
+                metadata["hidden_reasoning_token"].append(output_item["hidden_reasoning_token"])
     num_skipped = len(outputs)
     print(f"We skipped the first {num_skipped} examples")
 
 
     # Load the existing data from the cache_filepath
-    cache_outputs = {}
+    cache_outputs = {} 
     if args.cache_filepath is not None:
         if os.path.exists(args.cache_filepath):
             with open(args.cache_filepath) as f:
@@ -220,6 +224,7 @@ def sanitize_args(args):
                 # if output_item["output"]  is a list and the first string is not empty 
                 if type(output_item["output"]) == list and len(output_item["output"]) > 0 and len(output_item["output"][0]) > 0:
                     cache_outputs[output_item["session_id"]] = output_item
+
         print(f"Loaded {len(cache_outputs)} non-empty outputs from the cache file: {args.cache_filepath}")
 
     todo_inputs = model_inputs[num_skipped:]
@@ -267,7 +272,11 @@ def api(**kwargs):
             # check if in the cache
             if current_id_str in cache_outputs:
                 print(f"Using cache from {args.cache_filepath} for {current_id_str}")
-                outputs.append(cache_outputs[current_id_str]["output"])
+                cache_item = cache_outputs[current_id_str]
+                outputs.append(cache_item["output"])
+                if "hidden_reasoning_token" not in metadata:
+                    metadata["hidden_reasoning_token"] = []
+                metadata["hidden_reasoning_token"].append(cache_item["hidden_reasoning_token"])
             else:
                 openai_msg = [{"role":"system", "content":"You are a helpful AI assistant."}] 
                 for i, chat_item in enumerate(chat):
@@ -285,7 +294,16 @@ def api(**kwargs):
                     "stop": stop_words,
                 }
                 result = api(**openai_args)
-                outputs.append(result)
+                # for o1 
+                if args.model_name.startswith("openai/o1-"):
+                    content, hidden_reasoning_token = result
+                    # print(f"hidden_reasoning_token: {hidden_reasoning_token}")
+                    if "hidden_reasoning_token" not in metadata:
+                        metadata["hidden_reasoning_token"] = []
+                    metadata["hidden_reasoning_token"].append(hidden_reasoning_token)
+                else:
+                    content = result 
+                outputs.append(content)
             save_outputs(args, id_strs, outputs, chat_history, metadata, model_inputs, filepath)
 
     elif args.engine == "together":

diff --git a/src/unified_utils.py b/src/unified_utils.py
@@ -114,7 +114,8 @@ def save_outputs(args, id_strs, outputs, chat_history, metadata, model_inputs, f
         for key in metadata:
             if key in output_item:
                 continue 
-            output_item[key] = metadata[key][ind]
+            if ind < len(metadata[key]):
+                output_item[key] = metadata[key][ind]
         output_item = result_format(output_item, args)
         formatted_outputs.append(output_item)  
     with open(filepath, "w") as f:
@@ -165,6 +166,8 @@ def wrapper(*args, **kwargs):
                         if "invalid" in str(e).lower():
                             print("Invalid request, returning.")
                             retried = retry_limit
+                            if kwargs["model"].startswith("o1-"):
+                                return ['API Error: this query is blocked by APIs. ' + str(e)], -1
                             return ['API Error: this query is blocked by APIs. ' + str(e)]
                     else:
                         err_msg = str(e)
@@ -260,6 +263,7 @@ def openai_chat_request(
             contents.append(choice['message']['content'])
     else:
         nvidia_mode = False 
+        o1_mode = False
         # for version > 1.0
         if "deepseek" in model:
             assert os.environ.get("DEEPSEEK_API_KEY") is not None, "Please set DEEPSEEK_API_KEY in the environment variables."
@@ -307,6 +311,7 @@ def openai_chat_request(
         else: 
             # print(f"Requesting chat completion from OpenAI API with model {model}")
             if model.startswith("o1-"):
+                o1_mode = True
                 if messages[0]["role"] == "system":
                     messages = messages[1:]
                 response = client.chat.completions.create(
@@ -315,10 +320,12 @@ def openai_chat_request(
                     messages=messages,  
                     top_p=top_p,
                     n=n,
+                    # temperature=temperature,
                     frequency_penalty=frequency_penalty,
                     presence_penalty=presence_penalty, 
                     **kwargs,
                 )
+                hidden_reasoning_tokens = response.usage.completion_tokens_details.reasoning_tokens
             else:
                 response = client.chat.completions.create(
                     model=model, 
@@ -343,6 +350,8 @@ def openai_chat_request(
                 else:
                     raise ValueError(f"OpenAI Finish Reason Error: {choice.finish_reason}")
             contents.append(choice.message.content.strip())
+    if o1_mode:
+        return contents, hidden_reasoning_tokens
     return contents
 
 def together_chat_request(