Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions eureka_ml_insights/configs/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,15 @@
},
)

OAI_GPT4O_2024_11_20_AUZRE_CONFIG = ModelConfig(
AzureOpenAIModel,
{
"model_name": "gpt-4o",
"url": "https://eurekaevals.openai.azure.com/",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's replace this with a placeholder url

"api_version": "2025-01-01-preview",
},
)

# Gemini models
GEMINI_SECRET_KEY_PARAMS = {
"key_name": "your_gemini_secret_key_name",
Expand Down Expand Up @@ -343,6 +352,16 @@
},
)

DEEPSEEK_R1_LOCAL_CONFIG = ModelConfig(
LocalVLLMModel,
{
# this name must match the vllm deployment name/path
"model_name": "Deepseek-R1",
# specify ports in case the model is already deployed
"ports": ["5001"],
},
)

# DeepSeek R1 Endpoints on Azure
DEEPSEEK_R1_CONFIG = ModelConfig(
DeepseekR1ServerlessAzureRestEndpointModel,
Expand Down
69 changes: 69 additions & 0 deletions eureka_ml_insights/data_utils/arc_agi_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import re
from dataclasses import dataclass

import pandas as pd

from .transform import DFTransformBase


@dataclass
class ARCAGI_ExtractAnswer(DFTransformBase):
model_output_column: str
model_answer_column: str

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[self.model_answer_column] = df[self.model_output_column].apply(self.parse_output_answer)
return df

@staticmethod
def parse_output_answer(response):
"""
Parse the input string to extract answer of a given ARCAGI question.
Parameters:
response (str): Input string containing answer X in the form of "<output>final answer string</output>".
Returns:
answer (str): The final answer string with leading and training spaces stripped.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

training -> trailing

"""
answer = ""

if response is None:
return ""
elif response.find("<output>") == -1 or response.find("</output>") == -1:
return ""

start_index = response.find("<output>") + len("<output>")
end_index = response.find("</output>")

answer = response[start_index:end_index].strip()

return answer


@dataclass
class ARCAGI_CleanCOTAnswer(DFTransformBase):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we put this in the general transforms so others can use the same if they need to for other benchmarks? This is because cleaning COTs is not necessarily arc agi specific. Also, here is another way how we did it for other benchmarks if useful

self.evalreporting_comp.data_reader_config.init_args["transform"].transforms.append(

model_output_column: str
model_answer_column: str

def transform(self, df: pd.DataFrame) -> pd.DataFrame:
df[self.model_answer_column] = df[self.model_output_column].apply(self.parse_output_answer)
return df

@staticmethod
def parse_output_answer(response):
"""
Replace None responses with an empty string
Parameters:
response (str): Possibly None Response string
Returns:
answer (str): Response string with None replaced by blank string
"""
if response is None:
return ""

start_index = response.find("</think>") + len("</think>")
if start_index == -1:
return response

response = response[start_index:]

return response
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
You are an intelligent assistant who is very good at answering test questions accurately.

{{ prompt }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
You are an intelligent assistant who is very good at answering test questions accurately.
In the examples that follow you will be shown grids of numbers.
The numbers in the grids range from 0 through 9.
Each grid can be rendered as a grid of squares.
Each square in the grid is rendered as a colored square where the color of the square is derived from the number.
The colors are decided as follows:

0 - black
1 - blue
2 - red
3 - green
4 - yellow
5 - grey
6 - magenta
7 - brown
8 - cyan
9 - maroon

With that in mind, do your best to solve the question below.

{{ prompt }}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the prompt itself does not ask the model to format the answer in . However the answer extraction sort of assumes this. Is this because reasoning models are expected to have this format? What if the model uses other tags or no tags at all (e.g. if it is a conventional model with no thinking block)? For other reasoning tasks, we ask the model to clearly mark the final answer according to some format. For example:

Final Answer:

and then extract what comes after that.

6 changes: 6 additions & 0 deletions eureka_ml_insights/user_configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@
AIME_PIPELINE,
)
from .aime_seq import AIME_SEQ_PIPELINE
from .arc_agi import (
ARC_AGI_v1_PIPELINE,
ARC_AGI_v1_PIPELINE_5Run,
COT_ARC_AGI_v1_PIPELINE,
COT_ARC_AGI_v1_PIPELINE_5Run,
)
from .ba_calendar import (
BA_Calendar_Parallel_PIPELINE,
BA_Calendar_PIPELINE,
Expand Down
Loading
Loading