scicode-bench · rong-xyz · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/GEMINI.md b/GEMINI.md
@@ -0,0 +1,64 @@
+# Gemini Report: SciCode
+
+This document provides an overview of the SciCode project, focusing on the supported models and how to run them.
+
+## Supported Models
+
+The SciCode project uses [OpenRouter](https://openrouter.ai/) to support a wide variety of Large Language Models. The core logic for model integration is located in `src/scicode/gen/models.py`.
+
+Any model available through the OpenRouter API can be used by providing its model identifier (e.g., `openai/gpt-4o`, `anthropic/claude-3-opus`). For a list of available models, please refer to the [OpenRouter documentation](https://openrouter.ai/docs#models).
+
+A `dummy` model is also available for testing purposes.
+
+## Running the Models
+
+The recommended way to evaluate a new model is by using `inspect_ai`, as suggested in the `README.md`.
+
+### Using `inspect_ai` (Recommended)
+
+1.  **Clone the repository:**
+    ```bash
+    git clone [email protected]:scicode-bench/SciCode.git
+    cd SciCode
+    ```
+
+2.  **Install the package:**
+    ```bash
+    pip install -e .
+    ```
+
+3.  **Download test data:**
+    Download the numeric test results from the link provided in the `README.md` and save them as `./eval/data/test_data.h5`.
+
+4.  **Configure API Keys:**
+    Set your OpenRouter API key as an environment variable. See the "Configuration" section below for more details.
+
+5.  **Run the evaluation:**
+    You can run the evaluation using the `inspect` command. The model name should be a valid OpenRouter model identifier.
+
+    ```bash
+    inspect eval eval/inspect_ai/scicode.py --model openai/gpt-4o --temperature 0
+    ```
+
+### Deprecated Method
+
+A deprecated two-step process is also available:
+
+1.  **Generate code:**
+    ```bash
+    python eval/scripts/gencode.py --model <openrouter_model_name>
+    ```
+2.  **Test the generated code:**
+    ```bash
+    python eval/scripts/test_generated_code.py
+    ```
+
+## Configuration
+
+To use the models, you need to set your OpenRouter API key as an environment variable.
+
+```bash
+export OPENROUTER_KEY="your-openrouter-api-key"
+```
+
+You can add this line to your shell's startup file (e.g., `~/.bashrc` or `~/.zshrc`) to make it permanent.
diff --git a/README.md b/README.md
@@ -68,10 +68,12 @@ Scicode has been integrated with `inspect_ai` for easier and faster model evalua
 
 ```bash
 cd eval/inspect_ai
-export OPENAI_API_KEY=your-openai-api-key
+export OPENROUTER_KEY=your-openrouter-api-key
 inspect eval scicode.py --model openai/gpt-4o --temperature 0
 ```
 
+The model name should be a valid [OpenRouter](https://openrouter.ai/docs#models) model identifier (e.g., `openai/gpt-4o`, `anthropic/claude-3-opus`, `google/gemini-pro`).
+
 💡 For more detailed information of using `inspect_ai`, see [`eval/inspect_ai` readme](eval/inspect_ai/)
 
 ## Instructions to evaluate a new model in two steps (deprecated)

diff --git a/eval/inspect_ai/scicode.py b/eval/inspect_ai/scicode.py
@@ -343,8 +343,8 @@ async def solve(state: TaskState, generate: Generate) -> TaskState:
                     result = await generate(state=state_copy)
                     response_from_llm = result.output.completion
                     # ===Model Generation===
-                except:
-                    print(f"Failed to generate response for problem {prob_id} step {idx+1}.")
+                except Exception as e:
+                    print(f"Failed to generate response for problem {prob_id} step {idx+1}. Error: {e}")
                     response_from_llm = generate_dummy_response(prompt)
             prompt_assistant.register_previous_response(
                 prob_data=state.metadata,

diff --git a/eval/scripts/gencode.py b/eval/scripts/gencode.py
@@ -6,20 +6,19 @@
     get_function_from_code,
     read_from_hf_dataset,
 )
-from scicode.gen.models import extract_python_script, get_model_function
+from scicode.gen.models import extract_python_script, get_model_response
 
 DEFAULT_PROMPT_TEMPLATE = Path("eval", "data", "background_comment_template.txt").read_text()
 BACKGOUND_PROMPT_TEMPLATE = Path("eval", "data", "multistep_template.txt").read_text()
 
 
 class Gencode:
     def __init__(self, model: str, output_dir: Path,
-                 prompt_dir: Path, with_background: bool, temperature: float):
+                 prompt_dir: Path, with_background: bool):
         self.model = model
         self.output_dir = output_dir
         self.prompt_dir = prompt_dir
         self.with_background = with_background
-        self.temperature = temperature
         self.previous_llm_code = []
 
     def _get_background_dir(self):
@@ -90,13 +89,8 @@ def generate_response_with_steps(
         if save:
             self.save_prompt_with_steps(prob_data, prompt, num_steps)
 
-        model_kwargs = {}
-        if "claude" in model:
-            model_kwargs["max_tokens"] = 4096
-        model_kwargs["temperature"] = self.temperature
         # write the response to a file if it doesn't exist
-        model_fct = get_model_function(model, **model_kwargs)
-        response_from_llm = model_fct(prompt)
+        response_from_llm = get_model_response(prompt, model=model)
         self.previous_llm_code[num_steps - 1] = extract_python_script(response_from_llm)
         self.save_response_with_steps(prob_data, response_from_llm, previous_code, num_steps)
 
@@ -174,25 +168,19 @@ def get_cli() -> argparse.ArgumentParser:
         action="store_true",
         help="Include problem background if enabled",
     )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=0,
-        help="Generation temperature",
-    )
     return parser
 
 
-def main(model: str,
-         split: str,
-         output_dir: Path,
-         prompt_dir: Path,
-         with_background: bool,
-         temperature: float
+def main(
+    model: str = "gpt-4o",
+    split: str = "test",
+    output_dir: Path = Path("eval_results", "generated_code"),
+    prompt_dir: Path = Path("eval_results", "prompt"),
+    with_background: bool = False,
 ) -> None:
     gcode = Gencode(
         model=model, output_dir=output_dir,
-        prompt_dir=prompt_dir,  with_background=with_background, temperature=temperature
+        prompt_dir=prompt_dir,  with_background=with_background
     )
     prompt_template = BACKGOUND_PROMPT_TEMPLATE if with_background else DEFAULT_PROMPT_TEMPLATE
     data = read_from_hf_dataset(split)
@@ -209,4 +197,4 @@ def main(model: str,
 
 if __name__ == "__main__":
     args = get_cli().parse_args()
-    main(**vars(args))
+    main(**vars(args))
diff --git a/src/scicode/gen/models.py b/src/scicode/gen/models.py
@@ -1,149 +1,61 @@
-from functools import partial
 from openai import OpenAI
-import anthropic
-import google.generativeai as genai
-import config
 import re
 import os
-import litellm
-from litellm.utils import validate_environment as litellm_validate_environment
 
-from scicode import keys_cfg_path
 from scicode.utils.log import get_logger
 
 logger = get_logger("models")
 
 
-def get_config():
-    if not keys_cfg_path.exists():
-        raise FileNotFoundError(f"Config file not found: {keys_cfg_path}")
-    return config.Config(str(keys_cfg_path))
+def get_model_response(prompt: str, *, model: str) -> str:
+    """Call the OpenRouter api to generate a response, or use a dummy for testing."""
+    if model == "dummy":
+        return generate_dummy_response(prompt)
 
-def generate_litellm_response(prompt: str, *, model: str, **kwargs) -> str:
-    """Call the litellm api to generate a response"""
-    # litellm expects all keys as env variables
-    config = get_config()
-    for key, value in config.as_dict().items():
-        if key in os.environ and os.environ[key] != value:
-            logger.warning(f"Overwriting {key} from config with environment variable")
-        else:
-            os.environ[key] = value
-    # Let's validate that we have everythong for this model
-    env_validation = litellm_validate_environment(model)
-    if not env_validation.get("keys_in_environment") or env_validation.get("missing_keys", []):
-        msg = f"Environment validation for litellm failed for model {model}: {env_validation}"
-        raise ValueError(msg)
-    response = litellm.completion(
-        model=model,
-        messages = [
-            {"role": "user", "content": prompt},
-        ],
-        **kwargs,
-    )
-    return response.choices[0].message.content
+    key = os.getenv("OPENROUTER_KEY")
+    if not key:
+        raise ValueError(
+            "OPENROUTER_API_KEY environment variable not found. Please set it."
+        )
 
-def generate_openai_response(prompt: str, *, model="gpt-4-turbo-2024-04-09",
-                             temperature: float = 0) -> str:
-    """call the openai api to generate a response"""
-    key: str = get_config()["OPENAI_KEY"]  # type: ignore
-    client = OpenAI(api_key=key)
-    completion = client.chat.completions.create(
-        model=model,
-        temperature=temperature,
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": prompt},
-        ],
+    client = OpenAI(
+        base_url="https://openrouter.ai/api/v1",
+        api_key=key,
     )
-    return completion.choices[0].message.content
-
-
-def generate_anthropic_response(prompt, *, model="claude-3-opus-20240229",
-                                max_tokens: int = 4096, temperature: float = 0) -> str:
-    """call the anthropic api to generate a response"""
-    key: str = get_config()["ANTHROPIC_KEY"]  # type: ignore
-    client = anthropic.Anthropic(api_key=key)
-    message = client.messages.create(
-        model=model,
-        temperature=temperature,
-        max_tokens=max_tokens,
-        messages=[
-            {"role": "user", "content": prompt},
-        ],
-    )
-    return message.content[0].text
-
-
-def generate_google_response(prompt: str, *, model: str = "gemini-pro",
-                             temperature: float = 0) -> str:
-    """call the api to generate a response"""
-    key: str = get_config()["GOOGLE_KEY"]  # type: ignore
-    genai.configure(api_key=key)
-    model = genai.GenerativeModel(model_name=model)
-    response = model.generate_content(prompt,
-                                      generation_config=genai.GenerationConfig(temperature=temperature),
-                                      # safety_settings=[
-                                      #     {
-                                      #         "category": "HARM_CATEGORY_HARASSMENT",
-                                      #         "threshold": "BLOCK_NONE",
-                                      #     },
-                                      #     {
-                                      #         "category": "HARM_CATEGORY_HATE_SPEECH",
-                                      #         "threshold": "BLOCK_NONE",
-                                      #     },
-                                      #     {
-                                      #         "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-                                      #         "threshold": "BLOCK_NONE",
-                                      #     },
-                                      #     {
-                                      #         "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-                                      #         "threshold": "BLOCK_NONE"
-                                      #     }
-                                      # ]
-                                      )
     try:
-        return response.text
-    except ValueError:
-        print(f'prompt:\n{prompt}')
-        # If the response doesn't contain text, check if the prompt was blocked.
-        print(f'prompt feedback:\n{response.prompt_feedback}')
-        # Also check the finish reason to see if the response was blocked.
-        print(f'finish reason:\n{response.candidates[0].finish_reason.name}')
-        # If the finish reason was SAFETY, the safety ratings have more details.
-        print(f'safety rating:\n{response.candidates[0].safety_ratings}')
-        raise ValueError("Generate response failed.")
-
-
-def get_model_function(model: str, **kwargs):
-    """Return the appropriate function to generate a response based on the model"""
-    if model.startswith("litellm/"):
-        model = model.removeprefix("litellm/")
-        fct = generate_litellm_response
-    elif "gpt" in model:
-        fct = generate_openai_response
-    elif "claude" in model:
-        fct = generate_anthropic_response
-    elif "gemini" in model:
-        fct = generate_google_response
-    elif model == "dummy":
-        fct = generate_dummy_response
-    else:
-        raise ValueError(f"Model {model} not supported")
-    return partial(fct, model=model, **kwargs)
+        completion = client.chat.completions.create(
+            model=model,
+            temperature=0,
+            max_tokens=4096,
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": prompt},
+            ],
+        )
+    except Exception as e:
+        logger.error(f"Error calling OpenRouter API: {e}")
+        raise
+
+    return completion.choices[0].message.content
 
 
-def generate_dummy_response(prompt: str, **kwargs) -> str:
+def generate_dummy_response(prompt: str) -> str:
     """Used for testing as a substitute for actual models"""
     return "Blah blah\n```python\nprint('Hello, World!')\n```\n"
 
 
 def extract_python_script(response: str):
     # We will extract the python script from the response
-    if '```' in response:
-        python_script = response.split("```python")[1].split("```")[0] if '```python' in response else response.split('```')[1].split('```')[0]
+    if "```" in response:
+        python_script = (
+            response.split("```python")[1].split("```")[0]
+            if "```python" in response
+            else response.split("```")[1].split("```")[0]
+        )
     else:
         print("Fail to extract python code from specific format.")
         python_script = response
-    python_script = re.sub(r'^\s*(import .*|from .*\s+import\s+.*)', '', python_script, flags=re.MULTILINE)
-    return python_script
-
+    python_script = re.sub(
+        r"^\s*(import .*|from .*\s+import\s+.*)", "", python_script, flags=re.MULTILINE
+    )
+    return python_script