Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/working fifth q code #913

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/param.yaml
Original file line number Diff line number Diff line change
@@ -9,5 +9,5 @@ temperature: 0.6
top_p: null
write_proposal_strategy: default
max_env_run_num: 1
proposal_num: 2
proposal_num: 1
use_rag: True
51 changes: 51 additions & 0 deletions research_bench/eval_only.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import jsonlines
from research_bench.eval import compute_proposal_metrics
from tqdm import tqdm
import json

dataset = []
with open('./results/mlbench_result_4o_mini_fake_research_town_first_author_only.jsonl', 'r') as f:
for line_num, line in enumerate(f, 1):
try:
obj = json.loads(line)
dataset.append(obj)
except json.JSONDecodeError as e:
print(f"Error decoding JSON on line {line_num}: {e}")
continue

overall_metrics = {
'openai_sim': [],
'voyageai_sim': [],
'openai_sim_q1': [],
'openai_sim_q2': [],
'openai_sim_q3': [],
'openai_sim_q4': [],
'openai_sim_q5': [],
'voyageai_sim_q1': [],
'voyageai_sim_q2': [],
'voyageai_sim_q3': [],
'voyageai_sim_q4': [],
'voyageai_sim_q5': [],
}

dataset = dataset[:100]
for data in tqdm(dataset):
ref_proposal = data['ref_proposal']
gen_proposal = data['gen_proposal']
if 'openai_sim' not in data.keys():
print(data['paper_id'])
#metrics = compute_proposal_metrics(ref_proposal, gen_proposal)
#print(metrics)
for key in overall_metrics.keys():
overall_metrics[key].append(data[key])

final_metrics = {}
for key, values in overall_metrics.items():
print(f'{key}: {sum(values) / len(values)}')
final_metrics[key] = sum(values) / len(values)


openai_metric = 0.1 * final_metrics['openai_sim_q1'] + 0.1 * final_metrics['openai_sim_q2'] + 0.1 * final_metrics['openai_sim_q3'] + 0.1 * final_metrics['openai_sim_q4'] + 0.6 * final_metrics['openai_sim_q5']
voyageai_metric = 0.1 * final_metrics['voyageai_sim_q1'] + 0.1 * final_metrics['voyageai_sim_q2'] + 0.1 * final_metrics['voyageai_sim_q3'] + 0.1 * final_metrics['voyageai_sim_q4'] + 0.6 * final_metrics['voyageai_sim_q5']
print(f'openai_metric: {openai_metric}')
print(f'voyageai_metric: {voyageai_metric}')
19 changes: 19 additions & 0 deletions research_bench/nv_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from openai import OpenAI

client = OpenAI(
base_url = "https://integrate.api.nvidia.com/v1",
api_key = "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"
)

completion = client.chat.completions.create(
model="nvidia/nv-embed-v1",
messages=[{"role":"user","content":"Write a limerick about the wonders of GPU computing."}],
temperature=0.2,
top_p=0.7,
max_tokens=1024,
stream=True
)

for chunk in completion:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="")
265 changes: 254 additions & 11 deletions research_bench/proposal_writing.py

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions research_bench/run_eval.py
Original file line number Diff line number Diff line change
@@ -25,7 +25,8 @@ def inference(
profiles = [Profile(**data) for data in author_data.values()]
ref_abstracts = [ref['abstract'] for ref in paper_data.get('references', [])]

gen_proposal = write_proposal(mode, profiles, ref_abstracts, config)
paper_title = paper_data['title']
gen_proposal = write_proposal(mode, profiles, ref_abstracts, config, paper_title)

metrics = compute_proposal_metrics(ref_proposal, gen_proposal)
results = {
@@ -79,8 +80,10 @@ def main() -> None:
'author_only',
'citation_only',
'author_citation',
'textgnn',
'research_town',
'sakana_ai_scientist',
'debug',
'fake_research_town',
],
help='Processing mode',
)
4 changes: 2 additions & 2 deletions research_bench/run_eval.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/bin/bash

# Define the input and output paths, along with the modes to test
INPUT_PATH="./mlbench/mlbench.json"
INPUT_PATH="./mlbench/mlbench_full.json"
OUTPUT_DIR="./results"
MODES=("citation_only")
MODES=("fake_research_town")
NUM_PROCESSES=4

# Loop through each mode and run the evaluation
10 changes: 3 additions & 7 deletions research_town/envs/env_proposal_writing_without_rag.py
Original file line number Diff line number Diff line change
@@ -62,17 +62,13 @@ def run(self) -> Generator[Tuple[Progress, Agent], None, None]:
contexts=self.contexts,
config=self.config,
)

yield insight, researcher
insights.append(insight)

# Step 3: Researchers brainstorm ideas based on their insights
for researcher in researchers:
idea = researcher.brainstorm_idea(insights=insights, config=self.config)
idea = researcher.brainstorm_idea(insights=[insight], config=self.config)
yield idea, researcher
insights.append(insight)
ideas.append(idea)

# Step 4: Leader summarizes ideas and writes proposals
# Step 2: Leader summarizes ideas and writes proposals
idea_combos = sample_ideas(ideas, self.config.param.proposal_num)
for idea_combo in idea_combos:
summarized_idea = self.leader.summarize_idea(
2 changes: 2 additions & 0 deletions research_town/utils/sampler.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,8 @@

def sample_ideas(lst: List[Idea], n: int) -> List[List[Idea]]:
total_subsets = 2 ** len(lst) - (len(lst) + 1)
if len(lst) == 1:
return [lst]
if n > total_subsets:
raise ValueError(f'n cannot be greater than {total_subsets}')

Loading