Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create synthetic MMLU via GPT-4 #367

Open
wants to merge 55 commits into
base: main
Choose a base branch
from
Open
Changes from 16 commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
4254fc5
debug
nouhadziri Sep 23, 2024
5553564
mmlu
nouhadziri Sep 23, 2024
80d9fb8
remove debug
nouhadziri Sep 23, 2024
b441630
remove debug
nouhadziri Sep 23, 2024
0709bbf
debug mmlu
nouhadziri Sep 23, 2024
b04a3b0
debug mmlu
nouhadziri Sep 23, 2024
c3c9687
debug mmlu
nouhadziri Sep 23, 2024
2de0131
debug mmlu
nouhadziri Sep 23, 2024
b87f8ef
debug mmlu
nouhadziri Sep 23, 2024
f543d3c
debug mmlu
nouhadziri Sep 23, 2024
2baac78
debug mmlu
nouhadziri Sep 23, 2024
fabcd1c
gpt-4 generation mmlu
nouhadziri Sep 23, 2024
1acb070
gpt-4 generation mmlu
nouhadziri Sep 23, 2024
5356975
gpt-4 generation mmlu
nouhadziri Sep 23, 2024
2634a49
gpt-4 generation mmlu
nouhadziri Sep 23, 2024
2dae754
fix style
nouhadziri Sep 24, 2024
da1974f
generate mmlu
nouhadziri Oct 9, 2024
94995fe
generate mmlu
nouhadziri Oct 9, 2024
39f4144
generate mmlu
nouhadziri Oct 9, 2024
5d1b8ef
generate mmlu
nouhadziri Oct 9, 2024
a2a841a
generate mmlu
nouhadziri Oct 9, 2024
8246ae2
generate mmlu
nouhadziri Oct 9, 2024
c80f803
generate mmlu
nouhadziri Oct 9, 2024
f4fd23e
generate mmlu
nouhadziri Oct 9, 2024
bcbaf64
generate mmlu
nouhadziri Oct 9, 2024
cdacdbf
generate mmlu
nouhadziri Oct 9, 2024
7cc8212
debug
nouhadziri Oct 9, 2024
16a7bee
debug
nouhadziri Oct 9, 2024
e14d27c
debug
nouhadziri Oct 9, 2024
980971c
avoid reusing the few shot examples
nouhadziri Oct 9, 2024
6496930
avoid reusing the few shot examples
nouhadziri Oct 9, 2024
dde2bab
avoid reusing the few shot examples
nouhadziri Oct 9, 2024
651d5ee
avoid reusing the few shot examples
nouhadziri Oct 9, 2024
e555729
avoid reusing the few shot examples
nouhadziri Oct 9, 2024
d84af1c
avoid reusing the few shot examples
nouhadziri Oct 9, 2024
9cb6f04
avoid reusing the few shot examples
nouhadziri Oct 9, 2024
00dbba9
change the number of completions
nouhadziri Oct 9, 2024
3fe7fa6
change the number of completions
nouhadziri Oct 9, 2024
f0360a6
change the number of completions
nouhadziri Oct 9, 2024
1ad21bb
change the number of completions
nouhadziri Oct 9, 2024
7d08886
change the number of completions
nouhadziri Oct 9, 2024
b8eca65
change the number of completions
nouhadziri Oct 9, 2024
05bc55e
change the number of completions
nouhadziri Oct 9, 2024
104f96a
change the number of completions
nouhadziri Oct 9, 2024
9f1ebb7
change the number of completions
nouhadziri Oct 9, 2024
2f406a5
change the number of completions
nouhadziri Oct 9, 2024
9f38f97
change the number of completions
nouhadziri Oct 9, 2024
09a49e3
change the number of completions
nouhadziri Oct 9, 2024
eb94e4f
remove filtering
nouhadziri Oct 9, 2024
af3882a
remove filtering
nouhadziri Oct 9, 2024
117a43c
update the code to check similarity by subject
nouhadziri Oct 9, 2024
fcd51df
update the code to check similarity by subject
nouhadziri Oct 9, 2024
2991c83
update the code to check similarity by subject
nouhadziri Oct 9, 2024
bfef4f2
update the code to check similarity by subject
nouhadziri Oct 9, 2024
644f942
update the code to check similarity by subject
nouhadziri Oct 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions open_instruct/mmlu_synthetic/generate_mmlu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import asyncio
import json
import re
from typing import List, Dict, Optional
from collections import defaultdict
from dataclasses import dataclass
from datasets import load_dataset, Dataset
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm


@dataclass
class LLMGenerationConfig:
num_completions: int = 100
# model: str = "gpt-3.5-turbo-0125"
model: str = "gpt-4"
max_parallel_requests: int = 50 # Adjust based on your API rate limits


@dataclass
class GenerationArgs:
temperature: float = 0.7
max_tokens: int = 500
top_p: float = 0.95
examples_per_subject: int = 5
few_shot_examples: int = 3


class LLMProcessor:
def __init__(self, config: LLMGenerationConfig):
self.config = config
self.async_client = AsyncOpenAI()

def format_example(self, sample: dict) -> str:
return f"""
Question: {sample['question']}
A: {sample['choices'][0]}
B: {sample['choices'][1]}
C: {sample['choices'][2]}
D: {sample['choices'][3]}
Correct answer: {sample['choices'][sample['answer']]}
"""

async def generate_synthetic_question(self, subject: str, examples: List[dict], gen_args: GenerationArgs):
examples_str = "\n".join([self.format_example(example) for example in examples[: gen_args.few_shot_examples]])

prompt = f"""
Generate a new multiple-choice question similar to the following MMLU (Massive Multitask Language Understanding) questions on the subject of {subject}:

{examples_str}

Create a new question on {subject} with four options and indicate the correct answer. Format your response as follows:
Question: [Your new question]
A: [Option A]
B: [Option B]
C: [Option C]
D: [Option D]
Correct answer: [Letter of correct option]
"""

try:
response = await self.async_client.chat.completions.create(
model=self.config.model,
messages=[
{
"role": "system",
"content": "You are an AI assistant tasked with generating synthetic data for the MMLU dataset.",
},
{"role": "user", "content": prompt},
],
temperature=gen_args.temperature,
max_tokens=gen_args.max_tokens,
top_p=gen_args.top_p,
n=self.config.num_completions,
)
return response.choices[0].message.content
except Exception as e:
print(f"Error generating question for {subject}: {e}")
return None

async def process_subject(self, subject: str, samples: List[dict], gen_args: GenerationArgs):
semaphore = asyncio.Semaphore(self.config.max_parallel_requests)

async def process_with_semaphore():
async with semaphore:
return await self.generate_synthetic_question(subject, samples, gen_args)

tasks = [process_with_semaphore() for _ in range(gen_args.examples_per_subject)]
return await tqdm.gather(*tasks, desc=f"Generating {subject}")


def get_mmlu_samples_by_subject(num_samples_per_subject: int = 10) -> Dict[str, List[dict]]:
dataset = load_dataset("cais/mmlu", "all")
samples_by_subject = defaultdict(list)

for sample in dataset["test"]:
if len(samples_by_subject[sample["subject"]]) < num_samples_per_subject:
samples_by_subject[sample["subject"]].append(sample)

return dict(samples_by_subject)


def parse_generated_question(text: str, subject: str) -> Optional[dict]:
# Define regex patterns
question_pattern = re.compile(r"Question:\s*(.+)")
choice_pattern = re.compile(r"([A-D]):\s*(.+)")
answer_pattern = re.compile(r"Correct answer:\s*([A-D])")

# Extract question
question_match = question_pattern.search(text)
if not question_match:
return None
question = question_match.group(1).strip()

# Extract choices
choices = {}
for match in choice_pattern.finditer(text):
choices[match.group(1)] = match.group(2).strip()

# Check if we have exactly 4 choices
if len(choices) != 4 or set(choices.keys()) != set("ABCD"):
return None

# Extract answer
answer_match = answer_pattern.search(text)
if not answer_match or answer_match.group(1) not in "ABCD":
return None
answer = ord(answer_match.group(1)) - ord("A")

return {
"question": question,
"subject": subject,
"choices": [choices[letter] for letter in "ABCD"],
"answer": answer,
}


def upload_to_huggingface(data: List[dict], dataset_name: str):
dataset = Dataset.from_dict(
{
"question": [item["question"] for item in data],
"subject": [item["subject"] for item in data],
"choices": [item["choices"] for item in data],
"answer": [item["answer"] for item in data],
}
)

dataset_name = "ai2-adapt-dev/synth-mmlu-mini-sample-new"
dataset.push_to_hub(dataset_name)
print(f"Dataset uploaded to Hugging Face: https://huggingface.co/datasets/{dataset_name}")


async def main():
config = LLMGenerationConfig()
processor = LLMProcessor(config)
gen_args = GenerationArgs()

samples_by_subject = get_mmlu_samples_by_subject(gen_args.few_shot_examples + gen_args.examples_per_subject)

all_synthetic_data = []
for subject, samples in samples_by_subject.items():
raw_synthetic_data = await processor.process_subject(subject, samples, gen_args)
synthetic_data = [
parsed_question
for data in raw_synthetic_data
if data is not None
if (parsed_question := parse_generated_question(data, subject)) is not None
]
all_synthetic_data.extend(synthetic_data)

with open(f"synthetic_mmlu_data_{config.model}.json", "w") as f:
json.dump(all_synthetic_data, f, indent=2)

print(
f"Generated {len(all_synthetic_data)} valid synthetic MMLU questions across {len(samples_by_subject)} subjects, saved to 'synthetic_mmlu_data_{config.model}.json'"
)

# Upload to Hugging Face
upload_to_huggingface(all_synthetic_data, "ai2-adapt-dev/synthetic-mmlu-dataset")


if __name__ == "__main__":
asyncio.run(main())
Loading