update generate_data API calls

npalaska · npalaska · commit 1ac689a9a5c0 · 2024-07-02T17:47:16.000-04:00
diff --git a/scripts/test_freeform_skills.py b/scripts/test_freeform_skills.py
@@ -49,7 +49,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model).get_flow()
+skills_flow = SynthSkillsFlow(client, "mixtral", teacher_model, 30).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_grounded_skills.py b/scripts/test_grounded_skills.py
@@ -97,7 +97,7 @@
 
 ds = Dataset.from_list(samples)
 
-skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model).get_flow()
+skills_flow = SynthGroundedSkillsFlow(client, "mixtral", teacher_model, 30).get_flow()
 skills_pipe = Pipeline(skills_flow)
 
 sdg = SDG([skills_pipe])
diff --git a/scripts/test_knowledge.py b/scripts/test_knowledge.py
@@ -38,8 +38,8 @@
 
 ds = Dataset.from_list(samples)
 
-mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model).get_flow()
-knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model).get_flow()
+mmlu_flow = MMLUBenchFlow(client, "mixtral", teacher_model, 30).get_flow()
+knowledge_flow = SynthKnowledgeFlow(client, "mixtral", teacher_model, 30).get_flow()
 knowledge_pipe = Pipeline(knowledge_flow)
 mmlu_pipe = Pipeline(mmlu_flow)
 
diff --git a/src/instructlab/sdg/default_flows.py b/src/instructlab/sdg/default_flows.py
@@ -29,10 +29,11 @@ def _get_model_prompt(model_family):
 
 
 class Flow(ABC):
-    def __init__(self, client, model_family, model_id, batched=True) -> None:
+    def __init__(self, client, model_family, model_id, num_instructions_to_generate, batched=True) -> None:
         self.client = client
         self.model_family = model_family
         self.model_id = model_id
+        self.num_instructions_to_generate = num_instructions_to_generate
         self.batched = batched
 
     @abstractmethod
@@ -60,7 +61,7 @@ def get_flow(self) -> list:
                 "gen_kwargs": {
                     "max_tokens": 2048,
                     "temperature": 0.7,
-                    "n": 1
+                    "n": self.num_instructions_to_generate
                 },
                 "drop_duplicates": ["output"],
             }
@@ -280,7 +281,7 @@ def get_flow(self) -> list:
                     "output_cols": ["question"],
                     "batch_kwargs": {
                         "num_procs": 8,
-                        "num_samples": 30,
+                        "num_samples": self.num_instructions_to_generate,
                         "batched": self.batched,
                     },
                 },
@@ -375,16 +376,16 @@ def get_flow(self) -> list:
                     "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["context"],
                     "batch_kwargs": {
-                        "num_samples": 30,
                         "num_procs": 8,
                         "batched": self.batched,
                     }
                 },
                 "gen_kwargs": {
                     "temperature": 0.7,
                     "max_tokens": 2048,
-                    "n": 10
+                    "n": self.num_instructions_to_generate
                 },
+                "drop_duplicates": ["context"],
             },
             {
                 "block_type": LLMBlock,
@@ -396,6 +397,7 @@ def get_flow(self) -> list:
                     "model_prompt": _get_model_prompt(self.model_family),
                     "output_cols": ["question"],
                     "batch_kwargs": {
+                        "num_samples": 3,
                         "num_procs": 8,
                         "batched": self.batched,
                     },
@@ -414,7 +416,6 @@ def get_flow(self) -> list:
                     "batch_kwargs": {
                         "num_procs": 8,
                         "batched": self.batched,
-                        "num_samples": 10,
                     },
                 },
             },
diff --git a/src/instructlab/sdg/generate_data.py b/src/instructlab/sdg/generate_data.py
@@ -124,7 +124,7 @@ def _gen_test_data(
             outfile.write("\n")
 
 
-def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):
+def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate, batched):
     knowledge_flow_types = []
     freeform_skill_flow_types = []
     grounded_skill_flow_types = []
@@ -144,7 +144,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):
         [
             Pipeline(
                 flow_type(
-                    client, model_family, model_name, num_iters, batched
+                    client, model_family, model_name, num_instructions_to_generate, batched
                 ).get_flow()
             )
             for flow_type in knowledge_flow_types
@@ -154,7 +154,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):
         [
             Pipeline(
                 flow_type(
-                    client, model_family, model_name, num_iters, batched
+                    client, model_family, model_name, num_instructions_to_generate, batched
                 ).get_flow()
             )
             for flow_type in freeform_skill_flow_types
@@ -164,7 +164,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):
         [
             Pipeline(
                 flow_type(
-                    client, model_family, model_name, num_iters, batched
+                    client, model_family, model_name, num_instructions_to_generate, batched
                 ).get_flow()
             )
             for flow_type in grounded_skill_flow_types

Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@ def _gen_test_data(`
`124`	`124`	`outfile.write("\n")`
`125`	`125`
`126`	`126`
`127`		`-def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):`
	`127`	`+def _sdg_init(pipeline, client, model_family, model_name, num_instructions_to_generate, batched):`
`128`	`128`	`knowledge_flow_types = []`
`129`	`129`	`freeform_skill_flow_types = []`
`130`	`130`	`grounded_skill_flow_types = []`
`@@ -144,7 +144,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):`
`144`	`144`	`[`
`145`	`145`	`Pipeline(`
`146`	`146`	`flow_type(`
`147`		`- client, model_family, model_name, num_iters, batched`
	`147`	`+ client, model_family, model_name, num_instructions_to_generate, batched`
`148`	`148`	`).get_flow()`
`149`	`149`	`)`
`150`	`150`	`for flow_type in knowledge_flow_types`
`@@ -154,7 +154,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):`
`154`	`154`	`[`
`155`	`155`	`Pipeline(`
`156`	`156`	`flow_type(`
`157`		`- client, model_family, model_name, num_iters, batched`
	`157`	`+ client, model_family, model_name, num_instructions_to_generate, batched`
`158`	`158`	`).get_flow()`
`159`	`159`	`)`
`160`	`160`	`for flow_type in freeform_skill_flow_types`
`@@ -164,7 +164,7 @@ def _sdg_init(pipeline, client, model_family, model_name, num_iters, batched):`
`164`	`164`	`[`
`165`	`165`	`Pipeline(`
`166`	`166`	`flow_type(`
`167`		`- client, model_family, model_name, num_iters, batched`
	`167`	`+ client, model_family, model_name, num_instructions_to_generate, batched`
`168`	`168`	`).get_flow()`
`169`	`169`	`)`
`170`	`170`	`for flow_type in grounded_skill_flow_types`