-
Notifications
You must be signed in to change notification settings - Fork 254
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[BE] add integration test for the generation script #741
Changes from 2 commits
5ccfaf6
90089e5
7c2cd54
0174783
25227cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -368,6 +368,19 @@ def build_test_list(): | |
"fsdp+tp+cp", | ||
ngpu=8, | ||
), | ||
OverrideDefinitions( | ||
[ | ||
[ | ||
"--checkpoint.enable_checkpoint", | ||
"--experimental.pipeline_parallel_degree 2", | ||
"--training.enable_cpu_offload True", | ||
"--optimizer.early_step_in_backward", | ||
], | ||
], | ||
"Enable CPU Offload with PP", | ||
"enable_cpu_offload+PP", | ||
ngpu=4, | ||
), | ||
OverrideDefinitions( | ||
[ | ||
[ | ||
|
@@ -382,14 +395,14 @@ def build_test_list(): | |
[ | ||
[ | ||
"--checkpoint.enable_checkpoint", | ||
"--experimental.pipeline_parallel_degree 2", | ||
"--training.enable_cpu_offload True", | ||
"--optimizer.early_step_in_backward", | ||
], | ||
[ | ||
# placeholder for the generation script's generate step | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this part WIP? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I wrote it this way because for generation, the first step is to call To make it less hacky I think we need some surgery to the file & classes. |
||
], | ||
], | ||
"Enable CPU Offload with PP", | ||
"enable_cpu_offload+PP", | ||
ngpu=4, | ||
"Generation script test", | ||
"test_generate", | ||
ngpu=2, | ||
), | ||
] | ||
return integration_tests_flavors | ||
|
@@ -412,7 +425,7 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str): | |
model_flavor_arg = f"--model.flavor {test_flavor.model_flavor}" | ||
all_ranks = ",".join(map(str, range(test_flavor.ngpu))) | ||
|
||
for override_arg in test_flavor.override_args: | ||
for idx, override_arg in enumerate(test_flavor.override_args): | ||
cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_llama_train.sh" | ||
# dump compile trace for debugging purpose | ||
cmd = f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" ' + cmd | ||
|
@@ -428,6 +441,16 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str): | |
logger.info( | ||
f"=====Integration test, flavor : {test_flavor.test_descr}, command : {cmd}=====" | ||
) | ||
|
||
# save checkpoint (idx == 0) and load it for generation (idx == 1) | ||
if test_name == "test_generate" and idx == 1: | ||
cmd = ( | ||
f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} " | ||
f"CHECKPOINT_DIR={output_dir}/{test_name}/checkpoint/step-10 " | ||
"PROMPT='What is the meaning of life?' " | ||
f"./scripts/generate/run_llama_generate.sh --out > {output_dir}/{test_name}/generated_output.json" | ||
) | ||
|
||
result = _run_cmd(cmd) | ||
logger.info(result.stdout) | ||
if result.returncode != 0: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is the
set_determinism()
removed from the test?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This call of
set_determinism()
was added before @wconstab's PR to make RNG right, which broke this line. Given howseed
is explicitly used here, I'm not sure if the code over there would indicate (would it still be correct?). Need @XilunWu's help on understanding more.This PR tries to restore the behavior before the "BC-breaking change" and guard on its running.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is the set_determinism change for correct RNG affecting this code? I would hope that calling `set_determinism(seed, deterministic=True) would be equivalent to some manual stuff done here. What is the issue?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
discussed offline; refactored to use
set_determinism