Skip to content

Commit

Permalink
Merge pull request #143 from Dousia/main
Browse files Browse the repository at this point in the history
add process sync in evaluation metric computation via a temp file in lmms_eval/evaluator.py
  • Loading branch information
Luodian authored Jul 12, 2024
2 parents f628f08 + b9edfcb commit a60e4e0
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions lmms_eval/evaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os
import time
import random
import itertools
import json
Expand Down Expand Up @@ -428,6 +430,12 @@ def evaluate(
# Ensure all ranks wait for rank 0 to finish aggregation
torch.distributed.barrier()

# Synchronize processes with a temp file in case the evluation metric requires gpus
# TODO: fix barriers' taking up gpu computation
os.makedirs(cli_args.output_path, exist_ok=True)
if os.path.exists(f"{cli_args.output_path}/rank{int(os.environ.get('RANK', 0))}_metric_eval_done.txt"):
os.remove(f"{cli_args.output_path}/rank{int(os.environ.get('RANK', 0))}_metric_eval_done.txt")

if lm.rank == 0:
### Get task ordering for correct sample-wide aggregation
group_to_task = {}
Expand Down Expand Up @@ -628,8 +636,12 @@ def print_tasks(task_hierarchy, task_order, task_version, task_group_alias):
}
if log_samples:
results_dict["samples"] = dict(samples)

return results_dict

else:
return None
results_dict = None

with open(f"{cli_args.output_path}/rank{int(os.environ.get('RANK', 0))}_metric_eval_done.txt", 'w') as f:
f.write(f"rank {int(os.environ.get('RANK', 0))} eval done")
while len([file for file in os.listdir(cli_args.output_path) if file.endswith('metric_eval_done.txt')]) < lm.accelerator.num_processes:
time.sleep(1)

return results_dict

0 comments on commit a60e4e0

Please sign in to comment.