Skip to content

Commit

Permalink
upgrade pydftracer package (#242)
Browse files Browse the repository at this point in the history
* upgrade pydftracer package

* upgrade to dftracer 1.0.8

* attempt to fix the broken `test-subset` testcase

* add more timeout to test_subset

* add more timeout

* add cleanup

* fix the core dump issue where dftrace initialized/finalized twice

* fix test_computation_time
  • Loading branch information
rayandrew authored Feb 19, 2025
1 parent a034690 commit 7055ff0
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 6 deletions.
22 changes: 20 additions & 2 deletions dlio_benchmark/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@
# To make sure the output folder is the same in all the nodes. We have to do this.
import hydra

dftracer_initialize = True
dftracer_finalize = True
dtracer = None

class DLIOBenchmark(object):
"""
The Benchmark represents the I/O behavior of deep learning applications.
Expand All @@ -66,6 +70,8 @@ def __init__(self, cfg):
<li> local variables </li>
</ul>
"""
global dftracer, dftracer_initialize, dftracer_finalize

t0 = time()
self.args = ConfigArguments.get_instance()
LoadConfig(self.args, cfg)
Expand All @@ -91,7 +97,8 @@ def __init__(self, cfg):
self.comm.barrier()
# Configure the logging library
self.args.configure_dlio_logging(is_child=False)
self.dftracer = self.args.configure_dftracer(is_child=False, use_pid=False)
if dftracer_initialize:
dftracer = self.args.configure_dftracer(is_child=False, use_pid=False)
with Profile(name=f"{self.__init__.__qualname__}", cat=MODULE_DLIO_BENCHMARK):
if self.args.my_rank == 0:
logging.info(f"{utcnow()} Running DLIO with {self.args.comm_size} process(es)")
Expand Down Expand Up @@ -343,6 +350,9 @@ def finalize(self):
"""
It finalizes the dataset once training is completed.
"""

global dftracer, dftracer_initialize, dftracer_finalize

self.comm.barrier()
self.checkpointing_mechanism.finalize()
if not self.generate_only:
Expand All @@ -364,7 +374,8 @@ def finalize(self):
self.stats.finalize()
self.stats.save_data()
self.comm.barrier()
self.args.finalize_dftracer(self.dftracer)
if dftracer_finalize and dftracer:
self.args.finalize_dftracer(dftracer)

@hydra.main(version_base=None, config_path="configs", config_name="config")
def run_benchmark(cfg: DictConfig):
Expand All @@ -374,6 +385,13 @@ def run_benchmark(cfg: DictConfig):
benchmark.run()
benchmark.finalize()

def set_dftracer_initialize(status):
global dftracer, dftracer_initialize, dftracer_finalize
dftracer_initialize = status

def set_dftracer_finalize(status):
global dftracer, dftracer_initialize, dftracer_finalize
dftracer_finalize = status

def main() -> None:
"""
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ nvidia-dali-cuda110>=1.34.0
omegaconf~=2.2.0
pandas~=1.5.1
psutil~=5.9.8
pydftracer==1.0.2
pydftracer==1.0.8
pytest
tensorflow>=2.11.0
torch>=2.2.0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"omegaconf>=2.2.0",
"pandas>=1.5.1",
"psutil>=5.9.8",
"pydftracer==1.0.2",
"pydftracer==1.0.8",
]
x86_deps = [
f"hydra-core>={HYDRA_VERSION}",
Expand Down
3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# HACK: to fix the reinitialization problem
def pytest_configure(config):
config.is_dftracer_initialized = False
13 changes: 11 additions & 2 deletions tests/dlio_benchmark_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
# logging's max timestamp resolution is msecs, we will pass in usecs in the message
)

from dlio_benchmark.main import DLIOBenchmark
from dlio_benchmark.main import DLIOBenchmark, set_dftracer_initialize, set_dftracer_finalize
import glob

def init():
Expand Down Expand Up @@ -127,9 +127,11 @@ def test_subset() -> None:
logging.info(f" DLIO training test for subset")
logging.info("=" * 80)
with initialize_config_dir(version_base=None, config_dir=config_dir):
set_dftracer_finalize(False)
cfg = compose(config_name='config', overrides=['++workload.workflow.train=False', \
'++workload.workflow.generate_data=True'])
benchmark=run_benchmark(cfg, verify=False)
set_dftracer_initialize(False)
cfg = compose(config_name='config', overrides=['++workload.workflow.train=True', \
'++workload.workflow.generate_data=False', \
'++workload.dataset.num_files_train=8', \
Expand Down Expand Up @@ -506,7 +508,7 @@ def test_custom_storage_root_train(fmt, framework) -> None:

@pytest.mark.timeout(60, method="thread")
@pytest.mark.parametrize("dist", list(compute_time_distributions.keys()))
def test_computation_time_distribution(dist) -> None:
def test_computation_time_distribution(request, dist) -> None:
init()
clean()
compute_time_overrides = []
Expand All @@ -523,11 +525,18 @@ def test_computation_time_distribution(dist) -> None:
logging.info(f" DLIO test for computation time distribution")
logging.info("=" * 80)
with initialize_config_dir(version_base=None, config_dir=config_dir):
if request.config.is_dftracer_initialized:
set_dftracer_initialize(False)
else:
set_dftracer_finalize(False)

cfg = compose(config_name='config',
overrides=['++workload.workflow.train=True', \
'++workload.workflow.generate_data=True', \
'++workload.train.epochs=4'] + compute_time_overrides)
benchmark = run_benchmark(cfg)
if not request.config.is_dftracer_initialized:
request.config.is_dftracer_initialized = True
clean()
finalize()

Expand Down

0 comments on commit 7055ff0

Please sign in to comment.