diff --git a/examples/tests/dlio-ml/README.md b/examples/tests/dlio-ml/README.md new file mode 100644 index 0000000..029ac45 --- /dev/null +++ b/examples/tests/dlio-ml/README.md @@ -0,0 +1,72 @@ +# DLIO ML Example + +This is an example of using the IO tool[DLIO](https://dlio-profiler.readthedocs.io/en/latest/build.html#build-dlio-profiler-with-pip-recommended) that can +be added on the fly with pip. + +## Usage + +Create a cluster and install JobSet to it. + +```bash +kind create cluster +VERSION=v0.2.0 +kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/$VERSION/manifests.yaml +``` + +Install the operator (from the development manifest here): + +```bash +$ kubectl apply -f ../../dist/metrics-operator-dev.yaml +``` + +How to see metrics operator logs: + +```bash +$ kubectl logs -n metrics-system metrics-controller-manager-859c66464c-7rpbw +``` + +Then create the metrics set. This is going to run a single run of LAMMPS over MPI. +as lammps runs. + +```bash +kubectl apply -f metrics.yaml +``` + +Wait until you see pods created by the job and then running (there should be two - a launcher and worker for LAMMPS): + +```bash +kubectl get pods +``` +```diff +NAME READY STATUS RESTARTS AGE +metricset-sample-l-0-0-lt782 1/1 Running 0 3s +metricset-sample-w-0-0-4s5p9 1/1 Running 0 3s +``` + +In the above, "l" is a launcher pod, and "w" is a worker node. +If you inspect the log for the launcher you'll see a short sleep (the network isn't up immediately) +and then LAMMPS running, and the log is printed to the console. + +```bash +kubectl logs metricset-sample-l-0-0-lt782 -f +``` + +There is purposefully a sleep infinity at the end to give you a chance to copy over data. + +```bash +mkdir -p ./data ./output +# Only if you are interested in the ML data +kubectl cp metricset-sample-m-0-0-xfg6r:/dlio/data ./data/ +kubectl cp metricset-sample-m-0-0-xfg6r:/dlio/output ./output +``` + +You can open the tiny file in [https://ui.perfetto.dev/](https://ui.perfetto.dev/). + +![img/trace.png](img/trace.png) + +Other applications of interest might be related to AI/ML - we will try more soon! +Cleanup when you are done. + +```bash +kubectl delete -f metrics.yaml +``` diff --git a/examples/tests/dlio-ml/img/trace.png b/examples/tests/dlio-ml/img/trace.png new file mode 100644 index 0000000..4bed0e3 Binary files /dev/null and b/examples/tests/dlio-ml/img/trace.png differ diff --git a/examples/tests/dlio-ml/metrics.yaml b/examples/tests/dlio-ml/metrics.yaml new file mode 100644 index 0000000..0860b0b --- /dev/null +++ b/examples/tests/dlio-ml/metrics.yaml @@ -0,0 +1,37 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MetricSet +metadata: + labels: + app.kubernetes.io/name: metricset + app.kubernetes.io/instance: metricset-sample + name: metricset-sample +spec: + # kubectl apply -f metrics.yaml + # kubectl logs -f + pods: 1 + + metrics: + - name: io-ior + options: + command: mpirun --allow-run-as-root -np 10 dlio_benchmark workload=resnet50 ++workload.dataset.data_folder=/dlio/data ++workload.output.folder=/dlio/output + workdir: /dlio/data + addons: + - name: commands + options: + preBlock: | + apt-get update && apt-get install -y python3 python3-pip openmpi-bin openmpi-common libopenmpi-dev hwloc libhwloc-dev default-jre + #python3 -m pip install git+https://github.com/hariharan-devarajan/dlio-profiler.git + #python3 -m pip install git+https://github.com/argonne-lcf/dlio_benchmark.git + python3 -m pip install "dlio_benchmark[dlio_profiler] @ git+https://github.com/argonne-lcf/dlio_benchmark.git" + mkdir -p /dlio/data /dlio/output /dlio/logs + export DLIO_PROFILER_ENABLE=0 + mpirun -np 10 --allow-run-as-root dlio_benchmark workload=resnet50 ++workload.dataset.data_folder=/dlio/data ++workload.output.folder=/dlio/output ++workload.workflow.generate_data=True ++workload.workflow.train=False + export DLIO_PROFILER_LOG_LEVEL=ERROR + export DLIO_PROFILER_ENABLE=1 + export DLIO_PROFILER_INC_METADATA=1 + cd /dlio/data + postBlock: | + gzip -d /dlio/output/.trace*.pfw.gz + cat /dlio/output/.trace*.pfw + sleep infinity +