interTwin-eu · jarlsondre · Dec 2, 2024 · Nov 11, 2024 · Nov 11, 2024 · Nov 11, 2024
diff --git a/README.md b/README.md
@@ -128,10 +128,83 @@ git clone [--recurse-submodules] [email protected]:interTwin-eu/itwinai.git
 
 ### Install itwinai environment
 
-You can create the
-Python virtual environments using our predefined Makefile targets.
+In this project, we are using `uv` as a project-wide package manager. Therefore, if
+you are a developer, you should see the [uv tutorial](./uv-tutorial.md) after reading
+the following `pip` tutorial.
 
-#### PyTorch (+ Lightning) virtual environment
+#### Installation using pip
+
+##### Creating a venv
+
+You can install the `itwinai` environment for development using `pip`. First, however,
+you would want to make a Python venv if you haven't already. Make sure you have
+Python installed (on HPC you have to load it with `module load Python`), and then you
+can create a venv with the following command:
+
+```bash
+python -m venv <name-of-venv>
+```
+
+For example, if I wanted to create a venv in the directory `.venv` (which is useful if
+you use e.g. `uv`), then I would do:
+
+```bash
+python -m venv .venv
+```
+
+After this you can activate your venv using the following command:
+
+```bash
+source .venv/bin/activate
+```
+
+Now anything you pip install will be installed in your venv and if you run any python
+commands they will use the version from your venv.
+
+##### Installation of packages
+
+We provide some _extras_ that can be activated depending on which platform you are
+using.
+
+- `macos` or `linux` depending on which OS you use. Changes the version of `prov4ML`.
+- `dev` for development purposes. Includes libraries for testing and tensorboard etc.
+- `torch` for installation with PyTorch.
+
+If you want to install PyTorch using CUDA then you also have to add an
+`--extra-index-url` to the CUDA version that you want. Since you are developing the
+library, you also want to enable the editable flag, `-e`, so that you don't have to
+reinstall everything every time you make a change. If you are on HPC, then you will
+usually want to add the `--no-cache-dir` flag to avoid filling up your `~/.cache`
+directory, as you can very easily reach your disk quota otherwise. An example of a
+complete command for installing as a developer on HPC with CUDA thus becomes:
+
+```bash
+pip install -e .[torch,dev,linux] \
+    --no-cache-dir \
+    --extra-index-url https://download.pytorch.org/whl/cu121
+```
+
+If you wanted to install this locally on macOS (i.e. without CUDA) with PyTorch, you
+would do the following instead:
+
+```bash
+pip install -e .[torch,dev,macos]
+```
+
+<!-- You can create the Python virtual environments using our predefined Makefile targets. -->
+
+#### Horovod and DeepSpeed
+
+The above does not install `Horovod` and `DeepSpeed`, however, as they require a
+specialized [script](env-files/torch/install-horovod-deepspeed-cuda.sh). If you do not
+require CUDA, then you can install them using `pip` as follows:
+
+```bash
+pip install --no-cache-dir --no-build-isolation git+https://github.com/horovod/horovod.git
+pip install --no-cache-dir --no-build-isolation deepspeed
+```
+
+#### PyTorch (+ Lightning) virtual environment with makefiles
 
 Makefile targets for environment installation:
 

diff --git a/env-files/torch/generic_torch.sh b/env-files/torch/generic_torch.sh
@@ -127,34 +127,15 @@ else
     export HOROVOD_GPU_ALLREDUCE=NCCL
     export HOROVOD_NCCL_LINK=SHARED
     export HOROVOD_NCCL_HOME=$EBROOTNCCL
-
-    # Host language vars
-    export HOROVOD_WITH_PYTORCH=1
-    export HOROVOD_WITHOUT_TENSORFLOW=1
-    export HOROVOD_WITHOUT_MXNET=1
-  else
-    # CPU only installation
-    export HOROVOD_WITH_PYTORCH=1
-    export HOROVOD_WITHOUT_TENSORFLOW=1
-    export HOROVOD_WITHOUT_MXNET=1
   fi
-  pip install --no-cache-dir git+https://github.com/horovod/horovod.git@3a31d93 || exit 1
+  # Host language vars
+  export HOROVOD_WITH_PYTORCH=1
+  export HOROVOD_WITHOUT_TENSORFLOW=1
+  export HOROVOD_WITHOUT_MXNET=1
+
+  pip install --no-cache-dir git+https://github.com/horovod/horovod.git || exit 1
 fi
 
-# get required libraries in reqs.txt
-# if [ -f "${cDir}/$ENV_NAME/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py" ]; then
-   # echo 'required libs already exist'
-# else
-#   pip install -r Scripts/reqs.txt --no-cache-dir
-
-  # fix int bug: modify l.4 of /torchnlp/_third_party/weighted_random_sampler.py
-  # var='int_classes = int'
-  # sed -i .backup_file "4s|.*|$var|" \
-    # ${cDir}/$ENV_NAME/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py || exit 1
-  # Deleting unnecessary backup file
-  # rm ${cDir}/$ENV_NAME/lib/python${pver}/site-packages/torchnlp/_third_party/weighted_random_sampler.py.backup_file
-# fi
-
 # Install Pov4ML
 if [[ "$OSTYPE" =~ ^darwin ]] ; then
   pip install --no-cache-dir "prov4ml[apple,nvidia]@git+https://github.com/matbun/ProvML@new-main" || exit 1

diff --git a/env-files/torch/install-horovod-deepspeed-cuda.sh b/env-files/torch/install-horovod-deepspeed-cuda.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Job configuration
+#SBATCH --job-name=setup_venv
+#SBATCH --account=intertwin
+#SBATCH --output=horovod_ds_installation.out
+#SBATCH --error=horovod_ds_installation.err
+#SBATCH --time=00:30:00
+
+# Resources allocation
+#SBATCH --partition=develbooster
+#SBATCH --nodes=1
+#SBATCH --gres=gpu
+
+ml --force purge
+ml Stages/2024 GCC OpenMPI CUDA/12 cuDNN MPI-settings/CUDA
+ml Python/3.11.3 CMake HDF5 PnetCDF libaio mpi4py git Clang
+
+source .venv/bin/activate
+
+# DeepSpeed variables
+export DS_BUILD_CCL_COMM=1
+export DS_BUILD_UTILS=1
+export DS_BUILD_AIO=1
+export DS_BUILD_FUSED_ADAM=1
+export DS_BUILD_FUSED_LAMB=1
+export DS_BUILD_TRANSFORMER=1
+export DS_BUILD_STOCHASTIC_TRANSFORMER=1
+export DS_BUILD_TRANSFORMER_INFERENCE=1
+
+pip install --no-cache-dir --no-build-isolation "deepspeed==0.15.*"
+
+# Horovod variables
+export LDSHARED="$CC -shared" &&
+export CMAKE_CXX_STANDARD=17 
+
+export HOROVOD_MPI_THREADS_DISABLE=1
+export HOROVOD_CPU_OPERATIONS=MPI
+
+export HOROVOD_GPU_ALLREDUCE=NCCL
+export HOROVOD_NCCL_LINK=SHARED
+export HOROVOD_NCCL_HOME=$EBROOTNCCL
+
+export HOROVOD_WITH_PYTORCH=1
+export HOROVOD_WITHOUT_TENSORFLOW=1
+export HOROVOD_WITHOUT_MXNET=1
+
+pip install --no-cache-dir 'horovod[pytorch] @ git+https://github.com/horovod/horovod'
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,23 +25,23 @@ classifiers = [
 ]
 
 dependencies = [
-    "wandb",
-    "mlflow",
-    "jsonargparse[signatures]",
-    "pyyaml",
-    "omegaconf",
     "rich>=13.5.3",
     "typer>=0.9.0",
-    "wheel",
-    "pydantic",
-    # "prov4ml@git+https://github.com/HPCI-Lab/ProvML@main", # Prov4ML
-    # "prov4ml@git+https://github.com/matbun/ProvML@main",
-    "pandas",
-    "seaborn"
+    "numpy<2.0.0",
+    "wandb>=0.18.7",
+    "mlflow>=2.17.2",
+    "wheel>=0.45.0",
+    "seaborn>=0.13.2",
+    "py-cpuinfo>=9.0.0",
+    "packaging>=24.2",
+    "pydantic>=2.9.2",
+    "pyyaml>=6.0.2",
+    "omegaconf>=2.3.0",
+    "jsonargparse[signatures]>=4.34.0",
+    "matplotlib>=3.9.2",
+    "pip>=24.3.1",
 ]
 
-# dynamic = ["version", "description"]
-
 [project.optional-dependencies]
 torch = [
   "torch==2.4.*",
@@ -51,13 +51,19 @@ torch = [
   "torchaudio>=2.4.1",
 ]
 dev = [
-    "pytest>=7.4.2",
-    "pytest-mock>=3.11.1",
-    "pytest-cov>=4.1.0",
-    "ipykernel",
-    "ipython",
-    "isort",
-    "tensorflow==2.16.*",  # needed by tests on tensorboard
+  "pytest>=7.4.2",
+  "pytest-mock>=3.11.1",
+  "pytest-cov>=4.1.0",
+  "tensorflow==2.16.*", # needed by tests on tensorboard
+  "ipykernel>=6.29.5",
+  "ipython",
+  "isort>=5.13.2",
+]
+macos = [
+    "prov4ml[apple]@git+https://github.com/matbun/ProvML"
+]
+linux = [
+    "prov4ml[linux]@git+https://github.com/matbun/ProvML"
 ]
 docs = [ 
   "sphinx-rtd-theme==2.0.0",
@@ -66,12 +72,6 @@ docs = [
   "IPython",
   "tensorflow==2.16.*",
 ]
-macos = [
-    "prov4ml[apple]@git+https://github.com/matbun/ProvML"
-]
-linux = [
-    "prov4ml[linux]@git+https://github.com/matbun/ProvML"
-]
 
 [project.urls]
 Homepage = "https://www.intertwin.eu/"

diff --git a/src/itwinai/torch/distributed.py b/src/itwinai/torch/distributed.py
@@ -400,7 +400,9 @@ def init(self) -> None:
                 which is already initialized.
         """
         if not distributed_resources_available():
-            raise RuntimeError("Trying to run distributed on insufficient resources.")
+            raise RuntimeError(
+                "Trying to run distributed on insufficient resources."
+            )
         if self.is_initialized:
             raise DistributedStrategyError("Strategy was already initialized")
         dist.init_process_group(backend=self.backend)
@@ -567,7 +569,9 @@ def init(self) -> None:
 
         self.deepspeed = deepspeed
         if not distributed_resources_available():
-            raise RuntimeError("Trying to run distributed on insufficient resources.")
+            raise RuntimeError(
+                "Trying to run distributed on insufficient resources."
+            )
 
         if self.is_initialized:
             raise DistributedStrategyError("Strategy was already initialized")
@@ -736,7 +740,9 @@ def init(self) -> None:
                 already initialized.
         """
         if not distributed_resources_available():
-            raise RuntimeError("Trying to run distributed on insufficient resources.")
+            raise RuntimeError(
+                "Trying to run distributed on insufficient resources."
+            )
         if self.is_initialized:
             raise DistributedStrategyError("Strategy was already initialized")
 
@@ -867,9 +873,7 @@ def gather_obj(self, obj: Any, dst_rank: int = 0) -> Optional[list[Any]]:
             return result
 
     @check_initialized
-    def gather(
-        self, tensor: torch.Tensor, dst_rank: int = 0
-    ) -> Optional[List[torch.Tensor]]:
+    def gather(self, tensor: torch.Tensor, dst_rank: int = 0) -> Optional[List[torch.Tensor]]:
         """Gathers a tensor from the whole group in a list
         (to all workers). Under the hood it relies on allgather as gather is
         not supported by Horovod.

diff --git a/src/itwinai/torch/profiling/profiler.py b/src/itwinai/torch/profiling/profiler.py
@@ -107,7 +107,6 @@ def profiled_method(self: TorchTrainer, *args, **kwargs) -> Any:
         # Extracting and storing the profiling data
         key_averages = profiler.key_averages()
 
-        # strategy.barrier()
         profiling_dataframe = gather_profiling_data(key_averages=key_averages)
         profiling_dataframe["strategy"] = strategy_name
         profiling_dataframe["num_gpus"] = num_gpus_global

diff --git a/tutorials/distributed-ml/torch-scaling-test/README.md b/tutorials/distributed-ml/torch-scaling-test/README.md
@@ -32,9 +32,8 @@ python ddp_trainer.py -c config/base.yaml -c config/ddp.yaml --log-int 42
 ## Run a single training
 
 Training runs are meant to be submitted via SLURM, from a unified job script file:
-`slurm.sh`.
-You can select the distributed training algorithm and provide the command to execute
-setting SLURM environment variables using the `--export` option:
+`slurm.sh`.You can select the distributed training algorithm and provide the command
+to execute setting SLURM environment variables using the `--export` option:
 
 ```bash
 # Launch a distributed training setup with Torch DDP

diff --git a/use-cases/eurac/config.yaml b/use-cases/eurac/config.yaml
@@ -6,7 +6,7 @@ tmp_stats: /p/scratch/intertwin/datasets/eurac/stats
 
 experiment: "drought use case lstm"
 run_name: "alps_test"
-epochs: 4
+epochs: 5
 random_seed: 1010
 lr: 0.001
 batch_size: 256

diff --git a/use-cases/eurac/requirements.txt b/use-cases/eurac/requirements.txt
@@ -4,3 +4,4 @@ tqdm
 cf_xarray
 requests
 aiohttp
+ray
diff --git a/use-cases/eurac/trainer.py b/use-cases/eurac/trainer.py
@@ -1,7 +1,7 @@
 import os
 from pathlib import Path
 from timeit import default_timer
-from typing import Dict, Literal, Optional, Union, Any, Tuple
+from typing import Any, Dict, Literal, Optional, Tuple, Union
 
 import pandas as pd
 import torch
@@ -25,10 +25,9 @@
     NonDistributedStrategy,
     TorchDDPStrategy,
 )
+from itwinai.torch.profiling.profiler import profile_torch_trainer
 from itwinai.torch.trainer import TorchTrainer
 from itwinai.torch.type import Metric
-from itwinai.torch.profiling.profiler import profile_torch_trainer
-from itwinai.torch.monitoring.monitoring import measure_gpu_utilization
 
 
 class RNNDistributedTrainer(TorchTrainer):
@@ -155,11 +154,11 @@ def train(self):
             num_nodes = int(os.environ.get("SLURM_NNODES", "unk"))
             epoch_time_output_dir = Path("scalability-metrics/epoch-time")
             epoch_time_file_name = f"epochtime_{self.strategy.name}_{num_nodes}N.csv"
-            epoch_time_output_path = epoch_time_output_dir / epoch_time_file_name 
+            epoch_time_output_path = epoch_time_output_dir / epoch_time_file_name
 
             epoch_time_tracker = EpochTimeTracker(
                 strategy_name=self.strategy.name,
-                save_path=epoch_time_output_path, 
+                save_path=epoch_time_output_path,
                 num_nodes=num_nodes
             )
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ tqdm @@
     cf_xarray
     requests
     aiohttp
+    ray