geoelements · kks32 · Jul 24, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
diff --git a/.github/workflows/container-gpu.yml b/.github/workflows/container-gpu.yml
@@ -0,0 +1,44 @@
+name: Build and Push GPU Image to GHCR
+
+on:
+  push:
+    paths:
+      - Dockerfile-GPU
+      - requirements.txt
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./Dockerfile-GPU
+          push: true
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:gpu
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/train.yml b/.github/workflows/train.yml
@@ -7,13 +7,13 @@ jobs:
   gns:
     runs-on: ubuntu-latest
     container:
-      image: ghcr.io/geoelements/gns:config
+      image: ghcr.io/geoelements/gns:dataloader
 
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
-        
-    - name: Black linter check
+
+    - name: Black linter
       run: |
         black --check .
 
@@ -26,4 +26,4 @@ jobs:
         TMP_DIR="../gns-sample"
         DATASET_NAME="WaterDropSample"
         git clone https://github.com/geoelements/gns-sample ../gns-sample
-        python -m gns.train
+        python -m gns.train mode="train" training.steps=10
diff --git a/Dockerfile-GPU b/Dockerfile-GPU
@@ -0,0 +1,19 @@
+FROM nvcr.io/nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
+RUN apt-get update
+RUN apt-get upgrade -y
+
+RUN apt-get install -y python3
+RUN apt-get install -y python3-pip
+RUN apt-get install -y git
+
+RUN pip install --upgrade pip ipython ipykernel
+
+COPY requirements.txt requirements.txt
+ENV PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cu118
+RUN pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu118
+RUN pip install torch_geometric
+RUN pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.0+cu118.html
+RUN pip install absl-py autopep8 numpy==1.23.1 dm-tree matplotlib pyevtk pytest tqdm toml
+RUN pip install -r requirements.txt
+
+CMD ["/bin/bash"]
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
 # Graph Network Simulator (GNS) and MeshNet
 
 [![DOI](https://zenodo.org/badge/427487727.svg)](https://zenodo.org/badge/latestdoi/427487727)
-[![CircleCI](https://dl.circleci.com/status-badge/img/gh/geoelements/gns/tree/main.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/geoelements/gns/tree/main)
-[![Docker](https://quay.io/repository/geoelements/gns/status "Docker Repository on Quay")](https://quay.io/repository/geoelements/gns)
+[![GitHub Actions](https://github.com/geoelements/gns/actions/workflows/train.yml/badge.svg)](https://github.com/geoelements/gns/actions/workflows/train.yml)
+[![Docker](https://img.shields.io/badge/container-gpu-limegreen.svg)](https://ghcr.io/geoelements/gns:gpu)
 [![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://raw.githubusercontent.com/geoelements/gns/main/license.md)
 
 > Krishna Kumar, The University of Texas at Austin.
@@ -227,63 +227,29 @@ The dataset is shared on [DesignSafe DataDepot](https://doi.org/10.17603/ds2-fzg
 
 GNS uses [pytorch geometric](https://www.pyg.org/) and [CUDA](https://developer.nvidia.com/cuda-downloads). These packages have specific requirements, please see [PyG installation]((https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html) for details. 
 
-> CPU-only installation on Linux (Conda)
+> CPU-only installation on Linux/MacOS
 
 ```shell
-conda install -y pytorch torchvision torchaudio cpuonly -c pytorch
-conda install -y pyg -c pyg
-conda install -y pytorch-cluster -c pyg
-conda install -y absl-py -c anaconda 
-conda install -y numpy dm-tree matplotlib-base pyevtk -c conda-forge 
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+pip3 install torch_geometric
+pip3 install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.0+cpu.html
+pip3 install -r requirements.txt
 ```
-You can use the [WaterDropletSample](https://github.com/geoelements/gns-sample) dataset to check if your `gns` code is working correctly.
 
 To test the code you can run:
 
 ```
 pytest test/
 ```
 
-To test on the small waterdroplet sample:
-
-```
-git clone https://github.com/geoelements/gns-sample
-
-TMP_DIR="./gns-sample"
-DATASET_NAME="WaterDropSample"
+### Build Docker Image
 
-mkdir -p ${TMP_DIR}/${DATASET_NAME}/models/
-mkdir -p ${TMP_DIR}/${DATASET_NAME}/rollout/
+Dockerfile-GPU is supplied to build image with GPU support.
 
-DATA_PATH="${TMP_DIR}/${DATASET_NAME}/dataset/"
-MODEL_PATH="${TMP_DIR}/${DATASET_NAME}/models/"
-ROLLOUT_PATH="${TMP_DIR}/${DATASET_NAME}/rollout/"
-
-python -m gns.train --data_path=${DATA_PATH} --model_path=${MODEL_PATH} --ntraining_steps=10
 ```
-
-### Building GNS environment on TACC (LS6 and Frontera)
-
-- to setup a virtualenv
-
-```shell
-sh ./build_venv.sh
+docker pull ghcr.io/geoelements/gns:gpu
 ```
 
-- check tests run sucessfully.
-- start your environment
-
-```shell
-source start_venv.sh 
-```
-
-### Building GNS on MacOS
-```shell
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-pip3 install torch_geometric
-pip3 install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.0+cpu.html
-pip3 install -r requirements.txt
-```
 
 ## GNS training in parallel
 GNS can be trained in parallel on multiple nodes with multiple GPUs.
@@ -296,17 +262,12 @@ GNS can be trained in parallel on multiple nodes with multiple GPUs.
 > GNS scaling result on [TACC lonestar6 GPU nodes](https://docs.tacc.utexas.edu/hpc/lonestar6/#table2) with A100 GPUs.
 
 ### Usage
-#### Single Node, Multi-GPU
-```shell
-python -m torch.distributed.launch --nnodes=1  --nproc_per_node=[GPU_PER_NODE] --node_rank=[LOCAL_RANK] --master_addr=[MAIN_RANK] gns/train_multinode.py [ARGS] 
-```
 
-#### Multi-node, Multi-GPU
-On each node, run
 ```shell
-python -m torch.distributed.launch --nnodes=[NNODES]  --nproc_per_node=[GPU_PER_NODE] --node_rank=[LOCAL_RANK] --master_addr=[MAIN_RANK ]gns/train_multinode.py [ARGS] 
+mpiexec.hydra -np $NNODES -ppn 1 ../slurm_scripts/launch_helper.sh $DOCKER_IMG_LOCATION $n_gpu_per_node
 ```
 
+
 ### Inspiration
 PyTorch version of Graph Network Simulator and Mesh Graph Network Simulator are based on:
 * [https://arxiv.org/abs/2002.09405](https://arxiv.org/abs/2002.09405) and [https://github.com/deepmind/deepmind-research/tree/master/learning_to_simulate](https://github.com/deepmind/deepmind-research/tree/master/learning_to_simulate)

diff --git a/config.yaml b/config.yaml
@@ -45,7 +45,6 @@ training:
 # Hardware configuration
 hardware:
   cuda_device_number: null
-  n_gpus: 1
 
 # Logging configuration
 logging: