Performance Improvements to Main (#131)

* config updates and add timing * remove unneeded resorting for in memory training * simplify use of dst_sort * fix pipeling lock and isolate performMap to forward pass * bulk commit of pipeline, sampling, and loading performance improvements to main branch * gat support for both incoming and outgoing nbrs * remove timing code * fix argsort function arguments for compatibility with latest pytorch versions * update to latest tox version * apply autoformatter * fix linting errors * add dummy at::cuda classes to fix cpu-only builds * update python bindings * autoformat * update docs * fix merge * cleanup merge * remove pin memory call for CPU only build * autoformat * hide pinned memory calls in cpu code paths + autoformatter update * fix transfer_tensor * update pybind11, cmake fix, and for loop init in neighbor.cpp * only use -undefined dynamic_lookup for MacOS * only use -undefined dynamic_lookup for MacOS * fix transfer tensor for cpu tensors * autoformat * restructure cmake and fix dataloader init when using python API * autoformatter * revert to closest commit (9b5084f) with approximately working linux build * Fix linux build * add back all non cmake/pybind updates undone during revert of commit 370fe51 * fix rare edge case handling in neighborhood sampler (from artifact commit 64a0cea) * update marius/torch import order in docs/examples and fix perform map called twice bug * update marius/torch import order in src/python/tools/ * Skip flakey tests which periodically hang; update marius/torch import order for tests * attempt to fix test seg fault * autoformat * autoformat v2 * Update readme install instructions * readme typo --------- Co-authored-by: Roger Waleffe <[email protected]> Co-authored-by: Jason Mohoney <[email protected]> Co-authored-by: Jason <[email protected]> Co-authored-by: Roger Waleffe <[email protected]> Co-authored-by: waleffe <[email protected]>
marius-team · Jul 20, 2023 · e10cade · e10cade
1 parent 0a02b04
commit e10cade
Show file tree

Hide file tree

Showing 73 changed files with 708 additions and 415 deletions.
diff --git a/.clang-format b/.clang-format
@@ -105,7 +105,7 @@ IndentAccessModifiers: false
 IndentCaseLabels: true
 IndentCaseBlocks: false
 IndentGotoLabels: true
-IndentPPDirectives: None
+IndentPPDirectives: BeforeHash
 IndentExternBlock: AfterExternBlock
 IndentRequires:  false
 IndentWidth:     4

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -52,5 +52,5 @@ jobs:
     - name: Run Tests
       working-directory: ${{github.workspace}}
       shell: bash
-      run: MARIUS_TEST_HOME=test/ python3 -m pytest test/python --verbose
+      run: OMP_NUM_THREADS=1 MARIUS_TEST_HOME=test/ python3 -m pytest test/python --verbose
 
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,8 @@ CTestTestfile.cmake
 cmake-*/
 logs/
 data/
+!src/cpp/src/data
+!src/cpp/include/data
 test/test_data/generated/
 *.dylib
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -142,7 +142,7 @@ add_library(${PROJECT_NAME}
             ${project_CUDA_THIRD_PARTY_SOURCES})
 
 if(NOT APPLE)
-    target_link_libraries(${PROJECT_NAME} PUBLIC ${PYTHON_LIBRARIES})
+    target_link_libraries(${PROJECT_NAME} ${Python3_LIBRARIES})
 else()
     set_target_properties(${PROJECT_NAME} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup")
 endif()

diff --git a/README.md b/README.md
@@ -9,35 +9,43 @@ Marius ([OSDI '21 Paper](https://www.usenix.org/conference/osdi21/presentation/m
 - Pipelined training and IO
 - Partition caching and a buffer-aware data ordering to minimize IO for disk-based training (called BETA)
 
-MariusGNN ([arxiv](https://arxiv.org/abs/2202.02365), to appear in EuroSys '23) 
+MariusGNN ([EuroSys '23 Paper](https://dl.acm.org/doi/abs/10.1145/3552326.3567501)) 
 utilizes the data movement optimizations from Marius and adds support for scalable graph neural network training through:
 - An optimized data structure for neighbor sampling and GNN aggregation (called DENSE)
-- An improved data ordering for disk-based training (called COMET) which minimizes IO and maximizes model accuracy (note that COMET subsumes BETA)
+- An improved data ordering for disk-based training (called COMET) which minimizes IO and maximizes model accuracy (with COMET now subsuming BETA)
 
 ## Build and Install ##
 
 ### Requirements ###
 
 * CUDA >= 10.1
 * CuDNN >= 7 
-* pytorch >= 1.8
-* python >= 3.7
+* PyTorch >= 1.8
+* Python >= 3.7
 * GCC >= 7 (On Linux) or Clang >= 11.0 (On MacOS)
-* cmake >= 3.12
-* make >= 3.8
+* CMake >= 3.12
+* Make >= 3.8
+
+### Docker Installation ###
+We recommend using Docker for build and installation. 
+We provide a Dockerfile which installs all the necessary 
+requirements and provide end-to-end instructions in `examples/docker/`.
+
 
 ### Pip Installation ###
+With the required dependencies installed, Marius and MariusGNN can be built using Pip:  
 
 ```
 git clone https://github.com/marius-team/marius.git
+cd marius
 pip3 install .
 ```
 
+### Installation Result ###
 
+After installation, the Python API can be accessed with ``import marius``.
 
-The Python API can be accessed with ``import marius``
-
-The following command line tools will be installed:
+The following command line tools will be also be installed:
 - marius_train: Train models using configuration files and the command line
 - marius_eval: Command line model evaluation
 - marius_preprocess: Built-in dataset downloading and preprocessing
@@ -52,7 +60,7 @@ an exact experiment artifact for each paper in separate branches).
 
 ### Quick Start: ###
 
-First make sure Marius is installed with `pip3 install .` 
+First make sure Marius is installed. 
 
 Preprocess the FB15K_237 dataset with `marius_preprocess --dataset fb15k_237 --output_dir datasets/fb15k_237_example/`
 
@@ -68,33 +76,34 @@ See the [full example](http://marius-project.org/marius/examples/config/lp_fb15k
 
 The Python API is currently experimental and can be used to perform in-memory training and evaluation of graph learning models. 
 
-See the [documentation](http://marius-project.org/marius/examples/python/index.html#) for Python API usage and examples.
+See the [documentation](http://marius-project.org/marius/examples/python/index.html#) and `examples/python/` for Python API usage and examples.
 
 
-## Citing Marius ##
+## Citing Marius or MariusGNN ##
 Marius (out-of-core graph embeddings)
 ```
-@inproceedings {273733,
+@inproceedings{Marius,
     author = {Jason Mohoney and Roger Waleffe and Henry Xu and Theodoros Rekatsinas and Shivaram Venkataraman},
     title = {Marius: Learning Massive Graph Embeddings on a Single Machine},
     booktitle = {15th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 21)},
     year = {2021},
-    isbn = {978-1-939133-22-9},
+    isbn = {9781939133229},
     pages = {533--549},
     url = {https://www.usenix.org/conference/osdi21/presentation/mohoney},
-    publisher = {{USENIX} Association},
-    month = jul,
+    publisher = {{USENIX} Association}
 }
 ```
 
 MariusGNN (out-of-core GNN training)
 ```
-@misc{waleffe2022marius,
-  doi = {10.48550/ARXIV.2202.02365},
-  url = {https://arxiv.org/abs/2202.02365},
-  author = {Waleffe, Roger and Mohoney, Jason and Rekatsinas, Theodoros and Venkataraman, Shivaram},
-  keywords = {Machine Learning (cs.LG), Databases (cs.DB), FOS: Computer and information sciences, FOS: Computer and information sciences},
-  title = {MariusGNN: Resource-Efficient Out-of-Core Training of Graph Neural Networks},
-  publisher = {arXiv},
-  year = {2022},
+@inproceedings{MariusGNN, 
+    author = {Roger Waleffe and Jason Mohoney and Theodoros Rekatsinas and Shivaram Venkataraman},
+    title = {MariusGNN: Resource-Efficient Out-of-Core Training of Graph Neural Networks}, 
+    booktitle = {Proceedings of the Eighteenth European Conference on Computer Systems}, 
+    year = {2023}, 
+    isbn = {9781450394871}, 
+    pages = {144–161},
+    url = {https://doi.org/10.1145/3552326.3567501},
+    publisher = {Association for Computing Machinery}
+}
 ```
diff --git a/docs/config_interface/full_schema.rst b/docs/config_interface/full_schema.rst
@@ -187,6 +187,14 @@ Encoder Configuration
      - Type
      - Description
      - Required
+   * - use_incoming_nbrs
+     - Boolean
+     - Whether to use incoming neighbors for the encoder. One of use_incoming_nbrs or use_outgoing_nbrs must be set to true.
+     - No
+    * - use_outgoing_nbrs
+     - Boolean
+     - Whether to use outgoing neighbors for the encoder. One of use_incoming_nbrs or use_outgoing_nbrs must be set to true.
+     - No
    * - layers
      - List[List[:ref:`LayerConfig<layer-conf-section>`]]
      - Defines architecture of the encoder. Layers of the encoder are grouped into stages, where the layers within a stage are executed in parallel and the output of stage is the input to the successive stage.
@@ -267,16 +275,6 @@ The below example depicts a configuration where there is one embedding layer, fo
      - Specific options depending on the type of sampling layer.
      - No
 
-In the following configuration snippet, the GNN layer samples all neighbors for a given node during training. All neighbors with incoming
-edges to the given node are sampled while the outgoing edges are ignored. 
-
-.. code-block:: yaml 
-
-   train_neighbor_sampling:
-     - type: ALL
-       use_incoming_nbrs: true
-       use_outgoing_nbrs: false
-
 
 .. list-table:: UniformSamplingOptions[NeighborSamplingOptions]
    :widths: 15 10 50 15

diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -220,8 +220,8 @@ Import marius and preprocess ogbn_arxiv for node classifcation.
 
     .. code-block:: python
 
-        import torch
         import marius as m
+        import torch
         from marius.tools.preprocess.datasets.ogbn_arxiv import OGBNArxiv
 
         # initialize and preprocess dataset

diff --git a/examples/docker/README.md b/examples/docker/README.md
@@ -1,12 +1,43 @@
-# Sample dockerfile
+# Docker Installation
 
-Build an image with the name `marius` and the tag `example`:  
-`docker build -t marius:gpu -f examples/docker/gpu_ubuntu/dockerfile examples/docker/gpu_ubuntu/`
+The following instructions install the necessary dependencies and build
+the system using Docker. We describe the installation for GPU-based machines, 
+although Marius and MariusGNN can run on CPU only machines as well.
 
-Create and start a new container instance named `gaius` with:  
-`docker run --name marius_gpu -itd marius:gpu`
+### Build and Install Instructions ###
+1. Check if docker is installed (`which docker`) and if not install it: https://docs.docker.com/engine/install/
+2. Check if docker can access the GPUs by running `sudo docker run --gpus all nvidia/cuda:11.8.0-base-ubuntu22.04 nvidia-smi`. If this doesn't print the output of `nvidia-smi`, docker cannot access the CUDA driver on the host machine and you need to install the NVIDIA drivers for GPU support.
+3. Once the above succeeds, you should no longer need anything installed on the host machine.
+4. Create a docker image using the provided Dockerfile: `docker build -t image_name:image_tag gpu_ubuntu/.`
+5. Run the docker image: `docker run --gpus all -it image_name:image_tag bash`. It is often useful to link the current directory into the containers `/working_dir/` using the `-v` option (see below).
+6. Once the container is running, install and build the system:
+   ```
+   cd marius
+   pip3 install . --no-build-isolation
+   ```
 
-Run `docker ps` to verify the container is running
+**Full List of Example Commands for GPU Installation**:
 
-Start a bash session inside the container:  
-`docker exec -it marius_gpu bash`
+```
+CURRENT_DIR=`pwd`
+git clone https://github.com/marius-team/marius.git
+cd marius/examples/docker/
+docker build -t marius:latest gpu_ubuntu/.
+docker run --gpus all -it -v $CURRENT_DIR:/working_dir/ marius:latest bash
+cd marius
+pip3 install . --no-build-isolation
+```
+
+**CPU Only Installation**: If your machine does not have a GPU, remove the `--gpus all` from the docker run command in the GPU installation instructions. 
+You can also optionally use the Dockerfile in `cpu_ubuntu/` rather than `gpu_ubuntu/`.
+
+**Installation Notes**:
+1. The installation requires Docker to have at least 8GB of memory to work with. This is generally satisfied by
+   default, but if not (often on Mac), the `docker build` command may throw an error code 137. See
+   [here](https://stackoverflow.com/questions/44533319/how-to-assign-more-memory-to-docker-container/44533437#44533437),
+   [here](https://stackoverflow.com/questions/34674325/error-build-process-returned-exit-code-137-during-docker-build-on-tutum), and
+   [here](https://stackoverflow.com/questions/57291806/docker-build-failed-after-pip-installed-requirements-with-exit-code-137)
+   for StackOverflow threads on how to increase Docker available memory or fix this issue. The `pip3 install .` command
+   may also cause Docker memory issues. Increase the memory available to Docker or decrease the number of threads used for building
+   MariusGNN (to decrease the number of threads change `-j{}` in line 45 of `setup.py` to `-j1` for example). One thread
+   should build with 8GB of memory but may take some time (~30mins).
diff --git a/examples/docker/cpu_ubuntu/dockerfile b/examples/docker/cpu_ubuntu/dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:18.04
+FROM ubuntu:22.04
 RUN apt update
 
 RUN apt install -y g++ \
@@ -25,4 +25,7 @@ RUN sh cmake-3.20.0-linux-x86_64.sh --skip-license --prefix=/opt/cmake/
 RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
 
 # install pytorch
-RUN python3 -m pip install torch==1.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+RUN python3 -m pip install torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+
+RUN mkdir /working_dir
+WORKDIR /working_dir
diff --git a/examples/docker/gpu_ubuntu/dockerfile b/examples/docker/gpu_ubuntu/dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.4.0-cudnn8-devel-ubuntu18.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 RUN apt update
 
 RUN apt install -y g++ \
@@ -25,4 +25,7 @@ RUN sh cmake-3.20.0-linux-x86_64.sh --skip-license --prefix=/opt/cmake/
 RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake
 
 # install pytorch
-RUN python3 -m pip install torch==1.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+RUN python3 -m pip install torch==2.0.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html
+
+RUN mkdir /working_dir
+WORKDIR /working_dir
diff --git a/examples/python/custom_lp.py b/examples/python/custom_lp.py
@@ -1,13 +1,14 @@
 from pathlib import Path
 
-import torch
 from omegaconf import OmegaConf
 
 import marius as m
 from marius.tools.preprocess.converters.torch_converter import TorchEdgeListConverter
 from marius.tools.preprocess.dataset import LinkPredictionDataset
 from marius.tools.preprocess.utils import download_url, extract_file
 
+import torch  # isort:skip
+
 
 class MYDATASET(LinkPredictionDataset):
     def __init__(self, output_directory: Path, spark=False):

diff --git a/examples/python/custom_nc_graphsage.py b/examples/python/custom_nc_graphsage.py
@@ -2,7 +2,6 @@
 
 import numpy as np
 import pandas as pd
-import torch
 from omegaconf import OmegaConf
 
 import marius as m
@@ -12,6 +11,8 @@
 from marius.tools.preprocess.datasets.dataset_helpers import remap_nodes
 from marius.tools.preprocess.utils import download_url, extract_file
 
+import torch  # isort:skip
+
 
 def switch_to_num(row):
     names = [

diff --git a/examples/python/fb15k_237.py b/examples/python/fb15k_237.py
@@ -1,11 +1,12 @@
 from pathlib import Path
 
-import torch
 from omegaconf import OmegaConf
 
 import marius as m
 from marius.tools.preprocess.datasets.fb15k_237 import FB15K237
 
+import torch  # isort:skip
+
 
 def init_model(embedding_dim, num_nodes, num_relations, device, dtype):
     # setup shallow embedding encoder

diff --git a/examples/python/fb15k_237_gpu.py b/examples/python/fb15k_237_gpu.py
@@ -1,11 +1,12 @@
 from pathlib import Path
 
-import torch
 from omegaconf import OmegaConf
 
 import marius as m
 from marius.tools.preprocess.datasets.fb15k_237 import FB15K237
 
+import torch  # isort:skip
+
 
 def init_model(embedding_dim, num_nodes, num_relations, device, dtype):
     # setup shallow embedding encoder

diff --git a/examples/python/ogbn_arxiv_nc.py b/examples/python/ogbn_arxiv_nc.py
@@ -1,11 +1,12 @@
 from pathlib import Path
 
-import torch
 from omegaconf import OmegaConf
 
 import marius as m
 from marius.tools.preprocess.datasets.ogbn_arxiv import OGBNArxiv
 
+import torch  # isort:skip
+
 
 def init_model(feature_dim, num_classes, device):
     feature_layer = m.nn.layers.FeatureLayer(dimension=feature_dim, device=device)