Merge pull request #68 from duggalsu/opt_pytorch_graviton

Optimize docker for multi-arch builds
tattle-made · Feb 9, 2024 · 575b6c4 · 575b6c4
2 parents 484d5ae + c22bc58
commit 575b6c4
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 7 deletions.
diff --git a/.github/workflows/docker-push-vidvec.yml b/.github/workflows/docker-push-vidvec.yml
@@ -4,7 +4,7 @@ on: workflow_dispatch
 
 jobs:
   api:
-    runs-on: macos-14
+    runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
         with:
@@ -16,6 +16,8 @@ jobs:
           echo "setting variables"
           echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
 
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
       - uses: elgohr/Publish-Docker-Github-Action@master
         with:
           username: tattletech
@@ -24,6 +26,7 @@ jobs:
           workdir: src/api/
           dockerfile: Dockerfile.vid_vec_rep_resnet
           tags: ${{ steps.vars.outputs.sha_short }}
+          platforms: linux/amd64,linux/arm64
 
       # - name: deploy to cluster
       #   uses: steebchen/[email protected]

diff --git a/src/api/Dockerfile.vid_vec_rep_resnet b/src/api/Dockerfile.vid_vec_rep_resnet
@@ -1,4 +1,4 @@
-FROM arm64v8/python:3.11-slim-bullseye AS base
+FROM python:3.11-slim-bullseye AS base
 RUN apt-get update \
     && apt-get -y upgrade \
     && apt-get install -y \
@@ -14,6 +14,28 @@ RUN pip install --no-cache-dir --upgrade pip
 WORKDIR /app
 COPY ./core/operators/vid_vec_rep_resnet_requirements.txt /app/core/operators/vid_vec_rep_resnet_requirements.txt
 RUN pip install --no-cache-dir --user -r /app/core/operators/vid_vec_rep_resnet_requirements.txt
+
+### AWS Graviton Optimization ###
+# Graviton3(E) (e.g. c7g, c7gn and Hpc7g instances) supports BF16 format for ML acceleration. This can be enabled in oneDNN by setting the below environment variable
+grep -q bf16 /proc/cpuinfo && export DNNL_DEFAULT_FPMATH_MODE=BF16
+
+# Enable primitive caching to avoid the redundant primitive allocation
+# latency overhead. Please note this caching feature increases the
+# memory footprint. Tune this cache capacity to a lower value to
+# reduce the additional memory requirement.
+export LRU_CACHE_CAPACITY=1024
+
+# Enable Transparent huge page allocations from PyTorch C10 allocator
+export THP_MEM_ALLOC_ENABLE=1
+
+# Make sure the openmp threads are distributed across all the processes for multi process applications to avoid over subscription for the vcpus. For example if there is a single application process, then num_processes should be set to '1' so that all the vcpus are assigned to it with one-to-one mapping to omp threads
+num_vcpus=$(getconf _NPROCESSORS_ONLN)
+num_processes=1
+export OMP_NUM_THREADS=$((1 > ($num_vcpus/$num_processes) ? 1 : ($num_vcpus/$num_processes)))
+export OMP_PROC_BIND=false
+export OMP_PLACES=cores
+###
+
 COPY ./core/operators/vid_vec_rep_resnet.py /app/core/operators/vid_vec_rep_resnet.py
 
 COPY ./core/operators/sample_data/sample-cat-video.mp4 /app/core/operators/sample_data/sample-cat-video.mp4

diff --git a/src/api/core/operators/vid_vec_rep_resnet_requirements.txt b/src/api/core/operators/vid_vec_rep_resnet_requirements.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile --find-links=https://download.pytorch.org/whl/torch_stable.html vid_vec_rep_resnet_requirements.in
 #
---find-links https://download.pytorch.org/whl/torch_stable.html
+--find-links https://download.pytorch.org/whl/cpu
 
 certifi==2024.2.2
     # via requests
@@ -65,19 +65,19 @@ scipy==1.11.4
     # via -r vid_vec_rep_resnet_requirements.in
 sympy==1.12
     # via torch
-textual==0.48.2
+textual==0.50.0
     # via memray
-torch==2.1.2+cpu
+torch
     # via
     #   -r vid_vec_rep_resnet_requirements.in
     #   torchvision
-torchvision==0.16.2+cpu
+torchvision
     # via -r vid_vec_rep_resnet_requirements.in
 typing-extensions==4.9.0
     # via
     #   textual
     #   torch
 uc-micro-py==1.0.2
     # via linkify-it-py
-urllib3==2.0.7
+urllib3==2.2.0
     # via requests