Added working Docker GPU training with nvidia-docker 2.0

fengshi-cherish · Mar 12, 2018 · 725a12b · 725a12b
1 parent 557a7c5
commit 725a12b
Show file tree

Hide file tree

Showing 7 changed files with 269 additions and 189 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,9 +1,32 @@
-FROM pytorch/pytorch
+FROM nvidia/cuda:9.0-cudnn7-devel
+
 MAINTAINER Vithursan Thangarasa
 
-RUN \
-	apt-get -qq -y update && apt-get -y install && \
-	apt-get -y install ipython ipython-notebook python-tk
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install some dependencies
+RUN apt-get update && apt-get install -y \
+        build-essential \
+        curl \
+        libfreetype6-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        libssl-dev \
+        pkg-config \
+        rsync \
+        software-properties-common \
+        unzip \
+        python2.7 \
+        python2.7-dev \
+        python-tk \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install python-pip
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
 
 RUN \
 	pip install -U numpy \
@@ -13,11 +36,14 @@ RUN \
 	pandas \
 	tqdm \
 	pillow \
-	setuptools --ignore-installed \
+	setuptools \
 	sklearn \
 	scipy \
 	visdom
 
+RUN pip install http://download.pytorch.org/whl/cu90/torch-0.3.1-cp27-cp27mu-linux_x86_64.whl
+RUN pip install torchvision
+
 COPY ./ /root/MagnetLoss
 
 WORKDIR /root/MagnetLoss

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ PyTorch implementation of the Magnet Loss for Deep Metric Learning, based onthe
 The program requires the following dependencies (easy to install using pip, Ananconda or Docker):
 
 * python (tested on 2.7 and 3.6)
-* pytorch (tested with v0.3 and v0.3.1 with CUDA 8.0)
+* pytorch (tested with v0.3 and v0.3.1 with CUDA 8.0/9.0)
 * numpy
 * matplotlib
 * seaborn
@@ -52,7 +52,27 @@ Train ConvNet with Magnet Loss on the local machine using MNIST dataset:
 python magnet_loss_test.py --lr 1e-4 --batch-size 64 --mnist --dml
 ```
 
-## Docker GPU Training (WIP)
+## Docker GPU Training
+
+### Prerequisites:
+1. Docker installed on your machine. If you don't have Docker installed already, then go here to [Docker Setup](https://docs.docker.com/engine/getstarted/step_one/)
+2. Install `nvidia-docker 2.0` from [Nvidia Docker 2.0](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)
+3. Register `nvidia` runtime with the Docker engine [Nvidia Container Runtime](https://github.com/NVIDIA/nvidia-container-runtime)
+
+### Docker: Build Image
+```sh
+docker build -t magnetloss .
+```
+
+### Docker: Train
+Deploy and train on Docker container:
+```sh
+docker run --rm -it --runtime=nvidia magnetloss python magnet_loss_test.py --lr 1e-4 --mnist --batch-size 64 --dml
+```
+or
+```sh
+./run_gpu_docker.sh <DOCKER IMAGE NAME>
+```
 
 ## Results
 ### MNIST

diff --git a/config.json b/config.json
@@ -0,0 +1,178 @@
+{
+	"ociVersion": "1.0.0",
+	"process": {
+		"terminal": true,
+		"user": {
+			"uid": 0,
+			"gid": 0
+		},
+		"args": [
+			"sh"
+		],
+		"env": [
+			"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
+			"TERM=xterm"
+		],
+		"cwd": "/",
+		"capabilities": {
+			"bounding": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"effective": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"inheritable": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"permitted": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			],
+			"ambient": [
+				"CAP_AUDIT_WRITE",
+				"CAP_KILL",
+				"CAP_NET_BIND_SERVICE"
+			]
+		},
+		"rlimits": [
+			{
+				"type": "RLIMIT_NOFILE",
+				"hard": 1024,
+				"soft": 1024
+			}
+		],
+		"noNewPrivileges": true
+	},
+	"root": {
+		"path": "rootfs",
+		"readonly": true
+	},
+	"hostname": "runc",
+	"mounts": [
+		{
+			"destination": "/proc",
+			"type": "proc",
+			"source": "proc"
+		},
+		{
+			"destination": "/dev",
+			"type": "tmpfs",
+			"source": "tmpfs",
+			"options": [
+				"nosuid",
+				"strictatime",
+				"mode=755",
+				"size=65536k"
+			]
+		},
+		{
+			"destination": "/dev/pts",
+			"type": "devpts",
+			"source": "devpts",
+			"options": [
+				"nosuid",
+				"noexec",
+				"newinstance",
+				"ptmxmode=0666",
+				"mode=0620",
+				"gid=5"
+			]
+		},
+		{
+			"destination": "/dev/shm",
+			"type": "tmpfs",
+			"source": "shm",
+			"options": [
+				"nosuid",
+				"noexec",
+				"nodev",
+				"mode=1777",
+				"size=65536k"
+			]
+		},
+		{
+			"destination": "/dev/mqueue",
+			"type": "mqueue",
+			"source": "mqueue",
+			"options": [
+				"nosuid",
+				"noexec",
+				"nodev"
+			]
+		},
+		{
+			"destination": "/sys",
+			"type": "sysfs",
+			"source": "sysfs",
+			"options": [
+				"nosuid",
+				"noexec",
+				"nodev",
+				"ro"
+			]
+		},
+		{
+			"destination": "/sys/fs/cgroup",
+			"type": "cgroup",
+			"source": "cgroup",
+			"options": [
+				"nosuid",
+				"noexec",
+				"nodev",
+				"relatime",
+				"ro"
+			]
+		}
+	],
+	"linux": {
+		"resources": {
+			"devices": [
+				{
+					"allow": false,
+					"access": "rwm"
+				}
+			]
+		},
+		"namespaces": [
+			{
+				"type": "pid"
+			},
+			{
+				"type": "network"
+			},
+			{
+				"type": "ipc"
+			},
+			{
+				"type": "uts"
+			},
+			{
+				"type": "mount"
+			}
+		],
+		"maskedPaths": [
+			"/proc/kcore",
+			"/proc/latency_stats",
+			"/proc/timer_list",
+			"/proc/timer_stats",
+			"/proc/sched_debug",
+			"/sys/firmware",
+			"/proc/scsi"
+		],
+		"readonlyPaths": [
+			"/proc/asound",
+			"/proc/bus",
+			"/proc/fs",
+			"/proc/irq",
+			"/proc/sys",
+			"/proc/sysrq-trigger"
+		]
+	}
+}
diff --git a/magnet_loss_test.py b/magnet_loss_test.py
@@ -1,3 +1,5 @@
+import matplotlib
+matplotlib.use('agg')
 import matplotlib.pyplot as plt
 import numpy as np
 from math import ceil
@@ -16,7 +18,9 @@
 from torchvision.datasets import MNIST, CIFAR10
 from torchvision import transforms
 import torchvision.models as models
-from models.vgg_cifar import VGG
+
+from models.vgg import VGG
+from models.lenet import LeNet
 
 from magnet_loss.magnet_tools import *
 from magnet_loss.magnet_loss import MagnetLoss
@@ -32,75 +36,6 @@
 
 args = parse_settings()
 
-# Build Network
-class LeNet(nn.Module):
-
-	def __init__(self, emb_dim):
-		self.emb_dim = emb_dim
-
-		'''
-		Define the initialization function of LeNet, this function defines
-		the basic structure of the neural network
-		'''
-		super(LeNet, self).__init__()
-		self.conv1 = nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=2)
-		self.conv2 = nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2)
-		self.emb = nn.Linear(64*7*7, self.emb_dim)
-
-		self.layer1 = None
-		self.layer2 = None
-		self.features = None
-		self.embeddings = None
-		self.norm_embeddings = None
-
-	def forward(self, x):
-		'''
-		Define the forward propagation function and automatically generates
-		the backward propagation function (autograd)
-		'''
-
-		x = F.max_pool2d(F.relu(self.conv1(x)), 2)
-		self.layer1 = x
-
-		x = F.max_pool2d(F.relu(self.conv2(x)), 2)
-		self.layer2 = x
-
-		x = x.view(-1, self.num_flat_features(x))
-		self.features = x
-
-		x = self.emb(x)
-		embeddings = x
-
-		return embeddings, self.features
-
-	def num_flat_features(self, x):
-		'''
-			Calculate the total tensor x feature amount
-		'''
-
-		size = x.size()[1:] # All dimensions except batch dimension
-		num_features = 1
-		for s in size:
-			num_features *= s
-
-		return num_features
-
-	def l2_normalize(self, x, dim):
-
-	    if not (isinstance(x, torch.DoubleTensor) or isinstance(x, torch.FloatTensor)):
-	        x = x.float()
-
-	    if len(x.size()) == 1:
-	        x = x.view(1, -1)
-
-	    norm = torch.sqrt(torch.sum(x * x, dim=dim))
-	    norm = norm.view(-1, 1)
-
-	    return torch.div(x, norm)
-
-	def name(self):
-		return 'lenet-magnet'
-
 def run_magnet_loss():
 	'''
 	Test function for the magnet loss
@@ -122,7 +57,10 @@ def run_magnet_loss():
 	n_steps = epoch_steps * 15
 	cluster_refresh_interval = epoch_steps
 
-	model = torch.nn.DataParallel(LeNet(emb_dim)).cuda()
+	if args.mnist:
+		model = torch.nn.DataParallel(LeNet(emb_dim)).cuda()
+	if args.cifar10:
+		model = torch.nn.DataParallel(VGG(depth=16, num_classes=emb_dim))
 	print(model)
 
 	#model = EncoderCNN(64)