Skip to content

Commit

Permalink
Added working Docker GPU training with nvidia-docker 2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
vithursant committed Mar 12, 2018
1 parent 557a7c5 commit 725a12b
Show file tree
Hide file tree
Showing 7 changed files with 269 additions and 189 deletions.
36 changes: 31 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,9 +1,32 @@
FROM pytorch/pytorch
FROM nvidia/cuda:9.0-cudnn7-devel

MAINTAINER Vithursan Thangarasa

RUN \
apt-get -qq -y update && apt-get -y install && \
apt-get -y install ipython ipython-notebook python-tk
ENV DEBIAN_FRONTEND=noninteractive

# Install some dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
libfreetype6-dev \
libpng12-dev \
libzmq3-dev \
libssl-dev \
pkg-config \
rsync \
software-properties-common \
unzip \
python2.7 \
python2.7-dev \
python-tk \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Install python-pip
RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py

RUN \
pip install -U numpy \
Expand All @@ -13,11 +36,14 @@ RUN \
pandas \
tqdm \
pillow \
setuptools --ignore-installed \
setuptools \
sklearn \
scipy \
visdom

RUN pip install http://download.pytorch.org/whl/cu90/torch-0.3.1-cp27-cp27mu-linux_x86_64.whl
RUN pip install torchvision

COPY ./ /root/MagnetLoss

WORKDIR /root/MagnetLoss
Expand Down
24 changes: 22 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ PyTorch implementation of the Magnet Loss for Deep Metric Learning, based onthe
The program requires the following dependencies (easy to install using pip, Ananconda or Docker):

* python (tested on 2.7 and 3.6)
* pytorch (tested with v0.3 and v0.3.1 with CUDA 8.0)
* pytorch (tested with v0.3 and v0.3.1 with CUDA 8.0/9.0)
* numpy
* matplotlib
* seaborn
Expand Down Expand Up @@ -52,7 +52,27 @@ Train ConvNet with Magnet Loss on the local machine using MNIST dataset:
python magnet_loss_test.py --lr 1e-4 --batch-size 64 --mnist --dml
```

## Docker GPU Training (WIP)
## Docker GPU Training

### Prerequisites:
1. Docker installed on your machine. If you don't have Docker installed already, then go here to [Docker Setup](https://docs.docker.com/engine/getstarted/step_one/)
2. Install `nvidia-docker 2.0` from [Nvidia Docker 2.0](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)
3. Register `nvidia` runtime with the Docker engine [Nvidia Container Runtime](https://github.com/NVIDIA/nvidia-container-runtime)

### Docker: Build Image
```sh
docker build -t magnetloss .
```

### Docker: Train
Deploy and train on Docker container:
```sh
docker run --rm -it --runtime=nvidia magnetloss python magnet_loss_test.py --lr 1e-4 --mnist --batch-size 64 --dml
```
or
```sh
./run_gpu_docker.sh <DOCKER IMAGE NAME>
```

## Results
### MNIST
Expand Down
178 changes: 178 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
{
"ociVersion": "1.0.0",
"process": {
"terminal": true,
"user": {
"uid": 0,
"gid": 0
},
"args": [
"sh"
],
"env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm"
],
"cwd": "/",
"capabilities": {
"bounding": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"effective": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"inheritable": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"permitted": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"ambient": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
]
},
"rlimits": [
{
"type": "RLIMIT_NOFILE",
"hard": 1024,
"soft": 1024
}
],
"noNewPrivileges": true
},
"root": {
"path": "rootfs",
"readonly": true
},
"hostname": "runc",
"mounts": [
{
"destination": "/proc",
"type": "proc",
"source": "proc"
},
{
"destination": "/dev",
"type": "tmpfs",
"source": "tmpfs",
"options": [
"nosuid",
"strictatime",
"mode=755",
"size=65536k"
]
},
{
"destination": "/dev/pts",
"type": "devpts",
"source": "devpts",
"options": [
"nosuid",
"noexec",
"newinstance",
"ptmxmode=0666",
"mode=0620",
"gid=5"
]
},
{
"destination": "/dev/shm",
"type": "tmpfs",
"source": "shm",
"options": [
"nosuid",
"noexec",
"nodev",
"mode=1777",
"size=65536k"
]
},
{
"destination": "/dev/mqueue",
"type": "mqueue",
"source": "mqueue",
"options": [
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/sys",
"type": "sysfs",
"source": "sysfs",
"options": [
"nosuid",
"noexec",
"nodev",
"ro"
]
},
{
"destination": "/sys/fs/cgroup",
"type": "cgroup",
"source": "cgroup",
"options": [
"nosuid",
"noexec",
"nodev",
"relatime",
"ro"
]
}
],
"linux": {
"resources": {
"devices": [
{
"allow": false,
"access": "rwm"
}
]
},
"namespaces": [
{
"type": "pid"
},
{
"type": "network"
},
{
"type": "ipc"
},
{
"type": "uts"
},
{
"type": "mount"
}
],
"maskedPaths": [
"/proc/kcore",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/sys/firmware",
"/proc/scsi"
],
"readonlyPaths": [
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
]
}
}
80 changes: 9 additions & 71 deletions magnet_loss_test.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import numpy as np
from math import ceil
Expand All @@ -16,7 +18,9 @@
from torchvision.datasets import MNIST, CIFAR10
from torchvision import transforms
import torchvision.models as models
from models.vgg_cifar import VGG

from models.vgg import VGG
from models.lenet import LeNet

from magnet_loss.magnet_tools import *
from magnet_loss.magnet_loss import MagnetLoss
Expand All @@ -32,75 +36,6 @@

args = parse_settings()

# Build Network
class LeNet(nn.Module):

def __init__(self, emb_dim):
self.emb_dim = emb_dim

'''
Define the initialization function of LeNet, this function defines
the basic structure of the neural network
'''
super(LeNet, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=2)
self.conv2 = nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2)
self.emb = nn.Linear(64*7*7, self.emb_dim)

self.layer1 = None
self.layer2 = None
self.features = None
self.embeddings = None
self.norm_embeddings = None

def forward(self, x):
'''
Define the forward propagation function and automatically generates
the backward propagation function (autograd)
'''

x = F.max_pool2d(F.relu(self.conv1(x)), 2)
self.layer1 = x

x = F.max_pool2d(F.relu(self.conv2(x)), 2)
self.layer2 = x

x = x.view(-1, self.num_flat_features(x))
self.features = x

x = self.emb(x)
embeddings = x

return embeddings, self.features

def num_flat_features(self, x):
'''
Calculate the total tensor x feature amount
'''

size = x.size()[1:] # All dimensions except batch dimension
num_features = 1
for s in size:
num_features *= s

return num_features

def l2_normalize(self, x, dim):

if not (isinstance(x, torch.DoubleTensor) or isinstance(x, torch.FloatTensor)):
x = x.float()

if len(x.size()) == 1:
x = x.view(1, -1)

norm = torch.sqrt(torch.sum(x * x, dim=dim))
norm = norm.view(-1, 1)

return torch.div(x, norm)

def name(self):
return 'lenet-magnet'

def run_magnet_loss():
'''
Test function for the magnet loss
Expand All @@ -122,7 +57,10 @@ def run_magnet_loss():
n_steps = epoch_steps * 15
cluster_refresh_interval = epoch_steps

model = torch.nn.DataParallel(LeNet(emb_dim)).cuda()
if args.mnist:
model = torch.nn.DataParallel(LeNet(emb_dim)).cuda()
if args.cifar10:
model = torch.nn.DataParallel(VGG(depth=16, num_classes=emb_dim))
print(model)

#model = EncoderCNN(64)
Expand Down
Loading

0 comments on commit 725a12b

Please sign in to comment.