From 73c3ab6216de67d99ffd3ae5ccc6c2f5251d19d1 Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Wed, 16 Dec 2020 11:14:01 -0800
Subject: [PATCH 01/13] add alphapose model loader

---
 libs/detectors/x86/alphapose/alphapose.py     | 32 +++++++++
 .../x86/alphapose/builders/__init__.py        |  0
 .../x86/alphapose/builders/builder.py         | 23 +++++++
 .../x86/alphapose/configs/config.yaml         | 65 +++++++++++++++++++
 .../x86/alphapose/utils/config_parser.py      |  7 ++
 5 files changed, 127 insertions(+)
 create mode 100644 libs/detectors/x86/alphapose/alphapose.py
 create mode 100644 libs/detectors/x86/alphapose/builders/__init__.py
 create mode 100644 libs/detectors/x86/alphapose/builders/builder.py
 create mode 100644 libs/detectors/x86/alphapose/configs/config.yaml
 create mode 100644 libs/detectors/x86/alphapose/utils/config_parser.py

diff --git a/libs/detectors/x86/alphapose/alphapose.py b/libs/detectors/x86/alphapose/alphapose.py
new file mode 100644
index 00000000..2249166c
--- /dev/null
+++ b/libs/detectors/x86/alphapose/alphapose.py
@@ -0,0 +1,32 @@
+from utils import config_parser
+from builders import builder
+
+import torch
+import pathlib
+
+
+class Detector:
+    def __init__(self, config):
+        self.config = config
+        self.cfg = config_parser.parse("configs/config.yaml")
+        self.device = torch.device("cuda" if config.get_section_dict('Detector')['Gpu'] else "cpu")
+        self._input_size = self.cfg.DATA_PRESET.IMAGE_SIZE
+        self.load_model()
+        self.detection_model = builder.build_detection_model(self.args)
+        self.detection_model.load_model()
+        self._aspect_ratio = float(self._input_size[1]) / self._input_size[0]
+        self.hm_size = self.cfg.DATA_PRESET.HEATMAP_SIZE
+        self.eval_joints = list(range(self.cfg.DATA_PRESET.NUM_JOINTS))
+
+    def load_model(self):
+        # TODO: add download checkpoint script
+        model_file = pathlib.Path('/repo/data/x86/fast_res50_256x192.pth')
+        if not model_file.exists():
+            # TODO: add model link
+            pass
+
+        self.pose_model = builder.build_sppe_model(self.cfg.MODEL, preset_cfg=self.cfg.DATA_PRESET)
+        print(f'Loading pose model from {model_file}...')
+        self.pose_model.load_state_dict(torch.load(model_file, map_location=self.device))
+        self.pose_model.to(self.device)
+        self.pose_model.eval()
diff --git a/libs/detectors/x86/alphapose/builders/__init__.py b/libs/detectors/x86/alphapose/builders/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/libs/detectors/x86/alphapose/builders/builder.py b/libs/detectors/x86/alphapose/builders/builder.py
new file mode 100644
index 00000000..36c3c191
--- /dev/null
+++ b/libs/detectors/x86/alphapose/builders/builder.py
@@ -0,0 +1,23 @@
+from models.fastpose import FastPose
+from detectors.yolo_wrapper import YoloWrapper
+from easydict import EasyDict as edict
+
+def build_sppe_model(cfg, preset_cfg):
+    args = cfg.copy()
+    default_args = {
+        'PRESET': preset_cfg,
+    }
+    for name, value in default_args.items():
+        args.setdefault(name, value)
+    return FastPose(**args)
+
+
+def build_detection_model(opt):
+    cfg = edict()
+    cfg.CONFIG = 'detectors/yolo/cfg/yolov3-spp.cfg'
+    cfg.WEIGHTS = 'detectors/yolo/data/yolov3-spp.weights'
+    cfg.INP_DIM = 608
+    cfg.NMS_THRES = 0.6
+    cfg.CONFIDENCE = 0.1
+    cfg.NUM_CLASSES = 80
+    return YoloWrapper(cfg, opt)
diff --git a/libs/detectors/x86/alphapose/configs/config.yaml b/libs/detectors/x86/alphapose/configs/config.yaml
new file mode 100644
index 00000000..2a864b1a
--- /dev/null
+++ b/libs/detectors/x86/alphapose/configs/config.yaml
@@ -0,0 +1,65 @@
+DATASET:
+  TRAIN:
+    TYPE: 'Mscoco'
+    ROOT: './data/coco/'
+    IMG_PREFIX: 'train2017'
+    ANN: 'annotations/person_keypoints_train2017.json'
+    AUG:
+      FLIP: true
+      ROT_FACTOR: 40
+      SCALE_FACTOR: 0.3
+      NUM_JOINTS_HALF_BODY: 8
+      PROB_HALF_BODY: -1
+  VAL:
+    TYPE: 'Mscoco'
+    ROOT: './data/coco/'
+    IMG_PREFIX: 'val2017'
+    ANN: 'annotations/person_keypoints_val2017.json'
+  TEST:
+    TYPE: 'Mscoco_det'
+    ROOT: './data/coco/'
+    IMG_PREFIX: 'val2017'
+    DET_FILE: './exp/json/test_det_yolo.json'
+    ANN: 'annotations/person_keypoints_val2017.json'
+DATA_PRESET:
+  TYPE: 'simple'
+  SIGMA: 2
+  NUM_JOINTS: 17
+  IMAGE_SIZE:
+  - 256
+  - 192
+  HEATMAP_SIZE:
+  - 64
+  - 48
+MODEL:
+  TYPE: 'SimplePose'
+  PRETRAINED: ''
+  TRY_LOAD: ''
+  NUM_DECONV_FILTERS:
+  - 256
+  - 256
+  - 256
+  NUM_LAYERS: 50
+LOSS:
+  TYPE: 'MSELoss'
+DETECTOR:
+  NAME: 'yolo'
+  CONFIG: 'detector/yolo/cfg/yolov3-spp.cfg'
+  WEIGHTS: 'detector/yolo/data/yolov3-spp.weights'
+  NMS_THRES: 0.6
+  CONFIDENCE: 0.1
+TRAIN:
+  WORLD_SIZE: 4
+  BATCH_SIZE: 32
+  BEGIN_EPOCH: 0
+  END_EPOCH: 200
+  OPTIMIZER: 'adam'
+  LR: 0.001
+  LR_FACTOR: 0.1
+  LR_STEP:
+  - 90
+  - 120
+  DPG_MILESTONE: 140
+  DPG_STEP:
+  - 160
+  - 190
\ No newline at end of file
diff --git a/libs/detectors/x86/alphapose/utils/config_parser.py b/libs/detectors/x86/alphapose/utils/config_parser.py
new file mode 100644
index 00000000..92be5ddb
--- /dev/null
+++ b/libs/detectors/x86/alphapose/utils/config_parser.py
@@ -0,0 +1,7 @@
+import yaml
+from easydict import EasyDict as edict
+
+def parse(config_file):
+    with open(config_file) as f:
+        config = edict(yaml.load(f, Loader=yaml.FullLoader))
+        return config
\ No newline at end of file

From 6f41fe24f25a7c30243965a9228b00daddca1d4a Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Wed, 16 Dec 2020 12:31:31 -0800
Subject: [PATCH 02/13] add alphapose utils

---
 libs/detectors/x86/alphapose/utils/bbox.py    |  33 +++
 .../detectors/x86/alphapose/utils/pose_nms.py | 200 ++++++++++++++++++
 .../x86/alphapose/utils/transformations.py    | 119 +++++++++++
 3 files changed, 352 insertions(+)
 create mode 100644 libs/detectors/x86/alphapose/utils/bbox.py
 create mode 100644 libs/detectors/x86/alphapose/utils/pose_nms.py
 create mode 100644 libs/detectors/x86/alphapose/utils/transformations.py

diff --git a/libs/detectors/x86/alphapose/utils/bbox.py b/libs/detectors/x86/alphapose/utils/bbox.py
new file mode 100644
index 00000000..7d287f65
--- /dev/null
+++ b/libs/detectors/x86/alphapose/utils/bbox.py
@@ -0,0 +1,33 @@
+import numpy as np
+
+
+def box_to_center_scale(x, y, w, h, aspect_ratio=1.0, scale_mult=1.25):
+    """Convert box coordinates to center and scale.
+    adapted from https://github.com/Microsoft/human-pose-estimation.pytorch
+    """
+    pixel_std = 1
+    center = np.zeros((2), dtype=np.float32)
+    center[0] = x + w * 0.5
+    center[1] = y + h * 0.5
+
+    if w > aspect_ratio * h:
+        h = w / aspect_ratio
+    elif w < aspect_ratio * h:
+        w = h * aspect_ratio
+    scale = np.array(
+        [w * 1.0 / pixel_std, h * 1.0 / pixel_std], dtype=np.float32)
+    if center[0] != -1:
+        scale = scale * scale_mult
+    return center, scale
+
+
+def center_scale_to_box(center, scale):
+    pixel_std = 1.0
+    w = scale[0] * pixel_std
+    h = scale[1] * pixel_std
+    xmin = center[0] - w * 0.5
+    ymin = center[1] - h * 0.5
+    xmax = xmin + w
+    ymax = ymin + h
+    bbox = [xmin, ymin, xmax, ymax]
+    return bbox
diff --git a/libs/detectors/x86/alphapose/utils/pose_nms.py b/libs/detectors/x86/alphapose/utils/pose_nms.py
new file mode 100644
index 00000000..7fc30195
--- /dev/null
+++ b/libs/detectors/x86/alphapose/utils/pose_nms.py
@@ -0,0 +1,200 @@
+import numpy as np
+import torch
+
+delta1 = 1
+mu = 1.7
+delta2 = 2.65
+gamma = 22.48
+scoreThreds = 0.3
+matchThreds = 5
+alpha = 0.1
+vis_thr = 0.2
+oks_thr = 0.9
+
+
+def pose_nms(bboxes, bbox_scores, bbox_ids, pose_preds, pose_scores, areaThres=0):
+    '''
+    Parametric Pose NMS algorithm
+    bboxes:         bbox locations list (n, 4)
+    bbox_scores:    bbox scores list (n, 1)
+    bbox_ids:       bbox tracking ids list (n, 1)
+    pose_preds:     pose locations list (n, kp_num, 2)
+    pose_scores:    pose scores list    (n, kp_num, 1)
+    '''
+    # global ori_pose_preds, ori_pose_scores, ref_dists
+
+    pose_scores[pose_scores == 0] = 1e-5
+    kp_nums = pose_preds.size()[1]
+    res_bboxes, res_bbox_scores, res_bbox_ids, res_pose_preds, res_pose_scores, res_pick_ids = [], [], [], [], [], []
+
+    ori_bboxes = bboxes.clone()
+    ori_bbox_scores = bbox_scores.clone()
+    ori_bbox_ids = bbox_ids.clone()
+    ori_pose_preds = pose_preds.clone()
+    ori_pose_scores = pose_scores.clone()
+
+    xmax = bboxes[:, 2]
+    xmin = bboxes[:, 0]
+    ymax = bboxes[:, 3]
+    ymin = bboxes[:, 1]
+
+    widths = xmax - xmin
+    heights = ymax - ymin
+    ref_dists = alpha * np.maximum(widths, heights)
+
+    nsamples = bboxes.shape[0]
+    human_scores = pose_scores.mean(dim=1)
+
+    human_ids = np.arange(nsamples)
+    mask = np.ones(len(human_ids)).astype(bool)
+
+    # Do pPose-NMS
+    pick = []
+    merge_ids = []
+    while (mask.any()):
+        tensor_mask = torch.Tensor(mask) == True
+        # Pick the one with highest score
+        pick_id = torch.argmax(human_scores[tensor_mask])
+        pick.append(human_ids[mask][pick_id])
+
+        # Get numbers of match keypoints by calling PCK_match
+        ref_dist = ref_dists[human_ids[mask][pick_id]]
+        simi = get_parametric_distance(pick_id, pose_preds[tensor_mask], pose_scores[tensor_mask], ref_dist)
+        num_match_keypoints = PCK_match(pose_preds[tensor_mask][pick_id], pose_preds[tensor_mask], ref_dist)
+
+        # Delete humans who have more than matchThreds keypoints overlap and high similarity
+        delete_ids = torch.from_numpy(np.arange(human_scores[tensor_mask].shape[0]))[
+            ((simi > gamma) | (num_match_keypoints >= matchThreds))]
+
+        if delete_ids.shape[0] == 0:
+            delete_ids = pick_id
+
+        merge_ids.append(human_ids[mask][delete_ids])
+        newmask = mask[mask]
+        newmask[delete_ids] = False
+        mask[mask] = newmask
+
+    assert len(merge_ids) == len(pick)
+    preds_pick = ori_pose_preds[pick]
+    scores_pick = ori_pose_scores[pick]
+    bbox_scores_pick = ori_bbox_scores[pick]
+    bboxes_pick = ori_bboxes[pick]
+    bbox_ids_pick = ori_bbox_ids[pick]
+    # final_result = pool.map(filter_result, zip(scores_pick, merge_ids, preds_pick, pick, bbox_scores_pick))
+    # final_result = [item for item in final_result if item is not None]
+
+    for j in range(len(pick)):
+        ids = np.arange(kp_nums)
+        max_score = torch.max(scores_pick[j, ids, 0])
+
+        if max_score < scoreThreds:
+            continue
+
+        # Merge poses
+        merge_id = merge_ids[j]
+        merge_pose, merge_score = p_merge_fast(
+            preds_pick[j], ori_pose_preds[merge_id], ori_pose_scores[merge_id], ref_dists[pick[j]])
+
+        max_score = torch.max(merge_score[ids])
+        if max_score < scoreThreds:
+            continue
+
+        xmax = max(merge_pose[:, 0])
+        xmin = min(merge_pose[:, 0])
+        ymax = max(merge_pose[:, 1])
+        ymin = min(merge_pose[:, 1])
+        bbox = bboxes_pick[j].cpu().tolist()
+        bbox_score = bbox_scores_pick[j].cpu()
+
+        if (1.5 ** 2 * (xmax - xmin) * (ymax - ymin) < areaThres):
+            continue
+
+        res_bboxes.append(bbox)
+        res_bbox_scores.append(bbox_score)
+        res_bbox_ids.append(ori_bbox_ids[merge_id].tolist())
+        res_pose_preds.append(merge_pose)
+        res_pose_scores.append(merge_score)
+        res_pick_ids.append(pick[j])
+
+    return res_bboxes, res_bbox_scores, res_bbox_ids, res_pose_preds, res_pose_scores, res_pick_ids
+
+
+def get_parametric_distance(i, all_preds, keypoint_scores, ref_dist):
+    pick_preds = all_preds[i]
+    pred_scores = keypoint_scores[i]
+    dist = torch.sqrt(torch.sum(
+        torch.pow(pick_preds[np.newaxis, :] - all_preds, 2),
+        dim=2
+    ))
+    mask = (dist <= 1)
+
+    kp_nums = all_preds.size()[1]
+    # Define a keypoints distance
+    score_dists = torch.zeros(all_preds.shape[0], kp_nums)
+    keypoint_scores.squeeze_()
+    if keypoint_scores.dim() == 1:
+        keypoint_scores.unsqueeze_(0)
+    if pred_scores.dim() == 1:
+        pred_scores.unsqueeze_(1)
+    # The predicted scores are repeated up to do broadcast
+    pred_scores = pred_scores.repeat(1, all_preds.shape[0]).transpose(0, 1)
+
+    score_dists[mask] = torch.tanh(pred_scores[mask] / delta1) * torch.tanh(keypoint_scores[mask] / delta1)
+
+    point_dist = torch.exp((-1) * dist / delta2)
+    final_dist = torch.sum(score_dists, dim=1) + mu * torch.sum(point_dist, dim=1)
+
+    return final_dist
+
+
+def PCK_match(pick_pred, all_preds, ref_dist):
+    dist = torch.sqrt(torch.sum(
+        torch.pow(pick_pred[np.newaxis, :] - all_preds, 2),
+        dim=2
+    ))
+    ref_dist = min(ref_dist, 7)
+    num_match_keypoints = torch.sum(
+        dist / ref_dist <= 1,
+        dim=1
+    )
+
+    return num_match_keypoints
+
+
+def p_merge_fast(ref_pose, cluster_preds, cluster_scores, ref_dist):
+    '''
+    Score-weighted pose merging
+    INPUT:
+        ref_pose:       reference pose          -- [kp_num, 2]
+        cluster_preds:  redundant poses         -- [n, kp_num, 2]
+        cluster_scores: redundant poses score   -- [n, kp_num, 1]
+        ref_dist:       reference scale         -- Constant
+    OUTPUT:
+        final_pose:     merged pose             -- [kp_num, 2]
+        final_score:    merged score            -- [kp_num]
+    '''
+    dist = torch.sqrt(torch.sum(
+        torch.pow(ref_pose[np.newaxis, :] - cluster_preds, 2),
+        dim=2
+    ))
+
+    kp_num = ref_pose.size()[0]
+    ref_dist = min(ref_dist, 15)
+
+    mask = (dist <= ref_dist)
+    final_pose = torch.zeros(kp_num, 2)
+    final_score = torch.zeros(kp_num)
+
+    if cluster_preds.dim() == 2:
+        cluster_preds.unsqueeze_(0)
+        cluster_scores.unsqueeze_(0)
+    if mask.dim() == 1:
+        mask.unsqueeze_(0)
+
+    # Weighted Merge
+    masked_scores = cluster_scores.mul(mask.float().unsqueeze(-1))
+    normed_scores = masked_scores / torch.sum(masked_scores, dim=0)
+
+    final_pose = torch.mul(cluster_preds, normed_scores.repeat(1, 1, 2)).sum(dim=0)
+    final_score = torch.mul(masked_scores, normed_scores).sum(dim=0)
+    return final_pose, final_score
diff --git a/libs/detectors/x86/alphapose/utils/transformations.py b/libs/detectors/x86/alphapose/utils/transformations.py
new file mode 100644
index 00000000..3ec3e485
--- /dev/null
+++ b/libs/detectors/x86/alphapose/utils/transformations.py
@@ -0,0 +1,119 @@
+import numpy as np
+import cv2
+import torch
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        scale = np.array([scale, scale])
+
+    scale_tmp = scale
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def get_dir(src_point, rot_rad):
+    """Rotate the point by `rot_rad` degree."""
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def get_3rd_point(a, b):
+    """Return vector c that perpendicular to (a - b)."""
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def im_to_torch(img):
+    """Transform ndarray image to torch tensor.
+    Parameters
+    ----------
+    img: numpy.ndarray
+        An ndarray with shape: `(H, W, 3)`.
+    Returns
+    -------
+    torch.Tensor
+        A tensor with shape: `(3, H, W)`.
+    """
+    img = np.transpose(img, (2, 0, 1))  # C*H*W
+    img = to_torch(img).float()
+    if img.max() > 1:
+        img /= 255
+    return img
+
+
+def to_torch(ndarray):
+    # numpy.ndarray => torch.Tensor
+    if type(ndarray).__module__ == 'numpy':
+        return torch.from_numpy(ndarray)
+    elif not torch.is_tensor(ndarray):
+        raise ValueError("Cannot convert {} to torch tensor"
+                         .format(type(ndarray)))
+    return ndarray
+
+
+def get_max_pred(heatmaps):
+    num_joints = heatmaps.shape[0]
+    width = heatmaps.shape[2]
+    heatmaps_reshaped = heatmaps.reshape((num_joints, -1))
+    idx = np.argmax(heatmaps_reshaped, 1)
+    maxvals = np.max(heatmaps_reshaped, 1)
+
+    maxvals = maxvals.reshape((num_joints, 1))
+    idx = idx.reshape((num_joints, 1))
+
+    preds = np.tile(idx, (1, 2)).astype(np.float32)
+
+    preds[:, 0] = (preds[:, 0]) % width
+    preds[:, 1] = np.floor((preds[:, 1]) / width)
+
+    pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 2))
+    pred_mask = pred_mask.astype(np.float32)
+
+    preds *= pred_mask
+    return preds, maxvals
+
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
+    target_coords[0:2] = affine_transform(coords[0:2], trans)
+    return target_coords
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]

From d0656d572277951a0be955a4a1d0adfe877a0916 Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Wed, 16 Dec 2020 12:32:15 -0800
Subject: [PATCH 03/13] finalize alphapose class skeleton

---
 =                                             |   0
 libs/detectors/x86/alphapose/alphapose.py     | 122 +++++++++++++++++-
 .../x86/alphapose/builders/builder.py         |  17 ++-
 3 files changed, 128 insertions(+), 11 deletions(-)
 create mode 100644 =

diff --git a/= b/=
new file mode 100644
index 00000000..e69de29b
diff --git a/libs/detectors/x86/alphapose/alphapose.py b/libs/detectors/x86/alphapose/alphapose.py
index 2249166c..0e147e73 100644
--- a/libs/detectors/x86/alphapose/alphapose.py
+++ b/libs/detectors/x86/alphapose/alphapose.py
@@ -1,19 +1,24 @@
 from utils import config_parser
 from builders import builder
+from utils.bbox import box_to_center_scale, center_scale_to_box
+from utils.pose_nms import pose_nms
+from utils.transformations import get_affine_transform, transform_preds, im_to_torch, get_max_pred
 
 import torch
+import cv2
+import numpy as np
 import pathlib
 
 
 class Detector:
     def __init__(self, config):
         self.config = config
+        self.name = config.get_section_dict('Detector')['Name']
         self.cfg = config_parser.parse("configs/config.yaml")
         self.device = torch.device("cuda" if config.get_section_dict('Detector')['Gpu'] else "cpu")
         self._input_size = self.cfg.DATA_PRESET.IMAGE_SIZE
         self.load_model()
-        self.detection_model = builder.build_detection_model(self.args)
-        self.detection_model.load_model()
+        self.detection_model = builder.build_detection_model(self.name, config)
         self._aspect_ratio = float(self._input_size[1]) / self._input_size[0]
         self.hm_size = self.cfg.DATA_PRESET.HEATMAP_SIZE
         self.eval_joints = list(range(self.cfg.DATA_PRESET.NUM_JOINTS))
@@ -30,3 +35,116 @@ def load_model(self):
         self.pose_model.load_state_dict(torch.load(model_file, map_location=self.device))
         self.pose_model.to(self.device)
         self.pose_model.eval()
+
+    def inference(self, image):
+        detections = self.detection_model.inference(image)
+        # TODO
+        detections = prepare_detection_results(detections)
+        with torch.no_grad():
+            inps, cropped_boxes, boxes, scores, ids = self.transform_detections(image, detections)
+            inps = inps.to(self.device)
+            hm = self.pose_model(inps)
+            poses = self.post_process(hm, cropped_boxes, boxes, scores, ids)
+        # TODO
+        results = prepare_poses_results(poses)
+        return results
+
+    def transform_detections(self, image, dets):
+        if isinstance(dets, int):
+            return 0, 0
+        dets = dets[dets[:, 0] == 0]
+        boxes = dets[:, 1:5]
+        scores = dets[:, 5:6]
+        ids = torch.zeros(scores.shape)
+        inps = torch.zeros(boxes.size(0), 3, *self._input_size)
+        cropped_boxes = torch.zeros(boxes.size(0), 4)
+        for i, box in enumerate(boxes):
+            inps[i], cropped_box = self.transform_single_detection(image, box)
+            cropped_boxes[i] = torch.FloatTensor(cropped_box)
+        return inps, cropped_boxes, boxes, scores, ids
+
+    def transform_single_detection(self, image, bbox):
+        xmin, ymin, xmax, ymax = bbox
+        center, scale = box_to_center_scale(
+            xmin, ymin, xmax - xmin, ymax - ymin, self._aspect_ratio)
+        scale = scale * 1.0
+
+        input_size = self._input_size
+        inp_h, inp_w = input_size
+
+        trans = get_affine_transform(center, scale, 0, [inp_w, inp_h])
+        inp_h, inp_w = self._input_size
+        img = cv2.warpAffine(image, trans, (int(inp_w), int(inp_h)), flags=cv2.INTER_LINEAR)
+        bbox = center_scale_to_box(center, scale)
+
+        img = im_to_torch(img)
+        img[0].add_(-0.406)
+        img[1].add_(-0.457)
+        img[2].add_(-0.480)
+
+        return img, bbox
+
+    def post_process(self, hm, cropped_boxes, boxes, scores, ids):
+        assert hm.dim() == 4
+        pose_coords = []
+        pose_scores = []
+        for i in range(hm.shape[0]):
+            bbox = cropped_boxes[i].tolist()
+            pose_coord, pose_score = self.heatmap_to_coord(hm[i][self.eval_joints], bbox, hm_shape=self.hm_size,
+                                                           norm_type=None)
+            pose_coords.append(torch.from_numpy(pose_coord).unsqueeze(0))
+            pose_scores.append(torch.from_numpy(pose_score).unsqueeze(0))
+
+        preds_img = torch.cat(pose_coords)
+        preds_scores = torch.cat(pose_scores)
+
+        boxes, scores, ids, preds_img, preds_scores, pick_ids = \
+            pose_nms(boxes, scores, ids, preds_img, preds_scores, 0)
+
+        _result = []
+        for k in range(len(scores)):
+            _result.append(
+                {
+                    'keypoints': preds_img[k],
+                    'kp_score': preds_scores[k],
+                    'proposal_score': torch.mean(preds_scores[k]) + scores[k] + 1.25 * max(preds_scores[k]),
+                    'idx': ids[k],
+                    'bbox': [boxes[k][0], boxes[k][1], boxes[k][2] - boxes[k][0], boxes[k][3] - boxes[k][1]]
+                }
+            )
+        return _result
+
+    def heatmap_to_coord(self, hms, bbox, hms_flip=None, **kwargs):
+        if hms_flip is not None:
+            hms = (hms + hms_flip) / 2
+        if not isinstance(hms, np.ndarray):
+            hms = hms.cpu().data.numpy()
+        coords, maxvals = get_max_pred(hms)
+
+        hm_h = hms.shape[1]
+        hm_w = hms.shape[2]
+
+        # post-processing
+        for p in range(coords.shape[0]):
+            hm = hms[p]
+            px = int(round(float(coords[p][0])))
+            py = int(round(float(coords[p][1])))
+            if 1 < px < hm_w - 1 and 1 < py < hm_h - 1:
+                diff = np.array((hm[py][px + 1] - hm[py][px - 1],
+                                 hm[py + 1][px] - hm[py - 1][px]))
+                coords[p] += np.sign(diff) * .25
+
+        preds = np.zeros_like(coords)
+
+        # transform bbox to scale
+        xmin, ymin, xmax, ymax = bbox
+        w = xmax - xmin
+        h = ymax - ymin
+        center = np.array([xmin + w * 0.5, ymin + h * 0.5])
+        scale = np.array([w, h])
+        # Transform back
+        for i in range(coords.shape[0]):
+            preds[i] = transform_preds(coords[i], center, scale,
+                                       [hm_w, hm_h])
+
+        return preds, maxvals
diff --git a/libs/detectors/x86/alphapose/builders/builder.py b/libs/detectors/x86/alphapose/builders/builder.py
index 36c3c191..dab9086a 100644
--- a/libs/detectors/x86/alphapose/builders/builder.py
+++ b/libs/detectors/x86/alphapose/builders/builder.py
@@ -12,12 +12,11 @@ def build_sppe_model(cfg, preset_cfg):
     return FastPose(**args)
 
 
-def build_detection_model(opt):
-    cfg = edict()
-    cfg.CONFIG = 'detectors/yolo/cfg/yolov3-spp.cfg'
-    cfg.WEIGHTS = 'detectors/yolo/data/yolov3-spp.weights'
-    cfg.INP_DIM = 608
-    cfg.NMS_THRES = 0.6
-    cfg.CONFIDENCE = 0.1
-    cfg.NUM_CLASSES = 80
-    return YoloWrapper(cfg, opt)
+def build_detection_model(name, config):
+    detector_name = name.split("_")[-1]
+    if detector_name == "ssd":
+        from libs.detectors.x86 import mobilenet_ssd
+        detector = mobilenet_ssd.Detector(config=config)
+    else:
+        raise ValueError('Not supported detector named: ', name, ' for AlphaPose.')
+    return detector

From 4744f6b93bc3db2755281f2d001bf217ee62cbd3 Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Thu, 17 Dec 2020 13:08:22 -0800
Subject: [PATCH 04/13] add wrappers skeleton

---
 libs/detectors/x86/alphapose/alphapose.py | 1 +
 libs/detectors/x86/alphapose/wrappers.py  | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 libs/detectors/x86/alphapose/wrappers.py

diff --git a/libs/detectors/x86/alphapose/alphapose.py b/libs/detectors/x86/alphapose/alphapose.py
index 0e147e73..4eaaf8a8 100644
--- a/libs/detectors/x86/alphapose/alphapose.py
+++ b/libs/detectors/x86/alphapose/alphapose.py
@@ -3,6 +3,7 @@
 from utils.bbox import box_to_center_scale, center_scale_to_box
 from utils.pose_nms import pose_nms
 from utils.transformations import get_affine_transform, transform_preds, im_to_torch, get_max_pred
+from wrappers import prepare_detection_results, prepare_poses_results
 
 import torch
 import cv2
diff --git a/libs/detectors/x86/alphapose/wrappers.py b/libs/detectors/x86/alphapose/wrappers.py
new file mode 100644
index 00000000..9df0c25a
--- /dev/null
+++ b/libs/detectors/x86/alphapose/wrappers.py
@@ -0,0 +1,6 @@
+def prepare_detection_results(detections):
+    pass
+
+
+def prepare_poses_results(poses):
+    pass

From 1ce8f1610acb792caeb28565dbacec00515a03f1 Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Sun, 20 Dec 2020 07:26:47 -0800
Subject: [PATCH 05/13] add alphapose wrappers

---
 libs/detectors/x86/alphapose/alphapose.py |  5 ++-
 libs/detectors/x86/alphapose/wrappers.py  | 49 +++++++++++++++++++++--
 2 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/libs/detectors/x86/alphapose/alphapose.py b/libs/detectors/x86/alphapose/alphapose.py
index 4eaaf8a8..d9a2376a 100644
--- a/libs/detectors/x86/alphapose/alphapose.py
+++ b/libs/detectors/x86/alphapose/alphapose.py
@@ -15,6 +15,7 @@ class Detector:
     def __init__(self, config):
         self.config = config
         self.name = config.get_section_dict('Detector')['Name']
+        self.w, self.h, _ = [int(i) for i in self.config.get_section_dict('Detector')['ImageSize'].split(',')]
         self.cfg = config_parser.parse("configs/config.yaml")
         self.device = torch.device("cuda" if config.get_section_dict('Detector')['Gpu'] else "cpu")
         self._input_size = self.cfg.DATA_PRESET.IMAGE_SIZE
@@ -40,14 +41,14 @@ def load_model(self):
     def inference(self, image):
         detections = self.detection_model.inference(image)
         # TODO
-        detections = prepare_detection_results(detections)
+        detections = prepare_detection_results(detections, self.w, self.h)
         with torch.no_grad():
             inps, cropped_boxes, boxes, scores, ids = self.transform_detections(image, detections)
             inps = inps.to(self.device)
             hm = self.pose_model(inps)
             poses = self.post_process(hm, cropped_boxes, boxes, scores, ids)
         # TODO
-        results = prepare_poses_results(poses)
+        results = prepare_poses_results(poses, self.w, self.h, scores)
         return results
 
     def transform_detections(self, image, dets):
diff --git a/libs/detectors/x86/alphapose/wrappers.py b/libs/detectors/x86/alphapose/wrappers.py
index 9df0c25a..a6f0b525 100644
--- a/libs/detectors/x86/alphapose/wrappers.py
+++ b/libs/detectors/x86/alphapose/wrappers.py
@@ -1,6 +1,47 @@
-def prepare_detection_results(detections):
-    pass
+import numpy as np
+import torch
 
 
-def prepare_poses_results(poses):
-    pass
+def prepare_detection_results(object_list, w, h):
+    scale_factors = torch.tensor([w, h, w, h])
+    num_of_objects = len(object_list)
+    output = torch.zeros(num_of_objects, 8, dtype=torch.float32)
+    output[:, 6] = 0.99
+    for i, obj in enumerate(object_list):
+        bbox = torch.tensor([obj["bbox"][1], obj["bbox"][0], obj["bbox"][3], obj["bbox"][2]])
+        bbox_scaled = bbox * scale_factors
+        output[i, 1:5] = bbox_scaled
+        output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, w)
+        output[i, [4, 2]] = torch.clamp(output[i, [2, 4]], 0.0, h)
+        output[i, 5] = obj["score"]
+
+    return output
+
+
+def prepare_poses_results(poses, w, h, scores):
+    scales = np.array([h, w, h, w])
+    results = []
+    for i, item in enumerate(poses):
+        object_dict = dict()
+        bboxes = np.array([item["bbox"][1], item["bbox"][0], item["bbox"][3], item["bbox"][2]])
+        bboxes_scaled = np.divide(bboxes, scales)
+        object_dict["id"] = "1-" + str(i)
+        object_dict["bbox"] = bboxes_scaled.tolist()
+        object_dict["score"] = scores[i].item()
+        kp_scores = item["kp_score"].numpy()
+        keypoints = item["keypoints"]
+        if np.all(kp_scores[[0, 1, 2, 5, 6]] > 0.15):
+            x_min_face = int(keypoints[6, 0])
+            x_max_face = int(keypoints[5, 0])
+            y_max_face = int((keypoints[5, 1] + keypoints[6, 1]) / 2)
+            y_eyes = int((keypoints[1, 1] + keypoints[2, 1]) / 2)
+            y_min_face = 2 * y_eyes - y_max_face
+            if (y_max_face - y_min_face > 0) and (x_max_face - x_min_face > 0):
+                h_crop = y_max_face - y_min_face
+                x_min_face = int(max(0, x_min_face - 0.1 * h_crop))
+                y_min_face = int(max(0, y_min_face - 0.1 * h_crop))
+                x_max_face = int(min(w, x_min_face + 1.1 * h_crop))
+                y_max_face = int(min(h, y_min_face + 1.1 * h_crop))
+                object_dict["face"] = [y_min_face / h, x_min_face / w, y_max_face / h, x_max_face / w]
+        results.append(object_dict)
+    return results

From dfbce752b27a2eff2e4e07d2d65de6af613d2e22 Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Sat, 26 Dec 2020 11:10:49 -0800
Subject: [PATCH 06/13] alphapose tested

---
 libs/classifiers/classifier.py                |   9 +-
 libs/detectors/x86/alphapose/alphapose.py     |  18 +-
 .../x86/alphapose/builders/builder.py         |   3 +-
 libs/detectors/x86/alphapose/setup.py         | 204 ++++++++++++++++++
 .../x86/alphapose/utils/config_parser.py      |   5 +-
 libs/detectors/x86/alphapose/wrappers.py      |   9 +-
 libs/detectors/x86/detector.py                |   3 +
 x86-alphapose-gpu.Dockerfile                  | 103 +++++++++
 8 files changed, 340 insertions(+), 14 deletions(-)
 create mode 100644 libs/detectors/x86/alphapose/setup.py
 create mode 100644 x86-alphapose-gpu.Dockerfile

diff --git a/libs/classifiers/classifier.py b/libs/classifiers/classifier.py
index d1085d79..4bac0901 100644
--- a/libs/classifiers/classifier.py
+++ b/libs/classifiers/classifier.py
@@ -18,7 +18,8 @@ def inference(self, objects):
         return self.classifier.inference(objects)
 
     def object_post_process(self, object, classifier_result, classifier_score):
-        if object['face'] is not None and classifier_score > self.min_threshold:
-            object['face_label'] = classifier_result
-        else:
-            object['face_label'] = -1
+        if 'face' in object.keys():
+            if object['face'] is not None and classifier_score > self.min_threshold:
+                object['face_label'] = classifier_result
+            else:
+                object['face_label'] = -1
diff --git a/libs/detectors/x86/alphapose/alphapose.py b/libs/detectors/x86/alphapose/alphapose.py
index d9a2376a..567c5436 100644
--- a/libs/detectors/x86/alphapose/alphapose.py
+++ b/libs/detectors/x86/alphapose/alphapose.py
@@ -1,3 +1,5 @@
+import sys
+sys.path.append("libs/detectors/x86/alphapose")
 from utils import config_parser
 from builders import builder
 from utils.bbox import box_to_center_scale, center_scale_to_box
@@ -9,6 +11,8 @@
 import cv2
 import numpy as np
 import pathlib
+import time
+from libs.detectors.utils.fps_calculator import convert_infr_time_to_fps
 
 
 class Detector:
@@ -16,14 +20,15 @@ def __init__(self, config):
         self.config = config
         self.name = config.get_section_dict('Detector')['Name']
         self.w, self.h, _ = [int(i) for i in self.config.get_section_dict('Detector')['ImageSize'].split(',')]
-        self.cfg = config_parser.parse("configs/config.yaml")
-        self.device = torch.device("cuda" if config.get_section_dict('Detector')['Gpu'] else "cpu")
+        self.cfg = config_parser.parse("libs/detectors/x86/alphapose/configs/config.yaml")
+        self.device = torch.device("cuda" if config.get_section_dict('Detector')['Device'].endswith("gpu") else "cpu")
         self._input_size = self.cfg.DATA_PRESET.IMAGE_SIZE
         self.load_model()
         self.detection_model = builder.build_detection_model(self.name, config)
         self._aspect_ratio = float(self._input_size[1]) / self._input_size[0]
         self.hm_size = self.cfg.DATA_PRESET.HEATMAP_SIZE
         self.eval_joints = list(range(self.cfg.DATA_PRESET.NUM_JOINTS))
+        self.fps = None
 
     def load_model(self):
         # TODO: add download checkpoint script
@@ -39,7 +44,10 @@ def load_model(self):
         self.pose_model.eval()
 
     def inference(self, image):
+        t_begin = time.perf_counter()
         detections = self.detection_model.inference(image)
+        if len(detections) == 0:
+            return []
         # TODO
         detections = prepare_detection_results(detections, self.w, self.h)
         with torch.no_grad():
@@ -47,8 +55,10 @@ def inference(self, image):
             inps = inps.to(self.device)
             hm = self.pose_model(inps)
             poses = self.post_process(hm, cropped_boxes, boxes, scores, ids)
+        inference_time = time.perf_counter() - t_begin
+        self.fps = convert_infr_time_to_fps(inference_time)
         # TODO
-        results = prepare_poses_results(poses, self.w, self.h, scores)
+        results = prepare_poses_results(poses, self.w, self.h, scores) 
         return results
 
     def transform_detections(self, image, dets):
@@ -111,7 +121,7 @@ def post_process(self, hm, cropped_boxes, boxes, scores, ids):
                     'kp_score': preds_scores[k],
                     'proposal_score': torch.mean(preds_scores[k]) + scores[k] + 1.25 * max(preds_scores[k]),
                     'idx': ids[k],
-                    'bbox': [boxes[k][0], boxes[k][1], boxes[k][2] - boxes[k][0], boxes[k][3] - boxes[k][1]]
+                    'bbox': [boxes[k][0], boxes[k][1], boxes[k][2], boxes[k][3]]
                 }
             )
         return _result
diff --git a/libs/detectors/x86/alphapose/builders/builder.py b/libs/detectors/x86/alphapose/builders/builder.py
index dab9086a..df1250f1 100644
--- a/libs/detectors/x86/alphapose/builders/builder.py
+++ b/libs/detectors/x86/alphapose/builders/builder.py
@@ -1,6 +1,5 @@
 from models.fastpose import FastPose
-from detectors.yolo_wrapper import YoloWrapper
-from easydict import EasyDict as edict
+
 
 def build_sppe_model(cfg, preset_cfg):
     args = cfg.copy()
diff --git a/libs/detectors/x86/alphapose/setup.py b/libs/detectors/x86/alphapose/setup.py
new file mode 100644
index 00000000..e02ecf17
--- /dev/null
+++ b/libs/detectors/x86/alphapose/setup.py
@@ -0,0 +1,204 @@
+import os
+import platform
+import subprocess
+import time
+
+import numpy as np
+from Cython.Build import cythonize
+from setuptools import Extension, find_packages, setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+MAJOR = 0
+MINOR = 3
+PATCH = 0
+SUFFIX = ''
+SHORT_VERSION = '{}.{}.{}{}'.format(MAJOR, MINOR, PATCH, SUFFIX)
+
+# version_file = 'alphapose/version.py'
+
+
+# def readme():
+#     with open('README.md') as f:
+#         content = f.read()
+#     return content
+
+
+# def get_git_hash():
+#
+#     def _minimal_ext_cmd(cmd):
+#         # construct minimal environment
+#         env = {}
+#         for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+#             v = os.environ.get(k)
+#             if v is not None:
+#                 env[k] = v
+#         # LANGUAGE is used on win32
+#         env['LANGUAGE'] = 'C'
+#         env['LANG'] = 'C'
+#         env['LC_ALL'] = 'C'
+#         out = subprocess.Popen(
+#             cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+#         return out
+#
+#     try:
+#         out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+#         sha = out.strip().decode('ascii')
+#     except OSError:
+#         sha = 'unknown'
+#
+#     return sha
+
+
+# def get_hash():
+#     if os.path.exists('.git'):
+#         sha = get_git_hash()[:7]
+#     elif os.path.exists(version_file):
+#         try:
+#             from alphapose.version import __version__
+#             sha = __version__.split('+')[-1]
+#         except ImportError:
+#             raise ImportError('Unable to get git version')
+#     else:
+#         sha = 'unknown'
+#
+#     return sha
+
+
+# def write_version_py():
+#     content = """# GENERATED VERSION FILE
+# # TIME: {}
+#
+# __version__ = '{}'
+# short_version = '{}'
+# """
+#     sha = get_hash()
+#     VERSION = SHORT_VERSION + '+' + sha
+#
+#     with open(version_file, 'w') as f:
+#         f.write(content.format(time.asctime(), VERSION, SHORT_VERSION))
+#
+#
+# def get_version():
+#     with open(version_file, 'r') as f:
+#         exec(compile(f.read(), version_file, 'exec'))
+#     return locals()['__version__']
+
+
+def make_cython_ext(name, module, sources):
+    extra_compile_args = None
+    if platform.system() != 'Windows':
+        extra_compile_args = {
+            'cxx': ['-Wno-unused-function', '-Wno-write-strings']
+        }
+
+    extension = Extension(
+        '{}.{}'.format(module, name),
+        [os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=[np.get_include()],
+        language='c++',
+        extra_compile_args=extra_compile_args)
+    extension, = cythonize(extension)
+    return extension
+
+
+def make_cuda_ext(name, module, sources):
+
+    return CUDAExtension(
+        name='{}.{}'.format(module, name),
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        extra_compile_args={
+            'cxx': [],
+            'nvcc': [
+                '-D__CUDA_NO_HALF_OPERATORS__',
+                '-D__CUDA_NO_HALF_CONVERSIONS__',
+                '-D__CUDA_NO_HALF2_OPERATORS__',
+            ]
+        })
+
+
+def get_ext_modules():
+    ext_modules = []
+    # only windows visual studio 2013+ support compile c/cuda extensions
+    # If you force to compile extension on Windows and ensure appropriate visual studio
+    # is intalled, you can try to use these ext_modules.
+    force_compile = False
+    if platform.system() != 'Windows' or force_compile:
+        ext_modules = [
+            make_cuda_ext(
+                name='deform_conv_cuda',
+                module='models.layers.dcn',
+                sources=[
+                    'src/deform_conv_cuda.cpp',
+                    'src/deform_conv_cuda_kernel.cu'
+                ]),
+            make_cuda_ext(
+                name='deform_pool_cuda',
+                module='models.layers.dcn',
+                sources=[
+                    'src/deform_pool_cuda.cpp',
+                    'src/deform_pool_cuda_kernel.cu'
+                ]),
+        ]
+    return ext_modules
+
+
+def get_install_requires():
+    install_requires = [
+        'pillow>=6.2.0', 'six', 'terminaltables', 'visdom',
+        'tqdm', 'easydict',
+        'pyyaml',
+        'munkres', 'timm==0.1.20', 'natsort'
+    ]
+    # official pycocotools doesn't support Windows, we will install it by third-party git repository later
+    if platform.system() != 'Windows':
+        install_requires.append('pycocotools')
+    return install_requires
+
+
+def is_installed(package_name):
+    from pip._internal.utils.misc import get_installed_distributions
+    for p in get_installed_distributions():
+        if package_name in p.egg_name():
+            return True
+    return False
+
+
+if __name__ == '__main__':
+    # write_version_py()
+    setup(
+        name='alphapose',
+        version="0.1",
+        description='Code for AlphaPose',
+        long_description=" ",
+        keywords='computer vision, human pose estimation',
+        url='https://github.com/MVIG-SJTU/AlphaPose',
+        packages=find_packages(exclude=('data', 'exp',)),
+        package_data={'': ['*.json', '*.txt']},
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 2',
+            'Programming Language :: Python :: 2.7',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.4',
+            'Programming Language :: Python :: 3.5',
+            'Programming Language :: Python :: 3.6',
+        ],
+        license='GPLv3',
+        python_requires=">=3",
+        setup_requires=['pytest-runner', 'numpy', 'cython'],
+        tests_require=['pytest'],
+        install_requires=get_install_requires(),
+        ext_modules=get_ext_modules(),
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False)
+    # Windows need pycocotools here: https://github.com/philferriere/cocoapi#subdirectory=PythonAPI
+    if platform.system() == 'Windows' and not is_installed('pycocotools'):
+        print("\nInstall third-party pycocotools for Windows...")
+        cmd = 'python -m pip install git+https://github.com/philferriere/cocoapi.git#subdirectory=PythonAPI'
+        os.system(cmd)
+    if not is_installed('cython_bbox'):
+        print("\nInstall `cython_bbox`...")
+        cmd = 'python3 -m pip install git+https://github.com/yanfengliu/cython_bbox.git'
+        os.system(cmd)
diff --git a/libs/detectors/x86/alphapose/utils/config_parser.py b/libs/detectors/x86/alphapose/utils/config_parser.py
index 92be5ddb..8030c093 100644
--- a/libs/detectors/x86/alphapose/utils/config_parser.py
+++ b/libs/detectors/x86/alphapose/utils/config_parser.py
@@ -1,7 +1,10 @@
+import sys
+sys.path.append("libs/detectors/x86/alphapose")
 import yaml
 from easydict import EasyDict as edict
 
 def parse(config_file):
     with open(config_file) as f:
         config = edict(yaml.load(f, Loader=yaml.FullLoader))
-        return config
\ No newline at end of file
+        return config
+
diff --git a/libs/detectors/x86/alphapose/wrappers.py b/libs/detectors/x86/alphapose/wrappers.py
index a6f0b525..5ad2d60d 100644
--- a/libs/detectors/x86/alphapose/wrappers.py
+++ b/libs/detectors/x86/alphapose/wrappers.py
@@ -9,11 +9,12 @@ def prepare_detection_results(object_list, w, h):
     output[:, 6] = 0.99
     for i, obj in enumerate(object_list):
         bbox = torch.tensor([obj["bbox"][1], obj["bbox"][0], obj["bbox"][3], obj["bbox"][2]])
-        bbox_scaled = bbox * scale_factors
+        bbox_scaled = (bbox.float() * scale_factors.float())
         output[i, 1:5] = bbox_scaled
         output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, w)
-        output[i, [4, 2]] = torch.clamp(output[i, [2, 4]], 0.0, h)
-        output[i, 5] = obj["score"]
+        output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, h)
+        # TODO
+        output[i, 5] = float(obj["score"].numpy())
 
     return output
 
@@ -28,6 +29,7 @@ def prepare_poses_results(poses, w, h, scores):
         object_dict["id"] = "1-" + str(i)
         object_dict["bbox"] = bboxes_scaled.tolist()
         object_dict["score"] = scores[i].item()
+        object_dict["face"] = None
         kp_scores = item["kp_score"].numpy()
         keypoints = item["keypoints"]
         if np.all(kp_scores[[0, 1, 2, 5, 6]] > 0.15):
@@ -43,5 +45,6 @@ def prepare_poses_results(poses, w, h, scores):
                 x_max_face = int(min(w, x_min_face + 1.1 * h_crop))
                 y_max_face = int(min(h, y_min_face + 1.1 * h_crop))
                 object_dict["face"] = [y_min_face / h, x_min_face / w, y_max_face / h, x_max_face / w]
+
         results.append(object_dict)
     return results
diff --git a/libs/detectors/x86/detector.py b/libs/detectors/x86/detector.py
index 10d64878..4137cc45 100644
--- a/libs/detectors/x86/detector.py
+++ b/libs/detectors/x86/detector.py
@@ -25,6 +25,9 @@ def __init__(self, config):
         elif self.name == "openpifpaf_tensorrt":
             from libs.detectors.x86.openpifpaf_tensorrt import openpifpaf_tensorrt
             self.net = openpifpaf_tensorrt.Detector(self.config)
+        elif self.name.startswith("alphapose"):
+            from libs.detectors.x86.alphapose import alphapose
+            self.net = alphapose.Detector(self.config)
 
         else:
             raise ValueError('Not supported network named: ', self.name)
diff --git a/x86-alphapose-gpu.Dockerfile b/x86-alphapose-gpu.Dockerfile
new file mode 100644
index 00000000..7563ca11
--- /dev/null
+++ b/x86-alphapose-gpu.Dockerfile
@@ -0,0 +1,103 @@
+FROM nvcr.io/nvidia/tensorflow:20.03-tf2-py3
+#RUN apt-get update && apt-get install -y python3-dev  && conda update -y wrapt && pip3 install tensorflow==2.2 openpifpaf wget
+
+#RUN ln -s /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so.10.2 /usr/lib/x86_64-linux-gnu/libcudart.so.10.1
+#RUN pip uninstall python3-opencv python-opencv
+# The `python3-opencv` package isn't built with gstreamer on Ubuntu. So we need to manually build opencv.
+ARG OPENCV_VERSION=4.3.0
+# http://amritamaz.net/blog/opencv-config
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        cmake \
+        curl \
+        git \
+        gstreamer1.0-plugins-bad \
+        gstreamer1.0-plugins-good \
+        gstreamer1.0-plugins-ugly \
+        gstreamer1.0-vaapi \
+        libavcodec-dev \
+        libavformat-dev \
+        libgstreamer-plugins-base1.0-dev \
+        libgstreamer1.0-dev \
+        libsm6 \
+        libswscale-dev \
+        libxext6 \
+        libxrender-dev \
+        mesa-va-drivers \
+        python3-dev \
+        python3-numpy \
+    && rm -rf /var/lib/apt/lists/* \
+    && cd /tmp/ \
+    && curl -L https://github.com/opencv/opencv/archive/${OPENCV_VERSION}.tar.gz -o opencv.tar.gz \
+    && tar zxvf opencv.tar.gz && rm opencv.tar.gz \
+    && cd /tmp/opencv-${OPENCV_VERSION} \
+    && mkdir build \
+    && cd build \
+    && cmake \
+        -DBUILD_opencv_python3=yes \
+        -DPYTHON_EXECUTABLE=$(which python3) \
+        -DCMAKE_BUILD_TYPE=RELEASE \
+        -DBUILD_TESTS=OFF \
+        -DBUILD_PERF_TESTS=OFF \
+        -DBUILD_EXAMPLES=OFF \
+        -DINSTALL_TESTS=OFF \
+        -DBUILD_opencv_apps=OFF \
+        -DBUILD_DOCS=OFF \
+        ../ \
+    && make -j$(nproc) \
+    && make install \
+    && cd /tmp \
+    && rm -rf opencv-${OPENCV_VERSION} \
+    && apt-get purge -y \
+        cmake \
+        libgstreamer-plugins-base1.0-dev \
+        libgstreamer1.0-dev \
+        libxrender-dev \
+        python3-dev \
+    && apt-get autoremove -y
+
+# https://askubuntu.com/questions/909277/avoiding-user-interaction-with-tzdata-when-installing-certbot-in-a-docker-contai
+ARG DEBIAN_FRONTEND=noninteractive
+
+COPY api/requirements.txt /
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        tzdata \
+        pkg-config \
+        python3-dev \
+        python3-numpy \
+        python3-pillow \
+        python3-pip \
+        python3-scipy \
+        python3-wget \
+        supervisor \
+    && rm -rf /var/lib/apt/lists/* \
+    && python3 -m pip install --upgrade pip setuptools==41.0.0 && pip install -r /requirements.txt \
+    && apt-get purge -y \
+        python3-dev \
+    && apt-get autoremove -y
+
+RUN apt-get update && apt-get install -y python3-dev
+ENV DEV_ALLOW_ALL_ORIGINS=true
+ENV AWS_SHARED_CREDENTIALS_FILE=/repo/.aws/credentials
+ENV AWS_CONFIG_FILE=/repo/.aws/config
+ENV TF_FORCE_GPU_ALLOW_GROWTH=true
+ENV CONFIG_FILE=config-x86-gpu.ini
+RUN python3 -m site --user-site > /root/tmp_variable && DCN_PATH=$(cat /root/tmp_variable)
+COPY . /repo
+RUN pip3 install torch==1.2 torchvision==0.4.0
+#COPY ./libs/detectors/x86/alphapose/ /alphapose_packages
+#COPY ./libs/detectors/x86/alphapose/setup.py /alphapose_packages/setup.py
+WORKDIR /repo/libs/detectors/x86/alphapose
+#ENV PYTHONPATH=/alphapose_packages
+#
+#RUN mkdir -p $(cat /root/tmp_variable)/alphapose_package && \
+#cd $(cat /root/tmp_variable)/alphapose_package && \
+RUN apt-get update && apt-get install -y libyaml-dev && pip3 install cython && python3 setup.py build develop --user
+RUN mkdir -p $(cat /root/tmp_variable)/alphapose_package && \
+cp /repo/libs/detectors/x86/alphapose/models/layers/dcn/*.so $(cat /root/tmp_variable)/alphapose_package
+#mv /repo/libs/detectors/x86/alphapose/setup.py $(cat /root/tmp_variable)/alphapose_package
+
+WORKDIR /repo
+HEALTHCHECK --interval=30s --retries=2 --start-period=15s CMD bash healthcheck.bash
+CMD supervisord -c supervisord.conf -n

From cd65485999b246498e706d7903078200e06ed04d Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Sat, 26 Dec 2020 11:23:16 -0800
Subject: [PATCH 07/13] add alphapose models module

---
 .../x86/alphapose/models/__init__.py          |   0
 .../x86/alphapose/models/fastpose.py          |  61 ++
 .../x86/alphapose/models/layers/DUC.py        |  29 +
 .../x86/alphapose/models/layers/SE_Resnet.py  | 231 +++++
 .../x86/alphapose/models/layers/__init__.py   |   0
 .../x86/alphapose/models/layers/dcn/DCN.py    |  61 ++
 .../alphapose/models/layers/dcn/__init__.py   |  13 +
 .../models/layers/dcn/deform_conv.py          | 337 +++++++
 .../models/layers/dcn/deform_pool.py          | 252 +++++
 .../layers/dcn/src/deform_conv_cuda.cpp       | 701 ++++++++++++++
 .../layers/dcn/src/deform_conv_cuda_kernel.cu | 866 ++++++++++++++++++
 .../layers/dcn/src/deform_pool_cuda.cpp       |  90 ++
 .../layers/dcn/src/deform_pool_cuda_kernel.cu | 364 ++++++++
 .../detectors/x86/alphapose/utils/__init__.py |   0
 14 files changed, 3005 insertions(+)
 create mode 100644 libs/detectors/x86/alphapose/models/__init__.py
 create mode 100644 libs/detectors/x86/alphapose/models/fastpose.py
 create mode 100644 libs/detectors/x86/alphapose/models/layers/DUC.py
 create mode 100644 libs/detectors/x86/alphapose/models/layers/SE_Resnet.py
 create mode 100644 libs/detectors/x86/alphapose/models/layers/__init__.py
 create mode 100644 libs/detectors/x86/alphapose/models/layers/dcn/DCN.py
 create mode 100644 libs/detectors/x86/alphapose/models/layers/dcn/__init__.py
 create mode 100644 libs/detectors/x86/alphapose/models/layers/dcn/deform_conv.py
 create mode 100644 libs/detectors/x86/alphapose/models/layers/dcn/deform_pool.py
 create mode 100644 libs/detectors/x86/alphapose/models/layers/dcn/src/deform_conv_cuda.cpp
 create mode 100644 libs/detectors/x86/alphapose/models/layers/dcn/src/deform_conv_cuda_kernel.cu
 create mode 100644 libs/detectors/x86/alphapose/models/layers/dcn/src/deform_pool_cuda.cpp
 create mode 100644 libs/detectors/x86/alphapose/models/layers/dcn/src/deform_pool_cuda_kernel.cu
 create mode 100644 libs/detectors/x86/alphapose/utils/__init__.py

diff --git a/libs/detectors/x86/alphapose/models/__init__.py b/libs/detectors/x86/alphapose/models/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/libs/detectors/x86/alphapose/models/fastpose.py b/libs/detectors/x86/alphapose/models/fastpose.py
new file mode 100644
index 00000000..cca1dcb4
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/fastpose.py
@@ -0,0 +1,61 @@
+# -----------------------------------------------------
+# Copyright (c) Shanghai Jiao Tong University. All rights reserved.
+# Written by Jiefeng Li (jeff.lee.sjtu@gmail.com)
+# -----------------------------------------------------
+
+import torch.nn as nn
+
+from .layers.DUC import DUC
+from .layers.SE_Resnet import SEResnet
+
+
+class FastPose(nn.Module):
+    conv_dim = 128
+
+    def __init__(self, norm_layer=nn.BatchNorm2d, **cfg):
+        super(FastPose, self).__init__()
+        self._preset_cfg = cfg['PRESET']
+
+        if 'DCN' in cfg.keys():
+            stage_with_dcn = cfg['STAGE_WITH_DCN']
+            dcn = cfg['DCN']
+            self.preact = SEResnet(
+                f"resnet{cfg['NUM_LAYERS']}", dcn=dcn, stage_with_dcn=stage_with_dcn)
+        else:
+            self.preact = SEResnet(f"resnet{cfg['NUM_LAYERS']}")
+
+        # Imagenet pretrain model
+        import torchvision.models as tm   # noqa: F401,F403
+        assert cfg['NUM_LAYERS'] in [18, 34, 50, 101, 152]
+        x = eval(f"tm.resnet{cfg['NUM_LAYERS']}(pretrained=True)")
+
+        model_state = self.preact.state_dict()
+        state = {k: v for k, v in x.state_dict().items()
+                 if k in self.preact.state_dict() and v.size() == self.preact.state_dict()[k].size()}
+        model_state.update(state)
+        self.preact.load_state_dict(model_state)
+
+        self.suffle1 = nn.PixelShuffle(2)
+        self.duc1 = DUC(512, 1024, upscale_factor=2, norm_layer=norm_layer)
+        self.duc2 = DUC(256, 512, upscale_factor=2, norm_layer=norm_layer)
+
+        self.conv_out = nn.Conv2d(
+            self.conv_dim, self._preset_cfg['NUM_JOINTS'], kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        out = self.preact(x)
+        out = self.suffle1(out)
+        out = self.duc1(out)
+        out = self.duc2(out)
+
+        out = self.conv_out(out)
+        return out
+
+    def _initialize(self):
+        for m in self.conv_out.modules():
+            if isinstance(m, nn.Conv2d):
+                # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                # logger.info('=> init {}.weight as normal(0, 0.001)'.format(name))
+                # logger.info('=> init {}.bias as 0'.format(name))
+                nn.init.normal_(m.weight, std=0.001)
+                nn.init.constant_(m.bias, 0)
diff --git a/libs/detectors/x86/alphapose/models/layers/DUC.py b/libs/detectors/x86/alphapose/models/layers/DUC.py
new file mode 100644
index 00000000..96993bb0
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/DUC.py
@@ -0,0 +1,29 @@
+# -----------------------------------------------------
+# Copyright (c) Shanghai Jiao Tong University. All rights reserved.
+# Written by Jiefeng Li (jeff.lee.sjtu@gmail.com)
+# -----------------------------------------------------
+
+import torch.nn as nn
+
+
+class DUC(nn.Module):
+    '''
+    Initialize: inplanes, planes, upscale_factor
+    OUTPUT: (planes // upscale_factor^2) * ht * wd
+    '''
+
+    def __init__(self, inplanes, planes,
+                 upscale_factor=2, norm_layer=nn.BatchNorm2d):
+        super(DUC, self).__init__()
+        self.conv = nn.Conv2d(
+            inplanes, planes, kernel_size=3, padding=1, bias=False)
+        self.bn = norm_layer(planes, momentum=0.1)
+        self.relu = nn.ReLU(inplace=True)
+        self.pixel_shuffle = nn.PixelShuffle(upscale_factor)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        x = self.pixel_shuffle(x)
+        return x
diff --git a/libs/detectors/x86/alphapose/models/layers/SE_Resnet.py b/libs/detectors/x86/alphapose/models/layers/SE_Resnet.py
new file mode 100644
index 00000000..73568f53
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/SE_Resnet.py
@@ -0,0 +1,231 @@
+# -----------------------------------------------------
+# Copyright (c) Shanghai Jiao Tong University. All rights reserved.
+# Written by Jiefeng Li (jeff.lee.sjtu@gmail.com)
+# -----------------------------------------------------
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .dcn import DeformConv, ModulatedDeformConv
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=1):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel),
+            nn.Sigmoid()
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None,
+                 reduction=False, norm_layer=nn.BatchNorm2d):
+        super(BasicBlock, self).__init__()
+
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+        if reduction:
+            self.se = SELayer(planes)
+        self.reduc = reduction
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.reduc:
+            out = self.se(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1,
+                 downsample=None, reduction=False,
+                 norm_layer=nn.BatchNorm2d,
+                 dcn=None):
+        super(Bottleneck, self).__init__()
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(planes, momentum=0.1)
+        if self.with_dcn:
+            fallback_on_stride = dcn.get('FALLBACK_ON_STRIDE', False)
+            self.with_modulated_dcn = dcn.get('MODULATED', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                                   padding=1, bias=False)
+        else:
+            self.deformable_groups = dcn.get('DEFORM_GROUP', 1)
+            if not self.with_modulated_dcn:
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+
+            self.conv2_offset = nn.Conv2d(
+                planes,
+                self.deformable_groups * offset_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1)
+            self.conv2 = conv_op(
+                planes,
+                planes,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                deformable_groups=self.deformable_groups,
+                bias=False)
+
+        self.bn2 = norm_layer(planes, momentum=0.1)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = norm_layer(planes * 4, momentum=0.1)
+        if reduction:
+            self.se = SELayer(planes * 4)
+
+        self.reduc = reduction
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = F.relu(self.bn1(self.conv1(x)), inplace=True)
+        if not self.with_dcn:
+            out = F.relu(self.bn2(self.conv2(out)), inplace=True)
+        elif self.with_modulated_dcn:
+            offset_mask = self.conv2_offset(out)
+            offset = offset_mask[:, :18 * self.deformable_groups, :, :]
+            mask = offset_mask[:, -9 * self.deformable_groups:, :, :]
+            mask = mask.sigmoid()
+            out = F.relu(self.bn2(self.conv2(out, offset, mask)))
+        else:
+            offset = self.conv2_offset(out)
+            out = F.relu(self.bn2(self.conv2(out, offset)), inplace=True)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.reduc:
+            out = self.se(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = F.relu(out)
+
+        return out
+
+
+class SEResnet(nn.Module):
+    """ SEResnet """
+
+    def __init__(self, architecture, norm_layer=nn.BatchNorm2d,
+                 dcn=None, stage_with_dcn=(False, False, False, False)):
+        super(SEResnet, self).__init__()
+        self._norm_layer = norm_layer
+        assert architecture in ["resnet18", "resnet50", "resnet101", 'resnet152']
+        layers = {
+            'resnet18': [2, 2, 2, 2],
+            'resnet34': [3, 4, 6, 3],
+            'resnet50': [3, 4, 6, 3],
+            'resnet101': [3, 4, 23, 3],
+            'resnet152': [3, 8, 36, 3],
+        }
+        self.inplanes = 64
+        if architecture == "resnet18" or architecture == 'resnet34':
+            self.block = BasicBlock
+        else:
+            self.block = Bottleneck
+        self.layers = layers[architecture]
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7,
+                               stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(64, eps=1e-5, momentum=0.1, affine=True)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        stage_dcn = [dcn if with_dcn else None for with_dcn in stage_with_dcn]
+
+        self.layer1 = self.make_layer(self.block, 64, self.layers[0], dcn=stage_dcn[0])
+        self.layer2 = self.make_layer(
+            self.block, 128, self.layers[1], stride=2, dcn=stage_dcn[1])
+        self.layer3 = self.make_layer(
+            self.block, 256, self.layers[2], stride=2, dcn=stage_dcn[2])
+
+        self.layer4 = self.make_layer(
+            self.block, 512, self.layers[3], stride=2, dcn=stage_dcn[3])
+
+    def forward(self, x):
+        x = self.maxpool(self.relu(self.bn1(self.conv1(x))))  # 64 * h/4 * w/4
+        x = self.layer1(x)  # 256 * h/4 * w/4
+        x = self.layer2(x)  # 512 * h/8 * w/8
+        x = self.layer3(x)  # 1024 * h/16 * w/16
+        x = self.layer4(x)  # 2048 * h/32 * w/32
+        return x
+
+    def stages(self):
+        return [self.layer1, self.layer2, self.layer3, self.layer4]
+
+    def make_layer(self, block, planes, blocks, stride=1, dcn=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                self._norm_layer(planes * block.expansion, momentum=0.1),
+            )
+
+        layers = []
+        if downsample is not None:
+            layers.append(block(self.inplanes, planes,
+                                stride, downsample, reduction=True,
+                                norm_layer=self._norm_layer, dcn=dcn))
+        else:
+            layers.append(block(self.inplanes, planes, stride, downsample,
+                                norm_layer=self._norm_layer, dcn=dcn))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes,
+                                norm_layer=self._norm_layer, dcn=dcn))
+
+        return nn.Sequential(*layers)
diff --git a/libs/detectors/x86/alphapose/models/layers/__init__.py b/libs/detectors/x86/alphapose/models/layers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/libs/detectors/x86/alphapose/models/layers/dcn/DCN.py b/libs/detectors/x86/alphapose/models/layers/dcn/DCN.py
new file mode 100644
index 00000000..6848705d
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/dcn/DCN.py
@@ -0,0 +1,61 @@
+# -----------------------------------------------------
+# Copyright (c) Shanghai Jiao Tong University. All rights reserved.
+# Written by Jiefeng Li (jeff.lee.sjtu@gmail.com)
+# -----------------------------------------------------
+
+import torch.nn as nn
+
+from . import DeformConv, ModulatedDeformConv
+
+
+class DCN(nn.Module):
+    '''
+    Initialize: inplanes, planes, upscale_factor
+    OUTPUT: (planes // upscale_factor^2) * ht * wd
+    '''
+
+    def __init__(self, inplanes, planes, dcn,
+                 kernel_size, stride=1,
+                 padding=0, bias=False):
+        super(DCN, self).__init__()
+        fallback_on_stride = dcn.get('FALLBACK_ON_STRIDE', False)
+        self.with_modulated_dcn = dcn.get('MODULATED', False)
+        if fallback_on_stride:
+            self.conv = nn.Conv2d(inplanes, planes, kernel_size=kernel_size, stride=stride,
+                                  padding=padding, bias=bias)
+        else:
+            self.deformable_groups = dcn.get('DEFORM_GROUP', 1)
+            if not self.with_modulated_dcn:
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+
+            self.conv_offset = nn.Conv2d(
+                inplanes,
+                self.deformable_groups * offset_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding)
+            self.conv = conv_op(
+                inplanes,
+                planes,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                deformable_groups=self.deformable_groups,
+                bias=bias)
+
+    def forward(self, x):
+        if self.with_modulated_dcn:
+            offset_mask = self.conv_offset(x)
+            offset = offset_mask[:, :18 * self.deformable_groups, :, :]
+            mask = offset_mask[:, -9 * self.deformable_groups:, :, :]
+            mask = mask.sigmoid()
+            out = self.conv(x, offset, mask)
+        else:
+            offset = self.conv_offset(x)
+            out = self.conv(x, offset)
+
+        return out
diff --git a/libs/detectors/x86/alphapose/models/layers/dcn/__init__.py b/libs/detectors/x86/alphapose/models/layers/dcn/__init__.py
new file mode 100644
index 00000000..b0d986ca
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/dcn/__init__.py
@@ -0,0 +1,13 @@
+from .deform_conv import (DeformConv, DeformConvPack, ModulatedDeformConv,
+                          ModulatedDeformConvPack, deform_conv,
+                          modulated_deform_conv)
+from .deform_pool import (DeformRoIPooling, DeformRoIPoolingPack,
+                          ModulatedDeformRoIPoolingPack, deform_roi_pooling)
+from .DCN import DCN
+
+__all__ = [
+    'DeformConv', 'DeformConvPack', 'ModulatedDeformConv',
+    'ModulatedDeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack',
+    'ModulatedDeformRoIPoolingPack', 'deform_conv', 'modulated_deform_conv',
+    'deform_roi_pooling', 'DCN'
+]
diff --git a/libs/detectors/x86/alphapose/models/layers/dcn/deform_conv.py b/libs/detectors/x86/alphapose/models/layers/dcn/deform_conv.py
new file mode 100644
index 00000000..8ad6044e
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/dcn/deform_conv.py
@@ -0,0 +1,337 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from alphapose_package import deform_conv_cuda
+
+
+class DeformConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                weight,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1,
+                im2col_step=64):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                "Expected 4D tensor as input, got {}D tensor instead.".format(
+                    input.dim()))
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            DeformConvFunction._output_size(input, weight, ctx.padding,
+                                            ctx.dilation, ctx.stride))
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        if not input.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+            deform_conv_cuda.deform_conv_forward_cuda(
+                input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1],
+                weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0],
+                ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                ctx.dilation[0], ctx.groups, ctx.deformable_groups,
+                cur_im2col_step)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                deform_conv_cuda.deform_conv_backward_input_cuda(
+                    input, offset, grad_output, grad_input,
+                    grad_offset, weight, ctx.bufs_[0], weight.size(3),
+                    weight.size(2), ctx.stride[1], ctx.stride[0],
+                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                    ctx.dilation[0], ctx.groups, ctx.deformable_groups,
+                    cur_im2col_step)
+
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                deform_conv_cuda.deform_conv_backward_parameters_cuda(
+                    input, offset, grad_output,
+                    grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3),
+                    weight.size(2), ctx.stride[1], ctx.stride[0],
+                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                    ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1,
+                    cur_im2col_step)
+
+        return (grad_input, grad_offset, grad_weight, None, None, None, None,
+                None)
+
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                "convolution input is too small (output would be {})".format(
+                    'x'.join(map(str, output_size))))
+        return output_size
+
+
+class ModulatedDeformConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                mask,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError
+        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
+                or input.requires_grad:
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConvFunction._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        deform_conv_cuda.modulated_deform_conv_cuda_forward(
+            input, weight, bias, ctx._bufs[0], offset, mask, output,
+            ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride,
+            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+            ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        deform_conv_cuda.modulated_deform_conv_cuda_backward(
+            input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1],
+            grad_input, grad_weight, grad_bias, grad_offset, grad_mask,
+            grad_output, weight.shape[2], weight.shape[3], ctx.stride,
+            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+            ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (height + 2 * ctx.padding -
+                      (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1
+        width_out = (width + 2 * ctx.padding -
+                     (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+
+
+deform_conv = DeformConvFunction.apply
+modulated_deform_conv = ModulatedDeformConvFunction.apply
+
+
+class DeformConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=False):
+        super(DeformConv, self).__init__()
+
+        assert not bias
+        assert in_channels % groups == 0, \
+            'in_channels {} cannot be divisible by groups {}'.format(
+                in_channels, groups)
+        assert out_channels % groups == 0, \
+            'out_channels {} cannot be divisible by groups {}'.format(
+                out_channels, groups)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+
+    def forward(self, x, offset):
+        return deform_conv(x, offset, self.weight, self.stride, self.padding,
+                           self.dilation, self.groups, self.deformable_groups)
+
+
+class DeformConvPack(DeformConv):
+
+    def __init__(self, *args, **kwargs):
+        super(DeformConvPack, self).__init__(*args, **kwargs)
+
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deformable_groups * 2 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+
+    def forward(self, x):
+        offset = self.conv_offset(x)
+        return deform_conv(x, offset, self.weight, self.stride, self.padding,
+                           self.dilation, self.groups, self.deformable_groups)
+
+
+class ModulatedDeformConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=True):
+        super(ModulatedDeformConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, x, offset, mask):
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
+
+
+class ModulatedDeformConvPack(ModulatedDeformConv):
+
+    def __init__(self, *args, **kwargs):
+        super(ModulatedDeformConvPack, self).__init__(*args, **kwargs)
+
+        self.conv_offset_mask = nn.Conv2d(
+            self.in_channels,
+            self.deformable_groups * 3 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset_mask.weight.data.zero_()
+        self.conv_offset_mask.bias.data.zero_()
+
+    def forward(self, x):
+        out = self.conv_offset_mask(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
diff --git a/libs/detectors/x86/alphapose/models/layers/dcn/deform_pool.py b/libs/detectors/x86/alphapose/models/layers/dcn/deform_pool.py
new file mode 100644
index 00000000..60c2b3ba
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/dcn/deform_pool.py
@@ -0,0 +1,252 @@
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from alphapose_package import deform_pool_cuda
+
+
+class DeformRoIPoolingFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                data,
+                rois,
+                offset,
+                spatial_scale,
+                out_size,
+                out_channels,
+                no_trans,
+                group_size=1,
+                part_size=None,
+                sample_per_part=4,
+                trans_std=.0):
+        # TODO: support unsquare RoIs
+        out_h, out_w = _pair(out_size)
+        assert isinstance(out_h, int) and isinstance(out_w, int)
+        assert out_h == out_w
+        out_size = out_h  # out_h and out_w must be equal
+
+        ctx.spatial_scale = spatial_scale
+        ctx.out_size = out_size
+        ctx.out_channels = out_channels
+        ctx.no_trans = no_trans
+        ctx.group_size = group_size
+        ctx.part_size = out_size if part_size is None else part_size
+        ctx.sample_per_part = sample_per_part
+        ctx.trans_std = trans_std
+
+        assert 0.0 <= ctx.trans_std <= 1.0
+        if not data.is_cuda:
+            raise NotImplementedError
+
+        n = rois.shape[0]
+        output = data.new_empty(n, out_channels, out_size, out_size)
+        output_count = data.new_empty(n, out_channels, out_size, out_size)
+        deform_pool_cuda.deform_psroi_pooling_cuda_forward(
+            data, rois, offset, output, output_count, ctx.no_trans,
+            ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size,
+            ctx.part_size, ctx.sample_per_part, ctx.trans_std)
+
+        if data.requires_grad or rois.requires_grad or offset.requires_grad:
+            ctx.save_for_backward(data, rois, offset)
+        ctx.output_count = output_count
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+
+        data, rois, offset = ctx.saved_tensors
+        output_count = ctx.output_count
+        grad_input = torch.zeros_like(data)
+        grad_rois = None
+        grad_offset = torch.zeros_like(offset)
+
+        deform_pool_cuda.deform_psroi_pooling_cuda_backward(
+            grad_output, data, rois, offset, output_count, grad_input,
+            grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels,
+            ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part,
+            ctx.trans_std)
+        return (grad_input, grad_rois, grad_offset, None, None, None, None,
+                None, None, None, None)
+
+
+deform_roi_pooling = DeformRoIPoolingFunction.apply
+
+
+class DeformRoIPooling(nn.Module):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0):
+        super(DeformRoIPooling, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.out_size = _pair(out_size)
+        self.out_channels = out_channels
+        self.no_trans = no_trans
+        self.group_size = group_size
+        self.part_size = out_size if part_size is None else part_size
+        self.sample_per_part = sample_per_part
+        self.trans_std = trans_std
+
+    def forward(self, data, rois, offset):
+        if self.no_trans:
+            offset = data.new_empty(0)
+        return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                  self.out_size, self.out_channels,
+                                  self.no_trans, self.group_size,
+                                  self.part_size, self.sample_per_part,
+                                  self.trans_std)
+
+
+class DeformRoIPoolingPack(DeformRoIPooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 deform_fc_channels=1024):
+        super(DeformRoIPoolingPack,
+              self).__init__(spatial_scale, out_size, out_channels, no_trans,
+                             group_size, part_size, sample_per_part, trans_std)
+
+        self.num_offset_fcs = num_offset_fcs
+        self.deform_fc_channels = deform_fc_channels
+
+        if not no_trans:
+            seq = []
+            ic = self.out_size[0] * self.out_size[1] * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size[0] * self.out_size[1] * 2
+                seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                      self.out_size, self.out_channels,
+                                      self.no_trans, self.group_size,
+                                      self.part_size, self.sample_per_part,
+                                      self.trans_std)
+        else:
+            n = rois.shape[0]
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size[0], self.out_size[1])
+            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                      self.out_size, self.out_channels,
+                                      self.no_trans, self.group_size,
+                                      self.part_size, self.sample_per_part,
+                                      self.trans_std)
+
+
+class ModulatedDeformRoIPoolingPack(DeformRoIPooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 num_mask_fcs=2,
+                 deform_fc_channels=1024):
+        super(ModulatedDeformRoIPoolingPack,
+              self).__init__(spatial_scale, out_size, out_channels, no_trans,
+                             group_size, part_size, sample_per_part, trans_std)
+
+        self.num_offset_fcs = num_offset_fcs
+        self.num_mask_fcs = num_mask_fcs
+        self.deform_fc_channels = deform_fc_channels
+
+        if not no_trans:
+            offset_fc_seq = []
+            ic = self.out_size[0] * self.out_size[1] * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size[0] * self.out_size[1] * 2
+                offset_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    offset_fc_seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*offset_fc_seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+
+            mask_fc_seq = []
+            ic = self.out_size[0] * self.out_size[1] * self.out_channels
+            for i in range(self.num_mask_fcs):
+                if i < self.num_mask_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size[0] * self.out_size[1]
+                mask_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_mask_fcs - 1:
+                    mask_fc_seq.append(nn.ReLU(inplace=True))
+                else:
+                    mask_fc_seq.append(nn.Sigmoid())
+            self.mask_fc = nn.Sequential(*mask_fc_seq)
+            self.mask_fc[-2].weight.data.zero_()
+            self.mask_fc[-2].bias.data.zero_()
+
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                      self.out_size, self.out_channels,
+                                      self.no_trans, self.group_size,
+                                      self.part_size, self.sample_per_part,
+                                      self.trans_std)
+        else:
+            n = rois.shape[0]
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size[0], self.out_size[1])
+            mask = self.mask_fc(x.view(n, -1))
+            mask = mask.view(n, 1, self.out_size[0], self.out_size[1])
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std) * mask
diff --git a/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_conv_cuda.cpp b/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_conv_cuda.cpp
new file mode 100644
index 00000000..a07426a0
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_conv_cuda.cpp
@@ -0,0 +1,701 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+
+#include <torch/extension.h>
+#include <ATen/DeviceGuard.h>
+
+#include <cmath>
+#include <vector>
+
+void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor data_col);
+
+void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor grad_im);
+
+void deformable_col2im_coord(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const int channels, const int height,
+    const int width, const int ksize_h, const int ksize_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, at::Tensor grad_offset,
+    at::Tensor grad_mask);
+
+void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
+                 at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
+                 int padW, int dilationH, int dilationW, int group,
+                 int deformable_group) {
+  AT_CHECK(weight.ndimension() == 4,
+           "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+           "but got: %s",
+           weight.ndimension());
+
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  AT_CHECK(kW > 0 && kH > 0,
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
+           kW);
+
+  AT_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+           "kernel size should be consistent with weight, ",
+           "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
+           kW, weight.size(2), weight.size(3));
+
+  AT_CHECK(dW > 0 && dH > 0,
+           "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  AT_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+           ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  AT_CHECK(nInputPlane % deformable_group == 0,
+           "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  AT_CHECK(input.size(1) == nInputPlane,
+           "invalid number of input planes, expected: %d, but got: %d",
+           nInputPlane, input.size(1));
+
+  AT_CHECK((inputHeight >= kH && inputWidth >= kW),
+           "input image is smaller than kernel");
+
+  AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+           "invalid spatial size of offset, expected height: %d width: %d, but "
+           "got height: %d width: %d",
+           outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+           "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    AT_CHECK(gradOutput->size(dimf) == nOutputPlane,
+             "invalid number of gradOutput planes, expected: %d, but got: %d",
+             nOutputPlane, gradOutput->size(dimf));
+
+    AT_CHECK((gradOutput->size(dimh) == outputHeight &&
+              gradOutput->size(dimw) == outputWidth),
+             "invalid size of gradOutput, expected height: %d width: %d , but "
+             "got height: %d width: %d",
+             outputHeight, outputWidth, gradOutput->size(dimh),
+             gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
+                             at::Tensor offset, at::Tensor output,
+                             at::Tensor columns, at::Tensor ones, int kW,
+                             int kH, int dW, int dH, int padW, int padH,
+                             int dilationW, int dilationH, int group,
+                             int deformable_group, int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+  
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  at::Tensor output_buffer =
+      at::zeros({batchSize / im2col_step, nOutputPlane,
+                 im2col_step * outputHeight, outputWidth},
+                output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
+                                    at::Tensor gradOutput, at::Tensor gradInput,
+                                    at::Tensor gradOffset, at::Tensor weight,
+                                    at::Tensor columns, int kW, int kH, int dW,
+                                    int dH, int padW, int padH, int dilationW,
+                                    int dilationH, int group,
+                                    int deformable_group, int im2col_step) {
+  shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+                            inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+                            dilationH, dilationW, im2col_step, deformable_group,
+                            gradOffset[elt]);
+
+    deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
+    at::Tensor gradWeight,  // at::Tensor gradBias,
+    at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
+    int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, float scale, int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
+              padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+  
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
+    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w, const int dilation_h,
+    const int dilation_w, const int group, const int deformable_group,
+    const bool with_bias) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  at::DeviceGuard guard(input.device());
+  
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor columns,
+    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
+    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_conv_forward_cuda", &deform_conv_forward_cuda,
+        "deform forward (CUDA)");
+  m.def("deform_conv_backward_input_cuda", &deform_conv_backward_input_cuda,
+        "deform_conv_backward_input (CUDA)");
+  m.def("deform_conv_backward_parameters_cuda",
+        &deform_conv_backward_parameters_cuda,
+        "deform_conv_backward_parameters (CUDA)");
+  m.def("modulated_deform_conv_cuda_forward",
+        &modulated_deform_conv_cuda_forward,
+        "modulated deform conv forward (CUDA)");
+  m.def("modulated_deform_conv_cuda_backward",
+        &modulated_deform_conv_cuda_backward,
+        "modulated deform conv backward (CUDA)");
+}
\ No newline at end of file
diff --git a/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_conv_cuda_kernel.cu b/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_conv_cuda_kernel.cu
new file mode 100644
index 00000000..a2b94286
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_conv_cuda_kernel.cu
@@ -0,0 +1,866 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+
+inline int GET_BLOCKS(const int N)
+{
+  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+template <typename scalar_t>
+__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+                                               const int height, const int width, scalar_t h, scalar_t w)
+{
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                        const int h, const int w, const int height, const int width)
+{
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                          const int height, const int width, const scalar_t *im_data,
+                                          const int data_width, const int bp_dir)
+{
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
+                                             const int height, const int width, const int kernel_h, const int kernel_w,
+                                             const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                             const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+                                             const int batch_size, const int num_channels, const int deformable_group,
+                                             const int height_col, const int width_col,
+                                             scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const scalar_t map_h = i * dilation_h + offset_h;
+          //const scalar_t map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+void deformable_im2col(
+    const at::Tensor data_im, const at::Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, at::Tensor data_col)
+{
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *data_col_ = data_col.data<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels, deformable_group,
+            height_col, width_col, data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const scalar_t *data_col, const scalar_t *data_offset,
+    const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size, const int deformable_group,
+    const int height_col, const int width_col,
+    scalar_t *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
+                                                        2 * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+void deformable_col2im(
+    const at::Tensor data_col, const at::Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h,
+    const int ksize_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int parallel_imgs, const int deformable_group,
+    at::Tensor grad_im)
+{
+
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            parallel_imgs, deformable_group, height_col, width_col, grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
+                                                   const scalar_t *data_im, const scalar_t *data_offset,
+                                                   const int channels, const int height, const int width,
+                                                   const int kernel_h, const int kernel_w,
+                                                   const int pad_h, const int pad_w,
+                                                   const int stride_h, const int stride_w,
+                                                   const int dilation_h, const int dilation_w,
+                                                   const int channel_per_deformable_group,
+                                                   const int batch_size, const int offset_channels, const int deformable_group,
+                                                   const int height_col, const int width_col, scalar_t *grad_offset)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    scalar_t val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
+                                                  batch_size * width_col * height_col;
+    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
+                                                channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                                                        kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      const scalar_t weight = get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_col2im_coord(
+    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
+    const int channels, const int height, const int width, const int ksize_h,
+    const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
+{
+
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
+  int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+                                         const int height, const int width, scalar_t h, scalar_t w)
+{
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                             const int h, const int w, const int height, const int width)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                               const int height, const int width, const scalar_t *im_data,
+                                               const int data_width, const int bp_dir)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+                                                       const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
+                                                       const int height, const int width, const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int num_channels, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const float map_h = i * dilation_h + offset_h;
+          //const float map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+        //data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
+                                                       const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
+                                                       const int channels, const int height, const int width,
+                                                       const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       scalar_t *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
+                                                             const scalar_t *data_col, const scalar_t *data_im,
+                                                             const scalar_t *data_offset, const scalar_t *data_mask,
+                                                             const int channels, const int height, const int width,
+                                                             const int kernel_h, const int kernel_w,
+                                                             const int pad_h, const int pad_w,
+                                                             const int stride_h, const int stride_w,
+                                                             const int dilation_h, const int dilation_w,
+                                                             const int channel_per_deformable_group,
+                                                             const int batch_size, const int offset_channels, const int deformable_group,
+                                                             const int height_col, const int width_col,
+                                                             scalar_t *grad_offset, scalar_t *grad_mask)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    scalar_t val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const scalar_t weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, at::Tensor data_col)
+{
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *data_col_ = data_col.data<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, channels, deformable_group, height_col, width_col, data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, at::Tensor grad_im)
+{
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
+            kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset, at::Tensor grad_mask)
+{
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
+            kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
+            grad_offset_, grad_mask_);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
diff --git a/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_pool_cuda.cpp b/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_pool_cuda.cpp
new file mode 100644
index 00000000..9e0e3ffc
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_pool_cuda.cpp
@@ -0,0 +1,90 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
+
+// based on
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+#include <torch/extension.h>
+#include <ATen/DeviceGuard.h>
+
+#include <cmath>
+#include <vector>
+
+void DeformablePSROIPoolForward(
+    const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
+    at::Tensor out, at::Tensor top_count, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+
+void DeformablePSROIPoolBackwardAcc(
+    const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
+    const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
+    at::Tensor trans_grad, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+
+void deform_psroi_pooling_cuda_forward(
+    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
+    at::Tensor top_count, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std) {
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out.size(0), num_bbox);
+
+  DeformablePSROIPoolForward(
+      input, bbox, trans, out, top_count, batch, channels, height, width,
+      num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
+      pooled_size, part_size, sample_per_part, trans_std);
+}
+
+void deform_psroi_pooling_cuda_backward(
+    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
+    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
+    const int no_trans, const float spatial_scale, const int output_dim,
+    const int group_size, const int pooled_size, const int part_size,
+    const int sample_per_part, const float trans_std) {
+  AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out_grad.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out_grad.size(0), num_bbox);
+
+  DeformablePSROIPoolBackwardAcc(
+      out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
+      channels, height, width, num_bbox, channels_trans, no_trans,
+      spatial_scale, output_dim, group_size, pooled_size, part_size,
+      sample_per_part, trans_std);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward,
+        "deform psroi pooling forward(CUDA)");
+  m.def("deform_psroi_pooling_cuda_backward",
+        &deform_psroi_pooling_cuda_backward,
+        "deform psroi pooling backward(CUDA)");
+}
diff --git a/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_pool_cuda_kernel.cu b/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_pool_cuda_kernel.cu
new file mode 100644
index 00000000..1922d724
--- /dev/null
+++ b/libs/detectors/x86/alphapose/models/layers/dcn/src/deform_pool_cuda_kernel.cu
@@ -0,0 +1,364 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
+
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interp(
+    const scalar_t *data,
+    const scalar_t x,
+    const scalar_t y,
+    const int width,
+    const int height)
+{
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  scalar_t dist_x = (scalar_t)(x - x1);
+  scalar_t dist_y = (scalar_t)(y - y1);
+  scalar_t value11 = data[y1 * width + x1];
+  scalar_t value12 = data[y2 * width + x1];
+  scalar_t value21 = data[y1 * width + x2];
+  scalar_t value22 = data[y2 * width + x2];
+  scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename scalar_t>
+__global__ void DeformablePSROIPoolForwardKernel(
+    const int count,
+    const scalar_t *bottom_data,
+    const scalar_t spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const scalar_t *bottom_rois, const scalar_t *bottom_trans,
+    const int no_trans,
+    const scalar_t trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    scalar_t *top_data,
+    scalar_t *top_count)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    scalar_t sum = 0;
+    int count = 0;
+    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        scalar_t w = wstart + iw * sub_bin_size_w;
+        scalar_t h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename scalar_t>
+__global__ void DeformablePSROIPoolBackwardAccKernel(
+    const int count,
+    const scalar_t *top_diff,
+    const scalar_t *top_count,
+    const int num_rois,
+    const scalar_t spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
+    const scalar_t *bottom_data,
+    const scalar_t *bottom_rois,
+    const scalar_t *bottom_trans,
+    const int no_trans,
+    const scalar_t trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    scalar_t diff_val = top_diff[index] / top_count[index];
+    const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        scalar_t w = wstart + iw * sub_bin_size_w;
+        scalar_t h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        scalar_t dist_x = w - x0, dist_y = h - y0;
+        scalar_t q00 = (1 - dist_x) * (1 - dist_y);
+        scalar_t q01 = (1 - dist_x) * dist_y;
+        scalar_t q10 = dist_x * (1 - dist_y);
+        scalar_t q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
+
+        if (no_trans)
+        {
+          continue;
+        }
+        scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
+      }
+    }
+  }
+}
+
+void DeformablePSROIPoolForward(const at::Tensor data,
+                                const at::Tensor bbox,
+                                const at::Tensor trans,
+                                at::Tensor out,
+                                at::Tensor top_count,
+                                const int batch,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int num_bbox,
+                                const int channels_trans,
+                                const int no_trans,
+                                const float spatial_scale,
+                                const int output_dim,
+                                const int group_size,
+                                const int pooled_size,
+                                const int part_size,
+                                const int sample_per_part,
+                                const float trans_std)
+{
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  const int count = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
+        const scalar_t *bottom_data = data.data<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
+        scalar_t *top_data = out.data<scalar_t>();
+        scalar_t *top_count_data = top_count.data<scalar_t>();
+
+        DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
+            count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
+            bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
+            group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
+                                    const at::Tensor data,
+                                    const at::Tensor bbox,
+                                    const at::Tensor trans,
+                                    const at::Tensor top_count,
+                                    at::Tensor in_grad,
+                                    at::Tensor trans_grad,
+                                    const int batch,
+                                    const int channels,
+                                    const int height,
+                                    const int width,
+                                    const int num_bbox,
+                                    const int channels_trans,
+                                    const int no_trans,
+                                    const float spatial_scale,
+                                    const int output_dim,
+                                    const int group_size,
+                                    const int pooled_size,
+                                    const int part_size,
+                                    const int sample_per_part,
+                                    const float trans_std)
+{
+  // LOG(INFO) << "DeformablePSROIPoolBackward";
+  const int num_rois = num_bbox;
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  const int count = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
+        const scalar_t *top_diff = out_grad.data<scalar_t>();
+        const scalar_t *bottom_data = data.data<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
+        scalar_t *bottom_data_diff = in_grad.data<scalar_t>();
+        scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data<scalar_t>();
+        const scalar_t *top_count_data = top_count.data<scalar_t>();
+
+        DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
+            count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
+            pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
+            bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
+            group_size, part_size, num_classes, channels_each_class);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+  }
+}
\ No newline at end of file
diff --git a/libs/detectors/x86/alphapose/utils/__init__.py b/libs/detectors/x86/alphapose/utils/__init__.py
new file mode 100644
index 00000000..e69de29b

From c7d3aee2e875cf4fcb7ad0ea98a4e820ac416a3e Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Sat, 26 Dec 2020 13:58:21 -0800
Subject: [PATCH 08/13] add checkpoint downloader

---
 libs/detectors/x86/alphapose/alphapose.py | 13 ++++++++++---
 x86-alphapose-gpu.Dockerfile              |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/libs/detectors/x86/alphapose/alphapose.py b/libs/detectors/x86/alphapose/alphapose.py
index 567c5436..a1f44b42 100644
--- a/libs/detectors/x86/alphapose/alphapose.py
+++ b/libs/detectors/x86/alphapose/alphapose.py
@@ -1,4 +1,5 @@
 import sys
+
 sys.path.append("libs/detectors/x86/alphapose")
 from utils import config_parser
 from builders import builder
@@ -14,6 +15,9 @@
 import time
 from libs.detectors.utils.fps_calculator import convert_infr_time_to_fps
 
+import logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 
 class Detector:
     def __init__(self, config):
@@ -34,8 +38,11 @@ def load_model(self):
         # TODO: add download checkpoint script
         model_file = pathlib.Path('/repo/data/x86/fast_res50_256x192.pth')
         if not model_file.exists():
-            # TODO: add model link
-            pass
+            import gdown
+            logger.info("did not find model's checkpoint file, start downloading ...")
+            url = 'https://drive.google.com/uc?id=1kQhnMRURFiy7NsdS8EFL-8vtqEXOgECn'
+            output = '/repo/data/x86/fast_res50_256x192.pth'
+            gdown.download(url, output, quiet=False)
 
         self.pose_model = builder.build_sppe_model(self.cfg.MODEL, preset_cfg=self.cfg.DATA_PRESET)
         print(f'Loading pose model from {model_file}...')
@@ -58,7 +65,7 @@ def inference(self, image):
         inference_time = time.perf_counter() - t_begin
         self.fps = convert_infr_time_to_fps(inference_time)
         # TODO
-        results = prepare_poses_results(poses, self.w, self.h, scores) 
+        results = prepare_poses_results(poses, self.w, self.h, scores)
         return results
 
     def transform_detections(self, image, dets):
diff --git a/x86-alphapose-gpu.Dockerfile b/x86-alphapose-gpu.Dockerfile
index 7563ca11..2203db29 100644
--- a/x86-alphapose-gpu.Dockerfile
+++ b/x86-alphapose-gpu.Dockerfile
@@ -93,7 +93,7 @@ WORKDIR /repo/libs/detectors/x86/alphapose
 #
 #RUN mkdir -p $(cat /root/tmp_variable)/alphapose_package && \
 #cd $(cat /root/tmp_variable)/alphapose_package && \
-RUN apt-get update && apt-get install -y libyaml-dev && pip3 install cython && python3 setup.py build develop --user
+RUN apt-get update && apt-get install -y libyaml-dev && pip3 install cython gdown && python3 setup.py build develop --user
 RUN mkdir -p $(cat /root/tmp_variable)/alphapose_package && \
 cp /repo/libs/detectors/x86/alphapose/models/layers/dcn/*.so $(cat /root/tmp_variable)/alphapose_package
 #mv /repo/libs/detectors/x86/alphapose/setup.py $(cat /root/tmp_variable)/alphapose_package

From 4b79df566cd3448257c5649e4dde4e8dd87cae9b Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Sat, 26 Dec 2020 14:03:42 -0800
Subject: [PATCH 09/13] get values of score in mobilenet_ssd detector instead
 of tensor

---
 libs/detectors/x86/alphapose/alphapose.py | 3 ---
 libs/detectors/x86/alphapose/wrappers.py  | 2 +-
 libs/detectors/x86/mobilenet_ssd.py       | 4 +++-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/libs/detectors/x86/alphapose/alphapose.py b/libs/detectors/x86/alphapose/alphapose.py
index a1f44b42..5430f769 100644
--- a/libs/detectors/x86/alphapose/alphapose.py
+++ b/libs/detectors/x86/alphapose/alphapose.py
@@ -35,7 +35,6 @@ def __init__(self, config):
         self.fps = None
 
     def load_model(self):
-        # TODO: add download checkpoint script
         model_file = pathlib.Path('/repo/data/x86/fast_res50_256x192.pth')
         if not model_file.exists():
             import gdown
@@ -55,7 +54,6 @@ def inference(self, image):
         detections = self.detection_model.inference(image)
         if len(detections) == 0:
             return []
-        # TODO
         detections = prepare_detection_results(detections, self.w, self.h)
         with torch.no_grad():
             inps, cropped_boxes, boxes, scores, ids = self.transform_detections(image, detections)
@@ -64,7 +62,6 @@ def inference(self, image):
             poses = self.post_process(hm, cropped_boxes, boxes, scores, ids)
         inference_time = time.perf_counter() - t_begin
         self.fps = convert_infr_time_to_fps(inference_time)
-        # TODO
         results = prepare_poses_results(poses, self.w, self.h, scores)
         return results
 
diff --git a/libs/detectors/x86/alphapose/wrappers.py b/libs/detectors/x86/alphapose/wrappers.py
index 5ad2d60d..7559c021 100644
--- a/libs/detectors/x86/alphapose/wrappers.py
+++ b/libs/detectors/x86/alphapose/wrappers.py
@@ -14,7 +14,7 @@ def prepare_detection_results(object_list, w, h):
         output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, w)
         output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, h)
         # TODO
-        output[i, 5] = float(obj["score"].numpy())
+        output[i, 5] = obj["score"]
 
     return output
 
diff --git a/libs/detectors/x86/mobilenet_ssd.py b/libs/detectors/x86/mobilenet_ssd.py
index 747016b9..b4951c43 100644
--- a/libs/detectors/x86/mobilenet_ssd.py
+++ b/libs/detectors/x86/mobilenet_ssd.py
@@ -7,6 +7,7 @@
 
 from libs.detectors.utils.fps_calculator import convert_infr_time_to_fps
 
+
 def load_model(model_name):
     model_dir = pathlib.Path('/repo/data/x86') / model_name
     if not model_dir.exists():
@@ -72,6 +73,7 @@ def inference(self, resized_rgb_image):
         result = []
         for i in range(boxes.shape[1]):  # number of boxes
             if labels[0, i] == class_id and scores[0, i] > score_threshold:
-                result.append({"id": str(class_id) + '-' + str(i), "bbox": boxes[0, i, :].numpy(), "score": scores[0, i]})
+                result.append({"id": str(class_id) + '-' + str(i), "bbox": boxes[0, i, :].numpy().tolist(),
+                               "score": float(scores[0, i].numpy())})
 
         return result

From aeb3311fe1096f602b45bbf8212177590f8efce7 Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Sat, 26 Dec 2020 14:08:02 -0800
Subject: [PATCH 10/13] change wrapper.py name to convert_results_format.py

---
 libs/detectors/x86/alphapose/alphapose.py                     | 4 +++-
 .../x86/alphapose/{wrappers.py => convert_results_format.py}  | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)
 rename libs/detectors/x86/alphapose/{wrappers.py => convert_results_format.py} (99%)

diff --git a/libs/detectors/x86/alphapose/alphapose.py b/libs/detectors/x86/alphapose/alphapose.py
index 5430f769..af7dd57d 100644
--- a/libs/detectors/x86/alphapose/alphapose.py
+++ b/libs/detectors/x86/alphapose/alphapose.py
@@ -6,7 +6,7 @@
 from utils.bbox import box_to_center_scale, center_scale_to_box
 from utils.pose_nms import pose_nms
 from utils.transformations import get_affine_transform, transform_preds, im_to_torch, get_max_pred
-from wrappers import prepare_detection_results, prepare_poses_results
+from convert_results_format import prepare_detection_results, prepare_poses_results
 
 import torch
 import cv2
@@ -16,9 +16,11 @@
 from libs.detectors.utils.fps_calculator import convert_infr_time_to_fps
 
 import logging
+
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+
 class Detector:
     def __init__(self, config):
         self.config = config
diff --git a/libs/detectors/x86/alphapose/wrappers.py b/libs/detectors/x86/alphapose/convert_results_format.py
similarity index 99%
rename from libs/detectors/x86/alphapose/wrappers.py
rename to libs/detectors/x86/alphapose/convert_results_format.py
index 7559c021..bb65a857 100644
--- a/libs/detectors/x86/alphapose/wrappers.py
+++ b/libs/detectors/x86/alphapose/convert_results_format.py
@@ -13,7 +13,6 @@ def prepare_detection_results(object_list, w, h):
         output[i, 1:5] = bbox_scaled
         output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, w)
         output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, h)
-        # TODO
         output[i, 5] = obj["score"]
 
     return output

From 59499875ac5c9eecbb1dc885a6725e6e8f6f0613 Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Sat, 26 Dec 2020 14:12:11 -0800
Subject: [PATCH 11/13] change alphapose dockerfile name

---
 x86-alphapose-gpu.Dockerfile => x86-gpu-alphapose.Dockerfile | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename x86-alphapose-gpu.Dockerfile => x86-gpu-alphapose.Dockerfile (100%)

diff --git a/x86-alphapose-gpu.Dockerfile b/x86-gpu-alphapose.Dockerfile
similarity index 100%
rename from x86-alphapose-gpu.Dockerfile
rename to x86-gpu-alphapose.Dockerfile

From ae1adcce8f84e58a3b1ef6f8f0f82eb93fd44aed Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Sat, 26 Dec 2020 14:39:10 -0800
Subject: [PATCH 12/13] add docstring and update README

---
 README.md                                     |  5 +++
 config-x86-gpu.ini                            |  4 +--
 libs/detectors/x86/alphapose/alphapose.py     | 31 ++++++++++++++-----
 .../x86/alphapose/convert_results_format.py   | 25 +++++++++++++++
 x86-gpu-alphapose.Dockerfile                  | 13 +-------
 5 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index f07060a6..4f5e9dc8 100644
--- a/README.md
+++ b/README.md
@@ -190,9 +190,14 @@ Note that you should have [Nvidia Docker Toolkit](https://github.com/NVIDIA/nvid
 # 1) Build Docker image (This step is optional, you can skip it if you want to pull the container from neuralet dockerhub)
 docker build -f x86-gpu.Dockerfile -t "neuralet/smart-social-distancing:latest-x86_64_gpu" .
 
+# For AlphaPose:
+docker build -f x86-gpu-alphapose.Dockerfile -t "neuralet/smart-social-distancing:latest-x86_64_alphapose_gpu" .
+
 # 2) Run Docker container:
 Notice: you must have Docker >= 19.03 to run the container with `--gpus` flag.
 docker run -it --gpus all -p HOST_PORT:8000 -v "$PWD":/repo -e TZ=`./timezone.sh` neuralet/smart-social-distancing:latest-x86_64_gpu
+# Or for AlphaPose:
+docker run -it --gpus all -p HOST_PORT:8000 -v "$PWD":/repo -e TZ=`./timezone.sh` neuralet/smart-social-distancing:latest-x86_64_alphapose_gpu
 ```
 
 ##### Run on x86 with GPU using TensorRT optimization
diff --git a/config-x86-gpu.ini b/config-x86-gpu.ini
index a342c6a1..360d193f 100644
--- a/config-x86-gpu.ini
+++ b/config-x86-gpu.ini
@@ -61,8 +61,8 @@ DailyReportTime = 06:00
 [Detector]
 ; Supported devices: Jetson , EdgeTPU, Dummy, x86, x86-gpu
 Device = x86-gpu
-; Supported models: mobilenet_ssd_v2 and openpifpaf
-Name = openpifpaf
+; Supported models: mobilenet_ssd_v2 , openpifpaf and alphapose_mobilenet_ssd
+Name = alphapose_mobilenet_ssd
 ;ImageSize should be 3 numbers seperated by commas, no spaces: 300,300,3 (for better accuracy use higher resolution when
 ; using openpifpaf (openpifpaf detects both faces and pedestrians)
 ImageSize = 1281,721,3
diff --git a/libs/detectors/x86/alphapose/alphapose.py b/libs/detectors/x86/alphapose/alphapose.py
index af7dd57d..da704c7f 100644
--- a/libs/detectors/x86/alphapose/alphapose.py
+++ b/libs/detectors/x86/alphapose/alphapose.py
@@ -22,6 +22,12 @@
 
 
 class Detector:
+    """
+       Perform object detection with the AlphaPose model.
+       original repository of the model: https://github.com/MVIG-SJTU/AlphaPose
+
+       :param config: Is a ConfigEngine instance which provides necessary parameters.
+       """
     def __init__(self, config):
         self.config = config
         self.name = config.get_section_dict('Detector')['Name']
@@ -52,22 +58,31 @@ def load_model(self):
         self.pose_model.eval()
 
     def inference(self, image):
+        """
+            inference function sets input tensor to input image and gets the output.
+            The model provides corresponding detection output which is used for creating result
+            Args:
+                resized_rgb_image: uint8 numpy array with shape (img_height, img_width, channels)
+
+            Returns:
+                result: a dictionary contains of [{"id": 0, "bbox": [y1, x1, y2, x2], "score":s%, "face": [y1, x1, y2, x2]}, {...}, {...}, ...]
+        """
         t_begin = time.perf_counter()
         detections = self.detection_model.inference(image)
         if len(detections) == 0:
             return []
         detections = prepare_detection_results(detections, self.w, self.h)
         with torch.no_grad():
-            inps, cropped_boxes, boxes, scores, ids = self.transform_detections(image, detections)
+            inps, cropped_boxes, boxes, scores, ids = self._transform_detections(image, detections)
             inps = inps.to(self.device)
             hm = self.pose_model(inps)
-            poses = self.post_process(hm, cropped_boxes, boxes, scores, ids)
+            poses = self._post_process(hm, cropped_boxes, boxes, scores, ids)
         inference_time = time.perf_counter() - t_begin
         self.fps = convert_infr_time_to_fps(inference_time)
         results = prepare_poses_results(poses, self.w, self.h, scores)
         return results
 
-    def transform_detections(self, image, dets):
+    def _transform_detections(self, image, dets):
         if isinstance(dets, int):
             return 0, 0
         dets = dets[dets[:, 0] == 0]
@@ -77,11 +92,11 @@ def transform_detections(self, image, dets):
         inps = torch.zeros(boxes.size(0), 3, *self._input_size)
         cropped_boxes = torch.zeros(boxes.size(0), 4)
         for i, box in enumerate(boxes):
-            inps[i], cropped_box = self.transform_single_detection(image, box)
+            inps[i], cropped_box = self._transform_single_detection(image, box)
             cropped_boxes[i] = torch.FloatTensor(cropped_box)
         return inps, cropped_boxes, boxes, scores, ids
 
-    def transform_single_detection(self, image, bbox):
+    def _transform_single_detection(self, image, bbox):
         xmin, ymin, xmax, ymax = bbox
         center, scale = box_to_center_scale(
             xmin, ymin, xmax - xmin, ymax - ymin, self._aspect_ratio)
@@ -102,13 +117,13 @@ def transform_single_detection(self, image, bbox):
 
         return img, bbox
 
-    def post_process(self, hm, cropped_boxes, boxes, scores, ids):
+    def _post_process(self, hm, cropped_boxes, boxes, scores, ids):
         assert hm.dim() == 4
         pose_coords = []
         pose_scores = []
         for i in range(hm.shape[0]):
             bbox = cropped_boxes[i].tolist()
-            pose_coord, pose_score = self.heatmap_to_coord(hm[i][self.eval_joints], bbox, hm_shape=self.hm_size,
+            pose_coord, pose_score = self._heatmap_to_coord(hm[i][self.eval_joints], bbox, hm_shape=self.hm_size,
                                                            norm_type=None)
             pose_coords.append(torch.from_numpy(pose_coord).unsqueeze(0))
             pose_scores.append(torch.from_numpy(pose_score).unsqueeze(0))
@@ -132,7 +147,7 @@ def post_process(self, hm, cropped_boxes, boxes, scores, ids):
             )
         return _result
 
-    def heatmap_to_coord(self, hms, bbox, hms_flip=None, **kwargs):
+    def _heatmap_to_coord(self, hms, bbox, hms_flip=None, **kwargs):
         if hms_flip is not None:
             hms = (hms + hms_flip) / 2
         if not isinstance(hms, np.ndarray):
diff --git a/libs/detectors/x86/alphapose/convert_results_format.py b/libs/detectors/x86/alphapose/convert_results_format.py
index bb65a857..ad754b5f 100644
--- a/libs/detectors/x86/alphapose/convert_results_format.py
+++ b/libs/detectors/x86/alphapose/convert_results_format.py
@@ -3,6 +3,18 @@
 
 
 def prepare_detection_results(object_list, w, h):
+    """
+    Change output format of neuralet's Detector class to AlphaPose expected detection output.
+    Args:
+        object_list: A dictionary contains of [{"id": 0, "bbox": [y1, x1, y2, x2], "score":s%}, {...}, {...}, ...]
+        w: Width of input image
+        h: Height of input image
+
+    Returns:
+        A torch num_of_objects by 8 tensor, each row has the form of
+        (batch_index, x_min, y_min, x_max, y_max, detection_score, class_score, 0)
+
+    """
     scale_factors = torch.tensor([w, h, w, h])
     num_of_objects = len(object_list)
     output = torch.zeros(num_of_objects, 8, dtype=torch.float32)
@@ -19,6 +31,17 @@ def prepare_detection_results(object_list, w, h):
 
 
 def prepare_poses_results(poses, w, h, scores):
+    """
+    Change the output of AlphaPose estimator format to the Neuralet's Detectors output format.
+    Args:
+        poses: AlphaPose output, a dictionary with "keypoints", "kp_score", "proposal_score", "idx" and "bbox" keys
+        w: Width of input image
+        h: Height of input image
+        scores: Detection's score
+
+    Returns:
+    A dictionary contains of [{"id": 0, "bbox": [y1, x1, y2, x2], "score":s%, "face": [y1, x1, y2, x2]}, {...}, {...}, ...]
+    """
     scales = np.array([h, w, h, w])
     results = []
     for i, item in enumerate(poses):
@@ -31,6 +54,8 @@ def prepare_poses_results(poses, w, h, scores):
         object_dict["face"] = None
         kp_scores = item["kp_score"].numpy()
         keypoints = item["keypoints"]
+
+        # Extract face bounding box
         if np.all(kp_scores[[0, 1, 2, 5, 6]] > 0.15):
             x_min_face = int(keypoints[6, 0])
             x_max_face = int(keypoints[5, 0])
diff --git a/x86-gpu-alphapose.Dockerfile b/x86-gpu-alphapose.Dockerfile
index 2203db29..b28dbf30 100644
--- a/x86-gpu-alphapose.Dockerfile
+++ b/x86-gpu-alphapose.Dockerfile
@@ -1,8 +1,4 @@
 FROM nvcr.io/nvidia/tensorflow:20.03-tf2-py3
-#RUN apt-get update && apt-get install -y python3-dev  && conda update -y wrapt && pip3 install tensorflow==2.2 openpifpaf wget
-
-#RUN ln -s /usr/local/cuda-10.2/targets/x86_64-linux/lib/libcudart.so.10.2 /usr/lib/x86_64-linux-gnu/libcudart.so.10.1
-#RUN pip uninstall python3-opencv python-opencv
 # The `python3-opencv` package isn't built with gstreamer on Ubuntu. So we need to manually build opencv.
 ARG OPENCV_VERSION=4.3.0
 # http://amritamaz.net/blog/opencv-config
@@ -86,17 +82,10 @@ ENV CONFIG_FILE=config-x86-gpu.ini
 RUN python3 -m site --user-site > /root/tmp_variable && DCN_PATH=$(cat /root/tmp_variable)
 COPY . /repo
 RUN pip3 install torch==1.2 torchvision==0.4.0
-#COPY ./libs/detectors/x86/alphapose/ /alphapose_packages
-#COPY ./libs/detectors/x86/alphapose/setup.py /alphapose_packages/setup.py
 WORKDIR /repo/libs/detectors/x86/alphapose
-#ENV PYTHONPATH=/alphapose_packages
-#
-#RUN mkdir -p $(cat /root/tmp_variable)/alphapose_package && \
-#cd $(cat /root/tmp_variable)/alphapose_package && \
 RUN apt-get update && apt-get install -y libyaml-dev && pip3 install cython gdown && python3 setup.py build develop --user
 RUN mkdir -p $(cat /root/tmp_variable)/alphapose_package && \
-cp /repo/libs/detectors/x86/alphapose/models/layers/dcn/*.so $(cat /root/tmp_variable)/alphapose_package
-#mv /repo/libs/detectors/x86/alphapose/setup.py $(cat /root/tmp_variable)/alphapose_package
+    cp /repo/libs/detectors/x86/alphapose/models/layers/dcn/*.so $(cat /root/tmp_variable)/alphapose_package
 
 WORKDIR /repo
 HEALTHCHECK --interval=30s --retries=2 --start-period=15s CMD bash healthcheck.bash

From 345885ee047777f3a9f0961f0b8f02eaa4e927b9 Mon Sep 17 00:00:00 2001
From: alpha-carinae29 <alpha.carinae29@gmail.com>
Date: Sat, 26 Dec 2020 14:39:38 -0800
Subject: [PATCH 13/13] add AlphaPose License

---
 .../detectors/x86/alphapose/AlphaPose_LICENSE | 515 ++++++++++++++++++
 1 file changed, 515 insertions(+)
 create mode 100644 libs/detectors/x86/alphapose/AlphaPose_LICENSE

diff --git a/libs/detectors/x86/alphapose/AlphaPose_LICENSE b/libs/detectors/x86/alphapose/AlphaPose_LICENSE
new file mode 100644
index 00000000..c0383742
--- /dev/null
+++ b/libs/detectors/x86/alphapose/AlphaPose_LICENSE
@@ -0,0 +1,515 @@
+ALPHAPOSE: MULTIPERSON KEYPOINT DETECTION
+SOFTWARE LICENSE AGREEMENT
+ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
+
+BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.  IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
+
+This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Shanghai Jiao Tong University (called "Licensor" in this Agreement).  All rights not specifically granted to you in this Agreement are reserved for Licensor. 
+
+RESERVATION OF OWNERSHIP AND GRANT OF LICENSE: 
+Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive, 
+non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement.  As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor,  including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
+
+CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement.  Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
+
+PERMITTED USES:  The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
+
+DERIVATIVES: You may create derivatives of or make modifications to the Software, however, You agree that all and any such derivatives and modifications will be owned by Licensor and become a part of the Software licensed to You under this Agreement.  You may only use such derivatives and modifications for your own noncommercial internal research purposes, and you may not otherwise use, distribute or copy such derivatives and modifications in violation of this Agreement.
+
+BACKUPS:  If Licensee is an organization, it may make that number of copies of the Software necessary for internal noncommercial use at a single site within its organization provided that all information appearing in or on the original labels, including the copyright and trademark notices are copied onto the labels of the copies.
+
+USES NOT PERMITTED:  You may not distribute, copy or use the Software except as explicitly permitted herein. Licensee has not been granted any trademark license as part of this Agreement and may not use the name or mark “AlphaPose", "Shanghai Jiao Tong" or any renditions thereof without the prior written permission of Licensor.
+
+You may not sell, rent, lease, sublicense, lend, time-share or transfer, in whole or in part, or provide third parties access to prior or present versions (or any parts thereof) of the Software.
+
+ASSIGNMENT: You may not assign this Agreement or your rights hereunder without the prior written consent of Licensor. Any attempted assignment without such consent shall be null and void.
+
+TERM: The term of the license granted by this Agreement is from Licensee's acceptance of this Agreement by downloading the Software or by using the Software until terminated as provided below.
+
+The Agreement automatically terminates without notice if you fail to comply with any provision of this Agreement.  Licensee may terminate this Agreement by ceasing using the Software.  Upon any termination of this Agreement, Licensee will delete any and all copies of the Software. You agree that all provisions which operate to protect the proprietary rights of Licensor shall remain in force should breach occur and that the obligation of confidentiality described in this Agreement is binding in perpetuity and, as such, survives the term of the Agreement.
+
+FEE: Provided Licensee abides completely by the terms and conditions of this Agreement, there is no fee due to Licensor for Licensee's use of the Software in accordance with this Agreement.
+
+DISCLAIMER OF WARRANTIES:  THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT.  LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
+
+SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.  
+
+EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
+
+EXPORT REGULATION: Licensee agrees to comply with any and all applicable 
+U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
+
+SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
+
+NO IMPLIED WAIVERS: No failure or delay by Licensor in enforcing any right or remedy under this Agreement shall be construed as a waiver of any future or other exercise of such right or remedy by Licensor.
+
+ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire agreement between Licensee and Licensor as to the matter set forth herein and supersedes any previous agreements, understandings, and arrangements between the parties relating hereto.
+
+
+
+************************************************************************
+
+THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
+
+This project incorporates material from the project(s) listed below (collectively, "Third Party Code").  This Third Party Code is licensed to you under their original license terms set forth below.  We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
+ 
+1.	Torch, (https://github.com/torch/distro)
+
+Copyright (c) 2016, Soumith Chintala, Ronan Collobert, Koray Kavukcuoglu, Clement Farabet All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+Neither the name of distro nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+2. TensorFlow (https://github.com/tensorflow/tensorflow)
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2017, The TensorFlow Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+3. tf-faster-rcnn (https://github.com/endernewton/tf-faster-rcnn)
+MIT License
+
+Copyright (c) 2017 Xinlei Chen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+4.PyraNet (https://github.com/bearpaw/PyraNet)
+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+5. pose-hg-demo (https://github.com/umich-vl/pose-hg-demo)
+Copyright (c) 2016, University of Michigan
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
\ No newline at end of file