eval.py

"""Adapted from:
    @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch
    @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn
    Licensed under The MIT License [see LICENSE for details]
"""

from __future__ import print_function
import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
from torch.autograd import Variable
from data import VOC_ROOT, VOCAnnotationTransform, VOCDetection, BaseTransform
from data import CUSTOM_ROOT, CustomAnnotationTransform, CustomDetection, BaseTransform
from data.custom import get_targets
import torch.utils.data as data
from data.config import coco, voc, custom, MEANS

from ssd import build_ssd

import sys
import os
import time
import argparse
import numpy as np
import pickle
import cv2

if sys.version_info[0] == 2:
    import xml.etree.cElementTree as ET
else:
    import xml.etree.ElementTree as ET


def str2bool(v):
    return v.lower() in ("yes", "true", "t", "1")


parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Evaluation')
parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO', 'Custom'],
                type=str, help='VOC, COCO or Custom')
parser.add_argument('--trained_model',
                    default='weights/ssd300_mAP_77.43_v2.pth', type=str,
                    help='Trained state_dict file path to open')
parser.add_argument('--save_folder', default='eval/', type=str,
                    help='File path to save results')
parser.add_argument('--confidence_threshold', default=0.01, type=float,
                    help='Detection confidence threshold')
parser.add_argument('--top_k', default=5, type=int,
                    help='Further restrict the number of predictions to parse')
parser.add_argument('--cuda', default=True, type=str2bool,
                    help='Use cuda to train model')
parser.add_argument('--dataset_root', default=VOC_ROOT,
                    help='Location of VOC root directory')
parser.add_argument('--cleanup', default=True, type=str2bool,
                    help='Cleanup and remove results files following eval')

args = parser.parse_args()

# TODO add COCO
if args.dataset == 'VOC':
    from data import VOC_CLASSES as labelmap
else:
    from data import CUSTOM_CLASSES as labelmap

# TODO: move this to a better spot
if not os.path.exists(args.save_folder):
    os.makedirs(args.save_folder)

# Output directories
#   - cache_dir caches the annotations in a pickle file
cache_dir = os.path.join(args.dataset_root, 'annotations_cache')
output_dir = os.path.join(args.dataset_root, 'detection_output')
eval_dir = os.path.join(args.dataset_root, 'eval_output')

# Assign to either CPU or GPU as device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    if args.cuda:
        torch.set_default_tensor_type('torch.cuda.FloatTensor')
    if not args.cuda:
        print("WARNING: It looks like you have a CUDA device, but aren't using \
              CUDA.  Run with --cuda for optimal eval speed.")
        torch.set_default_tensor_type('torch.FloatTensor')
else:
    torch.set_default_tensor_type('torch.FloatTensor')

if args.dataset == 'VOC':
    annopath = os.path.join(args.dataset_root, 'Annotations', '%s.xml')
elif args.dataset == 'Custom':
    annopath = os.path.join(args.dataset_root, 'test', 'annot', '%s.json')
else:
    # TODO - generalize this else
    annopath = os.path.join(args.dataset_root, 'test', 'annot', '%s.json')
imgpath = os.path.join(args.dataset_root,  'test', '%s.*')
# imgsetpath = os.path.join(args.dataset_root, 'ImageSets',
#                           'Main', '{:s}.txt')
imgsetpath = os.path.join(args.dataset_root, 'imagenames.txt')
# YEAR = '2007'
devkit_path = os.path.join(args.dataset_root, args.dataset)
# TODO make this an arg
set_type = 'test'

class Timer(object):
    """A simple timer."""
    def __init__(self):
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.

    def tic(self):
        # using time.time instead of time.clock because time time.clock
        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if average:
            return self.average_time
        else:
            return self.diff

def parse_rec(filename):
    """ Parse a PASCAL VOC xml file """
    tree = ET.parse(filename)
    objects = []
    for obj in tree.findall('object'):
        obj_struct = {}
        obj_struct['name'] = obj.find('name').text
        obj_struct['pose'] = obj.find('pose').text
        obj_struct['truncated'] = int(obj.find('truncated').text)
        obj_struct['difficult'] = int(obj.find('difficult').text)
        bbox = obj.find('bndbox')
        obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1,
                              int(bbox.find('ymin').text) - 1,
                              int(bbox.find('xmax').text) - 1,
                              int(bbox.find('ymax').text) - 1]
        objects.append(obj_struct)

    return objects

def parse_rec_custom(filename):
    """Parse a json annotation file and return all bounding
    boxes for all images as a dict of dict of list.
    """
    # Process annot file
    targets = get_targets(filename)
    # This will be a list of images and bboxes therein
    objects_all = {}
    # scale = np.array([width, height, width, height])
    for target_id in targets:
        objects = []
        # img = cv2.imread(os.path.join(CUSTOM_ROOT, 'test', target_id))
        # height, width, _ = img.shape
        # scale = np.array([width, height, width, height])
        # Loop through all bboxes in an image
        for _, elem in enumerate(targets[target_id]):
            obj_struct = {}
            bbox = np.zeros(shape=4)
            bbox[0] = elem['x']
            bbox[1] = elem['y']
            bbox[2] = bbox[0] + elem['width']
            bbox[3] = bbox[1] + elem['height']
            final_box = np.array(bbox)
            # Add the new bbox to dict of lists of lists
            obj_struct['bbox'] = final_box
            # Filename as id to the dict TODO: grab class names
            obj_struct['name'] = 'object'
            obj_struct['difficult'] = 0 # False for now, no difficult gt boxes
            objects.append(obj_struct)
        # Append all bboxes from an image to the list of images
        objects_all[target_id] = objects
    return objects_all

def get_output_dir(name, phase):
    """Return the directory where experimental artifacts are placed.
    If the directory does not exist, it is created.
    A canonical path is built using the name from an imdb and a network
    (if not None).
    """
    filedir = os.path.join(name, phase)
    if not os.path.exists(filedir):
        os.makedirs(filedir)
    return filedir


def get_voc_results_file_template(image_set, cls_name):
    # <basepath>/results/det_test_aeroplane.txt
    filename = 'det_' + image_set + '_%s.txt' % (cls_name)
    filedir = os.path.join(output_dir, 'results')
    if not os.path.exists(filedir):
        os.makedirs(filedir)
    path = os.path.join(filedir, filename)
    return path


def write_results_file(all_boxes, dataset):
    for cls_ind, cls_name in enumerate([labelmap]):
        print('Writing {:s} detection results file'.format(cls_name))
        filename = get_voc_results_file_template(set_type, cls_name)
        with open(filename, 'wt') as f:
            for im_ind, index in enumerate(dataset.ids):
                # Read image so we can draw boxes
                img = cv2.imread(os.path.join(CUSTOM_ROOT, 'test', index))
                dets = all_boxes[cls_ind+1][im_ind]
                if dets == []:
                    continue
                if args.dataset == 'VOC':
                    # the VOCdevkit expects 1-based indices
                    for k in range(dets.shape[0]):
                        f.write('{:s}\t{:.3f}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}\n'.
                                format(index[1], dets[k, -1],
                                    dets[k, 0] + 1, dets[k, 1] + 1,
                                    dets[k, 2] + 1, dets[k, 3] + 1))

                if args.dataset == 'Custom':
                    for k in range(dets.shape[0]):
                        # If score is higher than given threshold
                        if dets[k, -1] > args.confidence_threshold:
                            f.write('{:s}\t{:.3f}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}\n'.
                                    format(index, dets[k, -1],
                                        dets[k, 0] + 1, dets[k, 1] + 1,
                                        dets[k, 2] + 1, dets[k, 3] + 1))
                        if k == 0: # draw first rect on input image
                            img = cv2.rectangle(img, (dets[k, 0], dets[k, 1]),
                                (dets[k, 2], dets[k, 3]), 
                                (0,255,0), 3)
                    cv2.imwrite(os.path.join(output_dir, 'recs_' + index), img)
                                           

    # TODO:  add COCO


def do_python_eval(use_07=True):
    if not os.path.isdir(cache_dir):
        os.makedirs(cache_dir)
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    aps = []
    # The PASCAL VOC metric changed in 2010
    print('VOC07 metric? ' + ('Yes' if use_07 else 'No'))
    for i, cls_name in enumerate([labelmap]):
        filename = get_voc_results_file_template(set_type, cls_name)
        rec, prec, ap = voc_eval(
           filename, annopath, imgsetpath.format(set_type), cls_name,
           ovthresh=0.3, use_07_metric=use_07)
        aps += [ap]
        print('AP for {} = {:.4f}'.format(cls_name, ap))
        with open(os.path.join(output_dir, cls_name + '_pr.pkl'), 'wb') as f:
            pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
    print('Mean AP = {:.4f}'.format(np.mean(aps)))
    print('~~~~~~~~')
    print('Results:')
    for ap in aps:
        print('{:.3f}'.format(ap))
    print('{:.3f}'.format(np.mean(aps)))
    print('~~~~~~~~')
    print('')
    print('--------------------------------------------------------------')
    print('Results computed with the **unofficial** Python eval code.')
    print('Results should be very close to the official MATLAB eval code.')
    print('--------------------------------------------------------------')


def voc_ap(rec, prec, use_07_metric=True):
    """ ap = voc_ap(rec, prec, [use_07_metric])
    Compute VOC AP given precision and recall.
    If use_07_metric is true, uses the
    VOC 07 11 point method (default:True).
    """
    if use_07_metric:
        # 11 point metric
        ap = 0.
        for t in np.arange(0., 1.1, 0.1):
            if np.sum(rec >= t) == 0:
                p = 0
            else:
                p = np.max(prec[rec >= t])
            ap = ap + p / 11.
    else:
        # correct AP calculation
        # first append sentinel values at the end
        mrec = np.concatenate(([0.], rec, [1.]))
        mpre = np.concatenate(([0.], prec, [0.]))

        # compute the precision envelope
        for i in range(mpre.size - 1, 0, -1):
            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])

        # to calculate area under PR curve, look for points
        # where X axis (recall) changes value
        i = np.where(mrec[1:] != mrec[:-1])[0]

        # and sum (\Delta recall) * prec
        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap


def voc_eval(detpath,
             annopath,
             imagesetfile,
             classname,
             ovthresh=0.5,
             use_07_metric=True):
    """rec, prec, ap = voc_eval(detpath,
                           annopath,
                           imagesetfile,
                           classname,
                           [ovthresh],
                           [use_07_metric])
    Top level function that does the PASCAL VOC evaluation.
    detpath: Path to detections
    detpath.format(classname) should produce the detection results file.
    annopath: Path to annotations
    annopath.format(imagename) should be the xml annotations file.
    imagesetfile: Text file containing the list of images, one image per line.
    classname: Category name (duh)
    cachedir: Directory for caching the annotations
    [ovthresh]: Overlap threshold (default = 0.5)
    [use_07_metric]: Whether to use VOC07's 11 point AP computation
    (default True)
    """
# assumes detections are in detpath.format(classname)
# assumes annotations are in annopath.format(imagename)
# assumes imagesetfile is a text file with each line an image name
# first load gt
    cachefile = os.path.join(cache_dir, 'annots.pkl')
    # read list of images
    with open(imagesetfile, 'r') as f:
        lines = f.readlines()
    imagenames = [x.strip() for x in lines]

    if True: #not os.path.isfile(cachefile):
        # Load annotations
        if args.dataset ==  'VOC':
            recs = {}
            for i, imagename in enumerate(imagenames):
                recs[imagename] = parse_rec(annopath % (imagename))
                if i % 100 == 0:
                    print('Reading annotation for {:d}/{:d}'.format(
                    i + 1, len(imagenames)))
        # TODO: COCO annots
        elif args.dataset == 'Custom':
            recs = parse_rec_custom(os.path.join(CUSTOM_ROOT, 'test', 'annot', 
                'via_region_data.json'))
        # TODO: return warning message since no valid dataset name was supplied
        else:
            recs = {}
        # Save all annotations to pickle file
        print('Saving cached annotations to {:s}'.format(cachefile))
        with open(cachefile, 'wb') as f:
            pickle.dump(recs, f)
    else:
        # load
        with open(cachefile, 'rb') as f:
            recs = pickle.load(f)

    # extract gt objects for this class
    class_recs = {}
    npos = 0
    for imagename in imagenames:
        if imagename in recs:
            R = [obj for obj in recs[imagename] if obj['name'] == classname]
            bbox = np.array([x['bbox'] for x in R])
            difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
            det = [False] * len(R)
            npos = npos + sum(~difficult) # TODO: figure out general replacement for 'difficult'
            class_recs[imagename] = {'bbox': bbox,
                                    'difficult': difficult,
                                    'det': det}

    # read dets
    detfile = detpath.format(classname)
    with open(detfile, 'r') as f:
        lines = f.readlines()
    if any(lines) == 1:

        splitlines = [x.strip().split('\t') for x in lines]
        image_ids = [x[0] for x in splitlines]
        confidence = np.array([float(x[1]) for x in splitlines])
        BB = np.array([[float(z) for z in x[2:]] for x in splitlines])

        # sort by confidence
        sorted_ind = np.argsort(-confidence)
        sorted_scores = np.sort(-confidence)
        BB = BB[sorted_ind, :]
        image_ids = [image_ids[x] for x in sorted_ind]

        # go down dets and mark TPs and FPs
        nd = len(image_ids)
        tp = np.zeros(nd)
        fp = np.zeros(nd)
        for d in range(nd):
            R = class_recs[image_ids[d]]
            bb = BB[d, :].astype(float)
            ovmax = -np.inf
            BBGT = R['bbox'].astype(float)
            if BBGT.size > 0:
                # compute overlaps
                # intersection
                ixmin = np.maximum(BBGT[:, 0], bb[0])
                iymin = np.maximum(BBGT[:, 1], bb[1])
                ixmax = np.minimum(BBGT[:, 2], bb[2])
                iymax = np.minimum(BBGT[:, 3], bb[3])
                iw = np.maximum(ixmax - ixmin, 0.)
                ih = np.maximum(iymax - iymin, 0.)
                inters = iw * ih
                uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) +
                       (BBGT[:, 2] - BBGT[:, 0]) *
                       (BBGT[:, 3] - BBGT[:, 1]) - inters)
                overlaps = inters / uni
                ovmax = np.max(overlaps)
                jmax = np.argmax(overlaps)

            if ovmax > ovthresh:
                if not R['difficult'][jmax]:
                    if not R['det'][jmax]:
                        tp[d] = 1.
                        R['det'][jmax] = 1
                    else:
                        fp[d] = 1.
            else:
                fp[d] = 1.

        # compute precision recall
        fp = np.cumsum(fp)
        tp = np.cumsum(tp)
        rec = tp / float(npos)
        # avoid divide by zero in case the first detection matches a difficult
        # ground truth
        prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
        ap = voc_ap(rec, prec, use_07_metric)
    else:
        rec = -1.
        prec = -1.
        ap = -1.

    return rec, prec, ap


def test_net(save_folder, net, cuda, dataset, transform, top_k,
             im_size=300, thresh=0.05):
    num_images = len(dataset)
    # all detections are collected into:
    #    all_boxes[cls][image] = N x 5 array of detections in
    #    (x1, y1, x2, y2, score)
    all_boxes = [[[] for _ in range(num_images)]
                 for _ in range(len([labelmap])+1)]

    # timers
    _t = {'im_detect': Timer(), 'misc': Timer()}
    # output_dir = get_output_dir('ssd300_120000', set_type)
    det_file = os.path.join(output_dir, 'detections.pkl')

    for i in range(num_images):
        im, gt, h, w, _ = dataset.pull_item(i)

        x = Variable(im.unsqueeze(0))
        x = x.to(device)
        _t['im_detect'].tic()
        detections = net(x).data
        detect_time = _t['im_detect'].toc(average=False)

        # skip j = 0, because it's the background class
        for j in range(1, detections.size(1)):
            dets = detections[0, j, :]
            mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t()
            dets = torch.masked_select(dets, mask).view(-1, 5)
            if dets.dim() < 2: # == 0
                continue
            boxes = dets[:, 1:]
            boxes[:, 0] *= w # x1
            boxes[:, 2] *= w # x2
            boxes[:, 1] *= h # y1
            boxes[:, 3] *= h # y2
            scores = dets[:, 0].cpu().numpy()
            cls_dets = np.hstack((boxes.cpu().numpy(),
                                  scores[:, np.newaxis])).astype(np.float32,
                                                                 copy=False)
            all_boxes[j][i] = cls_dets

        print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1,
                                                    num_images, detect_time))

    with open(det_file, 'wb') as f:
        pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)

    print('Evaluating detections')
    write_results_file(all_boxes, dataset)
    do_python_eval(use_07=False)

if __name__ == '__main__':
    # load net
    if args.dataset == 'VOC':
        cfg = voc
    else:
        cfg = custom
                   # +1 for background
    net = build_ssd(phase='test', size=cfg['min_dim'], 
        num_classes=cfg['num_classes'])
    net.load_state_dict(torch.load(args.trained_model))
    net.eval()
    print('Finished loading model!')

    # Load data (TODO:  add COCO)
    if args.dataset == 'VOC':
        dataset = VOCDetection(args.dataset_root, [(set_type)],
                            BaseTransform(cfg['min_dim'], MEANS),
                            VOCAnnotationTransform())
    else:
        dataset = CustomDetection(root=args.dataset_root, 
                                    image_set=[(set_type)], 
                                    transform=BaseTransform(cfg['min_dim'], MEANS), 
                                    target_transform=CustomAnnotationTransform(train=False))
    
    net = net.to(device)
    
    # Evaluation
    test_net(args.save_folder, net, args.cuda, dataset,
             BaseTransform(net.size, MEANS), args.top_k, cfg['min_dim'],
             thresh=args.confidence_threshold)