diff --git "a/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/__pycache__/trt_yolov3.cpython-36.pyc" "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/__pycache__/trt_yolov3.cpython-36.pyc" new file mode 100644 index 0000000..81d8f81 Binary files /dev/null and "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/__pycache__/trt_yolov3.cpython-36.pyc" differ diff --git "a/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/camera.py" "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/camera.py" new file mode 100644 index 0000000..0ce578f --- /dev/null +++ "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/camera.py" @@ -0,0 +1,233 @@ +"""camera.py + +This code implements the Camera class, which encapsulates code to +handle IP CAM, USB webcam or the Jetson onboard camera. In +addition, this Camera class is further extended to take a video +file or an image file as input. +""" + + +import logging +import threading +import subprocess + +import numpy as np +import cv2 + + +# The following flag ise used to control whether to use a GStreamer +# pipeline to open USB webcam source. If set to False, we just open +# the webcam using cv2.VideoCapture(index) machinery. i.e. relying +# on cv2's built-in function to capture images from the webcam. +USB_GSTREAMER = True + + +def add_camera_args(parser): + """Add parser augument for camera options.""" + parser.add_argument('--file', dest='use_file', + help='use a video file as input (remember to ' + 'also set --filename)', + action='store_true') + parser.add_argument('--image', dest='use_image', + help='use an image file as input (remember to ' + 'also set --filename)', + action='store_true') + parser.add_argument('--filename', dest='filename', + help='video file name, e.g. test.mp4', + default=None, type=str) + parser.add_argument('--rtsp', dest='use_rtsp', + help='use IP CAM (remember to also set --uri)', + action='store_true') + parser.add_argument('--uri', dest='rtsp_uri', + help='RTSP URI, e.g. rtsp://192.168.1.64:554', + default=None, type=str) + parser.add_argument('--latency', dest='rtsp_latency', + help='latency in ms for RTSP [200]', + default=200, type=int) + parser.add_argument('--usb', dest='use_usb', + help='use USB webcam (remember to also set --vid)', + action='store_true') + parser.add_argument('--vid', dest='video_dev', + help='device # of USB webcam (/dev/video?) [0]', + default=0, type=int) + parser.add_argument('--width', dest='image_width', + help='image width [640]', + default=640, type=int) + parser.add_argument('--height', dest='image_height', + help='image height [480]', + default=480, type=int) + return parser + + +def open_cam_rtsp(uri, width, height, latency): + """Open an RTSP URI (IP CAM).""" + gst_elements = str(subprocess.check_output('gst-inspect-1.0')) + if 'omxh264dec' in gst_elements: + # Use hardware H.264 decoder on Jetson platforms + gst_str = ('rtspsrc location={} latency={} ! ' + 'rtph264depay ! h264parse ! omxh264dec ! ' + 'nvvidconv ! ' + 'video/x-raw, width=(int){}, height=(int){}, ' + 'format=(string)BGRx ! videoconvert ! ' + 'appsink').format(uri, latency, width, height) + elif 'avdec_h264' in gst_elements: + # Otherwise try to use the software decoder 'avdec_h264' + # NOTE: in case resizing images is necessary, try adding + # a 'videoscale' into the pipeline + gst_str = ('rtspsrc location={} latency={} ! ' + 'rtph264depay ! h264parse ! avdec_h264 ! ' + 'videoconvert ! appsink').format(uri, latency) + else: + raise RuntimeError('H.264 decoder not found!') + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + + +def open_cam_usb(dev, width, height): + """Open a USB webcam.""" + if USB_GSTREAMER: + gst_str = ('v4l2src device=/dev/video{} ! ' + 'video/x-raw, width=(int){}, height=(int){} ! ' + 'videoconvert ! appsink').format(dev, width, height) + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + else: + return cv2.VideoCapture(dev) + + +def open_cam_onboard(width, height): + """Open the Jetson onboard camera.""" + gst_elements = str(subprocess.check_output('gst-inspect-1.0')) + if 'nvcamerasrc' in gst_elements: + # On versions of L4T prior to 28.1, you might need to add + # 'flip-method=2' into gst_str below. + gst_str = ('nvcamerasrc ! ' + 'video/x-raw(memory:NVMM), ' + 'width=(int)2592, height=(int)1458, ' + 'format=(string)I420, framerate=(fraction)30/1 ! ' + 'nvvidconv ! ' + 'video/x-raw, width=(int){}, height=(int){}, ' + 'format=(string)BGRx ! ' + 'videoconvert ! appsink').format(width, height) + elif 'nvarguscamerasrc' in gst_elements: + gst_str = ('nvarguscamerasrc ! ' + 'video/x-raw(memory:NVMM), ' + 'width=(int)1920, height=(int)1080, ' + 'format=(string)NV12, framerate=(fraction)30/1 ! ' + 'nvvidconv flip-method=2 ! ' + 'video/x-raw, width=(int){}, height=(int){}, ' + 'format=(string)BGRx ! ' + 'videoconvert ! appsink').format(width, height) + else: + raise RuntimeError('onboard camera source not found!') + return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) + + +def grab_img(cam): + """This 'grab_img' function is designed to be run in the sub-thread. + Once started, this thread continues to grab a new image and put it + into the global 'img_handle', until 'thread_running' is set to False. + """ + while cam.thread_running: + _, cam.img_handle = cam.cap.read() + if cam.img_handle is None: + logging.warning('grab_img(): cap.read() returns None...') + break + cam.thread_running = False + + +class Camera(): + """Camera class which supports reading images from theses video sources: + + 1. Video file + 2. Image (jpg, png, etc.) file, repeating indefinitely + 3. RTSP (IP CAM) + 4. USB webcam + 5. Jetson onboard camera + """ + + def __init__(self, args): + self.args = args + self.is_opened = False + self.use_thread = False + self.thread_running = False + self.img_handle = None + self.img_width = 0 + self.img_height = 0 + self.cap = None + self.thread = None + + def open(self): + """Open camera based on command line arguments.""" + assert self.cap is None, 'Camera is already opened!' + args = self.args + if args.use_file: + self.cap = cv2.VideoCapture(args.filename) + # ignore image width/height settings here + self.use_thread = False + elif args.use_image: + self.cap = 'OK' + self.img_handle = cv2.imread(args.filename) + # ignore image width/height settings here + if self.img_handle is not None: + self.is_opened = True + self.img_height, self.img_width, _ = self.img_handle.shape + self.use_thread = False + elif args.use_rtsp: + self.cap = open_cam_rtsp( + args.rtsp_uri, + args.image_width, + args.image_height, + args.rtsp_latency + ) + self.use_thread = True + elif args.use_usb: + self.cap = open_cam_usb( + args.video_dev, + args.image_width, + args.image_height + ) + self.use_thread = True + else: # by default, use the jetson onboard camera + self.cap = open_cam_onboard( + args.image_width, + args.image_height + ) + self.use_thread = True + if self.cap != 'OK': + if self.cap.isOpened(): + # Try to grab the 1st image and determine width and height + _, img = self.cap.read() + if img is not None: + self.img_height, self.img_width, _ = img.shape + self.is_opened = True + + def start(self): + assert not self.thread_running + if self.use_thread: + self.thread_running = True + self.thread = threading.Thread(target=grab_img, args=(self,)) + self.thread.start() + + def stop(self): + self.thread_running = False + if self.use_thread: + self.thread.join() + + def read(self): + if self.args.use_file: + _, img = self.cap.read() + if img is None: + #logging.warning('grab_img(): cap.read() returns None...') + # looping around + self.cap.release() + self.cap = cv2.VideoCapture(self.args.filename) + _, img = self.cap.read() + return img + elif self.args.use_image: + return np.copy(self.img_handle) + else: + return self.img_handle + + def release(self): + assert not self.thread_running + if self.cap != 'OK': + self.cap.release() diff --git "a/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/display.py" "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/display.py" new file mode 100644 index 0000000..41192e9 --- /dev/null +++ "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/display.py" @@ -0,0 +1,42 @@ + +"""display.py +""" + + +import cv2 + + +def open_window(window_name, width, height, title): + """Open the display window.""" + cv2.namedWindow(window_name, cv2.WINDOW_NORMAL) + cv2.resizeWindow(window_name, width, height) + cv2.setWindowTitle(window_name, title) + + +def show_help_text(img, help_text): + """Draw help text on image.""" + cv2.putText(img, help_text, (11, 20), cv2.FONT_HERSHEY_PLAIN, 1.0, + (32, 32, 32), 4, cv2.LINE_AA) + cv2.putText(img, help_text, (10, 20), cv2.FONT_HERSHEY_PLAIN, 1.0, + (240, 240, 240), 1, cv2.LINE_AA) + return img + + +def show_fps(img, fps): + """Draw fps number at top-left corner of the image.""" + font = cv2.FONT_HERSHEY_PLAIN + line = cv2.LINE_AA + fps_text = 'FPS: {:.2f}'.format(fps) + cv2.putText(img, fps_text, (11, 20), font, 1.0, (32, 32, 32), 4, line) + cv2.putText(img, fps_text, (10, 20), font, 1.0, (240, 240, 240), 1, line) + return img + + +def set_display(window_name, full_scrn): + """Set disply window to either full screen or normal.""" + if full_scrn: + cv2.setWindowProperty(window_name, cv2.WND_PROP_FULLSCREEN, + cv2.WINDOW_FULLSCREEN) + else: + cv2.setWindowProperty(window_name, cv2.WND_PROP_FULLSCREEN, + cv2.WINDOW_NORMAL) diff --git "a/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/trt_yolov3_node.py" "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/trt_yolov3_node.py" new file mode 100644 index 0000000..0b62a53 --- /dev/null +++ "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/trt_yolov3_node.py" @@ -0,0 +1,142 @@ +### Update List +# Date: 0807 +# Author: Gu Lee +# No more custom messages are used. Now use Int16MultiArray instead of BoundingBox and BoundingBoxes me +# Through this, TRT_yolov3/Bbox topic can be recorded with rosbag. + +# Date: 0816 +# Author: Gu Lee +# Separate image callback and detection function. +# Use rclpy.spin_once instead of rclpy.spin + + +import os +import cv2 +import rclpy +import numpy as np +from std_msgs.msg import String, Int16MultiArray, MultiArrayDimension +from sensor_msgs.msg import Image +from cv_bridge import CvBridge +import time +import argparse +import pycuda.autoinit # This is needed for initializing CUDA driver +from trt_yolov3.yolov3 import TrtYOLOv3 +from trt_yolov3.camera import add_camera_args, Camera +from trt_yolov3.display import open_window, set_display, show_fps +from trt_yolov3.visualization import BBoxVisualization + +bridge = CvBridge() + +COCO_CLASSES_LIST = ['person','bicycle','car','motorbike','aeroplane','bus','train','truck','boat','traffic light','fire hydrant','stop sign','parkingmeter','bench','bird','cat','dog','horse','sheep','cow','elephant','bear','zebra','giraffe','backpack','umbrella','handbag','tie','suitcase','frisbee','skis', +'snowboard','sports ball','kite','baseball bat','baseball glove','skateboard','surfboard','tennis racket','bottle','wine glass','cup','fork', +'knife','spoon','bowl','banana','apple','sandwich','orange','broccoli','carrot','hot dog','pizza','donut','cake','chair','sofa','pottedplant', +'bed','diningtable','toilet','tvmonitor','laptop','mouse','remote','keyboard','cell phone','microwave','oven','toaster','sink','refrigerator', +'book','clock','vase','scissors','teddy bear','hair drier','toothbrush',] + +KCITY_CUSTOM = ['car','crossWalk_sign','bicycle_sign','bust_sign','construction_sign','parking_sign','kidSafeSero_sign','busArrowDown_sign','trafficLightRedYellow','trafficLightGreenLeft', +'trafficLightYellow','trafficGreen','trafficLightRed','trafficLightRedLeft',] + +def get_cls_dict(category_num): + """Get the class ID to name translation dictionary.""" + if category_num == 80: + return {i: n for i, n in enumerate(COCO_CLASSES_LIST)} + elif category_num == 14: + return {i: n for i, n in enumerate(KCITY_CUSTOM)} + +def parse_args(): + desc = ('Capture and display live camera video, while doing ' + 'real-time object detection with TensorRT optimized ' + 'YOLOv3 model on Jetson Nano') + parser = argparse.ArgumentParser(description=desc) + parser = add_camera_args(parser) + parser.add_argument('--model', type=str, default='yolov3-416', + help='yolov3[-spp|-tiny]-[288|416|608]') + parser.add_argument('--category_num', type=int, default=80, + help='number of object categories [80]') + args = parser.parse_args() + + os.system("ls") + + return args + +def image_callback(msg : Image): + global trt_yolov3 + global conf_th + global vis + global pub + global pub_ + global node + global img + now = time.time() + time_now = time.time() + img = bridge.imgmsg_to_cv2(msg, "rgb8") + boxes, confs, clss = trt_yolov3.detect(img, conf_th) + boxes = boxes.tolist() + confs = confs.tolist() + clss = clss.tolist() + detection_results = Int16MultiArray() + if boxes is not None: + for i in range(len(boxes)): + detection_results.layout.dim.append(MultiArrayDimension()) + detection_results.layout.dim[i].label = "object" + str(i) + detection_results.layout.dim[i].size = 6 + detection_results.data = [0] * 6 * len(boxes) + detection_results.data[i * 6] = clss[i] + detection_results.data[i * 6 + 1] = int(confs[i] * 100) + detection_results.data[i * 6 + 2] = boxes[i][0] + detection_results.data[i * 6 + 3] = boxes[i][1] + detection_results.data[i * 6 + 4] = boxes[i][2] + detection_results.data[i * 6 + 5] = boxes[i][3] + pub_.publish(detection_results) + + # for result image publish + # This leads to a serious decline in performance. + # Use just for Debugging + img = vis.draw_bboxes(img, boxes, confs, clss) + img = cv2.resize(img, None, fx=0.2, fy=0.2) + imgmsg = bridge.cv2_to_imgmsg(img, "rgb8") + pub.publish(imgmsg) + +node = 0 + +def main(): + global trt_yolov3 + global conf_th + global vis + # for result image publish + global pub + global pub_ + global node + conf_th = 0.5 + rclpy.init(args=None) + args = parse_args() + node = rclpy.create_node('image_sub_py') + #sub = node.create_subscription(Image, '/image', image_callback) + sub = node.create_subscription(Image, '/movie', image_callback) + + # for result image publish + pub = node.create_publisher(Image, '/TRT_yolov3/result_image') + pub_ = node.create_publisher(Int16MultiArray, '/TRT_yolov3/Bbox') + + if args.category_num <= 0: + raise SystemExit('ERROR: bad category_num (%d)!' % args.category_num) + + cls_dict = get_cls_dict(args.category_num) + yolo_dim = args.model.split('-')[-1] + if 'x' in yolo_dim: + dim_split = yolo_dim.split('x') + if len(dim_split) != 2: + raise SystemExit('ERROR: bad yolo_dim (%s)!' % yolo_dim) + w, h = int(dim_split[0]), int(dim_split[1]) + else: + h = w = int(yolo_dim) + if h % 32 != 0 or w % 32 != 0: + raise SystemExit('ERROR: bad yolo_dim (%s)!' % yolo_dim) + trt_yolov3 = TrtYOLOv3(args.model, (h, w), args.category_num) + vis = BBoxVisualization(cls_dict) + rclpy.spin(node) + node.destroy_node() + rclpy.shutdown() + +if __name__ == '__main__': + main() diff --git "a/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/visualization.py" "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/visualization.py" new file mode 100644 index 0000000..776d90f --- /dev/null +++ "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/visualization.py" @@ -0,0 +1,102 @@ +"""visualization.py + +The BBoxVisualization class implements drawing of nice looking +bounding boxes based on object detection results. +""" + + +import numpy as np +import cv2 + + +# Constants +ALPHA = 0.5 +FONT = cv2.FONT_HERSHEY_PLAIN +TEXT_SCALE = 1.0 +TEXT_THICKNESS = 1 +BLACK = (0, 0, 0) +WHITE = (255, 255, 255) + + +def gen_colors(num_colors): + """Generate different colors. + + # Arguments + num_colors: total number of colors/classes. + + # Output + bgrs: a list of (B, G, R) tuples which correspond to each of + the colors/classes. + """ + import random + import colorsys + + hsvs = [[float(x) / num_colors, 1., 0.7] for x in range(num_colors)] + random.seed(1234) + random.shuffle(hsvs) + rgbs = list(map(lambda x: list(colorsys.hsv_to_rgb(*x)), hsvs)) + bgrs = [(int(rgb[2] * 255), int(rgb[1] * 255), int(rgb[0] * 255)) + for rgb in rgbs] + return bgrs + + +def draw_boxed_text(img, text, topleft, color): + """Draw a transluent boxed text in white, overlayed on top of a + colored patch surrounded by a black border. FONT, TEXT_SCALE, + TEXT_THICKNESS and ALPHA values are constants (fixed) as defined + on top. + + # Arguments + img: the input image as a numpy array. + text: the text to be drawn. + topleft: XY coordinate of the topleft corner of the boxed text. + color: color of the patch, i.e. background of the text. + + # Output + img: note the original image is modified inplace. + """ + assert img.dtype == np.uint8 + img_h, img_w, _ = img.shape + if topleft[0] >= img_w or topleft[1] >= img_h: + return img + margin = 3 + size = cv2.getTextSize(text, FONT, TEXT_SCALE, TEXT_THICKNESS) + w = size[0][0] + margin * 2 + h = size[0][1] + margin * 2 + # the patch is used to draw boxed text + patch = np.zeros((h, w, 3), dtype=np.uint8) + patch[...] = color + cv2.putText(patch, text, (margin+1, h-margin-2), FONT, TEXT_SCALE, + WHITE, thickness=TEXT_THICKNESS, lineType=cv2.LINE_8) + cv2.rectangle(patch, (0, 0), (w-1, h-1), BLACK, thickness=1) + w = min(w, img_w - topleft[0]) # clip overlay at image boundary + h = min(h, img_h - topleft[1]) + # Overlay the boxed text onto region of interest (roi) in img + roi = img[topleft[1]:topleft[1]+h, topleft[0]:topleft[0]+w, :] + cv2.addWeighted(patch[0:h, 0:w, :], ALPHA, roi, 1 - ALPHA, 0, roi) + return img + + +class BBoxVisualization(): + """BBoxVisualization class implements nice drawing of boudning boxes. + + # Arguments + cls_dict: a dictionary used to translate class id to its name. + """ + + def __init__(self, cls_dict): + self.cls_dict = cls_dict + self.colors = gen_colors(len(cls_dict)) + + def draw_bboxes(self, img, boxes, confs, clss): + """Draw detected bounding boxes on the original image.""" + for bb, cf, cl in zip(boxes, confs, clss): + cl = int(cl) + x_min, y_min, x_max, y_max = bb[0], bb[1], bb[2], bb[3] + color = self.colors[cl] + cv2.rectangle(img, (x_min, y_min), (x_max, y_max), color, 2) + txt_loc = (max(x_min+2, 0), max(y_min+2, 0)) + cls_name = self.cls_dict.get(cl, 'CLS{}'.format(cl)) + txt = '{} {:.2f}'.format(cls_name, cf) + img = draw_boxed_text(img, txt, txt_loc, color) + return img diff --git "a/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/yolov3.py" "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/yolov3.py" new file mode 100644 index 0000000..97ba398 --- /dev/null +++ "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/yolov3.py" @@ -0,0 +1,486 @@ +# yolov3.py +# +# Copyright 1993-2019 NVIDIA Corporation. All rights reserved. +# +# NOTICE TO LICENSEE: +# +# This source code and/or documentation ("Licensed Deliverables") are +# subject to NVIDIA intellectual property rights under U.S. and +# international Copyright laws. +# +# These Licensed Deliverables contained herein is PROPRIETARY and +# CONFIDENTIAL to NVIDIA and is being provided under the terms and +# conditions of a form of NVIDIA software license agreement by and +# between NVIDIA and Licensee ("License Agreement") or electronically +# accepted by Licensee. Notwithstanding any terms or conditions to +# the contrary in the License Agreement, reproduction or disclosure +# of the Licensed Deliverables to any third party without the express +# written consent of NVIDIA is prohibited. +# +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THESE LICENSED DELIVERABLES. +# +# U.S. Government End Users. These Licensed Deliverables are a +# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +# 1995), consisting of "commercial computer software" and "commercial +# computer software documentation" as such terms are used in 48 +# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +# only as a commercial end item. Consistent with 48 C.F.R.12.212 and +# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +# U.S. Government End Users acquire the Licensed Deliverables with +# only those rights set forth herein. +# +# Any use of the Licensed Deliverables in individual and commercial +# software must include, in the user documentation and internal +# comments to the code, the above Disclaimer and U.S. Government End +# Users Notice. +# + +from __future__ import print_function + +import numpy as np +import cv2 +import tensorrt as trt +import pycuda.driver as cuda +import os +import getpass + +def _preprocess_yolov3(img, shape): + """Preprocess an image before TRT YOLOv3 inferencing.""" + img = cv2.resize(img, shape) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = img.transpose((2, 0, 1)).astype(np.float32) + img /= 255.0 + return img + + +class PostprocessYOLO(object): + """Class for post-processing the three output tensors from YOLOv3.""" + + def __init__(self, + yolo_masks, + yolo_anchors, + nms_threshold, + yolo_input_resolution, + category_num=80): + """Initialize with all values that will be kept when processing several frames. + Assuming 3 outputs of the network in the case of (large) YOLOv3. + + Keyword arguments: + yolo_masks -- a list of 3 three-dimensional tuples for the YOLO masks + yolo_anchors -- a list of 9 two-dimensional tuples for the YOLO anchors + object_threshold -- threshold for object coverage, float value between 0 and 1 + nms_threshold -- threshold for non-max suppression algorithm, + float value between 0 and 1 + input_resolution_yolo -- two-dimensional tuple with the target network's (spatial) + input resolution in HW order + category_num -- number of output categories/classes + """ + self.masks = yolo_masks + self.anchors = yolo_anchors + self.nms_threshold = nms_threshold + self.input_resolution_yolo = yolo_input_resolution + self.category_num = category_num + + def process(self, outputs, resolution_raw, conf_th): + """Take the YOLOv3 outputs generated from a TensorRT forward pass, post-process them + and return a list of bounding boxes for detected object together with their category + and their confidences in separate lists. + + Keyword arguments: + outputs -- outputs from a TensorRT engine in NCHW format + resolution_raw -- the original spatial resolution from the input PIL image in WH order + conf_th -- confidence threshold, e.g. 0.3 + """ + outputs_reshaped = list() + for output in outputs: + outputs_reshaped.append(self._reshape_output(output)) + + boxes_xywh, categories, confidences = self._process_yolo_output( + outputs_reshaped, resolution_raw, conf_th) + + if len(boxes_xywh) > 0: + # convert (x, y, width, height) to (x1, y1, x2, y2) + img_w, img_h = resolution_raw + xx = boxes_xywh[:, 0].reshape(-1, 1) + yy = boxes_xywh[:, 1].reshape(-1, 1) + ww = boxes_xywh[:, 2].reshape(-1, 1) + hh = boxes_xywh[:, 3].reshape(-1, 1) + boxes = np.concatenate([xx, yy, xx+ww, yy+hh], axis=1) + 0.5 + boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0., float(img_w-1)) + boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0., float(img_h-1)) + boxes = boxes.astype(np.int) + else: + boxes = np.zeros((0, 4), dtype=np.int) # empty + + return boxes, categories, confidences + + def _reshape_output(self, output): + """Reshape a TensorRT output from NCHW to NHWC format (with expected C=255), + and then return it in (height,width,3,85) dimensionality after further reshaping. + + Keyword argument: + output -- an output from a TensorRT engine after inference + """ + output = np.transpose(output, [0, 2, 3, 1]) + _, height, width, _ = output.shape + dim1, dim2 = height, width + dim3 = 3 + # There are CATEGORY_NUM=80 object categories: + dim4 = (4 + 1 + self.category_num) + return np.reshape(output, (dim1, dim2, dim3, dim4)) + + def _process_yolo_output(self, outputs_reshaped, resolution_raw, conf_th): + """Take in a list of three reshaped YOLO outputs in (height,width,3,85) shape and return + return a list of bounding boxes for detected object together with their category and their + confidences in separate lists. + + Keyword arguments: + outputs_reshaped -- list of three reshaped YOLO outputs as NumPy arrays + with shape (height,width,3,85) + resolution_raw -- the original spatial resolution from the input PIL image in WH order + conf_th -- confidence threshold + """ + + # E.g. in YOLOv3-608, there are three output tensors, which we associate with their + # respective masks. Then we iterate through all output-mask pairs and generate candidates + # for bounding boxes, their corresponding category predictions and their confidences: + boxes, categories, confidences = list(), list(), list() + for output, mask in zip(outputs_reshaped, self.masks): + box, category, confidence = self._process_feats(output, mask) + box, category, confidence = self._filter_boxes(box, category, confidence, conf_th) + boxes.append(box) + categories.append(category) + confidences.append(confidence) + + boxes = np.concatenate(boxes) + categories = np.concatenate(categories) + confidences = np.concatenate(confidences) + + # Scale boxes back to original image shape: + width, height = resolution_raw + image_dims = [width, height, width, height] + boxes = boxes * image_dims + + # Using the candidates from the previous (loop) step, we apply the non-max suppression + # algorithm that clusters adjacent bounding boxes to a single bounding box: + nms_boxes, nms_categories, nscores = list(), list(), list() + for category in set(categories): + idxs = np.where(categories == category) + box = boxes[idxs] + category = categories[idxs] + confidence = confidences[idxs] + + keep = self._nms_boxes(box, confidence) + + nms_boxes.append(box[keep]) + nms_categories.append(category[keep]) + nscores.append(confidence[keep]) + + if not nms_categories and not nscores: + return (np.empty((0, 4), dtype=np.float32), + np.empty((0, 1), dtype=np.float32), + np.empty((0, 1), dtype=np.float32)) + + boxes = np.concatenate(nms_boxes) + categories = np.concatenate(nms_categories) + confidences = np.concatenate(nscores) + + return boxes, categories, confidences + + def _process_feats(self, output_reshaped, mask): + """Take in a reshaped YOLO output in height,width,3,85 format together with its + corresponding YOLO mask and return the detected bounding boxes, the confidence, + and the class probability in each cell/pixel. + + Keyword arguments: + output_reshaped -- reshaped YOLO output as NumPy arrays with shape (height,width,3,85) + mask -- 2-dimensional tuple with mask specification for this output + """ + + def sigmoid_v(array): + return np.reciprocal(np.exp(-array) + 1.0) + + def exponential_v(array): + return np.exp(array) + + grid_h, grid_w, _, _ = output_reshaped.shape + + anchors = [self.anchors[i] for i in mask] + + # Reshape to N, height, width, num_anchors, box_params: + anchors_tensor = np.reshape(anchors, [1, 1, len(anchors), 2]) + box_xy = sigmoid_v(output_reshaped[..., 0:2]) + box_wh = exponential_v(output_reshaped[..., 2:4]) * anchors_tensor + box_confidence = sigmoid_v(output_reshaped[..., 4:5]) + box_class_probs = sigmoid_v(output_reshaped[..., 5:]) + + col = np.tile(np.arange(0, grid_w), grid_h).reshape(-1, grid_w) + row = np.tile(np.arange(0, grid_h).reshape(-1, 1), grid_w) + + col = col.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2) + row = row.reshape(grid_h, grid_w, 1, 1).repeat(3, axis=-2) + grid = np.concatenate((col, row), axis=-1) + + box_xy += grid + box_xy /= (grid_w, grid_h) + box_wh /= self.input_resolution_yolo + box_xy -= (box_wh / 2.) + boxes = np.concatenate((box_xy, box_wh), axis=-1) + + # boxes: centroids, box_confidence: confidence level, box_class_probs: + # class confidence + return boxes, box_confidence, box_class_probs + + def _filter_boxes(self, boxes, box_confidences, box_class_probs, conf_th): + """Take in the unfiltered bounding box descriptors and discard each cell + whose score is lower than the object threshold set during class initialization. + + Keyword arguments: + boxes -- bounding box coordinates with shape (height,width,3,4); 4 for + x,y,height,width coordinates of the boxes + box_confidences -- bounding box confidences with shape (height,width,3,1); 1 for as + confidence scalar per element + box_class_probs -- class probabilities with shape (height,width,3,CATEGORY_NUM) + conf_th -- confidence threshold + """ + box_scores = box_confidences * box_class_probs + box_classes = np.argmax(box_scores, axis=-1) + box_class_scores = np.max(box_scores, axis=-1) + pos = np.where(box_class_scores >= conf_th) + + boxes = boxes[pos] + classes = box_classes[pos] + scores = box_class_scores[pos] + + return boxes, classes, scores + + def _nms_boxes(self, boxes, box_confidences): + """Apply the Non-Maximum Suppression (NMS) algorithm on the bounding boxes with their + confidence scores and return an array with the indexes of the bounding boxes we want to + keep (and display later). + + Keyword arguments: + boxes -- a NumPy array containing N bounding-box coordinates that survived filtering, + with shape (N,4); 4 for x,y,height,width coordinates of the boxes + box_confidences -- a Numpy array containing the corresponding confidences with shape N + """ + x_coord = boxes[:, 0] + y_coord = boxes[:, 1] + width = boxes[:, 2] + height = boxes[:, 3] + + areas = width * height + ordered = box_confidences.argsort()[::-1] + + keep = list() + while ordered.size > 0: + # Index of the current element: + i = ordered[0] + keep.append(i) + xx1 = np.maximum(x_coord[i], x_coord[ordered[1:]]) + yy1 = np.maximum(y_coord[i], y_coord[ordered[1:]]) + xx2 = np.minimum(x_coord[i] + width[i], x_coord[ordered[1:]] + width[ordered[1:]]) + yy2 = np.minimum(y_coord[i] + height[i], y_coord[ordered[1:]] + height[ordered[1:]]) + + width1 = np.maximum(0.0, xx2 - xx1 + 1) + height1 = np.maximum(0.0, yy2 - yy1 + 1) + intersection = width1 * height1 + union = (areas[i] + areas[ordered[1:]] - intersection) + + # Compute the Intersection over Union (IoU) score: + iou = intersection / union + + # The goal of the NMS algorithm is to reduce the number of adjacent bounding-box + # candidates to a minimum. In this step, we keep only those elements whose overlap + # with the current bounding box is lower than the threshold: + indexes = np.where(iou <= self.nms_threshold)[0] + ordered = ordered[indexes + 1] + + keep = np.array(keep) + return keep + + +class HostDeviceMem(object): + """Simple helper data class that's a little nicer to use than a 2-tuple.""" + def __init__(self, host_mem, device_mem): + self.host = host_mem + self.device = device_mem + + def __str__(self): + return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) + + def __repr__(self): + return self.__str__() + + +def allocate_buffers(engine): + """Allocates all host/device in/out buffers required for an engine.""" + inputs = [] + outputs = [] + bindings = [] + stream = cuda.Stream() + for binding in engine: + size = trt.volume(engine.get_binding_shape(binding)) * \ + engine.max_batch_size + dtype = trt.nptype(engine.get_binding_dtype(binding)) + # Allocate host and device buffers + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + # Append the device buffer to device bindings. + bindings.append(int(device_mem)) + # Append to the appropriate list. + if engine.binding_is_input(binding): + inputs.append(HostDeviceMem(host_mem, device_mem)) + else: + outputs.append(HostDeviceMem(host_mem, device_mem)) + return inputs, outputs, bindings, stream + + +def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): + """do_inference (for TensorRT 6.x or lower) + + This function is generalized for multiple inputs/outputs. + Inputs and outputs are expected to be lists of HostDeviceMem objects. + """ + # Transfer input data to the GPU. + [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] + # Run inference. + context.execute_async(batch_size=batch_size, + bindings=bindings, + stream_handle=stream.handle) + # Transfer predictions back from the GPU. + [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] + # Synchronize the stream + stream.synchronize() + # Return only the host outputs. + return [out.host for out in outputs] + + +def do_inference_v2(context, bindings, inputs, outputs, stream): + """do_inference_v2 (for TensorRT 7.0+) + + This function is generalized for multiple inputs/outputs for full + dimension networks. + Inputs and outputs are expected to be lists of HostDeviceMem objects. + """ + # Transfer input data to the GPU. + [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] + # Run inference. + context.execute_async_v2(bindings=bindings, stream_handle=stream.handle) + # Transfer predictions back from the GPU. + [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] + # Synchronize the stream + stream.synchronize() + # Return only the host outputs. + return [out.host for out in outputs] + + +class TrtYOLOv3(object): + """TrtYOLOv3 class encapsulates things needed to run TRT YOLOv3.""" + + def _load_engine(self): + TRTbin = "/home/%s/colcon_ws/install/trt_yolov3/lib/trt_yolov3/%s.trt" %(getpass.getuser(), self.model) + print(os.path.isfile(TRTbin)) + + with open(TRTbin, 'rb') as f, trt.Runtime(self.trt_logger) as runtime: + return runtime.deserialize_cuda_engine(f.read()) + + def _create_context(self): + return self.engine.create_execution_context() + + def __init__(self, model, input_shape, category_num=80): + """Initialize TensorRT plugins, engine and conetxt.""" + self.model = model + self.input_shape = input_shape + h, w = input_shape + # filters count + filters = (category_num + 5) * 3 + if 'tiny' in model: + self.output_shapes = [(1, filters, h // 32, w // 32), + (1, filters, h // 16, w // 16)] + else: + self.output_shapes = [(1, filters, h // 32, w // 32), + (1, filters, h // 16, w // 16), + (1, filters, h // 8, w // 8)] + if 'tiny' in model: + postprocessor_args = { + # A list of 2 three-dimensional tuples for the Tiny YOLO masks + 'yolo_masks': [(3, 4, 5), (0, 1, 2)], + # A list of 6 two-dimensional tuples for the Tiny YOLO anchors + 'yolo_anchors': [(10, 14), (23, 27), (37, 58), + (81, 82), (135, 169), (344, 319)], + # Threshold for non-max suppression algorithm, float + # value between 0 and 1 + 'nms_threshold': 0.7, + 'yolo_input_resolution': input_shape, + 'category_num': category_num + } + else: + postprocessor_args = { + # A list of 3 three-dimensional tuples for the YOLO masks + 'yolo_masks': [(6, 7, 8), (3, 4, 5), (0, 1, 2)], + # A list of 9 two-dimensional tuples for the YOLO anchors + 'yolo_anchors': [(10, 13), (16, 30), (33, 23), + (30, 61), (62, 45), (59, 119), + (116, 90), (156, 198), (373, 326)], + # Threshold for non-max suppression algorithm, float + # value between 0 and 1 + # between 0 and 1 + 'nms_threshold': 0.5, + 'yolo_input_resolution': input_shape, + 'category_num': category_num + } + self.postprocessor = PostprocessYOLO(**postprocessor_args) + + self.trt_logger = trt.Logger(trt.Logger.INFO) + self.engine = self._load_engine() + self.context = self._create_context() + self.inputs, self.outputs, self.bindings, self.stream = \ + allocate_buffers(self.engine) + self.inference_fn = do_inference if trt.__version__[0] < '7' \ + else do_inference_v2 + + def __del__(self): + """Free CUDA memories.""" + del self.stream + del self.outputs + del self.inputs + + def detect(self, img, conf_th=0.3): + """Detect objects in the input image.""" + shape_orig_WH = (img.shape[1], img.shape[0]) + img_resized = _preprocess_yolov3(img, self.input_shape) + + # Set host input to the image. The do_inference() function + # will copy the input to the GPU before executing. + self.inputs[0].host = np.ascontiguousarray(img_resized) + trt_outputs = self.inference_fn( + context=self.context, + bindings=self.bindings, + inputs=self.inputs, + outputs=self.outputs, + stream=self.stream) + + # Before doing post-processing, we need to reshape the outputs + # as do_inference() will give us flat arrays. + trt_outputs = [output.reshape(shape) for output, shape + in zip(trt_outputs, self.output_shapes)] + + # Run the post-processing algorithms on the TensorRT outputs + # and get the bounding box details of detected objects + boxes, classes, scores = self.postprocessor.process( + trt_outputs, shape_orig_WH, conf_th) + return boxes, scores, classes diff --git "a/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/yolov3_classes.py" "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/yolov3_classes.py" new file mode 100644 index 0000000..f5e2a12 --- /dev/null +++ "b/\352\260\235\354\262\264/yolov3-with-TensorRT-revised/trt_yolov3/yolov3_classes.py" @@ -0,0 +1,104 @@ +"""yolov3_classes.py + +NOTE: Number of YOLOv3 COCO output classes differs from SSD COCO models. +""" + +COCO_CLASSES_LIST = [ + 'person', + 'bicycle', + 'car', + 'motorbike', + 'aeroplane', + 'bus', + 'train', + 'truck', + 'boat', + 'traffic light', + 'fire hydrant', + 'stop sign', + 'parking meter', + 'bench', + 'bird', + 'cat', + 'dog', + 'horse', + 'sheep', + 'cow', + 'elephant', + 'bear', + 'zebra', + 'giraffe', + 'backpack', + 'umbrella', + 'handbag', + 'tie', + 'suitcase', + 'frisbee', + 'skis', + 'snowboard', + 'sports ball', + 'kite', + 'baseball bat', + 'baseball glove', + 'skateboard', + 'surfboard', + 'tennis racket', + 'bottle', + 'wine glass', + 'cup', + 'fork', + 'knife', + 'spoon', + 'bowl', + 'banana', + 'apple', + 'sandwich', + 'orange', + 'broccoli', + 'carrot', + 'hot dog', + 'pizza', + 'donut', + 'cake', + 'chair', + 'sofa', + 'pottedplant', + 'bed', + 'diningtable', + 'toilet', + 'tvmonitor', + 'laptop', + 'mouse', + 'remote', + 'keyboard', + 'cell phone', + 'microwave', + 'oven', + 'toaster', + 'sink', + 'refrigerator', + 'book', + 'clock', + 'vase', + 'scissors', + 'teddy bear', + 'hair drier', + 'toothbrush', +] + +# For translating YOLOv3 class ids (0~79) to SSD class ids (0~90) +yolov3_cls_to_ssd = [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, +] + + +def get_cls_dict(category_num): + """Get the class ID to name translation dictionary.""" + if category_num == 80: + return {i: n for i, n in enumerate(COCO_CLASSES_LIST)} + else: + return {i: 'CLS%d' % i for i in range(category_num)}