关于pytorch:PyTorch模型推理及多任务通用范式-课程5-作业

PyTorch 模型推理及多任务通用范式课程 5:

给大家安利了大白老师之前的好文：对于 Yolo 系列算法的理论知识解读和实战训练代码。
依据 pytorch 模型推理的三板斧：数据预处理、数据进网络、数据后处理，逐行实现了 Yolox 的推理代码。

本人找 2 张其余图，用 Yolox_s 进行指标检测，并注明输出尺寸和两个阈值。

Yolox_s：用 time 模块和 for 循环，对”./images/1.jpg”间断推理 100 次，统计工夫开销。有 CUDA 的同学，改下代码：self.device=torch.device(‘cuda’)，统计工夫开销。
有 CUDA 的同学，别离用 Yolox_tiny、Yolox_s、Yolox_m、Yolox_l、Yolox_x 对”./images/1.jpg”间断推理 100 次，统计工夫开销。

import torch
import torch.nn as nn
import torchvision
import numpy as np
import cv2
from models_yolox.visualize import vis
from models_yolox.yolo_head import YOLOXHead
from models_yolox.yolo_pafpn import YOLOPAFPN
import time

# write output image
def write_output(result, image, output):
    if result is not None:
        bboxes, scores, labels = result
        image = vis(image, bboxes, scores, labels, label_names)
    text1 = 'image size : {}*{}'.format(model_detect.w, model_detect.h)
    text2 = 'conf threshold : {}'.format(model_detect.conf_threshold)
    text3 = 'nms threshold : {}'.format(model_detect.nms_threshold)
    font = cv2.FONT_HERSHEY_SIMPLEX
    cv2.putText(image, text1, (10, 20), font, 0.7, (0, 255, 0), 1, cv2.LINE_AA)
    cv2.putText(image, text2, (10, 50), font, 0.7, (0, 255, 0), 1, cv2.LINE_AA)
    cv2.putText(image, text3, (10, 80), font, 0.7, (0, 255, 0), 1, cv2.LINE_AA)
    cv2.imwrite(output, image)

# run a model multiple times
def run_evl(times, model, device, image):
    image = cv2.imread(image)
    # CPU run 100 times
    model_detect = ModelPipline(device=device, model_name=model)
    t_all=0
    for i in range(times):
        t_start = time.time()
        model_detect.predict(image)
        t_end = time.time()
        t_all += t_end - t_start
    print('{} run model {} {} time lapse: {:.4f} seconds.'.format(device, model, times, t_all))

class YOLOX(nn.Module):
    def __init__(self, num_classes, depth=0.33, width=0.5, in_channels=[256, 512, 1024]):
        super().__init__()
        #yolox_s by default
        self.backbone = YOLOPAFPN(depth=depth, width=width, in_channels=in_channels)
        self.head = YOLOXHead(num_classes=num_classes, width=width, in_channels=in_channels)

    def forward(self, x):
        fpn_outs = self.backbone(x)
        outputs = self.head(fpn_outs)
        return outputs

class ModelPipline(object):
    def __init__(self, device=torch.device('cuda'), model_name='yolox_s'):
        # 进入模型的图片大小：为数据预处理和后处理做筹备
        self.inputs_size = (640, 640)  # (h,w)

        # CPU or CUDA：为数据预处理和模型加载做筹备
        self.device = device

        # 后处理的阈值
        self.conf_threshold = 0.5
        self.nms_threshold = 0.4

        # 标签载入
        label_names = open('./labels/coco_label.txt', 'r').readlines()
        self.label_names = [line.strip('\n') for line in label_names]

        # image size
        self.w = 0
        self.h = 0

        # for model selection
        self.model_info = {'yolox_tiny':   [0.33,  0.375,  './weights/yolox_tiny_coco.pth.tar'],
            'yolox_s':      [0.33,  0.5,    './weights/yolox_s_coco.pth.tar'],
            'yolox_m':      [0.67,  0.75,   './weights/yolox_m_coco.pth.tar'],
            'yolox_l':      [1.0,   1.0,    './weights/yolox_l_coco.pth.tar'],
            'yolox_x':      [1.33,  1.25,   './weights/yolox_x_coco.pth.tar'],
        }

        # 载入模型构造和模型权重
        self.num_classes = 80
        self.model = self.get_model(model_name)

    def predict(self, image):
        # 数据预处理
        inputs, r = self.preprocess(image)
        # 数据进网络
        outputs = self.model(inputs)
        # 数据后处理
        results = self.postprocess(outputs, r)
        return results

    def get_model(self, model_name):
        model_info = self.model_info[model_name]
        depth = model_info[0]
        width = model_info[1]
        path = model_info[2]
        if model_name == 'yolox_tiny':
            self.inputs_size = (416, 416)
        model = YOLOX(self.num_classes, depth, width)
        pretrained_state_dict = torch.load(path, map_location=lambda storage, loc: storage)["model"]
        model.load_state_dict(pretrained_state_dict, strict=True)
        model.to(self.device)
        model.eval()
        return model

    def preprocess(self, image):
        # 原图尺寸
        h, w = image.shape[:2]
        # to print image size
        self.h = h
        self.w = w
        # 生成一张 w=h=640 的 mask，数值全是 114
        padded_img = np.ones((self.inputs_size[0], self.inputs_size[1], 3)) * 114.0
        # 计算原图的长边缩放到 640 所须要的比例
        r = min(self.inputs_size[0] / h, self.inputs_size[1] / w)
        # 对原图做等比例缩放，使得长边 =640
        resized_img = cv2.resize(image, (int(w * r), int(h * r)), interpolation=cv2.INTER_LINEAR).astype(np.float32)
        # 将缩放后的原图填充到 640×640 的 mask 的左上方
        padded_img[: int(h * r), : int(w * r)] = resized_img
        # BGR——>RGB
        padded_img = padded_img[:, :, ::-1]
        # 归一化和标准化，和训练时保持一致
        inputs = padded_img / 255
        inputs = (inputs - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])
        ## 以下是图像工作的通用解决
        # (H,W,C) ——> (C,H,W)
        inputs = inputs.transpose(2, 0, 1)
        # (C,H,W) ——> (1,C,H,W)
        inputs = inputs[np.newaxis, :, :, :]
        # NumpyArray ——> Tensor
        inputs = torch.from_numpy(inputs)
        # dtype float32
        inputs = inputs.type(torch.float32)
        # 与 self.model 放在雷同硬件上
        inputs = inputs.to(self.device)
        return inputs, r

    def postprocess(self, prediction, r):
        # prediction.shape=[1,8400,85]，上面先将 85 中的前 4 列进行转换，从 xc,yc,w,h 变为 x0,y0,x1,y1
        box_corner = prediction.new(prediction.shape)
        box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
        box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
        box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
        box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
        prediction[:, :, :4] = box_corner[:, :, :4]
        # 只解决单张图
        image_pred = prediction[0]
        # class_conf.shape=[8400,1], 求每个 anchor 在 80 个类别中的最高分数。class_pred.shape=[8400,1], 每个 anchor 的 label index。class_conf, class_pred = torch.max(image_pred[:, 5: 5 + self.num_classes], 1, keepdim=True)
        conf_score = image_pred[:, 4].unsqueeze(dim=1) * class_conf
        conf_mask = (conf_score >= self.conf_threshold).squeeze()
        # detections.shape=[8400,6]，别离是 x0 ,y0, x1, y1, obj_score*class_score, class_label
        detections = torch.cat((image_pred[:, :4], conf_score, class_pred.float()), 1)
        # 将 obj_score*class_score > conf_thre 筛选进去
        detections = detections[conf_mask]
        # 通过阈值筛选后，如果没有残余指标则完结
        if not detections.size(0):
            return None
        # NMS
        nms_out_index = torchvision.ops.batched_nms(detections[:, :4], detections[:, 4], detections[:, 5],
                                                    self.nms_threshold)
        detections = detections[nms_out_index]
        # 把坐标映射回原图
        detections = detections.data.cpu().numpy()
        bboxes = (detections[:, :4] / r).astype(np.int64)
        scores = detections[:, 4]
        labels = detections[:, 5].astype(np.int64)
        return bboxes, scores, labels


if __name__ == '__main__':

    model_detect = ModelPipline(torch.device('cuda'), 'yolox_s')
    label_names = model_detect.label_names

    # detect image 2
    image = cv2.imread('./images/2.jpg')
    result = model_detect.predict(image)
    write_output(result, image, './demos/2.jpg')

    # detect image 3
    image = cv2.imread('./images/3.jpg')
    result = model_detect.predict(image)
    write_output(result, image, './demos/3.jpg')

    # get time lapse of 100 iteration
    run_evl(100, 'yolox_s', torch.device('cpu'), './images/1.jpg')
    run_evl(100, 'yolox_tiny', torch.device('cuda'), './images/1.jpg')
    run_evl(100, 'yolox_s', torch.device('cuda'), './images/1.jpg')
    run_evl(100, 'yolox_m', torch.device('cuda'), './images/1.jpg')
    run_evl(100, 'yolox_l', torch.device('cuda'), './images/1.jpg')
    run_evl(100, 'yolox_x', torch.device('cuda'), './images/1.jpg')

提交 2 张图的检测成果，并注明输出尺寸和两个阈值。
图一

图二

CPU 推理和 CUDA 推理，各自的工夫开销。
运行后果：
cpu run model yolox_s 100 time lapse: 13.2529 seconds.
cuda run model yolox_s 100 time lapse: 3.6188 seconds.
GPU 显著比 CPU 运算快。
不同 Backbone 各自的工夫开销。
运行后果：
cuda run model yolox_tiny 100 time lapse: 2.5452 seconds.
cuda run model yolox_s 100 time lapse: 3.6188 seconds.
cuda run model yolox_m 100 time lapse: 4.0251 seconds.
cuda run model yolox_l 100 time lapse: 4.7148 seconds.
cuda run model yolox_x 100 time lapse: 6.5571 seconds.
模型越大工夫开销越大。

这次课程对你的帮忙有哪些？技术上、学习办法上、思路上等都行，随便施展。
很久没写过作业了，督促了我写点货色，相熟一下之前不器重的细节。推理通用范式有实用价值，后续能够用在试验代码里。
对课程的优化改良有什么倡议？内容上、流程上、模式上等都行，随便施展。
本课程给我的感觉是老师们很尽心，内容丰盛，解说分明，有播种。但限于文字的模式，看得出来一些常识老师想讲但没法讲。心愿当前能录视频对着画板写写画画的讲，这样老师讲的分明，学生学得明确。另外心愿授课内容和代码能更贴近生产一线，多一些业界实际，能带入一些偏日常工作可能遇到的问题去思考和解决是最好的了。
最好能够附上本人所处的阶段，比方：学生，接触 AI 一年；退职，3 年 AI 我的项目教训等。
自己研一，接触深度学习不到一年，课题是机器视觉相干。

关于pytorch:PyTorch模型推理及多任务通用范式-课程5-作业

Course

Assignment

必做题

思考题

Solutions

Code

必做题

思考题

学习心得