Course
PyTorch模型推理及多任务通用范式 课程5:
- 给大家安利了大白老师之前的好文:对于Yolo系列算法的理论知识解读和实战训练代码。
- 依据 pytorch模型推理的三板斧:数据预处理、数据进网络、数据后处理,逐行实现了Yolox的推理代码。
Assignment
必做题
- 本人找2张其余图,用Yolox_s进行指标检测,并注明输出尺寸和两个阈值。
思考题
- Yolox_s:用time模块和for循环,对”./images/1.jpg”间断推理100次,统计工夫开销。有CUDA的同学,改下代码:self.device=torch.device('cuda'),统计工夫开销。
- 有CUDA的同学,别离用 Yolox_tiny、Yolox_s、Yolox_m、Yolox_l、Yolox_x 对”./images/1.jpg”间断推理100次,统计工夫开销。
Solutions
Code
import torchimport torch.nn as nnimport torchvisionimport numpy as npimport cv2from models_yolox.visualize import visfrom models_yolox.yolo_head import YOLOXHeadfrom models_yolox.yolo_pafpn import YOLOPAFPNimport time# write output imagedef write_output(result, image, output): if result is not None: bboxes, scores, labels = result image = vis(image, bboxes, scores, labels, label_names) text1 = 'image size : {}*{}'.format(model_detect.w, model_detect.h) text2 = 'conf threshold : {}'.format(model_detect.conf_threshold) text3 = 'nms threshold : {}'.format(model_detect.nms_threshold) font = cv2.FONT_HERSHEY_SIMPLEX cv2.putText(image, text1, (10, 20), font, 0.7, (0, 255, 0), 1, cv2.LINE_AA) cv2.putText(image, text2, (10, 50), font, 0.7, (0, 255, 0), 1, cv2.LINE_AA) cv2.putText(image, text3, (10, 80), font, 0.7, (0, 255, 0), 1, cv2.LINE_AA) cv2.imwrite(output, image)# run a model multiple timesdef run_evl(times, model, device, image): image = cv2.imread(image) # CPU run 100 times model_detect = ModelPipline(device=device, model_name=model) t_all=0 for i in range(times): t_start = time.time() model_detect.predict(image) t_end = time.time() t_all += t_end - t_start print('{} run model {} {} time lapse: {:.4f} seconds.'.format(device, model, times, t_all))class YOLOX(nn.Module): def __init__(self, num_classes, depth=0.33, width=0.5, in_channels=[256, 512, 1024]): super().__init__() #yolox_s by default self.backbone = YOLOPAFPN(depth=depth, width=width, in_channels=in_channels) self.head = YOLOXHead(num_classes=num_classes, width=width, in_channels=in_channels) def forward(self, x): fpn_outs = self.backbone(x) outputs = self.head(fpn_outs) return outputsclass ModelPipline(object): def __init__(self, device=torch.device('cuda'), model_name='yolox_s'): # 进入模型的图片大小:为数据预处理和后处理做筹备 self.inputs_size = (640, 640) # (h,w) # CPU or CUDA:为数据预处理和模型加载做筹备 self.device = device # 后处理的阈值 self.conf_threshold = 0.5 self.nms_threshold = 0.4 # 标签载入 label_names = open('./labels/coco_label.txt', 'r').readlines() self.label_names = [line.strip('\n') for line in label_names] # image size self.w = 0 self.h = 0 # for model selection self.model_info = { 'yolox_tiny': [0.33, 0.375, './weights/yolox_tiny_coco.pth.tar'], 'yolox_s': [0.33, 0.5, './weights/yolox_s_coco.pth.tar'], 'yolox_m': [0.67, 0.75, './weights/yolox_m_coco.pth.tar'], 'yolox_l': [1.0, 1.0, './weights/yolox_l_coco.pth.tar'], 'yolox_x': [1.33, 1.25, './weights/yolox_x_coco.pth.tar'], } # 载入模型构造和模型权重 self.num_classes = 80 self.model = self.get_model(model_name) def predict(self, image): # 数据预处理 inputs, r = self.preprocess(image) # 数据进网络 outputs = self.model(inputs) # 数据后处理 results = self.postprocess(outputs, r) return results def get_model(self, model_name): model_info = self.model_info[model_name] depth = model_info[0] width = model_info[1] path = model_info[2] if model_name == 'yolox_tiny': self.inputs_size = (416, 416) model = YOLOX(self.num_classes, depth, width) pretrained_state_dict = torch.load(path, map_location=lambda storage, loc: storage)["model"] model.load_state_dict(pretrained_state_dict, strict=True) model.to(self.device) model.eval() return model def preprocess(self, image): # 原图尺寸 h, w = image.shape[:2] # to print image size self.h = h self.w = w # 生成一张 w=h=640的mask,数值全是114 padded_img = np.ones((self.inputs_size[0], self.inputs_size[1], 3)) * 114.0 # 计算原图的长边缩放到640所须要的比例 r = min(self.inputs_size[0] / h, self.inputs_size[1] / w) # 对原图做等比例缩放,使得长边=640 resized_img = cv2.resize(image, (int(w * r), int(h * r)), interpolation=cv2.INTER_LINEAR).astype(np.float32) # 将缩放后的原图填充到 640×640的mask的左上方 padded_img[: int(h * r), : int(w * r)] = resized_img # BGR——>RGB padded_img = padded_img[:, :, ::-1] # 归一化和标准化,和训练时保持一致 inputs = padded_img / 255 inputs = (inputs - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225]) ##以下是图像工作的通用解决 # (H,W,C) ——> (C,H,W) inputs = inputs.transpose(2, 0, 1) # (C,H,W) ——> (1,C,H,W) inputs = inputs[np.newaxis, :, :, :] # NumpyArray ——> Tensor inputs = torch.from_numpy(inputs) # dtype float32 inputs = inputs.type(torch.float32) # 与self.model放在雷同硬件上 inputs = inputs.to(self.device) return inputs, r def postprocess(self, prediction, r): # prediction.shape=[1,8400,85],上面先将85中的前4列进行转换,从 xc,yc,w,h 变为 x0,y0,x1,y1 box_corner = prediction.new(prediction.shape) box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2 box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2 box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2 box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2 prediction[:, :, :4] = box_corner[:, :, :4] # 只解决单张图 image_pred = prediction[0] # class_conf.shape=[8400,1],求每个anchor在80个类别中的最高分数。class_pred.shape=[8400,1],每个anchor的label index。 class_conf, class_pred = torch.max(image_pred[:, 5: 5 + self.num_classes], 1, keepdim=True) conf_score = image_pred[:, 4].unsqueeze(dim=1) * class_conf conf_mask = (conf_score >= self.conf_threshold).squeeze() # detections.shape=[8400,6],别离是 x0 ,y0, x1, y1, obj_score*class_score, class_label detections = torch.cat((image_pred[:, :4], conf_score, class_pred.float()), 1) # 将obj_score*class_score > conf_thre 筛选进去 detections = detections[conf_mask] # 通过阈值筛选后,如果没有残余指标则完结 if not detections.size(0): return None # NMS nms_out_index = torchvision.ops.batched_nms(detections[:, :4], detections[:, 4], detections[:, 5], self.nms_threshold) detections = detections[nms_out_index] # 把坐标映射回原图 detections = detections.data.cpu().numpy() bboxes = (detections[:, :4] / r).astype(np.int64) scores = detections[:, 4] labels = detections[:, 5].astype(np.int64) return bboxes, scores, labelsif __name__ == '__main__': model_detect = ModelPipline(torch.device('cuda'), 'yolox_s') label_names = model_detect.label_names # detect image 2 image = cv2.imread('./images/2.jpg') result = model_detect.predict(image) write_output(result, image, './demos/2.jpg') # detect image 3 image = cv2.imread('./images/3.jpg') result = model_detect.predict(image) write_output(result, image, './demos/3.jpg') # get time lapse of 100 iteration run_evl(100, 'yolox_s', torch.device('cpu'), './images/1.jpg') run_evl(100, 'yolox_tiny', torch.device('cuda'), './images/1.jpg') run_evl(100, 'yolox_s', torch.device('cuda'), './images/1.jpg') run_evl(100, 'yolox_m', torch.device('cuda'), './images/1.jpg') run_evl(100, 'yolox_l', torch.device('cuda'), './images/1.jpg') run_evl(100, 'yolox_x', torch.device('cuda'), './images/1.jpg')
必做题
- 提交2张图的检测成果,并注明输出尺寸和两个阈值。
图一
图二
思考题
- CPU推理和CUDA推理,各自的工夫开销。
运行后果:
cpu run model yolox_s 100 time lapse: 13.2529 seconds.
cuda run model yolox_s 100 time lapse: 3.6188 seconds.
GPU显著比CPU运算快。 - 不同Backbone各自的工夫开销。
运行后果:
cuda run model yolox_tiny 100 time lapse: 2.5452 seconds.
cuda run model yolox_s 100 time lapse: 3.6188 seconds.
cuda run model yolox_m 100 time lapse: 4.0251 seconds.
cuda run model yolox_l 100 time lapse: 4.7148 seconds.
cuda run model yolox_x 100 time lapse: 6.5571 seconds.
模型越大工夫开销越大。
学习心得
- 这次课程对你的帮忙有哪些?技术上、学习办法上、思路上等都行,随便施展。
很久没写过作业了,督促了我写点货色,相熟一下之前不器重的细节。推理通用范式有实用价值,后续能够用在试验代码里。 - 对课程的优化改良有什么倡议?内容上、流程上、模式上等都行,随便施展。
本课程给我的感觉是老师们很尽心,内容丰盛,解说分明,有播种。但限于文字的模式,看得出来一些常识老师想讲但没法讲。心愿当前能录视频对着画板写写画画的讲,这样老师讲的分明,学生学得明确。另外心愿授课内容和代码能更贴近生产一线,多一些业界实际,能带入一些偏日常工作可能遇到的问题去思考和解决是最好的了。 - 最好能够附上本人所处的阶段,比方:学生,接触AI一年; 退职,3年AI我的项目教训 等。
自己研一,接触深度学习不到一年,课题是机器视觉相干。