import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import torch
import time
from PIL import Image
import cv2,os
import torchvision
import numpy as np
from scipy.special import softmax

get_img_np_nchw h和postprocess_the_output函数依据须要进行批改

TRT_LOGGER = trt.Logger()
def get_img_np_nchw(img_path):

img = Image.open(img_path).convert('L')
img = np.asarray(img, dtype='float32')
img = cv2.resize(np.array(img),(224, 224), interpolation = cv2.INTER_CUBIC)
img = img / 255.
img = img[np.newaxis, np.newaxis]
return image

class HostDeviceMem(object):

def __init__(self, host_mem, device_mem):
    """host_mom指代cpu内存，device_mem指代GPU内存
    """
    self.host = host_mem
    self.device = device_mem
def __str__(self):
    return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
    return self.__str__()

def allocate_buffers(engine):

inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
    size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
    dtype = trt.nptype(engine.get_binding_dtype(binding))
    # Allocate host and device buffers
    host_mem = cuda.pagelocked_empty(size, dtype)
    device_mem = cuda.mem_alloc(host_mem.nbytes)
    # Append the device buffer to device bindings.
    bindings.append(int(device_mem))
    # Append to the appropriate list.
    if engine.binding_is_input(binding):
        inputs.append(HostDeviceMem(host_mem, device_mem))
    else:
        outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream

def get_engine(max_batch_size=1, onnx_file_path=””, engine_file_path=””,fp16_mode=False, int8_mode=False,save_engine=False):

"""
params max_batch_size:      预先指定大小好调配显存
params onnx_file_path:      onnx文件门路
params engine_file_path:    待保留的序列化的引擎文件门路
params fp16_mode:           是否采纳FP16
params int8_mode:           是否采纳INT8
params save_engine:         是否保留引擎
returns:                    ICudaEngine
"""
# 如果曾经存在序列化之后的引擎，则间接反序列化失去cudaEngine
if os.path.exists(engine_file_path):
    print("Reading engine from file: {}".format(engine_file_path))
    with open(engine_file_path, 'rb') as f, \
        trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())  # 反序列化
else:  # 由onnx创立cudaEngine
    # 应用logger创立一个builder 
    # builder创立一个计算图 INetworkDefinition
    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    # In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.
    with trt.Builder(TRT_LOGGER) as builder, \
        builder.create_network(explicit_batch) as network,  \
        trt.OnnxParser(network, TRT_LOGGER) as parser, \
        builder.create_builder_config() as config: # [期货](https://www.gendan5.com/p/2021-06-18/250461.html)应用onnx的解析器绑定计算图，后续将通过解析填充计算图
        profile = builder.create_optimization_profile()
        profile.set_shape("inputs", (1, 1, 224, 224),(1,1,224,224),(1,1,224,224))
        config.add_optimization_profile(profile)
        config.max_workspace_size = 1<<30  # 事后调配的工作空间大小,即ICudaEngine执行时GPU最大须要的空间
        builder.max_batch_size = max_batch_size # 执行时最大能够应用的batchsize
        builder.fp16_mode = fp16_mode
        builder.int8_mode = int8_mode
        if int8_mode:
            # To be updated
            raise NotImplementedError
        # 解析onnx文件，填充计算图
        if not os.path.exists(onnx_file_path):
            quit("ONNX file {} not found!".format(onnx_file_path))
        print('loading onnx file from path {} ...'.format(onnx_file_path))
        # with open(onnx_file_path, 'rb') as model: # 二值化的网络后果和参数
        #     print("Begining onnx file parsing")
        #     parser.parse(model.read())  # 解析onnx文件
        parser.parse_from_file(onnx_file_path) # parser还有一个从文件解析onnx的办法
        print("Completed parsing of onnx file")
        # 填充计算图实现后，则应用builder从计算图中创立CudaEngine
        print("Building an engine from file{}' this may take a while...".format(onnx_file_path))
        #################
        # import pdb;pdb.set_trace()
        print(network.get_layer(network.num_layers-1).get_output(0).shape)
        # network.mark_output(network.get_layer(network.num_layers -1).get_output(0))
        engine = builder.build_engine(network,config)  # 留神，这里的network是INetworkDefinition类型，即填充后的计算图
        print("Completed creating Engine")
        if save_engine:  #保留engine供当前间接反序列化应用
            with open(engine_file_path, 'wb') as f:
                f.write(engine.serialize())  # 序列化
        return engine

def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):

# Transfer data from CPU to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]

def postprocess_the_outputs(outputs, shape_of_output):

outputs = outputs.reshape(*shape_of_output)
out = np.argmax(softmax(outputs,axis=1)[0,...],axis=0)
# import pdb;pdb.set_trace()
return out

验证TensorRT模型是否正确

onnx_model_path = ‘./Net.onnx’
max_batch_size = 1

These two modes are dependent on hardwares

fp16_mode = False
int8_mode = False
trt_engine_path = ‘./model_fp16_{}_int8_{}.trt’.format(fp16_mode, int8_mode)

Build an engine

engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode , save_engine=True)

Create the context for this engine

context = engine.create_execution_context()

Allocate buffers for input and output

inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings

Do inference

img_np_nchw = get_img_np_nchw(img_path)
inputs[0].host = img_np_nchw.reshape(-1)
shape_of_output = (max_batch_size, 2, 224, 224)

inputs[1].host = … for multiple input

t1 = time.time()
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
t2 = time.time()
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
print(‘TensorRT ok’)
print(“Inference time with the TensorRT engine: {}”.format(t2-t1))

关于python:PyTorch模型转TensorRT

get_img_np_nchw h和postprocess_the_output函数依据须要进行批改

验证TensorRT模型是否正确

These two modes are dependent on hardwares

Build an engine

Create the context for this engine

Allocate buffers for input and output

Do inference

inputs[1].host = … for multiple input

评论

发表回复取消回复

更多文章

DDN HPC 存储硬件架构设计深度分析

探秘IO500：从Lustre并行文件系统出发，开启HPC存储性能新征程

苹果iOS打包的ipa应用无法安装？一篇文章带你了解可能的原因及排查方法

图解Golang：从零开始实现简易版过期LRU缓存

关于python:PyTorch模型转TensorRT

get_img_np_nchw h和postprocess_the_output函数依据须要进行批改

验证TensorRT模型是否正确

These two modes are dependent on hardwares

Build an engine

Create the context for this engine

Allocate buffers for input and output

Do inference

inputs[1].host = … for multiple input

评论

发表回复 取消回复

更多文章

DDN HPC 存储硬件架构设计深度分析

探秘IO500：从Lustre并行文件系统出发，开启HPC存储性能新征程

苹果iOS打包的ipa应用无法安装？一篇文章带你了解可能的原因及排查方法

图解Golang：从零开始实现简易版过期LRU缓存

发表回复取消回复