共计 5356 个字符,预计需要花费 14 分钟才能阅读完成。
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import torch
import time
from PIL import Image
import cv2,os
import torchvision
import numpy as np
from scipy.special import softmax
get_img_np_nchw h 和 postprocess_the_output 函数依据须要进行批改
TRT_LOGGER = trt.Logger()
def get_img_np_nchw(img_path):
img = Image.open(img_path).convert('L')
img = np.asarray(img, dtype='float32')
img = cv2.resize(np.array(img),(224, 224), interpolation = cv2.INTER_CUBIC)
img = img / 255.
img = img[np.newaxis, np.newaxis]
return image
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
"""host_mom 指代 cpu 内存,device_mem 指代 GPU 内存"""
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def get_engine(max_batch_size=1, onnx_file_path=””, engine_file_path=””,fp16_mode=False, int8_mode=False,save_engine=False):
"""
params max_batch_size: 预先指定大小好调配显存
params onnx_file_path: onnx 文件门路
params engine_file_path: 待保留的序列化的引擎文件门路
params fp16_mode: 是否采纳 FP16
params int8_mode: 是否采纳 INT8
params save_engine: 是否保留引擎
returns: ICudaEngine
"""
# 如果曾经存在序列化之后的引擎,则间接反序列化失去 cudaEngine
if os.path.exists(engine_file_path):
print("Reading engine from file: {}".format(engine_file_path))
with open(engine_file_path, 'rb') as f, \
trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read()) # 反序列化
else: # 由 onnx 创立 cudaEngine
# 应用 logger 创立一个 builder
# builder 创立一个计算图 INetworkDefinition
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
# In TensorRT 7.0, the ONNX parser only supports full-dimensions mode, meaning that your network definition must be created with the explicitBatch flag set. For more information, see Working With Dynamic Shapes.
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network(explicit_batch) as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser, \
builder.create_builder_config() as config: # [ 期货](https://www.gendan5.com/p/2021-06-18/250461.html) 应用 onnx 的解析器绑定计算图,后续将通过解析填充计算图
profile = builder.create_optimization_profile()
profile.set_shape("inputs", (1, 1, 224, 224),(1,1,224,224),(1,1,224,224))
config.add_optimization_profile(profile)
config.max_workspace_size = 1<<30 # 事后调配的工作空间大小, 即 ICudaEngine 执行时 GPU 最大须要的空间
builder.max_batch_size = max_batch_size # 执行时最大能够应用的 batchsize
builder.fp16_mode = fp16_mode
builder.int8_mode = int8_mode
if int8_mode:
# To be updated
raise NotImplementedError
# 解析 onnx 文件,填充计算图
if not os.path.exists(onnx_file_path):
quit("ONNX file {} not found!".format(onnx_file_path))
print('loading onnx file from path {} ...'.format(onnx_file_path))
# with open(onnx_file_path, 'rb') as model: # 二值化的网络后果和参数
# print("Begining onnx file parsing")
# parser.parse(model.read()) # 解析 onnx 文件
parser.parse_from_file(onnx_file_path) # parser 还有一个从文件解析 onnx 的办法
print("Completed parsing of onnx file")
# 填充计算图实现后,则应用 builder 从计算图中创立 CudaEngine
print("Building an engine from file{}' this may take a while...".format(onnx_file_path))
#################
# import pdb;pdb.set_trace()
print(network.get_layer(network.num_layers-1).get_output(0).shape)
# network.mark_output(network.get_layer(network.num_layers -1).get_output(0))
engine = builder.build_engine(network,config) # 留神,这里的 network 是 INetworkDefinition 类型,即填充后的计算图
print("Completed creating Engine")
if save_engine: #保留 engine 供当前间接反序列化应用
with open(engine_file_path, 'wb') as f:
f.write(engine.serialize()) # 序列化
return engine
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer data from CPU to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def postprocess_the_outputs(outputs, shape_of_output):
outputs = outputs.reshape(*shape_of_output)
out = np.argmax(softmax(outputs,axis=1)[0,...],axis=0)
# import pdb;pdb.set_trace()
return out
验证 TensorRT 模型是否正确
onnx_model_path = ‘./Net.onnx’
max_batch_size = 1
These two modes are dependent on hardwares
fp16_mode = False
int8_mode = False
trt_engine_path = ‘./model_fp16_{}_int8_{}.trt’.format(fp16_mode, int8_mode)
Build an engine
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode , save_engine=True)
Create the context for this engine
context = engine.create_execution_context()
Allocate buffers for input and output
inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
Do inference
img_np_nchw = get_img_np_nchw(img_path)
inputs[0].host = img_np_nchw.reshape(-1)
shape_of_output = (max_batch_size, 2, 224, 224)
inputs[1].host = … for multiple input
t1 = time.time()
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
t2 = time.time()
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
print(‘TensorRT ok’)
print(“Inference time with the TensorRT engine: {}”.format(t2-t1))