Tensorrt加速Tensorflow推断速度(python和c++)
博主的环境配置可参考之前博客。
准备⼯作结束后,下⾯开始进⼊实验阶段。
实验⼀: tensorflow的pb模型转化为uff模型,tensorrt加载uff模型去预测图⽚
使⽤前⾯中获得的pb模型,将其转化为uff模型,再转化为tensorrt后去预测猫咪图⽚,看看时间需要多少(前⾯可看到未加速前,所花费时间为2.5s)
pb转uff模型的过程见我前⾯博客
在其基础上修改了代码,对⼀张图⽚预测,测试的uff模型是从上⾯博客中获得,图⽚还是之前⼏篇中所⽤的猫咪图
1from random import randint
2
3import tensorrt
4from PIL import Image
5import numpy as np
6from tensorflow.snet50 import preprocess_input, decode_predictions
高二周记
7import pycuda.driver as cuda
8# This import caus pycuda to automatically manage CUDA context creation and cleanup.
9import pycuda.autoinit
10from tensorflow.keras.preprocessing import image
11import tensorrt as trt
12
13import sys, os
学生日记50字14sys.path.inrt(1, os.path.join(sys.path[0], ".."))
15import common
16import cv2
17import time
18
19# You can t the logger verity higher to suppress messages (or lower to display more messages).
我在春天等你简谱20TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
20TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
21# Frozen model layers:
22# Input
23# resnet50/conv1_pad/Pad/paddings
24# resnet50/conv1_pad/Pad
25class ModelData(object):
26 MODEL_FILE = "weights.uff"
27 INPUT_NAME = "Input"
28 INPUT_SHAPE = (224, 224,3)
四川美术学院分数线29 OUTPUT_NAME = "resnet50/predictions/Softmax"
30# resnet50/predictions/BiasAdd
31# resnet50/predictions/Softmax
32# Identity
33
34def build_engine():
35 # For more information on TRT basics, refer to the introductory samples.
36 with trt.Builder(TRT_LOGGER) as builder, ate_network() as network, ate_builder_config() as config, trt.UffParr() as parr:
37 config.max_workspace_size = common.GiB(1)
38 # Par the Uff Network
39 ister_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE, tensorrt.UffInputOrder.NHWC)
40 ister_output(ModelData.OUTPUT_NAME)
41 parr.par(ModelData.MODEL_FILE, network)
42 engine = builder.build_engine(network, config)
43
44 return engine
45
46def main():
47 engine = build_engine()
48 # Build an engine, allocate buffers and create a stream.
49 # For more information on buffer allocation, refer to the introductory samples.
50 inputs, outputs, bindings, stream = common.allocate_buffers(engine)
51 ate_execution_context() as context:
52 img = image.load_img('2008_002682.jpg', target_size=(224, 224))
53 img = image.img_to_array(img)
54 img = preprocess_input(img)
55 print(img.shape)
56 img = waxis, :]
57 inputs[0].host = img.ravel()
58 print(inputs[0].host.shape)
瓷器的历史59
60 t_model = time.perf_counter()
61 result = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
62 print(f'do inference cost:{time.perf_counter() - t_model:.8f}s')
63
64 output = np.array(result[1])
65 output = waxis,:]
66 print(output.shape)
67 print('Predicted:', decode_predictions(output, top=5)[0])
68
69
70if __name__ == '__main__':
71 main()
预测结果如下:
对⽐直接使⽤pb模型来预测(需要2s的预测时间),预测结果⼀致,时间却⼤⼤提升了,这⾥只需要0.004s左右,可以看到时间⼤⼤提
升了。
如下是关键语句:
实验⼆:tensorflow的pb模型转化为onnx模型,tensorrt加载onnx模型对图⽚进⾏预测
还是那这篇中⽣成的pb模型做实验
安装下如下包
pip install -U tf2onnx
可执⾏如下命令,即可以⽅便完成转换
python -vert --graphdef weights.pb -- --inputs Input:0 --outputs resnet50/predictions/Softmax:0
需要输⼊和输出节点名字(中有打印每层的名字,所以博主这边是可以获取到的)
接下来就是⽤tensorrt去加载此onnx模型,转化为engine后来预测图⽚,此可借鉴我之前博客
./trtexec --onnx=/home/sxhlvye/Trial1/ --saveEngine=/home/sxhlvye/Trial1/
完毕后,再执⾏代码如下(不同于前⾯博客,博主这⾥稍微做了些改动):
1import sys
2import cv2
3from PIL import Image
4from tensorflow.keras.preprocessing import image
5from tensorflow.keras.applications import resnet50
6from tensorflow.snet50 import preprocess_input, decode_predictions
7import tensorflow as tf
8import time如何建立企业文化
9import numpy as np
10# This import caus pycuda to automatically manage CUDA context creation and cleanup.
11import pycuda.autoinit
12import tensorrt as trt
13import common
14import pycuda.driver as cuda
15import time
16import matplotlib.pyplot as plt
17import cv2
18# You can t the logger verity higher to suppress messages (or lower to display more messages).
19TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
20
21filename = "2008_002682.jpg"
22engine_file_path = ""
23
24class HostDeviceMem(object):
25 def __init__(lf, host_mem, device_mem):
26 """Within this context, host_mom means the cpu memory and device means the GPU memory
27 """
28 lf.host = host_mem
29 lf.device = device_mem
30 def __str__(lf):
31 return "Host:\n" + str(lf.host) + "\nDevice:\n" + str(lf.device)
32 def __repr__(lf):
33 return lf.__str__()
34
35def allocate_buffers(engine):
36 inputs = []
37 outputs = []
38 bindings = []
39 stream = cuda.Stream()
40 for binding in engine:
41 size = trt._binding_shape(binding)) * engine.max_batch_size
42 dtype = trt._binding_dtype(binding))
43 # Allocate host and device buffers
44 host_mem = cuda.pagelocked_empty(size, dtype)
45 device_mem = _alloc(host_mem.nbytes)
46 # Append the device buffer to device bindings.
47 bindings.append(int(device_mem))
48 # Append to the appropriate list.
49 if engine.binding_is_input(binding):
50 inputs.append(HostDeviceMem(host_mem, device_mem))
51 el:
52 outputs.append(HostDeviceMem(host_mem, device_mem))
53
54 return inputs, outputs, bindings, stream
55
56def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
57 # Transfer data from CPU to the GPU.
58 [py_htod_async(inp.device, inp.host, stream) for inp in inputs]
59
60 # Run inference.
61 t_model = time.perf_counter()
62 ute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
63 print(f'only one line cost:{time.perf_counter() - t_model:.8f}s')
64
65 # Transfer predictions back from the GPU.
66 [py_dtoh_async(out.host, out.device, stream) for out in outputs]
67
68 # Synchronize the stream
69 stream.synchronize()
70
71 # Return only the host outputs.
72 return [out.host for out in outputs]
73
74def main():
75 print("Reading engine from file {}".format(engine_file_path))
76 with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
浪淘沙欧阳修77 engine = runtime.derialize_cuda_ad())
78
79 #create the context for this engine
80 context = ate_execution_context()
81 #allocate buffers for input and output
82 inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
83 #read a image
84 img = image.load_img('2008_002682.jpg', target_size=(224, 224))
85 img = image.img_to_array(img)
86 img = preprocess_input(img)
87 print(img.shape)
88 img = waxis, :]
89
90 # Load data to the buffer
91 inputs[0].host = img.ravel()
92 print(inputs[0].host.shape)
93
94 #Do Inference
95 t_model = time.perf_counter()
96 result = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream) # numpy data
97 print(f'do inference cost:{time.perf_counter() - t_model:.8f}s')
98
99 output = np.array(result[0])
100 output = waxis, :]
101 print(output.shape)
102 print('Predicted:', decode_predictions(output, top=5)[0])
103
104if __name__ == '__main__':
105 main()
106
107
108
执⾏结果如下:
1/home/sxhlvye/anaconda3/bin/python /home/sxhlvye/Trial1/Tensorrt/test_onnx1.py
22022-03-20 18:13:45.682920: I tensorflow/stream_executor/platform/default/dso_loader:49] Successfully opened dynamic library libcudart.so.11.0 3Reading engine from
4[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.2.0 but loaded cuBLAS/cuBLAS LT 11.1.0
5[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.0 but loaded cuDNN 8.0.5
6[TensorRT] WARNING: TensorRT was linked against cuBLAS/cuBLAS LT 11.2.0 but loaded cuBLAS/cuBLAS LT 11.1.0
7[TensorRT] WARNING: TensorRT was linked against cuDNN 8.2.0 but loaded cuDNN 8.0.5
8(224, 224, 3)
9(150528,)
10only one line cost:0.33635211s
11do inference cost:0.33758112s
12(1, 1000)
13Predicted: [('n02123597', 'Siame_cat', 0.16550788), ('n02108915', 'French_bulldog', 0.14138032), ('n04409515', 'tennis_ball', 0.08570899), ('n02095314' 14
15Process finished with exit code 0
对⽐直接pb预测图⽚的结果
可看到结果⼀致,但tensorrt加速后,速度提升到了0.3376s
实验三:tensorflow的pb模型转化为ONNX模型后(上⾯说明了转变过程),c++去部署
这⾥可以参考博主之前的博客,⾥⾯演⽰的就是拿到onnx模型后怎么去部署,这边就不再赘叙了
实验四:tensorflow的pb转化为UFF模型后(上⾯说明了转变过程),c++去部署
和上⾯博客⼤同⼩异,这⾥是参考的tensorrt⾃带例⼦
博主知识简单的修改了下,让其流程跑通,图像归⼀化只是简单的做了下,详细代码如下(c++的环境配置见上⾯提到的博客)
1#include "BatchStream.h"
青剑湖
2#include "EntropyCalibrator.h"
3#include "argsParr.h"