OpenVINO 神经网络性能分析

2022-04-17 19 minute read

网络性能分析

查看每层的性能测量值，可以获得最耗时的层。

实现方式

通过配置收集指定设备上的性能分析

core = Core()
core.set_property(device_name, {"PERF_COUNT": "YES"})

通过推理请求获得性能分析数据

request = compiled_model.create_infer_request()
results = request.infer({0: input_tensor})
prof_info = request.get_profiling_info()

可视化性能分析

def print_infer_request_profiling_info(prof_info):
    column_max_widths = {
        'node_name': 0,
        'node_type': 0,
        'exec_type': 0
    }
    for node in prof_info:
        if len(node.node_name) > column_max_widths['node_name'] :
            column_max_widths['node_name'] = len(node.node_name)
        if len(node.node_type) > column_max_widths['node_type'] :
            column_max_widths['node_type'] = len(node.node_type)
        if len(node.exec_type) > column_max_widths['exec_type'] :
            column_max_widths['exec_type'] = len(node.exec_type)

    index_len = 6

    headers = ['Index', 'Node Name', 'Node Type', 'Exec Type', 'Real Time', 'CPU Time', 'Status']
    print(f"{headers[0]}{' '*(index_len-len(headers[0]))}", end='')
    print(f"{headers[1]}{' '*(column_max_widths['node_name']-len(headers[1])+1)}", end='')
    print(f"{headers[2]}{' '*(column_max_widths['node_type']-len(headers[2])+1)}", end='')
    print(f"{headers[3]}{' '*(column_max_widths['exec_type']-len(headers[3])+1)}", end='')
    print(f"{headers[4]}{' '*(9-len(headers[4])+1)}", end='')
    print(f"{headers[5]}{' '*(9-len(headers[5])+1)}", end='')
    print(f"{headers[6]}")

    print('-'*(column_max_widths['node_name']+column_max_widths['node_type']+column_max_widths['exec_type']+10*2+22))

    for i, node in enumerate(prof_info):
        print(f"{i}{' '*(index_len-len(str(i)))}", end='')
        print(f"{node.node_name}{' '*(column_max_widths['node_name']-len(node.node_name)+1)}", end='')
        print(f"{node.node_type}{' '*(column_max_widths['node_type']-len(node.node_type)+1)}", end='')
        print(f"{node.exec_type}{' '*(column_max_widths['exec_type']-len(node.exec_type)+1)}", end='')
        print(f"{node.real_time.total_seconds(): 2.6f}", end=' ')
        print(f"{node.cpu_time.total_seconds(): 2.6f}", end=' ')
        print(f"{node.status}")


log.info(f'Latency {request.latency}')
print_infer_request_profiling_info(prof_info)

应用到目标检测程序

程序代码

import logging as log
import os
import sys

import cv2
import numpy as np

from openvino.preprocess import PrePostProcessor, ResizeAlgorithm
from openvino.runtime import Core, Layout, Type


def main():
    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)

    # Parsing and validation of input arguments
    if len(sys.argv) != 4:
        log.info(f'Usage: {sys.argv[0]} <path_to_model> <path_to_image> <device_name>')
        return 1

    model_path = sys.argv[1]
    image_path = sys.argv[2]
    device_name = sys.argv[3]

    log.info('1. Creating OpenVINO Runtime Core')
    core = Core()
    core.set_property(device_name, {"PERF_COUNT": "YES"})

    log.info(f'2. Reading the model: {model_path}')
    # (.xml and .bin files) or (.onnx file)
    model = core.read_model(model_path)

    if len(model.inputs) != 1:
        log.error('Sample supports only single input topologies')
        return -1

    if len(model.outputs) != 1:
        log.error('Sample supports only single output topologies')
        return -1

    log.info('3. Set up input')
    # Read input image
    image = cv2.imread(image_path)
    # Add N dimension
    input_tensor = np.expand_dims(image, 0)

    log.info('4. Apply preprocessing')
    ppp = PrePostProcessor(model)
    _, h, w, _ = input_tensor.shape

    # 1) Set input tensor information:
    # - input() provides information about a single model input
    # - precision of tensor is supposed to be 'u8'
    # - layout of data is 'NHWC'
    ppp.input().tensor() \
        .set_element_type(Type.u8) \
        .set_layout(Layout('NHWC')) \
        .set_spatial_static_shape(h, w)  # noqa: ECE001, N400

    # 2) Adding explicit preprocessing steps:
    # - apply linear resize from tensor spatial dims to model spatial dims
    ppp.input().preprocess().resize(ResizeAlgorithm.RESIZE_LINEAR)

    # 3) Here we suppose model has 'NCHW' layout for input
    ppp.input().model().set_layout(Layout('NCHW'))

    # 4) Set output tensor information:
    # - precision of tensor is supposed to be 'f32'
    ppp.output().tensor().set_element_type(Type.f32)

    # 5) Apply preprocessing modifing the original 'model'
    model = ppp.build()

    log.info('5. Loading the model to the plugin')
    compiled_model = core.compile_model(model, device_name)

    log.info('6. Starting inference in synchronous mode')
    # results = compiled_model.infer_new_request({0: input_tensor})
    request = compiled_model.create_infer_request()
    results = request.infer({0: input_tensor})

    log.info('7. Process output')
    predictions = next(iter(results.values()))

    # Change a shape of a numpy.ndarray with results ([1, 1, N, 7]) to get another one ([N, 7]),
    # where N is the number of detected bounding boxes
    detections = predictions.reshape(-1, 7)

    for detection in detections:
        confidence = detection[2]

        if confidence > 0.5:
            class_id = int(detection[1])

            xmin = int(detection[3] * w)
            ymin = int(detection[4] * h)
            xmax = int(detection[5] * w)
            ymax = int(detection[6] * h)

            log.info(f'Found: class_id = {class_id}, confidence = {confidence:.2f}, ' f'coords = ({xmin}, {ymin}), ({xmax}, {ymax})')

            # Draw a bounding box on a output image
            cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)

    cv2.imwrite('out.bmp', image)

    if os.path.exists('out.bmp'):
        log.info('Image out.bmp was created!')
    else:
        log.error('Image out.bmp was not created. Check your permissions.')

    log.info(f'Latency {request.latency}')

    print_infer_request_profiling_info(request.get_profiling_info())

    return 0


def print_infer_request_profiling_info(prof_info):
    column_max_widths = {
        'node_name': 0,
        'node_type': 0,
        'exec_type': 0
    }
    for node in prof_info:
        if len(node.node_name) > column_max_widths['node_name'] :
            column_max_widths['node_name'] = len(node.node_name)
        if len(node.node_type) > column_max_widths['node_type'] :
            column_max_widths['node_type'] = len(node.node_type)
        if len(node.exec_type) > column_max_widths['exec_type'] :
            column_max_widths['exec_type'] = len(node.exec_type)

    index_len = 6

    headers = ['Index', 'Node Name', 'Node Type', 'Exec Type', 'Real Time', 'CPU Time', 'Status']
    print(f"{headers[0]}{' '*(index_len-len(headers[0]))}", end='')
    print(f"{headers[1]}{' '*(column_max_widths['node_name']-len(headers[1])+1)}", end='')
    print(f"{headers[2]}{' '*(column_max_widths['node_type']-len(headers[2])+1)}", end='')
    print(f"{headers[3]}{' '*(column_max_widths['exec_type']-len(headers[3])+1)}", end='')
    print(f"{headers[4]}{' '*(9-len(headers[4])+1)}", end='')
    print(f"{headers[5]}{' '*(9-len(headers[5])+1)}", end='')
    print(f"{headers[6]}")

    print('-'*(column_max_widths['node_name']+column_max_widths['node_type']+column_max_widths['exec_type']+10*2+22))

    for i, node in enumerate(prof_info):
        print(f"{i}{' '*(index_len-len(str(i)))}", end='')
        print(f"{node.node_name}{' '*(column_max_widths['node_name']-len(node.node_name)+1)}", end='')
        print(f"{node.node_type}{' '*(column_max_widths['node_type']-len(node.node_type)+1)}", end='')
        print(f"{node.exec_type}{' '*(column_max_widths['exec_type']-len(node.exec_type)+1)}", end='')
        print(f"{node.real_time.total_seconds(): 2.6f}", end=' ')
        print(f"{node.cpu_time.total_seconds(): 2.6f}", end=' ')
        print(f"{node.status}")


if __name__ == '__main__':
    sys.exit(main())

运行程序

$ python object_detection.py public/ssd300/FP32/ssd300.xml catdog.jpg CPU
[ INFO ] 1. Creating OpenVINO Runtime Core
[ INFO ] 2. Reading the model: public/ssd300/FP32/ssd300.xml
[ INFO ] 3. Set up input
[ INFO ] 4. Apply preprocessing
[ INFO ] 5. Loading the model to the plugin
[ INFO ] 6. Starting inference in synchronous mode
[ INFO ] 7. Process output
[ INFO ] Found: class_id = 8, confidence = 0.97, coords = (390, 112), (665, 473)
[ INFO ] Found: class_id = 12, confidence = 1.00, coords = (64, 57), (437, 499)
[ INFO ] Image out.bmp was created!
[ INFO ] Latency 134.02814899999998
Index Node Name                                                                    Node Type       Exec Type         Real Time CPU Time  Status
------------------------------------------------------------------------------------------------------------------------------------------------------
   data                                                                         Parameter       unknown_I8         0.000000  0.000000 Status.NOT_RUN
   data_abcd_acdb_fake                                                          Reorder         reorder_I8         0.000000  0.000000 Status.NOT_RUN
   Interpolate_3017                                                             Interpolate     jit_avx2_I8        0.000414  0.000414 Status.EXECUTED
   Convert_3022                                                                 Convert         unknown_I8         0.000023  0.000023 Status.EXECUTED
   data/mean                                                                    Subgraph        jit_avx2_FP32      0.000051  0.000051 Status.EXECUTED
   data/mean_acdb_abcd_conv1_1/WithoutBiases                                    Reorder         jit_uni_FP32       0.000098  0.000098 Status.EXECUTED
   conv1_1/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.001759  0.001759 Status.EXECUTED
   relu1_1                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
   conv1_2/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.010845  0.010845 Status.EXECUTED
   relu1_2                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  pool1                                                                        MaxPool         jit_avx2_FP32      0.001154  0.001154 Status.EXECUTED
  conv2_1/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.004804  0.004804 Status.EXECUTED
  relu2_1                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv2_2/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.009644  0.009644 Status.EXECUTED
  relu2_2                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  pool2                                                                        MaxPool         jit_avx2_FP32      0.000551  0.000551 Status.EXECUTED
  conv3_1/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.004833  0.004833 Status.EXECUTED
  relu3_1                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv3_2/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.009765  0.009765 Status.EXECUTED
  relu3_2                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv3_3/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.009831  0.009831 Status.EXECUTED
  relu3_3                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  pool3                                                                        MaxPool         jit_avx2_FP32      0.000261  0.000261 Status.EXECUTED
  conv4_1/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.005128  0.005128 Status.EXECUTED
  relu4_1                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv4_2/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.010256  0.010256 Status.EXECUTED
  relu4_2                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv4_3/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.022195  0.022195 Status.EXECUTED
  relu4_3                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  pool4                                                                        MaxPool         jit_avx2_FP32      0.000151  0.000151 Status.EXECUTED
  conv5_1/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.002811  0.002811 Status.EXECUTED
  relu5_1                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv5_2/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.002784  0.002784 Status.EXECUTED
  relu5_2                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv5_3/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.003487  0.003487 Status.EXECUTED
  relu5_3                                                                      Relu            undef              0.000000  0.000000 Status.NOT_RUN
  pool5                                                                        MaxPool         jit_avx2_FP32      0.000103  0.000103 Status.EXECUTED
  pool5_aBcd8b_abcd_fc6/WithoutBiases                                          Reorder         jit_FP32           0.000040  0.000040 Status.EXECUTED
  fc6/WithoutBiases                                                            Convolution     jit_gemm_FP32      0.022724  0.022724 Status.EXECUTED
  relu6                                                                        Relu            undef              0.000000  0.000000 Status.NOT_RUN
  fc6/WithoutBiases_abcd_aBcd8b_fc7/WithoutBiases                              Reorder         jit_FP32           0.000059  0.000059 Status.EXECUTED
  fc7/WithoutBiases                                                            Convolution     jit_avx2_1x1_FP32  0.001290  0.001290 Status.EXECUTED
  relu7                                                                        Relu            undef              0.000000  0.000000 Status.NOT_RUN
  fc7_mbox_conf/WithoutBiases                                                  Convolution     jit_avx2_FP32      0.001455  0.001455 Status.EXECUTED
  fc7_mbox_conf/WithoutBiases_aBcd8b_abcd_fc7_mbox_conf_perm                   Reorder         ref_any_FP32       0.000036  0.000036 Status.EXECUTED
  fc7_mbox_conf_perm                                                           Transpose       unknown_FP32       0.000020  0.000020 Status.EXECUTED
  fc7_mbox_conf_flat                                                           Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  fc7_mbox_conf_flat___mbox_conf                                               Reorder         ref_any_FP32       0.000015  0.000015 Status.EXECUTED
  conv6_1/WithoutBiases                                                        Convolution     jit_avx2_1x1_FP32  0.000321  0.000321 Status.EXECUTED
  conv6_1_relu                                                                 Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv6_2/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.000478  0.000478 Status.EXECUTED
  conv6_2_relu                                                                 Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv6_2_mbox_conf/WithoutBiases                                              Convolution     jit_avx2_FP32      0.000245  0.000245 Status.EXECUTED
  conv6_2_mbox_conf/WithoutBiases_aBcd8b_abcd_conv6_2_mbox_conf_perm           Reorder         ref_any_FP32       0.000012  0.000012 Status.EXECUTED
  conv6_2_mbox_conf_perm                                                       Transpose       unknown_FP32       0.000007  0.000007 Status.EXECUTED
  conv6_2_mbox_conf_flat                                                       Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  conv6_2_mbox_conf_flat___mbox_conf                                           Reorder         ref_any_FP32       0.000005  0.000005 Status.EXECUTED
  conv7_1/WithoutBiases                                                        Convolution     jit_avx2_1x1_FP32  0.000035  0.000035 Status.EXECUTED
  conv7_1_relu                                                                 Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv7_2/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.000066  0.000066 Status.EXECUTED
  conv7_2_relu                                                                 Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv7_2_mbox_conf/WithoutBiases                                              Convolution     jit_avx2_FP32      0.000063  0.000063 Status.EXECUTED
  conv7_2_mbox_conf/WithoutBiases_aBcd8b_abcd_conv7_2_mbox_conf_perm           Reorder         ref_any_FP32       0.000009  0.000009 Status.EXECUTED
  conv7_2_mbox_conf_perm                                                       Transpose       unknown_FP32       0.000004  0.000004 Status.EXECUTED
  conv7_2_mbox_conf_flat                                                       Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  conv7_2_mbox_conf_flat___mbox_conf                                           Reorder         ref_any_FP32       0.000005  0.000005 Status.EXECUTED
  conv8_1/WithoutBiases                                                        Convolution     jit_avx2_1x1_FP32  0.000010  0.000010 Status.EXECUTED
  conv8_1_relu                                                                 Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv8_2/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.000054  0.000054 Status.EXECUTED
  conv8_2_relu                                                                 Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv8_2_mbox_conf/WithoutBiases                                              Convolution     jit_avx2_FP32      0.000038  0.000038 Status.EXECUTED
  conv8_2_mbox_conf/WithoutBiases_aBcd8b_abcd_conv8_2_mbox_conf_perm           Reorder         ref_any_FP32       0.000008  0.000008 Status.EXECUTED
  conv8_2_mbox_conf_perm                                                       Transpose       unknown_FP32       0.000003  0.000003 Status.EXECUTED
  conv8_2_mbox_conf_flat                                                       Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  conv8_2_mbox_conf_flat___mbox_conf                                           Reorder         ref_any_FP32       0.000004  0.000004 Status.EXECUTED
  conv9_1/WithoutBiases                                                        Convolution     jit_avx2_1x1_FP32  0.000011  0.000011 Status.EXECUTED
  conv9_1_relu                                                                 Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv9_2/WithoutBiases                                                        Convolution     jit_avx2_FP32      0.000047  0.000047 Status.EXECUTED
  conv9_2_relu                                                                 Relu            undef              0.000000  0.000000 Status.NOT_RUN
  conv9_2_mbox_conf/WithoutBiases                                              Convolution     jit_avx2_FP32      0.000020  0.000020 Status.EXECUTED
  conv9_2_mbox_conf/WithoutBiases_aBcd8b_abcd_conv9_2_mbox_conf_perm           Reorder         ref_any_FP32       0.000005  0.000005 Status.EXECUTED
  conv9_2_mbox_conf_perm                                                       Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  conv9_2_mbox_conf_flat                                                       Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  conv9_2_mbox_conf_flat___mbox_conf                                           Reorder         ref_any_FP32       0.000004  0.000004 Status.EXECUTED
  conv9_2_mbox_loc/WithoutBiases                                               Convolution     jit_avx2_FP32      0.000010  0.000010 Status.EXECUTED
  conv9_2_mbox_loc/WithoutBiases_aBcd8b_abcd_conv9_2_mbox_loc_perm             Reorder         jit_uni_FP32       0.000004  0.000004 Status.EXECUTED
  conv9_2_mbox_loc_perm                                                        Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  conv9_2_mbox_loc_flat                                                        Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  conv9_2_mbox_loc_flat___mbox_loc                                             Reorder         ref_any_FP32       0.000004  0.000004 Status.EXECUTED
  conv8_2_mbox_loc/WithoutBiases                                               Convolution     jit_avx2_FP32      0.000011  0.000011 Status.EXECUTED
  conv8_2_mbox_loc_perm                                                        Transpose       unknown_FP32       0.000003  0.000003 Status.EXECUTED
  conv8_2_mbox_loc_flat                                                        Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  conv8_2_mbox_loc_flat___mbox_loc                                             Reorder         ref_any_FP32       0.000003  0.000003 Status.EXECUTED
  conv7_2_mbox_loc/WithoutBiases                                               Convolution     jit_avx2_FP32      0.000017  0.000017 Status.EXECUTED
  conv7_2_mbox_loc_perm                                                        Transpose       unknown_FP32       0.000003  0.000003 Status.EXECUTED
  conv7_2_mbox_loc_flat                                                        Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
  conv7_2_mbox_loc_flat___mbox_loc                                             Reorder         ref_any_FP32       0.000004  0.000004 Status.EXECUTED
  conv6_2_mbox_loc/WithoutBiases                                               Convolution     jit_avx2_FP32      0.000063  0.000063 Status.EXECUTED
  conv6_2_mbox_loc_perm                                                        Transpose       unknown_FP32       0.000004  0.000004 Status.EXECUTED
  conv6_2_mbox_loc_flat                                                        Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
 conv6_2_mbox_loc_flat___mbox_loc                                             Reorder         ref_any_FP32       0.000005  0.000005 Status.EXECUTED
 fc7_mbox_loc/WithoutBiases                                                   Convolution     jit_avx2_FP32      0.000345  0.000345 Status.EXECUTED
 fc7_mbox_loc_perm                                                            Transpose       unknown_FP32       0.000004  0.000004 Status.EXECUTED
 fc7_mbox_loc_flat                                                            Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
 fc7_mbox_loc_flat___mbox_loc                                                 Reorder         ref_any_FP32       0.000006  0.000006 Status.EXECUTED
 6613                                                                         NormalizeL2     unknown_FP32       0.000244  0.000244 Status.EXECUTED
 conv4_3_norm                                                                 Multiply        undef              0.000000  0.000000 Status.NOT_RUN
 conv4_3_norm_mbox_conf/WithoutBiases                                         Convolution     jit_avx2_FP32      0.001962  0.001962 Status.EXECUTED
 conv4_3_norm_mbox_conf/WithoutBiases_aBcd8b_abcd_conv4_3_norm_mbox_conf_perm Reorder         ref_any_FP32       0.000086  0.000086 Status.EXECUTED
 conv4_3_norm_mbox_conf_perm                                                  Transpose       unknown_FP32       0.000021  0.000021 Status.EXECUTED
 conv4_3_norm_mbox_conf_flat                                                  Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
 conv4_3_norm_mbox_conf_flat___mbox_conf                                      Reorder         ref_any_FP32       0.000020  0.000020 Status.EXECUTED
 mbox_conf                                                                    Concat          unknown_FP32       0.000000  0.000000 Status.NOT_RUN
 mbox_conf_reshape                                                            Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
 mbox_conf_softmax                                                            Softmax         jit_avx2_FP32      0.000070  0.000070 Status.EXECUTED
 mbox_conf_flatten                                                            Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
 conv4_3_norm_mbox_loc/WithoutBiases                                          Convolution     jit_avx2_FP32      0.000490  0.000490 Status.EXECUTED
 conv4_3_norm_mbox_loc_perm                                                   Transpose       unknown_FP32       0.000006  0.000006 Status.EXECUTED
 conv4_3_norm_mbox_loc_flat                                                   Reshape         unknown_FP32       0.000000  0.000000 Status.NOT_RUN
 conv4_3_norm_mbox_loc_flat___mbox_loc                                        Reorder         ref_any_FP32       0.000007  0.000007 Status.EXECUTED
 mbox_loc                                                                     Concat          unknown_FP32       0.000000  0.000000 Status.NOT_RUN
 detection_out                                                                DetectionOutput ref_any_FP32       0.002029  0.002029 Status.EXECUTED
 detection_out/sink_port_0                                                    Result          unknown_FP32       0.000000  0.000000 Status.NOT_RUN

内存分析

memory-profiler

监控每行代码的内存使用。

安装

pip install memory-profiler

使用

from memory_profiler import profile

@profile
def func():
    pass

应用到目标检测程序

程序代码

import logging as log
import os
import sys

import cv2
import numpy as np

from openvino.preprocess import PrePostProcessor, ResizeAlgorithm
from openvino.runtime import Core, Layout, Type

from memory_profiler import profile


@profile
def main():
    log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)

    # Parsing and validation of input arguments
    if len(sys.argv) != 4:
        log.info(f'Usage: {sys.argv[0]} <path_to_model> <path_to_image> <device_name>')
        return 1

    model_path = sys.argv[1]
    image_path = sys.argv[2]
    device_name = sys.argv[3]

    log.info('1. Creating OpenVINO Runtime Core')
    core = Core()
    core.set_property(device_name, {"PERF_COUNT": "YES"})

    log.info(f'2. Reading the model: {model_path}')
    # (.xml and .bin files) or (.onnx file)
    model = core.read_model(model_path)

    if len(model.inputs) != 1:
        log.error('Sample supports only single input topologies')
        return -1

    if len(model.outputs) != 1:
        log.error('Sample supports only single output topologies')
        return -1

    log.info('3. Set up input')
    # Read input image
    image = cv2.imread(image_path)
    # Add N dimension
    input_tensor = np.expand_dims(image, 0)

    log.info('4. Apply preprocessing')
    ppp = PrePostProcessor(model)
    _, h, w, _ = input_tensor.shape

    # 1) Set input tensor information:
    # - input() provides information about a single model input
    # - precision of tensor is supposed to be 'u8'
    # - layout of data is 'NHWC'
    ppp.input().tensor() \
        .set_element_type(Type.u8) \
        .set_layout(Layout('NHWC')) \
        .set_spatial_static_shape(h, w)  # noqa: ECE001, N400

    # 2) Adding explicit preprocessing steps:
    # - apply linear resize from tensor spatial dims to model spatial dims
    ppp.input().preprocess().resize(ResizeAlgorithm.RESIZE_LINEAR)

    # 3) Here we suppose model has 'NCHW' layout for input
    ppp.input().model().set_layout(Layout('NCHW'))

    # 4) Set output tensor information:
    # - precision of tensor is supposed to be 'f32'
    ppp.output().tensor().set_element_type(Type.f32)

    # 5) Apply preprocessing modifing the original 'model'
    model = ppp.build()

    log.info('5. Loading the model to the plugin')
    compiled_model = core.compile_model(model, device_name)

    log.info('6. Starting inference in synchronous mode')
    # results = compiled_model.infer_new_request({0: input_tensor})
    request = compiled_model.create_infer_request()
    results = request.infer({0: input_tensor})

    log.info('7. Process output')
    predictions = next(iter(results.values()))

    # Change a shape of a numpy.ndarray with results ([1, 1, N, 7]) to get another one ([N, 7]),
    # where N is the number of detected bounding boxes
    detections = predictions.reshape(-1, 7)

    for detection in detections:
        confidence = detection[2]

        if confidence > 0.5:
            class_id = int(detection[1])

            xmin = int(detection[3] * w)
            ymin = int(detection[4] * h)
            xmax = int(detection[5] * w)
            ymax = int(detection[6] * h)

            log.info(f'Found: class_id = {class_id}, confidence = {confidence:.2f}, ' f'coords = ({xmin}, {ymin}), ({xmax}, {ymax})')

            # Draw a bounding box on a output image
            cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)

    cv2.imwrite('out.bmp', image)

    if os.path.exists('out.bmp'):
        log.info('Image out.bmp was created!')
    else:
        log.error('Image out.bmp was not created. Check your permissions.')

    log.info(f'Latency {request.latency}')

    return 0


if __name__ == '__main__':
    sys.exit(main())

运行程序

这里分别使用 FP16 和 FP32 两种数据类型的模型进行测试。

FP16 模型

$ ll -h public/ssd300/FP16/
总用量 51M
-rw-rw-r-- 1 wjunjian wjunjian  51M 4月  27 09:10 ssd300.bin
-rw-rw-r-- 1 wjunjian wjunjian  14K 4月  27 09:10 ssd300.mapping
-rw-rw-r-- 1 wjunjian wjunjian 217K 4月  27 09:10 ssd300.xml

FP32 模型

$ ll -h public/ssd300/FP32/
总用量 101M
-rw-rw-r-- 1 wjunjian wjunjian 101M 4月  27 09:10 ssd300.bin
-rw-rw-r-- 1 wjunjian wjunjian  14K 4月  27 09:10 ssd300.mapping
-rw-rw-r-- 1 wjunjian wjunjian 179K 4月  27 09:10 ssd300.xml

FP16 CPU

$ python object_detection.py public/ssd300/FP16/ssd300.xml catdog.jpg CPU
[ INFO ] 1. Creating OpenVINO Runtime Core
[ INFO ] 2. Reading the model: public/ssd300/FP16/ssd300.xml
[ INFO ] 3. Set up input
[ INFO ] 4. Apply preprocessing
[ INFO ] 5. Loading the model to the plugin
[ INFO ] 6. Starting inference in synchronous mode
[ INFO ] 7. Process output
[ INFO ] Found: class_id = 8, confidence = 0.97, coords = (390, 112), (665, 473)
[ INFO ] Found: class_id = 12, confidence = 1.00, coords = (64, 57), (437, 499)
[ INFO ] Image out.bmp was created!
[ INFO ] Latency 143.190296
Filename: /home/wjunjian/openvino/openvino/samples/python/object_detection/object_detection.py

Line #    Mem usage    Increment  Occurrences   Line Contents
=============================================================
   71.0 MiB     71.0 MiB           1   @profile
                                       def main():
   71.0 MiB      0.0 MiB           1       log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
                                       
                                           # Parsing and validation of input arguments
   71.0 MiB      0.0 MiB           1       if len(sys.argv) != 4:
                                               log.info(f'Usage: {sys.argv[0]} <path_to_model> <path_to_image> <device_name>')
                                               return 1
                                       
   71.0 MiB      0.0 MiB           1       model_path = sys.argv[1]
   71.0 MiB      0.0 MiB           1       image_path = sys.argv[2]
   71.0 MiB      0.0 MiB           1       device_name = sys.argv[3]
                                       
   71.0 MiB      0.0 MiB           1       log.info('1. Creating OpenVINO Runtime Core')
   71.0 MiB      0.0 MiB           1       core = Core()
   71.0 MiB      0.0 MiB           1       core.set_property(device_name, {"PERF_COUNT": "YES"})
                                       
   71.0 MiB      0.0 MiB           1       log.info(f'2. Reading the model: {model_path}')
                                           # (.xml and .bin files) or (.onnx file)
  128.7 MiB     57.7 MiB           1       model = core.read_model(model_path)
                                       
  128.7 MiB      0.0 MiB           1       if len(model.inputs) != 1:
                                               log.error('Sample supports only single input topologies')
                                               return -1
                                       
  128.7 MiB      0.0 MiB           1       if len(model.outputs) != 1:
                                               log.error('Sample supports only single output topologies')
                                               return -1
                                       
  128.7 MiB      0.0 MiB           1       log.info('3. Set up input')
                                           # Read input image
  131.6 MiB      2.9 MiB           1       image = cv2.imread(image_path)
                                           # Add N dimension
  131.6 MiB      0.0 MiB           1       input_tensor = np.expand_dims(image, 0)
                                       
  131.6 MiB      0.0 MiB           1       log.info('4. Apply preprocessing')
  131.6 MiB      0.0 MiB           1       ppp = PrePostProcessor(model)
  131.6 MiB      0.0 MiB           1       _, h, w, _ = input_tensor.shape
                                       
                                           # 1) Set input tensor information:
                                           # - input() provides information about a single model input
                                           # - precision of tensor is supposed to be 'u8'
                                           # - layout of data is 'NHWC'
  131.6 MiB      0.0 MiB           4       ppp.input().tensor() \
  131.6 MiB      0.0 MiB           1           .set_element_type(Type.u8) \
  131.6 MiB      0.0 MiB           1           .set_layout(Layout('NHWC')) \
  131.6 MiB      0.0 MiB           1           .set_spatial_static_shape(h, w)  # noqa: ECE001, N400
                                       
                                           # 2) Adding explicit preprocessing steps:
                                           # - apply linear resize from tensor spatial dims to model spatial dims
  131.6 MiB      0.0 MiB           1       ppp.input().preprocess().resize(ResizeAlgorithm.RESIZE_LINEAR)
                                       
                                           # 3) Here we suppose model has 'NCHW' layout for input
  131.6 MiB      0.0 MiB           1       ppp.input().model().set_layout(Layout('NCHW'))
                                       
                                           # 4) Set output tensor information:
                                           # - precision of tensor is supposed to be 'f32'
  131.6 MiB      0.0 MiB           1       ppp.output().tensor().set_element_type(Type.f32)
                                       
                                           # 5) Apply preprocessing modifing the original 'model'
  131.6 MiB      0.0 MiB           1       model = ppp.build()
                                       
  131.6 MiB      0.0 MiB           1       log.info('5. Loading the model to the plugin')
  422.2 MiB    290.6 MiB           1       compiled_model = core.compile_model(model, device_name)
                                       
  422.2 MiB      0.0 MiB           1       log.info('6. Starting inference in synchronous mode')
                                           # results = compiled_model.infer_new_request({0: input_tensor})
  422.2 MiB      0.0 MiB           1       request = compiled_model.create_infer_request()
  471.2 MiB     49.1 MiB           1       results = request.infer({0: input_tensor})
                                       
  471.2 MiB      0.0 MiB           1       log.info('7. Process output')
  471.2 MiB      0.0 MiB           1       predictions = next(iter(results.values()))
                                       
                                           # Change a shape of a numpy.ndarray with results ([1, 1, N, 7]) to get another one ([N, 7]),
                                           # where N is the number of detected bounding boxes
  471.2 MiB      0.0 MiB           1       detections = predictions.reshape(-1, 7)
                                       
  471.2 MiB      0.0 MiB         201       for detection in detections:
  471.2 MiB      0.0 MiB         200           confidence = detection[2]
                                       
  471.2 MiB      0.0 MiB         200           if confidence > 0.5:
  471.2 MiB      0.0 MiB           2               class_id = int(detection[1])
                                       
  471.2 MiB      0.0 MiB           2               xmin = int(detection[3] * w)
  471.2 MiB      0.0 MiB           2               ymin = int(detection[4] * h)
  471.2 MiB      0.0 MiB           2               xmax = int(detection[5] * w)
  471.2 MiB      0.0 MiB           2               ymax = int(detection[6] * h)
                                       
  471.2 MiB      0.0 MiB           2               log.info(f'Found: class_id = {class_id}, confidence = {confidence:.2f}, ' f'coords = ({xmin}, {ymin}), ({xmax}, {ymax})')
                                       
                                                   # Draw a bounding box on a output image
  471.2 MiB      0.0 MiB           2               cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
                                       
  471.2 MiB      0.0 MiB           1       cv2.imwrite('out.bmp', image)
                                       
  471.2 MiB      0.0 MiB           1       if os.path.exists('out.bmp'):
  471.2 MiB      0.0 MiB           1           log.info('Image out.bmp was created!')
                                           else:
                                               log.error('Image out.bmp was not created. Check your permissions.')
                                       
  471.2 MiB      0.0 MiB           1       log.info(f'Latency {request.latency}')
                                       
  471.2 MiB      0.0 MiB           1       return 0

FP32 CPU

$ python object_detection.py public/ssd300/FP32/ssd300.xml catdog.jpg CPU
[ INFO ] 1. Creating OpenVINO Runtime Core
[ INFO ] 2. Reading the model: public/ssd300/FP32/ssd300.xml
[ INFO ] 3. Set up input
[ INFO ] 4. Apply preprocessing
[ INFO ] 5. Loading the model to the plugin
[ INFO ] 6. Starting inference in synchronous mode
[ INFO ] 7. Process output
[ INFO ] Found: class_id = 8, confidence = 0.97, coords = (390, 112), (665, 473)
[ INFO ] Found: class_id = 12, confidence = 1.00, coords = (64, 57), (437, 499)
[ INFO ] Image out.bmp was created!
[ INFO ] Latency 143.190296
Filename: /home/wjunjian/openvino/openvino/samples/python/object_detection/object_detection.py

Line #    Mem usage    Increment  Occurrences   Line Contents
=============================================================
   70.8 MiB     70.8 MiB           1   @profile
                                       def main():
   70.8 MiB      0.0 MiB           1       log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
                                       
                                           # Parsing and validation of input arguments
   70.8 MiB      0.0 MiB           1       if len(sys.argv) != 4:
                                               log.info(f'Usage: {sys.argv[0]} <path_to_model> <path_to_image> <device_name>')
                                               return 1
                                       
   70.8 MiB      0.0 MiB           1       model_path = sys.argv[1]
   70.8 MiB      0.0 MiB           1       image_path = sys.argv[2]
   70.8 MiB      0.0 MiB           1       device_name = sys.argv[3]
                                       
   70.8 MiB      0.0 MiB           1       log.info('1. Creating OpenVINO Runtime Core')
   70.8 MiB      0.0 MiB           1       core = Core()
   70.8 MiB      0.0 MiB           1       core.set_property(device_name, {"PERF_COUNT": "YES"})
                                       
   70.8 MiB      0.0 MiB           1       log.info(f'2. Reading the model: {model_path}')
                                           # (.xml and .bin files) or (.onnx file)
  178.1 MiB    107.3 MiB           1       model = core.read_model(model_path)
                                       
  178.1 MiB      0.0 MiB           1       if len(model.inputs) != 1:
                                               log.error('Sample supports only single input topologies')
                                               return -1
                                       
  178.1 MiB      0.0 MiB           1       if len(model.outputs) != 1:
                                               log.error('Sample supports only single output topologies')
                                               return -1
                                       
  178.1 MiB      0.0 MiB           1       log.info('3. Set up input')
                                           # Read input image
  180.9 MiB      2.8 MiB           1       image = cv2.imread(image_path)
                                           # Add N dimension
  180.9 MiB      0.0 MiB           1       input_tensor = np.expand_dims(image, 0)
                                       
  180.9 MiB      0.0 MiB           1       log.info('4. Apply preprocessing')
  180.9 MiB      0.0 MiB           1       ppp = PrePostProcessor(model)
  180.9 MiB      0.0 MiB           1       _, h, w, _ = input_tensor.shape
                                       
                                           # 1) Set input tensor information:
                                           # - input() provides information about a single model input
                                           # - precision of tensor is supposed to be 'u8'
                                           # - layout of data is 'NHWC'
  180.9 MiB      0.0 MiB           4       ppp.input().tensor() \
  180.9 MiB      0.0 MiB           1           .set_element_type(Type.u8) \
  180.9 MiB      0.0 MiB           1           .set_layout(Layout('NHWC')) \
  180.9 MiB      0.0 MiB           1           .set_spatial_static_shape(h, w)  # noqa: ECE001, N400
                                       
                                           # 2) Adding explicit preprocessing steps:
                                           # - apply linear resize from tensor spatial dims to model spatial dims
  180.9 MiB      0.0 MiB           1       ppp.input().preprocess().resize(ResizeAlgorithm.RESIZE_LINEAR)
                                       
                                           # 3) Here we suppose model has 'NCHW' layout for input
  180.9 MiB      0.0 MiB           1       ppp.input().model().set_layout(Layout('NCHW'))
                                       
                                           # 4) Set output tensor information:
                                           # - precision of tensor is supposed to be 'f32'
  180.9 MiB      0.0 MiB           1       ppp.output().tensor().set_element_type(Type.f32)
                                       
                                           # 5) Apply preprocessing modifing the original 'model'
  180.9 MiB      0.0 MiB           1       model = ppp.build()
                                       
  180.9 MiB      0.0 MiB           1       log.info('5. Loading the model to the plugin')
  292.3 MiB    111.4 MiB           1       compiled_model = core.compile_model(model, device_name)
                                       
  292.3 MiB      0.0 MiB           1       log.info('6. Starting inference in synchronous mode')
                                           # results = compiled_model.infer_new_request({0: input_tensor})
  292.3 MiB      0.0 MiB           1       request = compiled_model.create_infer_request()
  341.3 MiB     49.0 MiB           1       results = request.infer({0: input_tensor})
                                       
  341.3 MiB      0.0 MiB           1       log.info('7. Process output')
  341.3 MiB      0.0 MiB           1       predictions = next(iter(results.values()))
                                       
                                           # Change a shape of a numpy.ndarray with results ([1, 1, N, 7]) to get another one ([N, 7]),
                                           # where N is the number of detected bounding boxes
  341.3 MiB      0.0 MiB           1       detections = predictions.reshape(-1, 7)
                                       
  341.3 MiB      0.0 MiB         201       for detection in detections:
  341.3 MiB      0.0 MiB         200           confidence = detection[2]
                                       
  341.3 MiB      0.0 MiB         200           if confidence > 0.5:
  341.3 MiB      0.0 MiB           2               class_id = int(detection[1])
                                       
  341.3 MiB      0.0 MiB           2               xmin = int(detection[3] * w)
  341.3 MiB      0.0 MiB           2               ymin = int(detection[4] * h)
  341.3 MiB      0.0 MiB           2               xmax = int(detection[5] * w)
  341.3 MiB      0.0 MiB           2               ymax = int(detection[6] * h)
                                       
  341.3 MiB      0.0 MiB           2               log.info(f'Found: class_id = {class_id}, confidence = {confidence:.2f}, ' f'coords = ({xmin}, {ymin}), ({xmax}, {ymax})')
                                       
                                                   # Draw a bounding box on a output image
  341.3 MiB      0.0 MiB           2               cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
                                       
  341.3 MiB      0.0 MiB           1       cv2.imwrite('out.bmp', image)
                                       
  341.3 MiB      0.0 MiB           1       if os.path.exists('out.bmp'):
  341.3 MiB      0.0 MiB           1           log.info('Image out.bmp was created!')
                                           else:
                                               log.error('Image out.bmp was not created. Check your permissions.')
                                       
  341.3 MiB      0.0 MiB           1       log.info(f'Latency {request.latency}')
                                       
  341.3 MiB      0.0 MiB           1       return 0

在 CPU 设备上 FP16 比 FP32 的模型还占用内存。

网络性能分析

实现方式

通过配置收集指定设备上的性能分析

通过推理请求获得性能分析数据

可视化性能分析

应用到目标检测程序

程序代码

运行程序

内存分析

memory-profiler

安装

使用

应用到目标检测程序

程序代码

运行程序

参考资料