Yolo11模型转换以及推理

Yolo11模型转换以及推理

1.将pt模型转换为onnx格式

升级pip:

python3.10 -m pip install --upgrade pip
pip -V

安装yolov8和onnx:

pip install yolov8 onnx

设置相关环境变量

临时添加环境变量(立即生效):

export PATH="$PATH:$HOME/.local/bin"

• 说明:此操作仅对当前终端会话有效,关闭终端后失效。

• 验证:执⾏ yolo --version,若输出版本号,则说明命令已⽣效。

永久添加环境变量(⻓期有效)

将路径写⼊配置⽂件,使其永久⽣效:

编辑⽤⼾环境变量⽂件:

echo 'export PATH="$PATH:$HOME/.local/bin"' >> ~/.bashrc
source ~/.bashrc #使修改⽴即⽣效

新建export.py用于模型转换以及导出:

from ultralytics import YOLO

# 加载同级目录下的.pt模型文件
model = YOLO('yolo11n.pt')  # 替换为实际模型文件名

# 导出ONNX配置参数
export_params = {
    'format': 'onnx',
    'opset': 12,          # 推荐算子集版本
    'simplify': True,     # 启用模型简化
    'dynamic': False,     # 固定输入尺寸
    'imgsz': 640,         # 标准输入尺寸
    'half': False         # 保持FP32精度
}

# 执行转换并保存到同级目录
model.export(**export_params)

执行该程序完成将pt模型导出为onnx模型。

2.使用AIMO将onnx模型截断量化为Qnn_Int8模型

选择模型优化,模型格式选择onnx格式上传模型。

选择芯片型号以及目标框架,这里我们选择QCS8550+Qnn2.31。

使用Netron查看模型结构,进行输入输出的填写。

如上图output节点由Mul和Sigmod两个节点Concat而成,因此在AIMO中填写这两个节点作为截断,并且开启量化选择数据精度int8,

接下来进行提交即可,转换完成后将目标模型文件下载,解压缩后其中的.bin.aidem文件即为模型文件:

2.使用Aidlite推理Yolov8_Qnn_int8模型

检查aidlux环境中的aidlite版本是否与我们转换模型时选择的Qnn版本一致,终端执行:

sudo aid-pkg installed 

确定Qnn231已安装。

推理代码如下:

import numpy as np
import cv2
import argparse
import aidlite
import time

# 定义类别
CLASSES = ("person", "bicycle", "car", "motorbike ", "aeroplane ", "bus ", "train", "truck ", "boat", "traffic light",
           "fire hydrant", "stop sign ", "parking meter", "bench", "bird", "cat", "dog ", "horse ", "sheep", "cow", "elephant",
           "bear", "zebra ", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
           "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife ",
           "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza ", "donut", "cake", "chair", "sofa",
           "pottedplant", "bed", "diningtable", "toilet ", "tvmonitor", "laptop\t", "mouse\t", "remote ", "keyboard ", "cell phone", "microwave ",
           "oven ", "toaster", "sink", "refrigerator ", "book", "clock", "vase", "scissors ", "teddy bear ", "hair drier", "toothbrush ")

# 图像预处理
def eqprocess(image, size1, size2):
    h, w, _ = image.shape
    mask = np.zeros((size1, size2, 3), dtype=np.float32)
    scale1 = h / size1
    scale2 = w / size2
    if scale1 > scale2:
        scale = scale1
    else:
        scale = scale2
    img = cv2.resize(image, (int(w / scale), int(h / scale)))
    mask[:int(h / scale), :int(w / scale), :] = img
    return mask, scale

# 坐标转换:(center x, center y, width, height) 到 (x1, y1, x2, y2)
def xywh2xyxy(x):
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y

# 坐标转换:(left_top x, left_top y, right_bottom x, right_bottom y) 到 (left_top x, left_top y, width, height)
def xyxy2xywh(box):
    box[:, 2:] = box[:, 2:] - box[:, :2]
    return box

# 单类NMS算法
def NMS(dets, scores, thresh):
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    areas = (y2 - y1 + 1) * (x2 - x1 + 1)
    keep = []
    index = scores.argsort()[::-1]
    while index.size > 0:
        i = index[0]  # every time the first is the biggst, and add it directly
        keep.append(i)
        x11 = np.maximum(x1[i], x1[index[1:]])  # calculate the points of overlap
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])
        w = np.maximum(0, x22 - x11 + 1)  # the weights of overlap
        h = np.maximum(0, y22 - y11 + 1)  # the height of overlap
        overlaps = w * h
        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
        idx = np.where(ious <= thresh)[0]
        index = index[idx + 1]  # because index start from 1

    return keep

# 检测结果绘制
def draw_detect_res(img, det_pred):
    if det_pred is None:
        return img

    img = img.astype(np.uint8)
    im_canvas = img.copy()
    color_step = int(255 / len(CLASSES))
    for i in range(len(det_pred)):
        x1, y1, x2, y2 = [int(t) for t in det_pred[i][:4]]
        cls_id = int(det_pred[i][5])
        print(i + 1, [x1, y1, x2, y2], det_pred[i][4], f'{CLASSES[cls_id]}')
        cv2.putText(img, f'{CLASSES[cls_id]}', (x1, y1 - 6), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, int(cls_id * color_step), int(255 - cls_id * color_step)), thickness=2)
    img = cv2.addWeighted(im_canvas, 0.3, img, 0.7, 0)
    return img

# 缩放掩码
def scale_mask(masks, im0_shape):
    masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]),
                       interpolation=cv2.INTER_LINEAR)
    if len(masks.shape) == 2:
        masks = masks[:, :, None]
    return masks

# 裁剪掩码
def crop_mask(masks, boxes):
    n, h, w = masks.shape
    x1, y1, x2, y2 = np.split(boxes[:, :, None], 4, 1)
    r = np.arange(w, dtype=x1.dtype)[None, None, :]
    c = np.arange(h, dtype=x1.dtype)[None, :, None]
    return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))

# 处理掩码
def process_mask(protos, masks_in, bboxes, im0_shape):
    c, mh, mw = protos.shape
    masks = np.matmul(masks_in, protos.reshape((c, -1))).reshape((-1, mh, mw)).transpose(1, 2, 0)  # HWN
    masks = np.ascontiguousarray(masks)
    masks = scale_mask(masks, im0_shape)  # re-scale mask from P3 shape to original input image shape
    masks = np.einsum('HWN -> NHW', masks)  # HWN -> NHW
    masks = crop_mask(masks, bboxes)
    return np.greater(masks, 0.5)

# 掩码转线段
def masks2segments(masks):
    segments = []
    for x in masks.astype('uint8'):
        c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]  # CHAIN_APPROX_SIMPLE
        if c:
            c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
        else:
            c = np.zeros((0, 2))  # no segments found
        segments.append(c.astype('float32'))
    return segments

OBJ_THRESH = 0.45
NMS_THRESH = 0.45

class Yolo11n(object):
    def __init__(self, model_path, width, height, class_num):
        self.class_num = class_num
        self.width = width
        self.height = height
        input_shape = [[1, height, width, 3]]
        self.blocks = int(height * width * (1 / 64 + 1 / 256 + 1 / 1024))
        self.maskw = int(width / 4)
        self.maskh = int(height / 4)
        self.output_shape = [[1, 4, self.blocks], [1, class_num, self.blocks]]

        self.model = aidlite.Model.create_instance(model_path)
        if self.model is None:
            print("Create model failed !")
            return
        self.model.set_model_properties(input_shape, aidlite.DataType.TYPE_FLOAT32, self.output_shape,
                                        aidlite.DataType.TYPE_FLOAT32)

        self.config = aidlite.Config.create_instance()
        if self.config is None:
            print("build_interpretper_from_model_and_config failed !")
            return

        self.config.framework_type = aidlite.FrameworkType.TYPE_QNN231
        self.config.accelerate_type = aidlite.AccelerateType.TYPE_DSP
        self.config.is_quantify_model = 1

        self.interpreter = aidlite.InterpreterBuilder.build_interpretper_from_model_and_config(self.model, self.config)
        if self.interpreter is None:
            print("build_interpretper_from_model_and_config failed !")
            return

        self.interpreter.init()
        self.interpreter.load_model()

    def __call__(self, frame, invoke_nums):
        img, scale = eqprocess(frame, self.height, self.width)
        img = img / 255
        img = img.astype(np.float32)
        self.interpreter.set_input_tensor(0, img.data)

        invoke_time = []
        for i in range(invoke_nums):
            t1 = time.time()
            self.interpreter.invoke()
            cost_time = (time.time() - t1) * 1000
            invoke_time.append(cost_time)

        max_invoke_time = max(invoke_time)
        min_invoke_time = min(invoke_time)
        mean_invoke_time = sum(invoke_time) / invoke_nums
        var_invoketime = np.var(invoke_time)
        print("====================================")
        print(f"QNN invoke {invoke_nums} times:\n --mean_invoke_time is {mean_invoke_time} \n --max_invoke_time is {max_invoke_time} \n --min_invoke_time is {min_invoke_time} \n --var_invoketime is {var_invoketime}")
        print("====================================")

        qnn_1 = self.interpreter.get_output_tensor(0)
        qnn_2 = self.interpreter.get_output_tensor(1)
        qnn_out = sorted([qnn_1, qnn_2], key=len)

        qnn_local = qnn_out[0].reshape(*self.output_shape[0])
        qnn_conf = qnn_out[1].reshape(*self.output_shape[1])

        x = np.concatenate([qnn_local, qnn_conf], axis=1).transpose(0, 2, 1)
        x = x[np.amax(x[..., 4:], axis=-1) > OBJ_THRESH]
        if len(x) < 1:
            return None

        x = np.c_[x[..., :4], np.amax(x[..., 4:], axis=-1), np.argmax(x[..., 4:], axis=-1)]

        x[:, :4] = xywh2xyxy(x[:, :4])
        index = NMS(x[:, :4], x[:, 4], NMS_THRESH)
        out_boxes = x[index]
        out_boxes[..., :4] = out_boxes[..., :4] * scale

        return out_boxes

def parser_args():
    parser = argparse.ArgumentParser(description="Run model benchmarks")
    parser.add_argument('--target_model', type=str, default='/home/aidlux/yolo11/8550_models/cutoff_yolo11n_qcs8550_w8a8.qnn231.ctx.bin',
                        help="inference model path")
    parser.add_argument('--imgs', type=str, default='bus.jpg', help="Predict images path")
    parser.add_argument('--height', type=int, default=640, help="run backend")
    parser.add_argument('--weight', type=int, default=640, help="run backend")
    parser.add_argument('--cls_num', type=int, default=80, help="run backend")
    parser.add_argument('--invoke_nums', type=int, default=100, help="Inference nums")
    parser.add_argument('--model_type', type=str, default='QNN', help="run backend")
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = parser_args()
    height = args.height
    weight = args.weight

    model = Yolo11n(args.target_model, args.weight, args.height, args.cls_num)
    frame = cv2.imread(args.imgs)

    out_boxes = model(frame, args.invoke_nums)
    print(f"=================== \n Detect {len(out_boxes)} targets.")
    result = draw_detect_res(frame, out_boxes)
    cv2.imwrite("result.jpg", result)

QCS8550以及QCS6490对Yolo11目标检测推理速度(推理100次取平均时间):

模型 尺寸 (像素) QCS8550推理速度 NPU QNN (ms) QCS6490推理速度 NPU QNN (ms)
YOLO11n 640 1.99 5.12
YOLO11s 640 2.90 7.91
YOLO11m 640 5.14 20.44
YOLO11l 640 6.72 24.27
YOLO11x 640 13.39 63.79
1 个赞