Yolo-standalone/inference.py

import torch
import cv2
import numpy as np
import torchvision
from yolo11_standalone import YOLO11, YOLOPostProcessor

CLASSES = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
    "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
    "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
    "hair drier", "toothbrush"
]

COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))

def letterbox(im, new_shape=(640, 640), color=(114, 114, 114)):
    shape = im.shape[:2]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])

    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    dw, dh = dw / 2, dh / 2

    if shape[::-1] != new_unpad:
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)

    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))

    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
    return im, r, (dw, dh)

def xywh2xyxy(x):
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[..., 0] = x[..., 0] - x[..., 2] / 2  # top left x
    y[..., 1] = x[..., 1] - x[..., 3] / 2  # top left y
    y[..., 2] = x[..., 0] + x[..., 2] / 2  # bottom right x
    y[..., 3] = x[..., 1] + x[..., 3] / 2  # bottom right y
    return y

def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, max_det=300):
    prediction = prediction.transpose(1, 2)

    bs = prediction.shape[0]
    nc = prediction.shape[2] - 4

    xc = prediction[..., 4:].max(-1)[0] > conf_thres

    output = [torch.zeros((0, 6), device=prediction.device)] * bs

    for xi, x in enumerate(prediction):
        x = x[xc[xi]]
        if not x.shape[0]:
            continue
        box = xywh2xyxy(x[:, :4])
        conf, j = x[:, 4:].max(1, keepdim=True)
        x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
        n = x.shape[0]
        if not n:
            continue
        elif n > max_det:
            x = x[x[:, 4].argsort(descending=True)[:max_det]]
        c = x[:, 5:6] * 7680  # classes
        boxes, scores = x[:, :4] + c, x[:, 4]
        i = torchvision.ops.nms(boxes, scores, iou_thres)
        output[xi] = x[i]

    return output

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = YOLO11(nc=80, scale='s')
    model.load_weights("yolo11s.pth")
    model.to(device)
    model.eval()
    post_std = YOLOPostProcessor(model.model[-1], use_segmentation=False)

    img_path = "1.jpg"
    img0 = cv2.imread(img_path)
    assert img0 is not None, f"Image Not Found {img_path}"

    img, ratio, (dw, dh) = letterbox(img0, new_shape=(640, 640))

    img = img[:, :, ::-1].transpose(2, 0, 1)
    img = np.ascontiguousarray(img)

    img_tensor = torch.from_numpy(img).to(device)
    img_tensor = img_tensor.float()
    img_tensor /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img_tensor.ndim == 3:
        img_tensor = img_tensor.unsqueeze(0)

    print("开始推理...")
    with torch.no_grad():
        pred = model(img_tensor)

    pred = post_std(pred)
    pred = non_max_suppression(pred, conf_thres=0.25, iou_thres=0.45)

    det = pred[0]

    if len(det):
        det[:, [0, 2]] -= dw  # x padding
        det[:, [1, 3]] -= dh  # y padding
        det[:, :4] /= ratio

        det[:, 0].clamp_(0, img0.shape[1])
        det[:, 1].clamp_(0, img0.shape[0])
        det[:, 2].clamp_(0, img0.shape[1])
        det[:, 3].clamp_(0, img0.shape[0])

        print(f"检测到 {len(det)} 个目标")

        for *xyxy, conf, cls in det:
            c = int(cls)
            label = f'{CLASSES[c]} {conf:.2f}'
            p1, p2 = (int(xyxy[0]), int(xyxy[1])), (int(xyxy[2]), int(xyxy[3]))

            color = COLORS[c]
            cv2.rectangle(img0, p1, p2, color, 2, lineType=cv2.LINE_AA)

            t_size = cv2.getTextSize(label, 0, fontScale=0.5, thickness=1)[0]
            p2_label = p1[0] + t_size[0], p1[1] - t_size[1] - 3
            cv2.rectangle(img0, p1, p2_label, color, -1, cv2.LINE_AA)

            cv2.putText(img0, label, (p1[0], p1[1] - 2), 0, 0.5, [255, 255, 255], thickness=1, lineType=cv2.LINE_AA)

            print(f" - {label} at {p1}-{p2}")

    cv2.imwrite("result.jpg", img0)
    print("结果已保存至 result.jpg")

def import_os_exists(path):
    import os
    return os.path.exists(path)

if __name__ == "__main__":
    main()