Skip to main content

Web App for the Object Detection using Yolov12, OpenCV and Gradio Framework

Ultralytics, the computer vision company which the responsible for the develoment of Yolo object detection models, hs recently lanched the Yolov12 series of object detection models and we are going to try these model. So, In this step by step tutorial we are going to to develop a Web App for the Object Detection using Yolov12, OpenCV and Gradio Framework and check the performace of Yolov12.

Let's get started

Prerequisites

Before we start the development of our object detection web application. We need to install the prerequisites/dependencies or the python packages which we need to develop the application. Here is the list of package which we need to install -

  • torch==2.2.2
  • torchvision==0.17.2
  • timm==1.0.14
  • albumentations==2.0.4
  • onnx==1.14.0
  • onnxruntime==1.15.1
  • pycocotools==2.0.7
  • PyYAML==6.0.1
  • scipy==1.13.0
  • gradio==4.44.1
  • opencv-python==4.9.0.80
  • psutil==5.9.8
  • py-cpuinfo==9.0.0
  • huggingface-hub==0.23.2
  • safetensors==0.4.3
  • numpy==1.26.4
  • thop

Note - In order to install these python package make sure that your internet should be up and running. you can install these packages with a single pip command

pip install -r /path/to/requirements.txt

Hopefully if everything goes well you will see that pip will fetch all the necessary files and install all the packages.

Application Code

import gradio as gr
import cv2
import tempfile
from ultralytics import YOLO


def yolov12_inference(image, video, model_id, image_size, conf_threshold):
    model = YOLO(model_id)
    if image:
        results = model.predict(source=image, imgsz=image_size, conf=conf_threshold)
        annotated_image = results[0].plot()
        return annotated_image[:, :, ::-1], None
    else:
        video_path = tempfile.mktemp(suffix=".webm")
        with open(video_path, "wb") as f:
            with open(video, "rb") as g:
                f.write(g.read())

        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        output_video_path = tempfile.mktemp(suffix=".webm")
        out = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'vp80'), fps, (frame_width, frame_height))

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            results = model.predict(source=frame, imgsz=image_size, conf=conf_threshold)
            annotated_frame = results[0].plot()
            out.write(annotated_frame)

        cap.release()
        out.release()

        return None, output_video_path


def yolov12_inference_for_examples(image, model_path, image_size, conf_threshold):
    annotated_image, _ = yolov12_inference(image, None, model_path, image_size, conf_threshold)
    return annotated_image


def app():
    with gr.Blocks():
        with gr.Row():
            with gr.Column():
                image = gr.Image(type="pil", label="Image", visible=True)
                video = gr.Video(label="Video", visible=False)
                input_type = gr.Radio(
                    choices=["Image", "Video"],
                    value="Image",
                    label="Input Type",
                )
                model_id = gr.Dropdown(
                    label="Model",
                    choices=[
                        "yolov12n.pt",
                        "yolov12s.pt",
                        "yolov12m.pt",
                        "yolov12l.pt",
                        "yolov12x.pt",
                    ],
                    value="yolov12m.pt",
                )
                image_size = gr.Slider(
                    label="Image Size",
                    minimum=320,
                    maximum=1280,
                    step=32,
                    value=640,
                )
                conf_threshold = gr.Slider(
                    label="Confidence Threshold",
                    minimum=0.0,
                    maximum=1.0,
                    step=0.05,
                    value=0.25,
                )
                yolov12_infer = gr.Button(value="Detect Objects")

            with gr.Column():
                output_image = gr.Image(type="numpy", label="Annotated Image", visible=True)
                output_video = gr.Video(label="Annotated Video", visible=False)

        def update_visibility(input_type):
            image = gr.update(visible=True) if input_type == "Image" else gr.update(visible=False)
            video = gr.update(visible=False) if input_type == "Image" else gr.update(visible=True)
            output_image = gr.update(visible=True) if input_type == "Image" else gr.update(visible=False)
            output_video = gr.update(visible=False) if input_type == "Image" else gr.update(visible=True)

            return image, video, output_image, output_video

        input_type.change(
            fn=update_visibility,
            inputs=[input_type],
            outputs=[image, video, output_image, output_video],
        )

        def run_inference(image, video, model_id, image_size, conf_threshold, input_type):
            if input_type == "Image":
                return yolov12_inference(image, None, model_id, image_size, conf_threshold)
            else:
                return yolov12_inference(None, video, model_id, image_size, conf_threshold)


        yolov12_infer.click(
            fn=run_inference,
            inputs=[image, video, model_id, image_size, conf_threshold, input_type],
            outputs=[output_image, output_video],
        )

        gr.Examples(
            examples=[
                [
                    "ultralytics/assets/bus.jpg",
                    "yolov12s.pt",
                    640,
                    0.25,
                ],
                [
                    "ultralytics/assets/zidane.jpg",
                    "yolov12x.pt",
                    640,
                    0.25,
                ],
            ],
            fn=yolov12_inference_for_examples,
            inputs=[
                image,
                model_id,
                image_size,
                conf_threshold,
            ],
            outputs=[output_image],
            cache_examples='lazy',
        )

gradio_app = gr.Blocks()
with gradio_app:
    gr.HTML(
        """
    <h1 style='text-align: center'>
    YOLOv12: Attention-Centric Real-Time Object Detectors
    </h1>
    """)
    with gr.Row():
        with gr.Column():
            app()
if __name__ == '__main__':
    gradio_app.launch()

Running Gradio App

In order to run the Gradio object detection web application please execute the below mention command in the Command Prompt or in Terminal -

>> python app.py

Now if everything goes well then you will be able to see your object detection web app running in your default web browser as shown in the below mentioned image or you can also run it inside any browser of you choice through localhost. You can see the localhost ip inside Command Prompt or in Terminal applications.

Figure-1: Web App Interface


Figure-2: Yolov12 Object Detection

Comments