testsAndMisc-archive/python_pkg/praca_magisterska_video/_q24_nms_final.py

"""NMS/IoU, detector-from-classifier, and methods comparison."""

from __future__ import annotations

from _q24_common import (
    BG_COLOR,
    FONT_B,
    FONT_R,
    FPS,
    STEP_DUR,
    H,
    W,
    _tc,
    _text_slide,
)
from moviepy import ColorClip, CompositeVideoClip, VideoClip
from moviepy.video.fx import FadeIn, FadeOut
import numpy as np


# ── NMS + IoU ─────────────────────────────────────────────────────
def _nms_iou_demo() -> list[CompositeVideoClip]:
    """Animate NMS and IoU concepts."""
    slides = []

    def make_nms_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR

        progress = min(t / (STEP_DUR * 0.7), 1.0)

        # Draw overlapping bounding boxes
        ox, oy = 100, 200
        obj_w, obj_h = 150, 120

        # Multiple overlapping detections for same object
        boxes = [
            (ox, oy, obj_w, obj_h, 0.95, (255, 80, 80)),  # best
            (ox + 15, oy - 10, obj_w + 10, obj_h + 5, 0.90, (200, 60, 60)),
            (ox - 10, oy + 5, obj_w - 5, obj_h + 10, 0.85, (160, 50, 50)),
        ]
        # Different object far away
        boxes.append((ox + 350, oy + 50, 100, 100, 0.40, (80, 180, 255)))

        for i, (bx, by, bw, bh, _conf, color) in enumerate(boxes):
            dc = color
            nms_phase = 0.4
            nms_limit = 3
            if progress > nms_phase and i > 0 and i < nms_limit:
                # After NMS, these get removed (shown as faded/crossed)
                dc = (60, 40, 40)

            for tt in range(2):
                frame[by - tt : by + bh + tt, bx - tt : bx - tt + 2] = dc
                frame[by - tt : by + bh + tt, bx + bw + tt - 2 : bx + bw + tt] = dc
                frame[by - tt : by - tt + 2, bx - tt : bx + bw + tt] = dc
                frame[by + bh + tt - 2 : by + bh + tt, bx - tt : bx + bw + tt] = dc

        # IoU visualization on right side
        iou_x, iou_y = 700, 200
        # Box A
        frame[iou_y : iou_y + 100, iou_x : iou_x + 100] = (80, 80, 200)
        # Box B (overlapping)
        frame[iou_y + 40 : iou_y + 140, iou_x + 40 : iou_x + 140] = (200, 80, 80)
        # Intersection highlighted
        frame[iou_y + 40 : iou_y + 100, iou_x + 40 : iou_x + 100] = (200, 150, 200)

        return frame

    nms_clip = VideoClip(make_nms_frame, duration=STEP_DUR).with_fps(FPS)
    text_clips: list[VideoClip] = [nms_clip]
    labels = [
        ("NMS (Non-Maximum Suppression) + IoU", 28, "#FFE082", FONT_B, (80, 20)),
        (
            "NMS = Najlepszy Ma Się dobrze — zachowaj najlepszą, usuń duplikaty",
            18,
            "#B0BEC5",
            FONT_R,
            (80, 65),
        ),
        ("conf=0.95 ✓", 14, "#A5D6A7", FONT_B, (100, 340)),
        ("0.90 ✗ IoU>0.5", 13, "#EF9A9A", FONT_R, (100, 365)),
        ("0.85 ✗ IoU>0.5", 13, "#EF9A9A", FONT_R, (100, 390)),
        ("0.40 ✓ INNY obiekt", 13, "#64B5F6", FONT_R, (100, 420)),
        ("IoU = Intersection over Union", 18, "#FFE082", FONT_B, (700, 160)),
        ("IoU = pole(∩) / pole(AUB)", 16, "white", FONT_R, (700, 380)),
        ("Fioletowy = intersection", 14, "#CE93D8", FONT_R, (700, 410)),
        ("IoU > 0.5 → TEN SAM obiekt → usuń", 14, "#EF9A9A", FONT_R, (700, 440)),
        ("IoU < 0.5 → INNY obiekt → zachowaj", 14, "#A5D6A7", FONT_R, (700, 470)),
        (
            "DETR: jedyny detektor BEZ NMS (Hungarian matching zamiast tego)",
            14,
            "#78909C",
            FONT_R,
            (80, 620),
        ),
    ]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(STEP_DUR)
            .with_position(pos)
        )
        text_clips.append(tc)

    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )
    return slides


# ── Detector from Classifier ─────────────────────────────────────
def _detector_from_classifier() -> list[CompositeVideoClip]:
    """Show 3 approaches to building a detector from a classifier."""
    slides = []

    approaches = [
        (
            "Podejście 1: Sliding Window (NAJWOLNIEJSZE)",
            [
                ("Okno przesuwa się po obrazie w wielu skalach", "#B0BEC5"),
                ("Każde okno → klasyfikator (np. ResNet) → klasa + pewność", "#B0BEC5"),
                ("~18 000 okien x 10ms = ~3 minuty na obraz!", "#EF9A9A"),
                ("Mnemonik: WYCINAJ i PYTAJ — jak wycinanie ciasteczek", "#FFE082"),
            ],
            "SRF",
        ),
        (
            "Podejście 2: Region Proposals (= R-CNN)",
            [
                ("Selective Search → ~2000 inteligentnych regionów", "#B0BEC5"),
                ("Każdy region → CNN → wektor cech → SVM klasyfikuje", "#B0BEC5"),
                ("~2000 x 10ms = ~20 sec — 9x szybciej!", "#64B5F6"),
                (
                    "Mnemonik: INTELIGENTNE CIĘCIE — wytnij tylko tam gdzie wiśnie",
                    "#FFE082",
                ),
            ],
            "SRF",
        ),
        (
            "Podejście 3: Fine-tune backbone (NAJLEPSZE)",
            [
                (
                    "Pretrained backbone (ResNet) → odetnij FC → dodaj detection head",
                    "#B0BEC5",
                ),
                (
                    "Detection head = głowica klasyfikacji + głowica regresji bbox",
                    "#B0BEC5",
                ),
                ("~0.2 sec/obraz, najlepsza jakość (mAP ~42%)", "#A5D6A7"),
                ("Mnemonik: PRZESZCZEP GŁOWY — ten sam silnik, nowa głowa", "#FFE082"),
            ],
            "SRF",
        ),
    ]

    for title, points, _mnem in approaches:
        lines = [
            (title, 24, "#FFE082", FONT_B, (80, 140)),
        ]
        for i, (text, color) in enumerate(points):
            lines.append((f"• {text}", 18, color, FONT_R, (100, 220 + i * 50)))

        lines.append(
            (
                "Detektor z klasyfikatora: SRF = Sliding → Region → Fine-tune",
                16,
                "#78909C",
                FONT_R,
                (80, 520),
            )
        )
        lines.append(
            (
                "= Szukaj Ręcznie, Finalnie optymalizuj!",
                16,
                "#90CAF9",
                FONT_R,
                (80, 550),
            )
        )

        slides.append(_text_slide(lines, duration=STEP_DUR))

    return slides


# ── Methods comparison ────────────────────────────────────────────
def _methods_comparison() -> CompositeVideoClip:
    """Create a comparison table of all detection methods."""
    bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(10.0)
    title = (
        _tc(
            text="Porównanie detektorów",
            font_size=36,
            color="white",
            font=FONT_B,
        )
        .with_duration(10.0)
        .with_position(("center", 20))
    )

    rows = [
        ("Model", "Rok", "Typ", "Szybkość", "Kluczowe"),
        ("HOG+SVM", "2005", "Klasyczny", "~1 fps", "Gradient histogramy"),
        ("Viola-Jones", "2001", "Klasyczny", "30+ fps", "Haar+Cascade"),
        ("R-CNN", "2014", "Two-stage", "50 sec!", "CNN per region"),
        ("Fast R-CNN", "2015", "Two-stage", "2 sec", "ROI Pooling"),
        ("Faster R-CNN", "2015", "Two-stage", "5 fps", "RPN w sieci"),
        ("YOLO", "2016", "One-stage", "45+ fps", "Siatka SxS"),
        ("DETR", "2020", "Transformer", "~40 fps", "Bez NMS!"),
    ]

    clips: list[VideoClip] = [bg, title]
    for i, row in enumerate(rows):
        y_pos = 75 + i * 72
        col_x = [40, 200, 280, 400, 530]
        for j, cell in enumerate(row):
            fs = 16 if i > 0 else 18
            color = "#64B5F6" if i == 0 else "#E0E0E0"
            tc = (
                _tc(
                    text=cell,
                    font_size=fs,
                    color=color,
                    font=FONT_B if i == 0 else FONT_R,
                )
                .with_duration(10.0)
                .with_position((col_x[j], y_pos))
            )
            clips.append(tc)

    return CompositeVideoClip(clips, size=(W, H)).with_effects(
        [FadeIn(0.5), FadeOut(0.5)]
    )