testsAndMisc/python_pkg/praca_magisterska_video/_q24_rpn_yolo.py

"""RPN anchor boxes and YOLO grid detection."""

from __future__ import annotations

from _q24_common import (
    BG_COLOR,
    FONT_B,
    FONT_R,
    FPS,
    STEP_DUR,
    H,
    W,
    _tc,
    _text_slide,
)
from moviepy import CompositeVideoClip, VideoClip
from moviepy.video.fx import FadeIn, FadeOut
import numpy as np


# ── RPN + Anchor Boxes ───────────────────────────────────────────
def _rpn_anchors_demo() -> list[CompositeVideoClip]:
    """Animate RPN and anchor boxes: Faster R-CNN innovation."""
    slides = []

    # Slide 1: Anchor boxes concept
    def make_anchors_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR
        progress = min(t / (STEP_DUR * 0.7), 1.0)

        # Draw feature map grid point with multiple anchors
        cx, cy = 350, 360  # center point on feature map

        # Draw a "feature map" grid background
        cell = 60
        for r in range(-3, 4):
            for c in range(-3, 4):
                x = cx + c * cell - cell // 2
                y = cy + r * cell - cell // 2
                frame[y : y + cell - 1, x : x + cell - 1] = (30, 35, 48)

        # Center point highlighted
        frame[cy - 5 : cy + 5, cx - 5 : cx + 5] = (255, 200, 50)

        # Draw anchors around center: 3 sizes x 3 ratios = 9
        anchor_specs = [
            (30, 30, (200, 80, 80)),  # small 1:1
            (20, 40, (200, 60, 60)),  # small 1:2
            (40, 20, (180, 60, 60)),  # small 2:1
            (60, 60, (80, 200, 80)),  # medium 1:1
            (40, 80, (60, 180, 60)),  # medium 1:2
            (80, 40, (60, 160, 60)),  # medium 2:1
            (90, 90, (80, 80, 200)),  # large 1:1
            (60, 120, (60, 60, 180)),  # large 1:2
            (120, 60, (60, 60, 160)),  # large 2:1
        ]
        n_anchors = min(int(progress * 9) + 1, 9)
        for i in range(n_anchors):
            hw, hh, color = anchor_specs[i]
            x1 = max(0, cx - hw)
            y1 = max(0, cy - hh)
            x2 = min(W - 1, cx + hw)
            y2 = min(H - 1, cy + hh)
            for tt in range(2):
                frame[y1 - tt : y2 + tt, x1 - tt : x1 - tt + 2] = color
                frame[y1 - tt : y2 + tt, x2 + tt - 2 : x2 + tt] = color
                frame[y1 - tt : y1 - tt + 2, x1 - tt : x2 + tt] = color
                frame[y2 + tt - 2 : y2 + tt, x1 - tt : x2 + tt] = color

        return frame

    anch_clip = VideoClip(make_anchors_frame, duration=STEP_DUR + 1).with_fps(FPS)
    dur = STEP_DUR + 1
    labels = [
        ("Anchor Boxes + RPN (Faster R-CNN)", 26, "#FFE082", FONT_B, (80, 20)),
        (
            "KROK 1: Anchory = predefiniowane kształty w każdej pozycji",
            17,
            "#A5D6A7",
            FONT_R,
            (80, 60),
        ),
        (
            "3 rozmiary x 3 proporcje = 9 anchorów per punkt",
            16,
            "#B0BEC5",
            FONT_R,
            (80, 90),
        ),
        ("Małe (1:1, 1:2, 2:1)", 14, "#EF9A9A", FONT_R, (750, 170)),
        ("Średnie (1:1, 1:2, 2:1)", 14, "#A5D6A7", FONT_R, (750, 210)),
        ("Duże (1:1, 1:2, 2:1)", 14, "#64B5F6", FONT_R, (750, 250)),
        ("Żółty punkt = pozycja", 14, "#FFE082", FONT_R, (750, 310)),
        ("na feature mapie", 14, "#FFE082", FONT_R, (750, 335)),
        ("Sieć NIE predykuje bbox od zera!", 16, "white", FONT_R, (80, 530)),
        (
            "Predykuje OFFSET od najbliższego anchora: (Δx, Δy, Δw, Δh)",
            16,
            "#FFE082",
            FONT_R,
            (80, 565),
        ),
        (
            "+ P(obiekt) = 'czy w tym anchorze jest coś?'",
            16,
            "#A5D6A7",
            FONT_R,
            (80, 600),
        ),
        (
            "Mnemonik: Anchor = KOTWICA — sieć dopasowuje bbox do kotwicy",
            15,
            "#78909C",
            FONT_R,
            (80, 645),
        ),
    ]
    text_clips: list[VideoClip] = [anch_clip]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(dur)
            .with_position(pos)
        )
        text_clips.append(tc)
    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )

    # Slide 2: RPN step by step
    rpn_lines = [
        (
            "RPN: Region Proposal Network — krok po kroku",
            24,
            "#FFE082",
            FONT_B,
            (80, 30),
        ),
        (
            "Zastępuje Selective Search SIECIĄ NEURONOWĄ (end-to-end!)",
            17,
            "#B0BEC5",
            FONT_R,
            (80, 85),
        ),
        ("", 10, "white", FONT_R, (80, 110)),
        (
            "1. Backbone (ResNet) przetwarza obraz → feature mapa [40x60x256]",
            16,
            "#64B5F6",
            FONT_R,
            (100, 140),
        ),
        (
            "2. Filtr 3x3 przesuwa się po feature mapie",
            16,
            "#A5D6A7",
            FONT_R,
            (100, 180),
        ),
        (
            "3. W KAŻDEJ pozycji (x,y) rozważ k=9 anchorów:",
            16,
            "#FFE082",
            FONT_R,
            (100, 220),
        ),
        ("   → P(obiekt) — 'czy tu jest coś?'", 15, "white", FONT_R, (120, 255)),
        ("   → (Δx, Δy, Δw, Δh) — poprawka pozycji", 15, "white", FONT_R, (120, 285)),
        (
            "4. 40x60 pozycji x 9 anchorów = 21 600 kandydatów!",
            16,
            "#EF9A9A",
            FONT_R,
            (100, 325),
        ),
        (
            "5. Weź ~300 z najwyższym P(obiekt) → ROI Pool → FC",
            16,
            "#A5D6A7",
            FONT_R,
            (100, 365),
        ),
        ("", 10, "white", FONT_R, (100, 395)),
        ("Porównanie generowania propozycji:", 17, "white", FONT_B, (80, 420)),
        (
            "  Selective Search: ~2000 regionów, osobny algorytm, ~2 sec",
            15,
            "#EF9A9A",
            FONT_R,
            (100, 460),
        ),
        (
            "  RPN: ~300 regionów, W SIECI, ~10 ms → 200x szybciej!",
            15,
            "#A5D6A7",
            FONT_R,
            (100, 495),
        ),
        ("", 10, "white", FONT_R, (100, 520)),
        (
            "Faster R-CNN = Backbone + RPN + ROI Pool + FC — WSZYSTKO end-to-end",
            17,
            "#FFE082",
            FONT_R,
            (80, 545),
        ),
        (
            "→ 5 fps (0.2 sec/obraz) vs R-CNN 50 sec = 250x szybciej!",
            17,
            "#A5D6A7",
            FONT_R,
            (80, 585),
        ),
        (
            "Wciąż two-stage: (1) RPN generuje propozycje, (2) FC klasyfikuje",
            15,
            "#78909C",
            FONT_R,
            (80, 630),
        ),
    ]
    slides.append(_text_slide(rpn_lines, duration=STEP_DUR + 1))

    return slides


# ── YOLO ──────────────────────────────────────────────────────────
def _yolo_demo() -> list[CompositeVideoClip]:
    """Animate YOLO grid detection concept."""
    slides = []

    def make_yolo_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR

        progress = min(t / (STEP_DUR * 0.7), 1.0)

        # Draw image with grid overlay
        img_x, img_y = 100, 140
        img_size = 420
        grid_n = 7

        # Background "image"
        frame[img_y : img_y + img_size, img_x : img_x + img_size] = (50, 55, 70)

        # Objects in the image
        frame[img_y + 80 : img_y + 200, img_x + 50 : img_x + 180] = (
            180,
            60,
            60,
        )  # "car"
        frame[img_y + 150 : img_y + 350, img_x + 250 : img_x + 330] = (
            60,
            120,
            180,
        )  # "person"

        # Grid lines
        cell = img_size // grid_n
        for i in range(grid_n + 1):
            # Vertical
            x = img_x + i * cell
            frame[img_y : img_y + img_size, x : x + 1] = (100, 100, 120)
            # Horizontal
            y = img_y + i * cell
            frame[y : y + 1, img_x : img_x + img_size] = (100, 100, 120)

        # Highlight cells containing object centers
        car_phase = 0.3
        if progress > car_phase:
            # Car center ~ cell (1, 1)
            cx, cy = 1, 2
            hx = img_x + cx * cell
            hy = img_y + cy * cell
            frame[hy : hy + cell, hx : hx + cell] = np.clip(
                frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255
            ).astype(np.uint8)

        person_phase = 0.5
        if progress > person_phase:
            # Person center ~ cell (4, 4)
            cx, cy = 4, 4
            hx = img_x + cx * cell
            hy = img_y + cy * cell
            frame[hy : hy + cell, hx : hx + cell] = np.clip(
                frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255
            ).astype(np.uint8)

        # Bounding boxes predictions from cells
        bbox_phase = 0.6
        if progress > bbox_phase:
            # Car bbox
            for tt in range(2):
                frame[
                    img_y + 78 - tt : img_y + 202 + tt,
                    img_x + 48 - tt : img_x + 48 - tt + 2,
                ] = (255, 80, 80)
                frame[
                    img_y + 78 - tt : img_y + 202 + tt,
                    img_x + 182 + tt - 2 : img_x + 182 + tt,
                ] = (255, 80, 80)
                frame[
                    img_y + 78 - tt : img_y + 78 - tt + 2,
                    img_x + 48 - tt : img_x + 182 + tt,
                ] = (255, 80, 80)
                frame[
                    img_y + 202 + tt - 2 : img_y + 202 + tt,
                    img_x + 48 - tt : img_x + 182 + tt,
                ] = (255, 80, 80)

            # Person bbox
            for tt in range(2):
                frame[
                    img_y + 148 - tt : img_y + 352 + tt,
                    img_x + 248 - tt : img_x + 248 - tt + 2,
                ] = (80, 180, 255)
                frame[
                    img_y + 148 - tt : img_y + 352 + tt,
                    img_x + 332 + tt - 2 : img_x + 332 + tt,
                ] = (80, 180, 255)
                frame[
                    img_y + 148 - tt : img_y + 148 - tt + 2,
                    img_x + 248 - tt : img_x + 332 + tt,
                ] = (80, 180, 255)
                frame[
                    img_y + 352 + tt - 2 : img_y + 352 + tt,
                    img_x + 248 - tt : img_x + 332 + tt,
                ] = (80, 180, 255)

        return frame

    yolo_clip = VideoClip(make_yolo_frame, duration=STEP_DUR).with_fps(FPS)
    text_clips: list[VideoClip] = [yolo_clip]
    labels = [
        ("YOLO — You Only Look Once", 28, "#FFE082", FONT_B, (80, 20)),
        (
            "Jednoetapowy detektor: siatka SxS → wszystkie detekcje naraz!",
            18,
            "#B0BEC5",
            FONT_R,
            (80, 65),
        ),
        ("Siatka 7x7 = 49 komórek", 16, "#64B5F6", FONT_R, (600, 180)),
        ("Każda komórka predykuje:", 16, "white", FONT_R, (600, 220)),
        ("  • B bbox (x, y, w, h, conf)", 14, "#B0BEC5", FONT_R, (600, 255)),
        ("  • C klas (prawdopodobieństwa)", 14, "#B0BEC5", FONT_R, (600, 285)),
        ("Komórka odpowiada za obiekt", 14, "#A5D6A7", FONT_R, (600, 325)),
        ("którego ŚRODEK w niej wpada", 14, "#A5D6A7", FONT_R, (600, 350)),
        ("45-155 fps! (vs 5 fps Faster R-CNN)", 18, "#EF9A9A", FONT_B, (600, 400)),
        (
            "Jedno przejście przez sieć → WSZYSTKIE detekcje naraz → NMS → wynik",
            14,
            "#78909C",
            FONT_R,
            (80, 620),
        ),
        (
            "Two-stage (R-CNN): propozycje+klasyfikacja "
            "| One-stage (YOLO): bez propozycji!",
            14,
            "#90CAF9",
            FONT_R,
            (80, 655),
        ),
    ]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(STEP_DUR)
            .with_position(pos)
        )
        text_clips.append(tc)

    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )
    return slides