testsAndMisc-archive/python_pkg/praca_magisterska_video/_q23_deeplab.py

"""DeepLab architecture animations for Q23 segmentation video."""

from __future__ import annotations

from moviepy import (
    CompositeVideoClip,
    VideoClip,
)
import numpy as np

from python_pkg.praca_magisterska_video._q23_helpers import (
    BG_COLOR,
    FONT_B,
    FONT_R,
    FPS,
    STEP_DUR,
    H,
    W,
    _compose_slide,
)


# ── DeepLab Architecture ─────────────────────────────────────────
def _make_dilated_frame(t: float) -> np.ndarray:
    """Render a dilated convolution comparison frame."""
    frame = np.zeros((H, W, 3), dtype=np.uint8)
    frame[:] = BG_COLOR
    progress = min(t / (STEP_DUR * 0.7), 1.0)

    cell = 36
    grids = [
        (
            "rate=1",
            60,
            [
                (0, 0),
                (0, 1),
                (0, 2),
                (1, 0),
                (1, 1),
                (1, 2),
                (2, 0),
                (2, 1),
                (2, 2),
            ],
        ),
        (
            "rate=2",
            420,
            [
                (0, 0),
                (0, 2),
                (0, 4),
                (2, 0),
                (2, 2),
                (2, 4),
                (4, 0),
                (4, 2),
                (4, 4),
            ],
        ),
        (
            "rate=3",
            820,
            [
                (0, 0),
                (0, 3),
                (0, 6),
                (3, 0),
                (3, 3),
                (3, 6),
                (6, 0),
                (6, 3),
                (6, 6),
            ],
        ),
    ]

    for gi, (_label, gx, positions) in enumerate(grids):
        if progress < gi * 0.3:
            break
        gy = 180
        grid_size = 7
        for r in range(grid_size):
            for c in range(grid_size):
                x = gx + c * cell
                y = gy + r * cell
                frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55)
        for r, c in positions:
            x = gx + c * cell
            y = gy + r * cell
            frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200)
            frame[y : y + 2, x : x + cell - 2] = (120, 180, 255)
            frame[y + cell - 4 : y + cell - 2, x : x + cell - 2] = (120, 180, 255)

    return frame


def _make_aspp_frame(t: float) -> np.ndarray:
    """Render a single ASPP module animation frame."""
    frame = np.zeros((H, W, 3), dtype=np.uint8)
    frame[:] = BG_COLOR
    progress = min(t / (STEP_DUR * 0.7), 1.0)

    frame[250:330, 50:130] = (70, 130, 200)
    frame[250:252, 50:130] = (120, 180, 255)
    frame[328:330, 50:130] = (120, 180, 255)

    branches = [
        ("1x1 conv", 250, (200, 170), (100, 40), (80, 200, 120)),
        ("rate=6", 310, (200, 250), (100, 40), (200, 160, 80)),
        ("rate=12", 370, (200, 330), (100, 40), (200, 120, 60)),
        ("rate=18", 430, (200, 410), (100, 40), (180, 100, 80)),
        ("GAP", 490, (200, 490), (100, 40), (160, 80, 160)),
    ]
    n_branches = min(int(progress * 5) + 1, 5)
    for i, (_lbl, _h, (bx, by), (bw, bh), color) in enumerate(branches):
        if i < n_branches:
            frame[by : by + bh, bx : bx + bw] = color
            frame[by : by + 2, bx : bx + bw] = tuple(min(c + 50, 255) for c in color)
            ay = by + bh // 2
            frame[ay - 1 : ay + 2, 133:197] = (150, 150, 170)

    concat_phase = 0.6
    if progress > concat_phase:
        frame[250:530, 380:420] = (50, 60, 80)
        frame[250:252, 380:420] = (200, 200, 100)
        frame[528:530, 380:420] = (200, 200, 100)
        for i, (_lbl, _h, (bx, by), (bw, bh), _c) in enumerate(branches):
            if i < n_branches:
                ay = by + bh // 2
                frame[ay - 1 : ay + 2, bx + bw + 3 : 378] = (150, 150, 170)

    final_conv_phase = 0.8
    if progress > final_conv_phase:
        frame[350:420, 450:550] = (100, 200, 100)
        frame[350:352, 450:550] = (150, 230, 150)
        frame[418:420, 450:550] = (150, 230, 150)
        frame[388:391, 423:448] = (150, 150, 170)

    return frame


def _deeplab_demo() -> list[CompositeVideoClip]:
    """Animate DeepLab: dilated convolution + ASPP step by step."""
    dur = STEP_DUR + 1

    # Slide 1: Regular vs Dilated convolution
    dil_clip = VideoClip(_make_dilated_frame, duration=dur).with_fps(FPS)
    labels = [
        ("DeepLab: Atrous (Dilated) Convolution", 26, "#FFE082", FONT_B, (80, 20)),
        (
            "KROK 1: Zrozum dilated convolution — filtr z DZIURAMI",
            18,
            "#A5D6A7",
            FONT_R,
            (80, 60),
        ),
        ("rate=1 (zwykła)", 14, "#64B5F6", FONT_B, (60, 160)),
        ("RF = 3x3", 14, "#64B5F6", FONT_R, (60, 440)),
        ("9 wag, kontekst 3px", 12, "#78909C", FONT_R, (60, 470)),
        ("rate=2 (dilated)", 14, "#FFE082", FONT_B, (420, 160)),
        ("RF = 5x5", 14, "#FFE082", FONT_R, (420, 440)),
        ("9 wag, kontekst 5px!", 12, "#78909C", FONT_R, (420, 470)),
        ("rate=3 (dilated)", 14, "#A5D6A7", FONT_B, (820, 160)),
        ("RF = 7x7", 14, "#A5D6A7", FONT_R, (820, 440)),
        ("9 wag, kontekst 7px!", 12, "#78909C", FONT_R, (820, 470)),
        (
            "Niebieski = pozycja wag filtra 3x3 | Szary = pominięte (dziury)",
            15,
            "#B0BEC5",
            FONT_R,
            (80, 510),
        ),
        (
            "TE SAME 9 wag → WIĘKSZE pole widzenia "
            "→ lepszy kontekst BEZ dodatkowych parametrów!",
            16,
            "white",
            FONT_R,
            (80, 550),
        ),
        (
            "Mnemonik: DZIURY w filtrze — à trous = z dziurami (fr.)",
            16,
            "#FFE082",
            FONT_R,
            (80, 600),
        ),
    ]
    slides = [_compose_slide(dil_clip, labels, dur)]

    # Slide 2: ASPP module step by step
    aspp_clip = VideoClip(_make_aspp_frame, duration=dur).with_fps(FPS)
    labels2 = [
        (
            "DeepLab: ASPP (Atrous Spatial Pyramid Pooling)",
            24,
            "#FFE082",
            FONT_B,
            (80, 20),
        ),
        (
            "KROK 2: Multi-scale — analizuj obraz na WIELU skalach naraz",
            17,
            "#A5D6A7",
            FONT_R,
            (80, 60),
        ),
        ("Wejście", 13, "#64B5F6", FONT_B, (55, 235)),
        ("Conv 1x1", 12, "white", FONT_R, (210, 178)),
        ("Dilated r=6", 12, "white", FONT_R, (205, 258)),
        ("Dilated r=12", 12, "white", FONT_R, (203, 338)),
        ("Dilated r=18", 12, "white", FONT_R, (203, 418)),
        ("GAP (global)", 12, "white", FONT_R, (205, 498)),
        ("Concat", 13, "#FFE082", FONT_B, (381, 537)),
        ("Conv", 13, "#A5D6A7", FONT_B, (470, 425)),
        (
            "5 gałęzi RÓWNOLEGŁYCH → różne skale kontekstu:",
            16,
            "#B0BEC5",
            FONT_R,
            (550, 170),
        ),
        ("  1x1: kontekst punktowy (piksel)", 14, "#A5D6A7", FONT_R, (560, 210)),
        ("  r=6: kontekst lokalny (~13px)", 14, "#FFE082", FONT_R, (560, 245)),
        ("  r=12: kontekst średni (~25px)", 14, "#FFE082", FONT_R, (560, 280)),
        ("  r=18: kontekst szeroki (~37px)", 14, "#FFE082", FONT_R, (560, 315)),
        ("  GAP: kontekst GLOBALNY (cały obraz)", 14, "#CE93D8", FONT_R, (560, 350)),
        ("Concat → 1x1 conv → mapa segmentacji", 16, "#A5D6A7", FONT_R, (550, 400)),
        (
            "Efekt: sieć widzi OD piksela DO całego obrazu naraz!",
            17,
            "white",
            FONT_R,
            (80, 600),
        ),
        (
            "Mnemonik: ASPP = Piramida z DZIURAMI, patrzy na 5 skal jednocześnie",
            15,
            "#FFE082",
            FONT_R,
            (80, 645),
        ),
    ]
    slides.append(_compose_slide(aspp_clip, labels2, dur))

    return slides