"""MoviePy visualization for PYTANIE 23: Image Segmentation.

Creates animated video demonstrating:
- What segmentation is (pixel-level classification)
- Thresholding / Otsu (bimodal histogram)
- Region Growing (BFS flood fill)
- Watershed (topographic flooding)
- U-Net encoder-decoder architecture
"""

from __future__ import annotations

import os
from pathlib import Path

import numpy as np

os.environ["FFMPEG_BINARY"] = "/usr/bin/ffmpeg"

from moviepy import (
    ColorClip,
    CompositeVideoClip,
    TextClip,
    VideoClip,
    concatenate_videoclips,
)
from moviepy.video.fx import FadeIn, FadeOut

# ── Constants ─────────────────────────────────────────────────────
W, H = 1280, 720
FPS = 24
STEP_DUR = 7.0
HEADER_DUR = 4.0
FONT_B = "/usr/share/fonts/TTF/DejaVuSans-Bold.ttf"
FONT_R = "/usr/share/fonts/TTF/DejaVuSans.ttf"
OUTPUT_DIR = Path(__file__).resolve().parent / "videos"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT = str(OUTPUT_DIR / "q23_segmentation.mp4")

BG_COLOR = (15, 20, 35)
rng = np.random.default_rng(42)


def _tc(**kwargs: object) -> TextClip:
    """TextClip wrapper that adds enough bottom margin to prevent clipping."""
    fs = kwargs.get("font_size", 24)
    m = int(fs) // 3 + 2
    kwargs["margin"] = (0, m)
    return TextClip(**kwargs)


def _make_header(
    title: str, subtitle: str, duration: float = HEADER_DUR
) -> CompositeVideoClip:
    bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(duration)
    t = (
        _tc(
            text=title,
            font_size=48,
            color="white",
            font=FONT_B,
        )
        .with_duration(duration)
        .with_position(("center", 260))
    )
    s = (
        _tc(
            text=subtitle,
            font_size=24,
            color="#90CAF9",
            font=FONT_R,
        )
        .with_duration(duration)
        .with_position(("center", 340))
    )
    return CompositeVideoClip([bg, t, s], size=(W, H)).with_effects(
        [FadeIn(0.5), FadeOut(0.5)]
    )


def _text_slide(
    lines: list[tuple[str, int, str, str, tuple[str | int, str | int]]],
    duration: float = STEP_DUR,
) -> CompositeVideoClip:
    """Create a slide with multiple text elements."""
    bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(duration)
    clips: list[VideoClip] = [bg]
    for text, font_size, color, font, pos in lines:
        tc = (
            _tc(
                text=text,
                font_size=font_size,
                color=color,
                font=font,
            )
            .with_duration(duration)
            .with_position(pos)
        )
        clips.append(tc)
    return CompositeVideoClip(clips, size=(W, H)).with_effects(
        [FadeIn(0.3), FadeOut(0.3)]
    )


# ── Segmentation concept ─────────────────────────────────────────
def _segmentation_concept() -> list[CompositeVideoClip]:
    """Show what segmentation is: pixel-level labeling."""
    slides = []

    # Synthetic image: grid of colored pixels
    def make_image_frame(_t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR

        # Draw a small "image" grid
        grid_x, grid_y = 100, 150
        cell = 40
        # Sky (top rows)
        colors_map = [
            [(135, 206, 235)] * 8,  # sky
            [(135, 206, 235)] * 5 + [(34, 139, 34)] * 3,  # sky + tree
            [(34, 139, 34)] * 3
            + [(128, 128, 128)] * 3
            + [(34, 139, 34)] * 2,  # tree+road+tree
            [(128, 128, 128)] * 3
            + [(200, 50, 50)] * 2
            + [(128, 128, 128)] * 3,  # road+car+road
        ]
        labels_map = [
            ["niebo"] * 8,
            ["niebo"] * 5 + ["drzewo"] * 3,
            ["drzewo"] * 3 + ["droga"] * 3 + ["drzewo"] * 2,
            ["droga"] * 3 + ["samochód"] * 2 + ["droga"] * 3,
        ]
        label_colors = {
            "niebo": (100, 180, 255),
            "drzewo": (50, 200, 50),
            "droga": (180, 180, 180),
            "samochód": (255, 80, 80),
        }

        for r, row in enumerate(colors_map):
            for c, col in enumerate(row):
                y = grid_y + r * cell
                x = grid_x + c * cell
                frame[y : y + cell - 2, x : x + cell - 2] = col

        # Draw segmentation map on the right
        seg_x = 600
        for r, row in enumerate(labels_map):
            for c, lab in enumerate(row):
                y = grid_y + r * cell
                x = seg_x + c * cell
                frame[y : y + cell - 2, x : x + cell - 2] = label_colors[lab]

        return frame

    image_clip = VideoClip(make_image_frame, duration=STEP_DUR).with_fps(FPS)
    labels_text = [
        ("Obraz wejściowy", 22, "white", FONT_B, (170, 100)),
        ("Mapa segmentacji", 22, "white", FONT_B, (660, 100)),
        ("→", 50, "#FFE082", FONT_B, (450, 250)),
        ("Każdy piksel → etykieta klasy", 20, "#B0BEC5", FONT_R, (100, 420)),
        ("niebo  |  drzewo  |  droga  |  samochód", 18, "#90CAF9", FONT_R, (600, 420)),
        ("Segmentacja = klasyfikacja per-piksel", 24, "#FFE082", FONT_B, (100, 500)),
        (
            "Semantic: klasy bez instancji | Instance: rozróżnia obiekty | Panoptic: oba",
            16,
            "#78909C",
            FONT_R,
            (100, 560),
        ),
    ]
    clips: list[VideoClip] = [image_clip]
    for text, fs, color, font, pos in labels_text:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(STEP_DUR)
            .with_position(pos)
        )
        clips.append(tc)

    slides.append(
        CompositeVideoClip(clips, size=(W, H)).with_effects([FadeIn(0.3), FadeOut(0.3)])
    )
    return slides


# ── Thresholding / Otsu ───────────────────────────────────────────
def _thresholding_demo() -> list[CompositeVideoClip]:
    """Animate thresholding and Otsu concept."""
    slides = []

    # Show histogram & threshold
    def make_threshold_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR

        # Draw bimodal histogram bars
        bar_start_x = 80
        bar_y = 500
        bar_w = 4

        for i in range(256):
            # Bimodal: peaks at 60 and 190
            h1 = 200 * np.exp(-((i - 60) ** 2) / (2 * 20**2))
            h2 = 150 * np.exp(-((i - 190) ** 2) / (2 * 25**2))
            bar_h = int(h1 + h2)
            x = bar_start_x + i * bar_w
            if x + bar_w < W:
                frame[bar_y - bar_h : bar_y, x : x + bar_w - 1] = (150, 150, 170)

        # Animated threshold line
        threshold = int(60 + (190 - 60) * min(t / (STEP_DUR * 0.7), 1.0))
        tx = bar_start_x + threshold * bar_w
        if tx < W:
            frame[bar_y - 250 : bar_y + 10, tx : tx + 3] = (255, 80, 80)

        # Color the two sides
        for i in range(threshold):
            x = bar_start_x + i * bar_w
            h1 = 200 * np.exp(-((i - 60) ** 2) / (2 * 20**2))
            h2 = 150 * np.exp(-((i - 190) ** 2) / (2 * 25**2))
            bar_h = int(h1 + h2)
            if x + bar_w < W and bar_h > 0:
                frame[bar_y - bar_h : bar_y, x : x + bar_w - 1] = (70, 130, 200)

        for i in range(threshold, 256):
            x = bar_start_x + i * bar_w
            h1 = 200 * np.exp(-((i - 60) ** 2) / (2 * 20**2))
            h2 = 150 * np.exp(-((i - 190) ** 2) / (2 * 25**2))
            bar_h = int(h1 + h2)
            if x + bar_w < W and bar_h > 0:
                frame[bar_y - bar_h : bar_y, x : x + bar_w - 1] = (200, 100, 80)

        return frame

    hist_clip = VideoClip(make_threshold_frame, duration=STEP_DUR).with_fps(FPS)
    text_clips: list[VideoClip] = [hist_clip]
    labels = [
        ("Progowanie (Thresholding) z metodą Otsu", 28, "#FFE082", FONT_B, (80, 30)),
        (
            "Histogram jasności pikseli — dwumodalny (bimodal)",
            20,
            "#B0BEC5",
            FONT_R,
            (80, 80),
        ),
        ("Garb 1: piksele obiektu (ciemne ~60)", 16, "#64B5F6", FONT_R, (80, 120)),
        ("Garb 2: piksele tła (jasne ~190)", 16, "#EF9A9A", FONT_R, (80, 150)),
        (
            "Próg T (czerwona linia) dzieli piksele na 2 klasy",
            18,
            "white",
            FONT_R,
            (80, 540),
        ),
        (
            "Otsu: automatycznie testuje T=0..255, minimalizuje σ² wewnątrzklasową",
            16,
            "#A5D6A7",
            FONT_R,
            (80, 580),
        ),
        (
            "Piksel ≤ T → klasa 0 (tło) | Piksel > T → klasa 1 (obiekt)",
            16,
            "#78909C",
            FONT_R,
            (80, 620),
        ),
    ]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(STEP_DUR)
            .with_position(pos)
        )
        text_clips.append(tc)

    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )
    return slides


# ── Region Growing ────────────────────────────────────────────────
def _region_growing_demo() -> list[CompositeVideoClip]:
    """Animate region growing BFS from a seed pixel."""
    slides = []

    grid_size = 10
    cell_size = 40
    rng = np.random.default_rng(42)
    # Create a simple grid: dark region (30-80) and bright region (160-220)
    grid = np.zeros((grid_size, grid_size), dtype=np.uint8)
    grid[:] = 60  # dark background
    grid[2:7, 3:8] = 180  # bright rectangle

    # Add some noise
    noise = rng.integers(-15, 15, (grid_size, grid_size))
    grid = np.clip(grid.astype(int) + noise, 0, 255).astype(np.uint8)

    # BFS steps from seed (4, 5)
    seed = (4, 5)
    threshold_val = 50
    visited_order: list[tuple[int, int]] = []
    queue = [seed]
    visited_set = {seed}
    while queue:
        r, c = queue.pop(0)
        visited_order.append((r, c))
        for dr, dc in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
            nr, nc = r + dr, c + dc
            if (
                0 <= nr < grid_size
                and 0 <= nc < grid_size
                and (nr, nc) not in visited_set
            ) and abs(int(grid[nr, nc]) - int(grid[seed])) < threshold_val:
                visited_set.add((nr, nc))
                queue.append((nr, nc))

    def make_region_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR
        ox, oy = 100, 180

        # How many cells to show as visited
        progress = min(t / (STEP_DUR * 0.8), 1.0)
        n_visited = int(progress * len(visited_order))

        for r in range(grid_size):
            for c in range(grid_size):
                x = ox + c * cell_size
                y = oy + r * cell_size
                val = grid[r, c]
                color = (val, val, val)

                # Highlight visited
                if (r, c) in visited_order[:n_visited]:
                    color = (80, 200, 120)  # green for region
                elif (r, c) == seed:
                    color = (255, 200, 50)  # yellow seed

                frame[y : y + cell_size - 2, x : x + cell_size - 2] = color

                # Show value
                # (drawn as a simple marker since we can't render text in numpy easily)

        # Mark the seed with a bright border
        sx = ox + seed[1] * cell_size
        sy = ox + seed[0] * cell_size + 80
        frame[sy : sy + cell_size, sx : sx + 2] = (255, 200, 50)
        frame[sy : sy + cell_size, sx + cell_size - 2 : sx + cell_size] = (255, 200, 50)
        frame[sy : sy + 2, sx : sx + cell_size] = (255, 200, 50)
        frame[sy + cell_size - 2 : sy + cell_size, sx : sx + cell_size] = (255, 200, 50)

        return frame

    region_clip = VideoClip(make_region_frame, duration=STEP_DUR).with_fps(FPS)
    text_clips: list[VideoClip] = [region_clip]
    labels = [
        ("Region Growing — rozrastanie regionu", 28, "#FFE082", FONT_B, (100, 30)),
        ("Seed (ziarno) → BFS do podobnych sąsiadów", 20, "#B0BEC5", FONT_R, (100, 80)),
        (
            "Żółty = seed | Zielony = region | Szary = nieodwiedzone",
            16,
            "#78909C",
            FONT_R,
            (100, 120),
        ),
        (
            "Sąsiad PODOBNY (|jasność - jasność_regionu| < próg) → dodaj do regionu",
            16,
            "#A5D6A7",
            FONT_R,
            (100, 600),
        ),
        (
            "Algorytm zatrzymuje się gdy brak podobnych sąsiadów",
            16,
            "#90CAF9",
            FONT_R,
            (100, 640),
        ),
        (
            "Mnemonik: PLAMA atramentu — rozlewa się na podobne piksele",
            18,
            "#EF9A9A",
            FONT_R,
            (100, 670),
        ),
    ]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(STEP_DUR)
            .with_position(pos)
        )
        text_clips.append(tc)

    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )
    return slides


# ── Watershed ─────────────────────────────────────────────────────
def _watershed_demo() -> list[CompositeVideoClip]:
    """Animate watershed flooding concept."""
    slides = []

    def make_watershed_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR

        # Draw terrain profile (1D cross-section)
        ox, oy = 100, 450
        terrain_w = 900
        terrain_points = 100

        xs = np.linspace(0, 1, terrain_points)
        # Two valleys with a ridge
        terrain = (
            120 * np.exp(-((xs - 0.25) ** 2) / 0.005)
            + 80 * np.exp(-((xs - 0.75) ** 2) / 0.008)
            + 30
        )
        terrain = 250 - terrain  # invert for visual (valleys at bottom)

        # Water level rises over time
        water_level = int(160 + 80 * min(t / (STEP_DUR * 0.7), 1.0))

        for i in range(terrain_points - 1):
            x1 = ox + int(xs[i] * terrain_w)
            x2 = ox + int(xs[i + 1] * terrain_w)
            y1 = oy - int(terrain[i])
            y2 = oy - int(terrain[i + 1])

            # Fill terrain
            for x in range(x1, min(x2 + 1, W)):
                top = min(y1, y2) - 5
                frame[top:oy, x : x + 1] = (100, 80, 60)

            # Fill water
            water_y = oy - water_level
            for x in range(x1, min(x2 + 1, W)):
                t_y = oy - int(terrain[i])
                if water_y < t_y:
                    # Water fills below terrain surface
                    fill_top = max(water_y, 0)
                    fill_bot = min(t_y, oy)
                    if fill_top < fill_bot:
                        frame[fill_top:fill_bot, x : x + 1] = (70, 130, 220)

        # Dam marker at ridge
        ridge_x = ox + int(0.5 * terrain_w)
        if water_level > 160:
            frame[oy - water_level : oy - 140, ridge_x - 2 : ridge_x + 2] = (
                255,
                80,
                80,
            )

        return frame

    ws_clip = VideoClip(make_watershed_frame, duration=STEP_DUR).with_fps(FPS)
    text_clips: list[VideoClip] = [ws_clip]
    labels = [
        ("Watershed — metoda zlewiska", 28, "#FFE082", FONT_B, (100, 20)),
        (
            "Obraz = mapa topograficzna (jasność = wysokość)",
            20,
            "#B0BEC5",
            FONT_R,
            (100, 65),
        ),
        (
            "Brązowy = teren (ciemne=doliny, jasne=szczyty)",
            16,
            "#8D6E63",
            FONT_R,
            (100, 100),
        ),
        ("Niebieski = woda zalewająca od minimów", 16, "#64B5F6", FONT_R, (100, 130)),
        (
            "Czerwony = TAMA (granica segmentu) — gdy woda z 2 dolin się spotka",
            16,
            "#EF9A9A",
            FONT_R,
            (100, 160),
        ),
        (
            "Problem: over-segmentation (za dużo regionów). Rozwiązanie: marker-controlled.",
            16,
            "#A5D6A7",
            FONT_R,
            (100, 560),
        ),
        (
            "Mnemonik: ZALEWANIE terenu — granie gór = granice segmentów",
            18,
            "#FFE082",
            FONT_R,
            (100, 600),
        ),
    ]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(STEP_DUR)
            .with_position(pos)
        )
        text_clips.append(tc)

    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )
    return slides


# ── U-Net Architecture ───────────────────────────────────────────
def _unet_demo() -> list[CompositeVideoClip]:
    """Animate U-Net encoder-decoder architecture."""
    slides = []

    def make_unet_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR

        # Draw U-shape: encoder blocks going down, decoder going up
        # Encoder: 4 blocks getting smaller
        enc_sizes = [(80, 120), (60, 100), (45, 80), (30, 60)]
        dec_sizes = list(reversed(enc_sizes))
        enc_x = 150
        dec_x = 850

        progress = min(t / (STEP_DUR * 0.6), 1.0)
        n_blocks = int(progress * 8) + 1  # 1 to 8

        enc_positions = []
        y_offset = 120
        for i, (bw, bh) in enumerate(enc_sizes):
            x = enc_x
            y = y_offset + i * 130
            enc_positions.append((x, y, bw, bh))
            if i < n_blocks:
                # Draw encoder block
                frame[y : y + bh, x : x + bw] = (70, 130, 200)
                # Border
                frame[y : y + 2, x : x + bw] = (100, 180, 255)
                frame[y + bh - 2 : y + bh, x : x + bw] = (100, 180, 255)
                frame[y : y + bh, x : x + 2] = (100, 180, 255)
                frame[y : y + bh, x + bw - 2 : x + bw] = (100, 180, 255)

                # Down arrow
                if i < len(enc_sizes) - 1:
                    ax = x + bw // 2
                    ay = y + bh + 10
                    frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)

        # Bottleneck
        bx, by = 500, y_offset + 3 * 130 + 30
        if n_blocks > 4:
            frame[by : by + 50, bx : bx + 25] = (200, 100, 80)
            frame[by : by + 2, bx : bx + 25] = (255, 140, 100)
            frame[by + 48 : by + 50, bx : bx + 25] = (255, 140, 100)

        # Decoder
        dec_positions = []
        for i, (bw, bh) in enumerate(dec_sizes):
            x = dec_x
            y = y_offset + (3 - i) * 130
            dec_positions.append((x, y, bw, bh))
            if n_blocks > 4 + i + 1:
                frame[y : y + bh, x : x + bw] = (80, 200, 120)
                frame[y : y + 2, x : x + bw] = (120, 230, 150)
                frame[y + bh - 2 : y + bh, x : x + bw] = (120, 230, 150)
                frame[y : y + bh, x : x + 2] = (120, 230, 150)
                frame[y : y + bh, x + bw - 2 : x + bw] = (120, 230, 150)

                # Up arrow
                if i < len(dec_sizes) - 1:
                    ax = x + bw // 2
                    ay = y - 30
                    frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)

        # Skip connections (horizontal dashed lines)
        if n_blocks > 5:
            for i in range(min(n_blocks - 5, 4)):
                ey = enc_positions[i][1] + enc_positions[i][3] // 2
                ex_end = enc_positions[i][0] + enc_positions[i][2]
                dx_start = dec_x
                for dash_x in range(ex_end + 10, dx_start - 10, 15):
                    frame[ey : ey + 2, dash_x : dash_x + 8] = (255, 200, 50)

        return frame

    unet_clip = VideoClip(make_unet_frame, duration=STEP_DUR + 1).with_fps(FPS)
    text_clips: list[VideoClip] = [unet_clip]
    labels = [
        ("U-Net: Encoder-Decoder + Skip Connections", 28, "#FFE082", FONT_B, (80, 20)),
        (
            "Niebieski = Encoder (↓ zmniejsza rozdzielczość, wyciąga cechy)",
            16,
            "#64B5F6",
            FONT_R,
            (80, 65),
        ),
        (
            "Zielony = Decoder (↑ zwiększa rozdzielczość, odtwarza mapę)",
            16,
            "#A5D6A7",
            FONT_R,
            (80, 90),
        ),
        (
            "Żółte przerywane = Skip connections (przenoszą detale z encodera)",
            16,
            "#FFE082",
            FONT_R,
            (80, 115),
        ),
        (
            "Czerwony = Bottleneck (najgłębsza warstwa, max abstrakcja)",
            16,
            "#EF9A9A",
            FONT_R,
            (450, 570),
        ),
        (
            "Kształt U: encoder ↓ decoder ↑, mosty pośrodku",
            18,
            "white",
            FONT_R,
            (80, 640),
        ),
        (
            "Concatenation: skip łączy kanały (więcej informacji niż dodawanie)",
            16,
            "#78909C",
            FONT_R,
            (80, 670),
        ),
    ]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(STEP_DUR + 1)
            .with_position(pos)
        )
        text_clips.append(tc)

    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )
    return slides


# ── FCN Architecture ─────────────────────────────────────────────
def _fcn_demo() -> list[CompositeVideoClip]:
    """Animate FCN step-by-step: FC → Conv 1x1 transformation."""
    slides = []

    # Slide 1: Classic CNN vs FCN pipeline comparison
    def make_fcn_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR
        progress = min(t / (STEP_DUR * 0.8), 1.0)

        # TOP: Classic CNN → FC → 1 label
        top_y = 140
        blocks_classic = [
            ((80, top_y), (70, 50), (70, 130, 200)),
            ((170, top_y), (50, 40), (50, 100, 160)),
            ((240, top_y), (60, 50), (70, 130, 200)),
            ((320, top_y), (40, 35), (50, 100, 160)),
            ((385, top_y), (55, 50), (160, 80, 60)),
            ((465, top_y), (55, 50), (180, 60, 60)),
            ((545, top_y), (80, 50), (200, 80, 80)),
        ]
        n_top = min(int(progress * 7) + 1, 7)
        for i, ((bx, by), (bw, bh), color) in enumerate(blocks_classic):
            if i < n_top:
                frame[by : by + bh, bx : bx + bw] = color
                frame[by : by + 2, bx : bx + bw] = tuple(
                    min(c + 50, 255) for c in color
                )
                frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
                    min(c + 50, 255) for c in color
                )
                if i < 6:
                    ax = bx + bw + 3
                    ay = by + bh // 2
                    frame[ay - 1 : ay + 2, ax : ax + 12] = (150, 150, 170)

        # Red X over Flatten+FC when FCN appears
        if progress > 0.6:
            for d in range(-2, 3):
                for step in range(50):
                    x1 = 385 + int(step * 135 / 50)
                    y1 = top_y + step + d
                    if 0 <= y1 < H and 0 <= x1 < W:
                        frame[y1, x1] = (255, 80, 80)
                    y2 = top_y + 50 - step + d
                    if 0 <= y2 < H and 0 <= x1 < W:
                        frame[y2, x1] = (255, 80, 80)

        # BOTTOM: FCN pipeline
        bot_y = 380
        blocks_fcn = [
            ((80, bot_y), (70, 50), (70, 130, 200)),
            ((170, bot_y), (50, 40), (50, 100, 160)),
            ((240, bot_y), (60, 50), (70, 130, 200)),
            ((320, bot_y), (40, 35), (50, 100, 160)),
            ((385, bot_y), (70, 50), (80, 200, 120)),
            ((480, bot_y), (75, 50), (200, 160, 80)),
            ((580, bot_y), (80, 50), (100, 200, 100)),
        ]
        if progress > 0.4:
            n_bot = min(int((progress - 0.4) / 0.6 * 7) + 1, 7)
            for i, ((bx, by), (bw, bh), color) in enumerate(blocks_fcn):
                if i < n_bot:
                    frame[by : by + bh, bx : bx + bw] = color
                    frame[by : by + 2, bx : bx + bw] = tuple(
                        min(c + 50, 255) for c in color
                    )
                    frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
                        min(c + 50, 255) for c in color
                    )
                    if i < 6:
                        ax = bx + bw + 3
                        ay = by + bh // 2
                        frame[ay - 1 : ay + 2, ax : ax + 12] = (150, 150, 170)

        return frame

    fcn_clip = VideoClip(make_fcn_frame, duration=STEP_DUR + 1).with_fps(FPS)
    dur = STEP_DUR + 1
    labels = [
        ("FCN: Fully Convolutional Network (2015)", 26, "#FFE082", FONT_B, (80, 20)),
        ("KROK 1: Zamień FC → Conv 1x1", 18, "#A5D6A7", FONT_R, (80, 60)),
        ("Klasyczny CNN:", 16, "#EF9A9A", FONT_B, (80, 105)),
        ("Conv", 11, "white", FONT_R, (92, 148)),
        ("Pool", 11, "white", FONT_R, (178, 148)),
        ("Conv", 11, "white", FONT_R, (250, 148)),
        ("Pool", 11, "white", FONT_R, (325, 148)),
        ("Flatten", 11, "#EF9A9A", FONT_R, (390, 148)),
        ("FC", 11, "#EF9A9A", FONT_R, (480, 148)),
        ("1 label", 11, "#EF9A9A", FONT_R, (555, 148)),
        ("FCN:", 16, "#A5D6A7", FONT_B, (80, 350)),
        ("Conv", 11, "white", FONT_R, (92, 388)),
        ("Pool", 11, "white", FONT_R, (178, 388)),
        ("Conv", 11, "white", FONT_R, (250, 388)),
        ("Pool", 11, "white", FONT_R, (325, 388)),
        ("Conv1x1", 11, "#A5D6A7", FONT_R, (390, 388)),
        ("Upsample", 11, "#FFE082", FONT_R, (486, 388)),
        ("Mapa", 11, "#A5D6A7", FONT_R, (595, 388)),
        (
            "FC: spłaszcza 3D→1D, wymusza stały rozmiar → 1 etykieta",
            16,
            "#EF9A9A",
            FONT_R,
            (80, 250),
        ),
        (
            "Conv1x1: działa per piksel x kanały → DOWOLNY rozmiar → mapa klasy",
            16,
            "#A5D6A7",
            FONT_R,
            (80, 460),
        ),
        (
            "KROK 2: Skip connections — łączą wczesne detale z późną abstrakcją",
            17,
            "#64B5F6",
            FONT_R,
            (80, 510),
        ),
        (
            "Wczesne warstwy = krawędzie, tekstury | Późne = koncepty obiektów",
            15,
            "#78909C",
            FONT_R,
            (80, 545),
        ),
        (
            "FCN = PIERWSZA sieć end-to-end do segmentacji per-piksel!",
            18,
            "white",
            FONT_R,
            (80, 590),
        ),
        (
            "Mnemonik: FC → Conv 1x1 = otwieramy bramkę dla DOWOLNEGO rozmiaru",
            16,
            "#FFE082",
            FONT_R,
            (80, 640),
        ),
    ]
    text_clips: list[VideoClip] = [fcn_clip]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(dur)
            .with_position(pos)
        )
        text_clips.append(tc)
    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )

    # Slide 2: FCN skip connections step by step
    skip_lines = [
        ("FCN: Skip Connections — krok po kroku", 26, "#FFE082", FONT_B, (80, 30)),
        (
            "1. Encoder zmniejsza: 224→112→56→28→14 (pooling)",
            18,
            "#64B5F6",
            FONT_R,
            (100, 100),
        ),
        (
            "   Każdy pooling traci detale przestrzenne (dokładne krawędzie)",
            15,
            "#78909C",
            FONT_R,
            (100, 135),
        ),
        (
            "2. Decoder powiększa: 14→28→56→112→224 (upsample/deconv)",
            18,
            "#A5D6A7",
            FONT_R,
            (100, 190),
        ),
        (
            "   Upsample ODGADUJE piksele — rozmyty wynik!",
            15,
            "#78909C",
            FONT_R,
            (100, 225),
        ),
        (
            "3. Skip connections: dodaj cechy z encodera do decodera",
            18,
            "#FFE082",
            FONT_R,
            (100, 280),
        ),
        (
            "   Wczesne cechy = GDZIE (precyzyjne krawędzie)",
            15,
            "#64B5F6",
            FONT_R,
            (100, 315),
        ),
        (
            "   Późne cechy = CO (abstrakcyjne koncepty)",
            15,
            "#A5D6A7",
            FONT_R,
            (100, 345),
        ),
        (
            "   Skip = daje decoderowi OBA → ostry wynik!",
            15,
            "#FFE082",
            FONT_R,
            (100, 375),
        ),
        (
            "Warianty: FCN-32s (brak skip, rozmyty) → FCN-16s → FCN-8s (najlepszy)",
            16,
            "#B0BEC5",
            FONT_R,
            (80, 440),
        ),
        (
            "FCN-32s: upsample 32x naraz → ROZMYTE granice",
            15,
            "#EF9A9A",
            FONT_R,
            (100, 485),
        ),
        (
            "FCN-16s: skip z pool4 + upsample 16x → lepiej",
            15,
            "#FFE082",
            FONT_R,
            (100, 520),
        ),
        (
            "FCN-8s:  skip z pool3+pool4 + upsample 8x → OSTRE granice!",
            15,
            "#A5D6A7",
            FONT_R,
            (100, 555),
        ),
        (
            "Im więcej skip connections → tym więcej detali z encodera → ostrzejszy wynik",
            17,
            "white",
            FONT_R,
            (80, 620),
        ),
    ]
    slides.append(_text_slide(skip_lines, duration=STEP_DUR + 1))

    return slides


# ── DeepLab Architecture ─────────────────────────────────────────
def _deeplab_demo() -> list[CompositeVideoClip]:
    """Animate DeepLab: dilated convolution + ASPP step by step."""
    slides = []

    # Slide 1: Regular vs Dilated convolution
    def make_dilated_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR
        progress = min(t / (STEP_DUR * 0.7), 1.0)

        cell = 36
        # Draw three grids side by side for rate=1, rate=2, rate=3
        grids = [
            (
                "rate=1",
                60,
                [
                    (0, 0),
                    (0, 1),
                    (0, 2),
                    (1, 0),
                    (1, 1),
                    (1, 2),
                    (2, 0),
                    (2, 1),
                    (2, 2),
                ],
            ),
            (
                "rate=2",
                420,
                [
                    (0, 0),
                    (0, 2),
                    (0, 4),
                    (2, 0),
                    (2, 2),
                    (2, 4),
                    (4, 0),
                    (4, 2),
                    (4, 4),
                ],
            ),
            (
                "rate=3",
                820,
                [
                    (0, 0),
                    (0, 3),
                    (0, 6),
                    (3, 0),
                    (3, 3),
                    (3, 6),
                    (6, 0),
                    (6, 3),
                    (6, 6),
                ],
            ),
        ]

        for gi, (_label, gx, positions) in enumerate(grids):
            if progress < gi * 0.3:
                break
            gy = 180
            grid_size = 7
            # Draw background grid
            for r in range(grid_size):
                for c in range(grid_size):
                    x = gx + c * cell
                    y = gy + r * cell
                    frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55)

            # Highlight filter positions
            for r, c in positions:
                x = gx + c * cell
                y = gy + r * cell
                frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200)
                frame[y : y + 2, x : x + cell - 2] = (120, 180, 255)
                frame[y + cell - 4 : y + cell - 2, x : x + cell - 2] = (120, 180, 255)

        return frame

    dil_clip = VideoClip(make_dilated_frame, duration=STEP_DUR + 1).with_fps(FPS)
    dur = STEP_DUR + 1
    labels = [
        ("DeepLab: Atrous (Dilated) Convolution", 26, "#FFE082", FONT_B, (80, 20)),
        (
            "KROK 1: Zrozum dilated convolution — filtr z DZIURAMI",
            18,
            "#A5D6A7",
            FONT_R,
            (80, 60),
        ),
        ("rate=1 (zwykła)", 14, "#64B5F6", FONT_B, (60, 160)),
        ("RF = 3x3", 14, "#64B5F6", FONT_R, (60, 440)),
        ("9 wag, kontekst 3px", 12, "#78909C", FONT_R, (60, 470)),
        ("rate=2 (dilated)", 14, "#FFE082", FONT_B, (420, 160)),
        ("RF = 5x5", 14, "#FFE082", FONT_R, (420, 440)),
        ("9 wag, kontekst 5px!", 12, "#78909C", FONT_R, (420, 470)),
        ("rate=3 (dilated)", 14, "#A5D6A7", FONT_B, (820, 160)),
        ("RF = 7x7", 14, "#A5D6A7", FONT_R, (820, 440)),
        ("9 wag, kontekst 7px!", 12, "#78909C", FONT_R, (820, 470)),
        (
            "Niebieski = pozycja wag filtra 3x3 | Szary = pominięte (dziury)",
            15,
            "#B0BEC5",
            FONT_R,
            (80, 510),
        ),
        (
            "TE SAME 9 wag → WIĘKSZE pole widzenia → lepszy kontekst BEZ dodatkowych parametrów!",
            16,
            "white",
            FONT_R,
            (80, 550),
        ),
        (
            "Mnemonik: DZIURY w filtrze — à trous = z dziurami (fr.)",
            16,
            "#FFE082",
            FONT_R,
            (80, 600),
        ),
    ]
    text_clips: list[VideoClip] = [dil_clip]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(dur)
            .with_position(pos)
        )
        text_clips.append(tc)
    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )

    # Slide 2: ASPP module step by step
    def make_aspp_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR
        progress = min(t / (STEP_DUR * 0.7), 1.0)

        # Input feature map on left
        frame[250:330, 50:130] = (70, 130, 200)
        frame[250:252, 50:130] = (120, 180, 255)
        frame[328:330, 50:130] = (120, 180, 255)

        # ASPP parallel branches
        branches = [
            ("1x1 conv", 250, (200, 170), (100, 40), (80, 200, 120)),
            ("rate=6", 310, (200, 250), (100, 40), (200, 160, 80)),
            ("rate=12", 370, (200, 330), (100, 40), (200, 120, 60)),
            ("rate=18", 430, (200, 410), (100, 40), (180, 100, 80)),
            ("GAP", 490, (200, 490), (100, 40), (160, 80, 160)),
        ]
        n_branches = min(int(progress * 5) + 1, 5)
        for i, (_lbl, _h, (bx, by), (bw, bh), color) in enumerate(branches):
            if i < n_branches:
                frame[by : by + bh, bx : bx + bw] = color
                frame[by : by + 2, bx : bx + bw] = tuple(
                    min(c + 50, 255) for c in color
                )
                # Arrow from input
                ay = by + bh // 2
                frame[ay - 1 : ay + 2, 133:197] = (150, 150, 170)

        # Concatenation box
        if progress > 0.6:
            frame[250:530, 380:420] = (50, 60, 80)
            frame[250:252, 380:420] = (200, 200, 100)
            frame[528:530, 380:420] = (200, 200, 100)
            # Arrows from branches to concat
            for i, (_lbl, _h, (bx, by), (bw, bh), _c) in enumerate(branches):
                if i < n_branches:
                    ay = by + bh // 2
                    frame[ay - 1 : ay + 2, bx + bw + 3 : 378] = (150, 150, 170)

        # Final conv after concat
        if progress > 0.8:
            frame[350:420, 450:550] = (100, 200, 100)
            frame[350:352, 450:550] = (150, 230, 150)
            frame[418:420, 450:550] = (150, 230, 150)
            # Arrow from concat
            frame[388:391, 423:448] = (150, 150, 170)

        return frame

    aspp_clip = VideoClip(make_aspp_frame, duration=STEP_DUR + 1).with_fps(FPS)
    labels2 = [
        (
            "DeepLab: ASPP (Atrous Spatial Pyramid Pooling)",
            24,
            "#FFE082",
            FONT_B,
            (80, 20),
        ),
        (
            "KROK 2: Multi-scale — analizuj obraz na WIELU skalach naraz",
            17,
            "#A5D6A7",
            FONT_R,
            (80, 60),
        ),
        ("Wejście", 13, "#64B5F6", FONT_B, (55, 235)),
        ("Conv 1x1", 12, "white", FONT_R, (210, 178)),
        ("Dilated r=6", 12, "white", FONT_R, (205, 258)),
        ("Dilated r=12", 12, "white", FONT_R, (203, 338)),
        ("Dilated r=18", 12, "white", FONT_R, (203, 418)),
        ("GAP (global)", 12, "white", FONT_R, (205, 498)),
        ("Concat", 13, "#FFE082", FONT_B, (381, 537)),
        ("Conv", 13, "#A5D6A7", FONT_B, (470, 425)),
        (
            "5 gałęzi RÓWNOLEGŁYCH → różne skale kontekstu:",
            16,
            "#B0BEC5",
            FONT_R,
            (550, 170),
        ),
        ("  1x1: kontekst punktowy (piksel)", 14, "#A5D6A7", FONT_R, (560, 210)),
        ("  r=6: kontekst lokalny (~13px)", 14, "#FFE082", FONT_R, (560, 245)),
        ("  r=12: kontekst średni (~25px)", 14, "#FFE082", FONT_R, (560, 280)),
        ("  r=18: kontekst szeroki (~37px)", 14, "#FFE082", FONT_R, (560, 315)),
        ("  GAP: kontekst GLOBALNY (cały obraz)", 14, "#CE93D8", FONT_R, (560, 350)),
        ("Concat → 1x1 conv → mapa segmentacji", 16, "#A5D6A7", FONT_R, (550, 400)),
        (
            "Efekt: sieć widzi OD piksela DO całego obrazu naraz!",
            17,
            "white",
            FONT_R,
            (80, 600),
        ),
        (
            "Mnemonik: ASPP = Piramida z DZIURAMI, patrzy na 5 skal jednocześnie",
            15,
            "#FFE082",
            FONT_R,
            (80, 645),
        ),
    ]
    text_clips2: list[VideoClip] = [aspp_clip]
    for text, fs, color, font, pos in labels2:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(dur)
            .with_position(pos)
        )
        text_clips2.append(tc)
    slides.append(
        CompositeVideoClip(text_clips2, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )

    return slides


# ── Transformer Segmentation ────────────────────────────────────
def _transformer_seg_demo() -> list[CompositeVideoClip]:
    """Animate transformer-based segmentation: self-attention concept."""
    slides = []

    # Slide 1: CNN local vs Transformer global
    def make_attention_frame(t: float) -> np.ndarray:
        frame = np.zeros((H, W, 3), dtype=np.uint8)
        frame[:] = BG_COLOR
        progress = min(t / (STEP_DUR * 0.7), 1.0)

        cell = 40
        grid_n = 6

        # LEFT: CNN — local receptive field
        lx, ly = 60, 200
        for r in range(grid_n):
            for c in range(grid_n):
                x = lx + c * cell
                y = ly + r * cell
                frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55)

        # Highlight 3x3 kernel in CNN
        if progress > 0.2:
            cx, cy = 2, 2  # center cell
            for dr in range(-1, 2):
                for dc in range(-1, 2):
                    r, c = cy + dr, cx + dc
                    x = lx + c * cell
                    y = ly + r * cell
                    frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200)
            # Center highlighted more
            x = lx + cx * cell
            y = ly + cy * cell
            frame[y : y + cell - 2, x : x + cell - 2] = (120, 180, 255)

        # RIGHT: Transformer — global attention
        rx, ry = 680, 200
        for r in range(grid_n):
            for c in range(grid_n):
                x = rx + c * cell
                y = ry + r * cell
                frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55)

        # All cells connected to center
        if progress > 0.4:
            cx_t, cy_t = 2, 2
            # Center cell
            x0 = rx + cx_t * cell + cell // 2
            y0 = ry + cy_t * cell + cell // 2
            n_connections = int(progress * 36)
            conn_idx = 0
            for r in range(grid_n):
                for c in range(grid_n):
                    conn_idx += 1
                    if conn_idx > n_connections:
                        break
                    x = rx + c * cell
                    y = ry + r * cell
                    # Color by "attention strength" — closer = stronger
                    dist = abs(r - cy_t) + abs(c - cx_t)
                    strength = max(30, 200 - dist * 30)
                    frame[y : y + cell - 2, x : x + cell - 2] = (
                        strength // 3,
                        strength // 2,
                        strength,
                    )
                    # Draw connection line
                    x1 = x + cell // 2
                    y1 = y + cell // 2
                    steps = max(abs(x1 - x0), abs(y1 - y0))
                    if steps > 0:
                        for s in range(0, steps, 3):
                            px = x0 + int((x1 - x0) * s / steps)
                            py = y0 + int((y1 - y0) * s / steps)
                            if 0 <= px < W - 1 and 0 <= py < H - 1:
                                frame[py : py + 1, px : px + 1] = (200, 180, 50)
                else:
                    continue
                break
            # Center highlighted strongly
            x = rx + cx_t * cell
            y = ry + cy_t * cell
            frame[y : y + cell - 2, x : x + cell - 2] = (255, 200, 50)

        return frame

    att_clip = VideoClip(make_attention_frame, duration=STEP_DUR + 1).with_fps(FPS)
    dur = STEP_DUR + 1
    labels = [
        ("Transformer: Self-Attention w segmentacji", 26, "#FFE082", FONT_B, (80, 20)),
        ("CNN = LOKALNY kontekst", 18, "#64B5F6", FONT_B, (60, 160)),
        ("Transformer = GLOBALNY kontekst", 18, "#FFE082", FONT_B, (680, 160)),
        ("Filtr 3x3 widzi", 14, "#64B5F6", FONT_R, (60, 460)),
        ("TYLKO 9 sąsiadów", 14, "#64B5F6", FONT_R, (60, 485)),
        ("Self-attention: każdy", 14, "#FFE082", FONT_R, (680, 460)),
        ("piksel widzi WSZYSTKIE!", 14, "#FFE082", FONT_R, (680, 485)),
        ("vs", 28, "#B0BEC5", FONT_B, (450, 300)),
    ]
    text_clips: list[VideoClip] = [att_clip]
    for text, fs, color, font, pos in labels:
        tc = (
            _tc(text=text, font_size=fs, color=color, font=font)
            .with_duration(dur)
            .with_position(pos)
        )
        text_clips.append(tc)
    slides.append(
        CompositeVideoClip(text_clips, size=(W, H)).with_effects(
            [FadeIn(0.3), FadeOut(0.3)]
        )
    )

    # Slide 2: Self-attention Q/K/V step by step
    qkv_lines = [
        ("Self-Attention: Q / K / V krok po kroku", 26, "#FFE082", FONT_B, (80, 30)),
        ("Każdy piksel (token) tworzy 3 wektory:", 18, "#B0BEC5", FONT_R, (100, 100)),
        (
            "  Q (Query)  = 'czego szukam?' - pytanie piksela",
            17,
            "#64B5F6",
            FONT_R,
            (120, 145),
        ),
        (
            "  K (Key)    = 'co oferuj\u0119?' - odpowied\u017a piksela",
            17,
            "#A5D6A7",
            FONT_R,
            (120, 185),
        ),
        (
            "  V (Value)  = 'moja warto\u015b\u0107' - informacja do przekazania",
            17,
            "#FFE082",
            FONT_R,
            (120, 225),
        ),
        ("Algorytm attention:", 18, "#B0BEC5", FONT_R, (100, 285)),
        (
            "  1. Mnożenie Q x K\u1d40 → macierz NxN (kto ważny dla kogo)",
            16,
            "white",
            FONT_R,
            (120, 320),
        ),
        (
            "  2. Skalowanie: / √d (stabilność gradientów)",
            16,
            "white",
            FONT_R,
            (120, 355),
        ),
        (
            "  3. Softmax → wagi attention (sumują się do 1)",
            16,
            "white",
            FONT_R,
            (120, 390),
        ),
        (
            "  4. Mnożenie wag x V → ważona suma wartości",
            16,
            "white",
            FONT_R,
            (120, 425),
        ),
        (
            "Attention(Q,K,V) = softmax(Q · K\u1d40 / √d) · V",
            20,
            "#FFE082",
            FONT_B,
            (100, 480),
        ),
        (
            "Złożoność: O(n²) pamięci — n = liczba pikseli/tokenów",
            16,
            "#EF9A9A",
            FONT_R,
            (100, 535),
        ),
        (
            "Dlatego SegFormer używa efficient attention (liniowa złożoność)",
            15,
            "#78909C",
            FONT_R,
            (100, 570),
        ),
        (
            "SegFormer (2021): lightweight + hierarchiczny encoder",
            16,
            "#A5D6A7",
            FONT_R,
            (100, 610),
        ),
        (
            "Mask2Former (2022): masked attention + unified (semantic+instance+panoptic)",
            16,
            "#CE93D8",
            FONT_R,
            (100, 645),
        ),
    ]
    slides.append(_text_slide(qkv_lines, duration=STEP_DUR + 1))

    # Slide 3: Encoder-Decoder in DL summary
    summary_lines = [
        (
            "Podsumowanie: Encoder-Decoder w segmentacji DL",
            24,
            "#FFE082",
            FONT_B,
            (80, 30),
        ),
        ("Wspólna idea WSZYSTKICH sieci segmentacji:", 18, "#B0BEC5", FONT_R, (80, 90)),
        (
            "Encoder:  obraz → cechy (zmniejsza rozdzielczość, wyciąga CO)",
            16,
            "#64B5F6",
            FONT_R,
            (100, 140),
        ),
        (
            "Decoder:  cechy → mapa (zwiększa rozdzielczość, odtwarza GDZIE)",
            16,
            "#A5D6A7",
            FONT_R,
            (100, 175),
        ),
        (
            "Skip:     przenosi detale z encodera do decodera",
            16,
            "#FFE082",
            FONT_R,
            (100, 210),
        ),
        ("", 10, "white", FONT_R, (100, 240)),
        (
            "FCN (2015):     Conv1x1 + skip → pierwsza end-to-end",
            16,
            "#64B5F6",
            FONT_R,
            (100, 275),
        ),
        (
            "U-Net (2015):   U-shape + skip concat → segmentacja medyczna",
            16,
            "#A5D6A7",
            FONT_R,
            (100, 310),
        ),
        (
            "DeepLab (2018): dilated conv + ASPP → multi-scale kontekst",
            16,
            "#FFE082",
            FONT_R,
            (100, 345),
        ),
        (
            "SegFormer:      transformer encoder (globalny kontekst)",
            16,
            "#CE93D8",
            FONT_R,
            (100, 380),
        ),
        (
            "Mask2Former:    masked attention (unified, SOTA)",
            16,
            "#CE93D8",
            FONT_R,
            (100, 415),
        ),
        ("", 10, "white", FONT_R, (100, 440)),
        (
            "Ewolucja: więcej kontekstu + lepsze skip connections:",
            17,
            "white",
            FONT_R,
            (80, 465),
        ),
        (
            "  CNN lokal. → dilated (szersze RF) → transformer (global) → masked att.",
            16,
            "#B0BEC5",
            FONT_R,
            (80, 505),
        ),
        (
            "  addition skip → concat skip → cross-attention skip",
            16,
            "#B0BEC5",
            FONT_R,
            (80, 540),
        ),
        (
            "Metryki: mIoU (standard), Dice (medycyna), Focal Loss (imbalance)",
            16,
            "#90CAF9",
            FONT_R,
            (80, 590),
        ),
        (
            "Loss: Cross-Entropy per piksel + opcjonalnie Dice/Focal",
            15,
            "#78909C",
            FONT_R,
            (80, 625),
        ),
    ]
    slides.append(_text_slide(summary_lines, duration=STEP_DUR + 1))

    return slides


# ── Methods comparison ────────────────────────────────────────────
def _methods_comparison() -> CompositeVideoClip:
    bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(10.0)
    title = (
        _tc(
            text="Porównanie metod segmentacji",
            font_size=36,
            color="white",
            font=FONT_B,
        )
        .with_duration(10.0)
        .with_position(("center", 20))
    )

    rows = [
        ("Metoda", "Typ", "Idea", "Mnemonik"),
        ("Thresholding", "Klasyczna", "piksel > T → klasa 1", "PRÓG na bramce"),
        ("Otsu", "Klasyczna", "auto-próg, min σ²", "AUTO-bramkarz"),
        ("Region Growing", "Klasyczna", "BFS od seeda", "PLAMA atramentu"),
        ("Watershed", "Klasyczna", "zalewanie minimów", "ZALEWANIE terenu"),
        ("Mean Shift", "Klasyczna", "jądro → max gęstości", "KULKI do dołków"),
        ("U-Net", "Deep Learning", "encoder-decoder + skip", "Litera U + mosty"),
        ("DeepLab", "Deep Learning", "dilated conv + ASPP", "DZIURY w filtrze"),
    ]

    clips: list[VideoClip] = [bg, title]
    for i, row in enumerate(rows):
        y_pos = 75 + i * 72
        col_x = [40, 210, 340, 660]
        for j, cell in enumerate(row):
            fs = 16 if i > 0 else 18
            color = "#64B5F6" if i == 0 else ("#E0E0E0" if j < 3 else "#FFE082")
            tc = (
                _tc(
                    text=cell,
                    font_size=fs,
                    color=color,
                    font=FONT_B if i == 0 else FONT_R,
                )
                .with_duration(10.0)
                .with_position((col_x[j], y_pos))
            )
            clips.append(tc)

    return CompositeVideoClip(clips, size=(W, H)).with_effects(
        [FadeIn(0.5), FadeOut(0.5)]
    )


# ── Main ──────────────────────────────────────────────────────────
def main() -> None:
    """Generate the Q23 segmentation visualization video."""
    sections: list[VideoClip] = []

    sections.append(
        _make_header(
            "Pytanie 23: Segmentacja obrazu",
            "Problem, strategie klasyczne i sieci neuronowe",
            duration=4.0,
        )
    )

    # Concept
    sections.append(_make_header("Co to segmentacja?", "Etykieta klasy per piksel"))
    sections.extend(_segmentation_concept())

    # Thresholding
    sections.append(
        _make_header("Progowanie + Otsu", "Najprostsza metoda — automatyczny próg")
    )
    sections.extend(_thresholding_demo())

    # Region Growing
    sections.append(_make_header("Region Growing", "Seed → BFS do podobnych sąsiadów"))
    sections.extend(_region_growing_demo())

    # Watershed
    sections.append(_make_header("Watershed", "Obraz jako mapa topograficzna"))
    sections.extend(_watershed_demo())

    # FCN
    sections.append(
        _make_header("FCN (Deep Learning)", "Fully Convolutional Network — Conv 1x1")
    )
    sections.extend(_fcn_demo())

    # U-Net
    sections.append(
        _make_header(
            "U-Net (Deep Learning)", "Architektura encoder-decoder + skip concat"
        )
    )
    sections.extend(_unet_demo())

    # DeepLab
    sections.append(
        _make_header(
            "DeepLab v3+ (Deep Learning)", "Dilated convolution + ASPP — multi-scale"
        )
    )
    sections.extend(_deeplab_demo())

    # Transformer segmentation
    sections.append(
        _make_header(
            "Transformer (SegFormer, Mask2Former)", "Self-attention — globalny kontekst"
        )
    )
    sections.extend(_transformer_seg_demo())

    # Comparison
    sections.append(_methods_comparison())

    # Summary
    sections.append(
        _make_header(
            "Podsumowanie",
            "Klasyczne: próg/region/watershed | DL: FCN/U-Net/DeepLab/Transformer",
            duration=4.0,
        )
    )

    final = concatenate_videoclips(sections, method="compose")
    final.write_videofile(
        OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4
    )
    print(f"Video saved to: {OUTPUT}")


if __name__ == "__main__":
    main()