testsAndMisc-archive/python_pkg/praca_magisterska_video/visualize_q23.py

1627 lines
51 KiB
Python

"""MoviePy visualization for PYTANIE 23: Image Segmentation.
Creates animated video demonstrating:
- What segmentation is (pixel-level classification)
- Thresholding / Otsu (bimodal histogram)
- Region Growing (BFS flood fill)
- Watershed (topographic flooding)
- U-Net encoder-decoder architecture
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
import numpy as np
os.environ["FFMPEG_BINARY"] = "/usr/bin/ffmpeg"
from moviepy import (
ColorClip,
CompositeVideoClip,
TextClip,
VideoClip,
concatenate_videoclips,
)
from moviepy.video.fx import FadeIn, FadeOut
# ── Constants ─────────────────────────────────────────────────────
W, H = 1280, 720
FPS = 24
STEP_DUR = 7.0
HEADER_DUR = 4.0
FONT_B = "/usr/share/fonts/TTF/DejaVuSans-Bold.ttf"
FONT_R = "/usr/share/fonts/TTF/DejaVuSans.ttf"
OUTPUT_DIR = Path(__file__).resolve().parent / "videos"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT = str(OUTPUT_DIR / "q23_segmentation.mp4")
logging.basicConfig(level=logging.INFO)
_logger = logging.getLogger(__name__)
BG_COLOR = (15, 20, 35)
rng = np.random.default_rng(42)
def _tc(**kwargs: object) -> TextClip:
"""TextClip wrapper that adds enough bottom margin to prevent clipping."""
fs = kwargs.get("font_size", 24)
m = int(fs) // 3 + 2
kwargs["margin"] = (0, m)
return TextClip(**kwargs)
def _make_header(
title: str, subtitle: str, duration: float = HEADER_DUR
) -> CompositeVideoClip:
bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(duration)
t = (
_tc(
text=title,
font_size=48,
color="white",
font=FONT_B,
)
.with_duration(duration)
.with_position(("center", 260))
)
s = (
_tc(
text=subtitle,
font_size=24,
color="#90CAF9",
font=FONT_R,
)
.with_duration(duration)
.with_position(("center", 340))
)
return CompositeVideoClip([bg, t, s], size=(W, H)).with_effects(
[FadeIn(0.5), FadeOut(0.5)]
)
def _text_slide(
lines: list[tuple[str, int, str, str, tuple[str | int, str | int]]],
duration: float = STEP_DUR,
) -> CompositeVideoClip:
"""Create a slide with multiple text elements."""
bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(duration)
clips: list[VideoClip] = [bg]
for text, font_size, color, font, pos in lines:
tc = (
_tc(
text=text,
font_size=font_size,
color=color,
font=font,
)
.with_duration(duration)
.with_position(pos)
)
clips.append(tc)
return CompositeVideoClip(clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
def _compose_slide(
base_clip: VideoClip,
labels: list[tuple[str, int, str, str, tuple[int, int]]],
duration: float,
) -> CompositeVideoClip:
"""Overlay text labels on an animated base clip."""
text_clips: list[VideoClip] = [base_clip]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(duration)
.with_position(pos)
)
text_clips.append(tc)
return CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
# ── Segmentation concept ─────────────────────────────────────────
def _segmentation_concept() -> list[CompositeVideoClip]:
"""Show what segmentation is: pixel-level labeling."""
slides = []
# Synthetic image: grid of colored pixels
def make_image_frame(_t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
# Draw a small "image" grid
grid_x, grid_y = 100, 150
cell = 40
# Sky (top rows)
colors_map = [
[(135, 206, 235)] * 8, # sky
[(135, 206, 235)] * 5 + [(34, 139, 34)] * 3, # sky + tree
[(34, 139, 34)] * 3
+ [(128, 128, 128)] * 3
+ [(34, 139, 34)] * 2, # tree+road+tree
[(128, 128, 128)] * 3
+ [(200, 50, 50)] * 2
+ [(128, 128, 128)] * 3, # road+car+road
]
labels_map = [
["niebo"] * 8,
["niebo"] * 5 + ["drzewo"] * 3,
["drzewo"] * 3 + ["droga"] * 3 + ["drzewo"] * 2,
["droga"] * 3 + ["samochód"] * 2 + ["droga"] * 3,
]
label_colors = {
"niebo": (100, 180, 255),
"drzewo": (50, 200, 50),
"droga": (180, 180, 180),
"samochód": (255, 80, 80),
}
for r, row in enumerate(colors_map):
for c, col in enumerate(row):
y = grid_y + r * cell
x = grid_x + c * cell
frame[y : y + cell - 2, x : x + cell - 2] = col
# Draw segmentation map on the right
seg_x = 600
for r, row in enumerate(labels_map):
for c, lab in enumerate(row):
y = grid_y + r * cell
x = seg_x + c * cell
frame[y : y + cell - 2, x : x + cell - 2] = label_colors[lab]
return frame
image_clip = VideoClip(make_image_frame, duration=STEP_DUR).with_fps(FPS)
labels_text = [
("Obraz wejściowy", 22, "white", FONT_B, (170, 100)),
("Mapa segmentacji", 22, "white", FONT_B, (660, 100)),
("", 50, "#FFE082", FONT_B, (450, 250)),
("Każdy piksel → etykieta klasy", 20, "#B0BEC5", FONT_R, (100, 420)),
("niebo | drzewo | droga | samochód", 18, "#90CAF9", FONT_R, (600, 420)),
("Segmentacja = klasyfikacja per-piksel", 24, "#FFE082", FONT_B, (100, 500)),
(
"Semantic: klasy bez instancji | Instance: "
"rozróżnia obiekty | Panoptic: oba",
16,
"#78909C",
FONT_R,
(100, 560),
),
]
clips: list[VideoClip] = [image_clip]
for text, fs, color, font, pos in labels_text:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR)
.with_position(pos)
)
clips.append(tc)
slides.append(
CompositeVideoClip(clips, size=(W, H)).with_effects([FadeIn(0.3), FadeOut(0.3)])
)
return slides
# ── Thresholding / Otsu ───────────────────────────────────────────
def _thresholding_demo() -> list[CompositeVideoClip]:
"""Animate thresholding and Otsu concept."""
slides = []
# Show histogram & threshold
def make_threshold_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
# Draw bimodal histogram bars
bar_start_x = 80
bar_y = 500
bar_w = 4
for i in range(256):
# Bimodal: peaks at 60 and 190
h1 = 200 * np.exp(-((i - 60) ** 2) / (2 * 20**2))
h2 = 150 * np.exp(-((i - 190) ** 2) / (2 * 25**2))
bar_h = int(h1 + h2)
x = bar_start_x + i * bar_w
if x + bar_w < W:
frame[bar_y - bar_h : bar_y, x : x + bar_w - 1] = (150, 150, 170)
# Animated threshold line
threshold = int(60 + (190 - 60) * min(t / (STEP_DUR * 0.7), 1.0))
tx = bar_start_x + threshold * bar_w
if tx < W:
frame[bar_y - 250 : bar_y + 10, tx : tx + 3] = (255, 80, 80)
# Color the two sides
for i in range(threshold):
x = bar_start_x + i * bar_w
h1 = 200 * np.exp(-((i - 60) ** 2) / (2 * 20**2))
h2 = 150 * np.exp(-((i - 190) ** 2) / (2 * 25**2))
bar_h = int(h1 + h2)
if x + bar_w < W and bar_h > 0:
frame[bar_y - bar_h : bar_y, x : x + bar_w - 1] = (70, 130, 200)
for i in range(threshold, 256):
x = bar_start_x + i * bar_w
h1 = 200 * np.exp(-((i - 60) ** 2) / (2 * 20**2))
h2 = 150 * np.exp(-((i - 190) ** 2) / (2 * 25**2))
bar_h = int(h1 + h2)
if x + bar_w < W and bar_h > 0:
frame[bar_y - bar_h : bar_y, x : x + bar_w - 1] = (200, 100, 80)
return frame
hist_clip = VideoClip(make_threshold_frame, duration=STEP_DUR).with_fps(FPS)
text_clips: list[VideoClip] = [hist_clip]
labels = [
("Progowanie (Thresholding) z metodą Otsu", 28, "#FFE082", FONT_B, (80, 30)),
(
"Histogram jasności pikseli — dwumodalny (bimodal)",
20,
"#B0BEC5",
FONT_R,
(80, 80),
),
("Garb 1: piksele obiektu (ciemne ~60)", 16, "#64B5F6", FONT_R, (80, 120)),
("Garb 2: piksele tła (jasne ~190)", 16, "#EF9A9A", FONT_R, (80, 150)),
(
"Próg T (czerwona linia) dzieli piksele na 2 klasy",
18,
"white",
FONT_R,
(80, 540),
),
(
"Otsu: automatycznie testuje T=0..255, minimalizuje σ² wewnątrzklasową",
16,
"#A5D6A7",
FONT_R,
(80, 580),
),
(
"Piksel ≤ T → klasa 0 (tło) | Piksel > T → klasa 1 (obiekt)",
16,
"#78909C",
FONT_R,
(80, 620),
),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── Region Growing ────────────────────────────────────────────────
def _region_growing_demo() -> list[CompositeVideoClip]:
"""Animate region growing BFS from a seed pixel."""
slides = []
grid_size = 10
cell_size = 40
rng = np.random.default_rng(42)
# Create a simple grid: dark region (30-80) and bright region (160-220)
grid = np.zeros((grid_size, grid_size), dtype=np.uint8)
grid[:] = 60 # dark background
grid[2:7, 3:8] = 180 # bright rectangle
# Add some noise
noise = rng.integers(-15, 15, (grid_size, grid_size))
grid = np.clip(grid.astype(int) + noise, 0, 255).astype(np.uint8)
# BFS steps from seed (4, 5)
seed = (4, 5)
threshold_val = 50
visited_order: list[tuple[int, int]] = []
queue = [seed]
visited_set = {seed}
while queue:
r, c = queue.pop(0)
visited_order.append((r, c))
for dr, dc in [(-1, 0), (1, 0), (0, -1), (0, 1)]:
nr, nc = r + dr, c + dc
if (
0 <= nr < grid_size
and 0 <= nc < grid_size
and (nr, nc) not in visited_set
) and abs(int(grid[nr, nc]) - int(grid[seed])) < threshold_val:
visited_set.add((nr, nc))
queue.append((nr, nc))
def make_region_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
ox, oy = 100, 180
# How many cells to show as visited
progress = min(t / (STEP_DUR * 0.8), 1.0)
n_visited = int(progress * len(visited_order))
for r in range(grid_size):
for c in range(grid_size):
x = ox + c * cell_size
y = oy + r * cell_size
val = grid[r, c]
color = (val, val, val)
# Highlight visited
if (r, c) in visited_order[:n_visited]:
color = (80, 200, 120) # green for region
elif (r, c) == seed:
color = (255, 200, 50) # yellow seed
frame[y : y + cell_size - 2, x : x + cell_size - 2] = color
# Show value
# (drawn as a simple marker since we can't render text in numpy easily)
# Mark the seed with a bright border
sx = ox + seed[1] * cell_size
sy = ox + seed[0] * cell_size + 80
frame[sy : sy + cell_size, sx : sx + 2] = (255, 200, 50)
frame[sy : sy + cell_size, sx + cell_size - 2 : sx + cell_size] = (255, 200, 50)
frame[sy : sy + 2, sx : sx + cell_size] = (255, 200, 50)
frame[sy + cell_size - 2 : sy + cell_size, sx : sx + cell_size] = (255, 200, 50)
return frame
region_clip = VideoClip(make_region_frame, duration=STEP_DUR).with_fps(FPS)
text_clips: list[VideoClip] = [region_clip]
labels = [
("Region Growing — rozrastanie regionu", 28, "#FFE082", FONT_B, (100, 30)),
("Seed (ziarno) → BFS do podobnych sąsiadów", 20, "#B0BEC5", FONT_R, (100, 80)),
(
"Żółty = seed | Zielony = region | Szary = nieodwiedzone",
16,
"#78909C",
FONT_R,
(100, 120),
),
(
"Sąsiad PODOBNY (|jasność - jasność_regionu| < próg) → dodaj do regionu",
16,
"#A5D6A7",
FONT_R,
(100, 600),
),
(
"Algorytm zatrzymuje się gdy brak podobnych sąsiadów",
16,
"#90CAF9",
FONT_R,
(100, 640),
),
(
"Mnemonik: PLAMA atramentu — rozlewa się na podobne piksele",
18,
"#EF9A9A",
FONT_R,
(100, 670),
),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── Watershed ─────────────────────────────────────────────────────
def _watershed_demo() -> list[CompositeVideoClip]:
"""Animate watershed flooding concept."""
slides = []
def make_watershed_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
# Draw terrain profile (1D cross-section)
ox, oy = 100, 450
terrain_w = 900
terrain_points = 100
xs = np.linspace(0, 1, terrain_points)
# Two valleys with a ridge
terrain = (
120 * np.exp(-((xs - 0.25) ** 2) / 0.005)
+ 80 * np.exp(-((xs - 0.75) ** 2) / 0.008)
+ 30
)
terrain = 250 - terrain # invert for visual (valleys at bottom)
# Water level rises over time
water_level = int(160 + 80 * min(t / (STEP_DUR * 0.7), 1.0))
for i in range(terrain_points - 1):
x1 = ox + int(xs[i] * terrain_w)
x2 = ox + int(xs[i + 1] * terrain_w)
y1 = oy - int(terrain[i])
y2 = oy - int(terrain[i + 1])
# Fill terrain
for x in range(x1, min(x2 + 1, W)):
top = min(y1, y2) - 5
frame[top:oy, x : x + 1] = (100, 80, 60)
# Fill water
water_y = oy - water_level
for x in range(x1, min(x2 + 1, W)):
t_y = oy - int(terrain[i])
if water_y < t_y:
# Water fills below terrain surface
fill_top = max(water_y, 0)
fill_bot = min(t_y, oy)
if fill_top < fill_bot:
frame[fill_top:fill_bot, x : x + 1] = (70, 130, 220)
# Dam marker at ridge
ridge_x = ox + int(0.5 * terrain_w)
dam_visible_threshold = 160
if water_level > dam_visible_threshold:
frame[oy - water_level : oy - 140, ridge_x - 2 : ridge_x + 2] = (
255,
80,
80,
)
return frame
ws_clip = VideoClip(make_watershed_frame, duration=STEP_DUR).with_fps(FPS)
text_clips: list[VideoClip] = [ws_clip]
labels = [
("Watershed — metoda zlewiska", 28, "#FFE082", FONT_B, (100, 20)),
(
"Obraz = mapa topograficzna (jasność = wysokość)",
20,
"#B0BEC5",
FONT_R,
(100, 65),
),
(
"Brązowy = teren (ciemne=doliny, jasne=szczyty)",
16,
"#8D6E63",
FONT_R,
(100, 100),
),
("Niebieski = woda zalewająca od minimów", 16, "#64B5F6", FONT_R, (100, 130)),
(
"Czerwony = TAMA (granica segmentu) — gdy woda z 2 dolin się spotka",
16,
"#EF9A9A",
FONT_R,
(100, 160),
),
(
"Problem: over-segmentation "
"(za dużo regionów). "
"Rozwiązanie: marker-controlled.",
16,
"#A5D6A7",
FONT_R,
(100, 560),
),
(
"Mnemonik: ZALEWANIE terenu — granie gór = granice segmentów",
18,
"#FFE082",
FONT_R,
(100, 600),
),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── U-Net Architecture ───────────────────────────────────────────
def _draw_unet_skips(
frame: np.ndarray,
enc_positions: list[tuple[int, int, int, int]],
n_blocks: int,
dec_x: int,
skip_threshold: int,
) -> None:
"""Draw horizontal dashed skip-connection lines."""
if n_blocks <= skip_threshold:
return
for i in range(min(n_blocks - 5, 4)):
ey = enc_positions[i][1] + enc_positions[i][3] // 2
ex_end = enc_positions[i][0] + enc_positions[i][2]
for dash_x in range(ex_end + 10, dec_x - 10, 15):
frame[ey : ey + 2, dash_x : dash_x + 8] = (255, 200, 50)
def _make_unet_frame(t: float) -> np.ndarray:
"""Render a single U-Net animation frame."""
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
enc_sizes = [(80, 120), (60, 100), (45, 80), (30, 60)]
dec_sizes = list(reversed(enc_sizes))
enc_x = 150
dec_x = 850
progress = min(t / (STEP_DUR * 0.6), 1.0)
n_blocks = int(progress * 8) + 1
enc_positions: list[tuple[int, int, int, int]] = []
y_offset = 120
for i, (bw, bh) in enumerate(enc_sizes):
x = enc_x
y = y_offset + i * 130
enc_positions.append((x, y, bw, bh))
if i < n_blocks:
frame[y : y + bh, x : x + bw] = (70, 130, 200)
frame[y : y + 2, x : x + bw] = (100, 180, 255)
frame[y + bh - 2 : y + bh, x : x + bw] = (100, 180, 255)
frame[y : y + bh, x : x + 2] = (100, 180, 255)
frame[y : y + bh, x + bw - 2 : x + bw] = (100, 180, 255)
if i < len(enc_sizes) - 1:
ax = x + bw // 2
ay = y + bh + 10
frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)
bx, by = 500, y_offset + 3 * 130 + 30
encoder_count = 4
if n_blocks > encoder_count:
frame[by : by + 50, bx : bx + 25] = (200, 100, 80)
frame[by : by + 2, bx : bx + 25] = (255, 140, 100)
frame[by + 48 : by + 50, bx : bx + 25] = (255, 140, 100)
for i, (bw, bh) in enumerate(dec_sizes):
x = dec_x
y = y_offset + (3 - i) * 130
if n_blocks > 4 + i + 1:
frame[y : y + bh, x : x + bw] = (80, 200, 120)
frame[y : y + 2, x : x + bw] = (120, 230, 150)
frame[y + bh - 2 : y + bh, x : x + bw] = (120, 230, 150)
frame[y : y + bh, x : x + 2] = (120, 230, 150)
frame[y : y + bh, x + bw - 2 : x + bw] = (120, 230, 150)
if i < len(dec_sizes) - 1:
ax = x + bw // 2
ay = y - 30
frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)
skip_threshold = 5
_draw_unet_skips(frame, enc_positions, n_blocks, dec_x, skip_threshold)
return frame
def _unet_demo() -> list[CompositeVideoClip]:
"""Animate U-Net encoder-decoder architecture."""
dur = STEP_DUR + 1
unet_clip = VideoClip(_make_unet_frame, duration=dur).with_fps(FPS)
labels = [
("U-Net: Encoder-Decoder + Skip Connections", 28, "#FFE082", FONT_B, (80, 20)),
(
"Niebieski = Encoder (↓ zmniejsza rozdzielczość, wyciąga cechy)",
16,
"#64B5F6",
FONT_R,
(80, 65),
),
(
"Zielony = Decoder (↑ zwiększa rozdzielczość, odtwarza mapę)",
16,
"#A5D6A7",
FONT_R,
(80, 90),
),
(
"Żółte przerywane = Skip connections (przenoszą detale z encodera)",
16,
"#FFE082",
FONT_R,
(80, 115),
),
(
"Czerwony = Bottleneck (najgłębsza warstwa, max abstrakcja)",
16,
"#EF9A9A",
FONT_R,
(450, 570),
),
(
"Kształt U: encoder ↓ decoder ↑, mosty pośrodku",
18,
"white",
FONT_R,
(80, 640),
),
(
"Concatenation: skip łączy kanały (więcej informacji niż dodawanie)",
16,
"#78909C",
FONT_R,
(80, 670),
),
]
return [_compose_slide(unet_clip, labels, dur)]
# ── FCN Architecture ─────────────────────────────────────────────
def _draw_pipeline_blocks(
frame: np.ndarray,
blocks: list[
tuple[tuple[int, int], tuple[int, int], tuple[int, int, int]]
],
n_visible: int,
arrow_limit: int,
) -> None:
"""Draw coloured blocks with connecting arrows."""
for i, ((bx, by), (bw, bh), color) in enumerate(blocks):
if i < n_visible:
frame[by : by + bh, bx : bx + bw] = color
frame[by : by + 2, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
if i < arrow_limit:
ax = bx + bw + 3
ay = by + bh // 2
frame[ay - 1 : ay + 2, ax : ax + 12] = (150, 150, 170)
def _draw_red_cross(
frame: np.ndarray,
x_start: int,
width: int,
top_y: int,
height: int,
) -> None:
"""Draw a red X across the given rectangle."""
for d in range(-2, 3):
for step in range(height):
x1 = x_start + int(step * width / height)
y1 = top_y + step + d
if 0 <= y1 < H and 0 <= x1 < W:
frame[y1, x1] = (255, 80, 80)
y2 = top_y + height - step + d
if 0 <= y2 < H and 0 <= x1 < W:
frame[y2, x1] = (255, 80, 80)
def _make_fcn_frame(t: float) -> np.ndarray:
"""Render a single FCN comparison frame."""
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.8), 1.0)
top_y = 140
blocks_classic = [
((80, top_y), (70, 50), (70, 130, 200)),
((170, top_y), (50, 40), (50, 100, 160)),
((240, top_y), (60, 50), (70, 130, 200)),
((320, top_y), (40, 35), (50, 100, 160)),
((385, top_y), (55, 50), (160, 80, 60)),
((465, top_y), (55, 50), (180, 60, 60)),
((545, top_y), (80, 50), (200, 80, 80)),
]
n_top = min(int(progress * 7) + 1, 7)
arrow_limit = 6
_draw_pipeline_blocks(frame, blocks_classic, n_top, arrow_limit)
cross_phase = 0.6
if progress > cross_phase:
_draw_red_cross(frame, 385, 135, top_y, 50)
bot_y = 380
blocks_fcn = [
((80, bot_y), (70, 50), (70, 130, 200)),
((170, bot_y), (50, 40), (50, 100, 160)),
((240, bot_y), (60, 50), (70, 130, 200)),
((320, bot_y), (40, 35), (50, 100, 160)),
((385, bot_y), (70, 50), (80, 200, 120)),
((480, bot_y), (75, 50), (200, 160, 80)),
((580, bot_y), (80, 50), (100, 200, 100)),
]
fcn_phase = 0.4
if progress > fcn_phase:
n_bot = min(int((progress - fcn_phase) / 0.6 * 7) + 1, 7)
_draw_pipeline_blocks(frame, blocks_fcn, n_bot, arrow_limit)
return frame
def _fcn_demo() -> list[CompositeVideoClip]:
"""Animate FCN step-by-step: FC → Conv 1x1 transformation."""
dur = STEP_DUR + 1
fcn_clip = VideoClip(_make_fcn_frame, duration=dur).with_fps(FPS)
labels = [
("FCN: Fully Convolutional Network (2015)", 26, "#FFE082", FONT_B, (80, 20)),
("KROK 1: Zamień FC → Conv 1x1", 18, "#A5D6A7", FONT_R, (80, 60)),
("Klasyczny CNN:", 16, "#EF9A9A", FONT_B, (80, 105)),
("Conv", 11, "white", FONT_R, (92, 148)),
("Pool", 11, "white", FONT_R, (178, 148)),
("Conv", 11, "white", FONT_R, (250, 148)),
("Pool", 11, "white", FONT_R, (325, 148)),
("Flatten", 11, "#EF9A9A", FONT_R, (390, 148)),
("FC", 11, "#EF9A9A", FONT_R, (480, 148)),
("1 label", 11, "#EF9A9A", FONT_R, (555, 148)),
("FCN:", 16, "#A5D6A7", FONT_B, (80, 350)),
("Conv", 11, "white", FONT_R, (92, 388)),
("Pool", 11, "white", FONT_R, (178, 388)),
("Conv", 11, "white", FONT_R, (250, 388)),
("Pool", 11, "white", FONT_R, (325, 388)),
("Conv1x1", 11, "#A5D6A7", FONT_R, (390, 388)),
("Upsample", 11, "#FFE082", FONT_R, (486, 388)),
("Mapa", 11, "#A5D6A7", FONT_R, (595, 388)),
(
"FC: spłaszcza 3D→1D, wymusza stały rozmiar → 1 etykieta",
16,
"#EF9A9A",
FONT_R,
(80, 250),
),
(
"Conv1x1: działa per piksel x kanały → DOWOLNY rozmiar → mapa klasy",
16,
"#A5D6A7",
FONT_R,
(80, 460),
),
(
"KROK 2: Skip connections — łączą wczesne detale z późną abstrakcją",
17,
"#64B5F6",
FONT_R,
(80, 510),
),
(
"Wczesne warstwy = krawędzie, tekstury | Późne = koncepty obiektów",
15,
"#78909C",
FONT_R,
(80, 545),
),
(
"FCN = PIERWSZA sieć end-to-end do segmentacji per-piksel!",
18,
"white",
FONT_R,
(80, 590),
),
(
"Mnemonik: FC → Conv 1x1 = otwieramy bramkę dla DOWOLNEGO rozmiaru",
16,
"#FFE082",
FONT_R,
(80, 640),
),
]
slides = [_compose_slide(fcn_clip, labels, dur)]
# Slide 2: FCN skip connections step by step
skip_lines = [
("FCN: Skip Connections — krok po kroku", 26, "#FFE082", FONT_B, (80, 30)),
(
"1. Encoder zmniejsza: 224→112→56→28→14 (pooling)",
18,
"#64B5F6",
FONT_R,
(100, 100),
),
(
" Każdy pooling traci detale przestrzenne (dokładne krawędzie)",
15,
"#78909C",
FONT_R,
(100, 135),
),
(
"2. Decoder powiększa: 14→28→56→112→224 (upsample/deconv)",
18,
"#A5D6A7",
FONT_R,
(100, 190),
),
(
" Upsample ODGADUJE piksele — rozmyty wynik!",
15,
"#78909C",
FONT_R,
(100, 225),
),
(
"3. Skip connections: dodaj cechy z encodera do decodera",
18,
"#FFE082",
FONT_R,
(100, 280),
),
(
" Wczesne cechy = GDZIE (precyzyjne krawędzie)",
15,
"#64B5F6",
FONT_R,
(100, 315),
),
(
" Późne cechy = CO (abstrakcyjne koncepty)",
15,
"#A5D6A7",
FONT_R,
(100, 345),
),
(
" Skip = daje decoderowi OBA → ostry wynik!",
15,
"#FFE082",
FONT_R,
(100, 375),
),
(
"Warianty: FCN-32s (brak skip, rozmyty) → FCN-16s → FCN-8s (najlepszy)",
16,
"#B0BEC5",
FONT_R,
(80, 440),
),
(
"FCN-32s: upsample 32x naraz → ROZMYTE granice",
15,
"#EF9A9A",
FONT_R,
(100, 485),
),
(
"FCN-16s: skip z pool4 + upsample 16x → lepiej",
15,
"#FFE082",
FONT_R,
(100, 520),
),
(
"FCN-8s: skip z pool3+pool4 + upsample 8x → OSTRE granice!",
15,
"#A5D6A7",
FONT_R,
(100, 555),
),
(
"Im więcej skip connections → tym więcej "
"detali z encodera → ostrzejszy wynik",
17,
"white",
FONT_R,
(80, 620),
),
]
slides.append(_text_slide(skip_lines, duration=STEP_DUR + 1))
return slides
# ── DeepLab Architecture ─────────────────────────────────────────
def _make_dilated_frame(t: float) -> np.ndarray:
"""Render a dilated convolution comparison frame."""
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
cell = 36
grids = [
(
"rate=1",
60,
[
(0, 0),
(0, 1),
(0, 2),
(1, 0),
(1, 1),
(1, 2),
(2, 0),
(2, 1),
(2, 2),
],
),
(
"rate=2",
420,
[
(0, 0),
(0, 2),
(0, 4),
(2, 0),
(2, 2),
(2, 4),
(4, 0),
(4, 2),
(4, 4),
],
),
(
"rate=3",
820,
[
(0, 0),
(0, 3),
(0, 6),
(3, 0),
(3, 3),
(3, 6),
(6, 0),
(6, 3),
(6, 6),
],
),
]
for gi, (_label, gx, positions) in enumerate(grids):
if progress < gi * 0.3:
break
gy = 180
grid_size = 7
for r in range(grid_size):
for c in range(grid_size):
x = gx + c * cell
y = gy + r * cell
frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55)
for r, c in positions:
x = gx + c * cell
y = gy + r * cell
frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200)
frame[y : y + 2, x : x + cell - 2] = (120, 180, 255)
frame[y + cell - 4 : y + cell - 2, x : x + cell - 2] = (120, 180, 255)
return frame
def _make_aspp_frame(t: float) -> np.ndarray:
"""Render a single ASPP module animation frame."""
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
frame[250:330, 50:130] = (70, 130, 200)
frame[250:252, 50:130] = (120, 180, 255)
frame[328:330, 50:130] = (120, 180, 255)
branches = [
("1x1 conv", 250, (200, 170), (100, 40), (80, 200, 120)),
("rate=6", 310, (200, 250), (100, 40), (200, 160, 80)),
("rate=12", 370, (200, 330), (100, 40), (200, 120, 60)),
("rate=18", 430, (200, 410), (100, 40), (180, 100, 80)),
("GAP", 490, (200, 490), (100, 40), (160, 80, 160)),
]
n_branches = min(int(progress * 5) + 1, 5)
for i, (_lbl, _h, (bx, by), (bw, bh), color) in enumerate(branches):
if i < n_branches:
frame[by : by + bh, bx : bx + bw] = color
frame[by : by + 2, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
ay = by + bh // 2
frame[ay - 1 : ay + 2, 133:197] = (150, 150, 170)
concat_phase = 0.6
if progress > concat_phase:
frame[250:530, 380:420] = (50, 60, 80)
frame[250:252, 380:420] = (200, 200, 100)
frame[528:530, 380:420] = (200, 200, 100)
for i, (_lbl, _h, (bx, by), (bw, bh), _c) in enumerate(branches):
if i < n_branches:
ay = by + bh // 2
frame[ay - 1 : ay + 2, bx + bw + 3 : 378] = (150, 150, 170)
final_conv_phase = 0.8
if progress > final_conv_phase:
frame[350:420, 450:550] = (100, 200, 100)
frame[350:352, 450:550] = (150, 230, 150)
frame[418:420, 450:550] = (150, 230, 150)
frame[388:391, 423:448] = (150, 150, 170)
return frame
def _deeplab_demo() -> list[CompositeVideoClip]:
"""Animate DeepLab: dilated convolution + ASPP step by step."""
dur = STEP_DUR + 1
# Slide 1: Regular vs Dilated convolution
dil_clip = VideoClip(_make_dilated_frame, duration=dur).with_fps(FPS)
labels = [
("DeepLab: Atrous (Dilated) Convolution", 26, "#FFE082", FONT_B, (80, 20)),
(
"KROK 1: Zrozum dilated convolution — filtr z DZIURAMI",
18,
"#A5D6A7",
FONT_R,
(80, 60),
),
("rate=1 (zwykła)", 14, "#64B5F6", FONT_B, (60, 160)),
("RF = 3x3", 14, "#64B5F6", FONT_R, (60, 440)),
("9 wag, kontekst 3px", 12, "#78909C", FONT_R, (60, 470)),
("rate=2 (dilated)", 14, "#FFE082", FONT_B, (420, 160)),
("RF = 5x5", 14, "#FFE082", FONT_R, (420, 440)),
("9 wag, kontekst 5px!", 12, "#78909C", FONT_R, (420, 470)),
("rate=3 (dilated)", 14, "#A5D6A7", FONT_B, (820, 160)),
("RF = 7x7", 14, "#A5D6A7", FONT_R, (820, 440)),
("9 wag, kontekst 7px!", 12, "#78909C", FONT_R, (820, 470)),
(
"Niebieski = pozycja wag filtra 3x3 | Szary = pominięte (dziury)",
15,
"#B0BEC5",
FONT_R,
(80, 510),
),
(
"TE SAME 9 wag → WIĘKSZE pole widzenia "
"→ lepszy kontekst BEZ dodatkowych parametrów!",
16,
"white",
FONT_R,
(80, 550),
),
(
"Mnemonik: DZIURY w filtrze — à trous = z dziurami (fr.)",
16,
"#FFE082",
FONT_R,
(80, 600),
),
]
slides = [_compose_slide(dil_clip, labels, dur)]
# Slide 2: ASPP module step by step
aspp_clip = VideoClip(_make_aspp_frame, duration=dur).with_fps(FPS)
labels2 = [
(
"DeepLab: ASPP (Atrous Spatial Pyramid Pooling)",
24,
"#FFE082",
FONT_B,
(80, 20),
),
(
"KROK 2: Multi-scale — analizuj obraz na WIELU skalach naraz",
17,
"#A5D6A7",
FONT_R,
(80, 60),
),
("Wejście", 13, "#64B5F6", FONT_B, (55, 235)),
("Conv 1x1", 12, "white", FONT_R, (210, 178)),
("Dilated r=6", 12, "white", FONT_R, (205, 258)),
("Dilated r=12", 12, "white", FONT_R, (203, 338)),
("Dilated r=18", 12, "white", FONT_R, (203, 418)),
("GAP (global)", 12, "white", FONT_R, (205, 498)),
("Concat", 13, "#FFE082", FONT_B, (381, 537)),
("Conv", 13, "#A5D6A7", FONT_B, (470, 425)),
(
"5 gałęzi RÓWNOLEGŁYCH → różne skale kontekstu:",
16,
"#B0BEC5",
FONT_R,
(550, 170),
),
(" 1x1: kontekst punktowy (piksel)", 14, "#A5D6A7", FONT_R, (560, 210)),
(" r=6: kontekst lokalny (~13px)", 14, "#FFE082", FONT_R, (560, 245)),
(" r=12: kontekst średni (~25px)", 14, "#FFE082", FONT_R, (560, 280)),
(" r=18: kontekst szeroki (~37px)", 14, "#FFE082", FONT_R, (560, 315)),
(" GAP: kontekst GLOBALNY (cały obraz)", 14, "#CE93D8", FONT_R, (560, 350)),
("Concat → 1x1 conv → mapa segmentacji", 16, "#A5D6A7", FONT_R, (550, 400)),
(
"Efekt: sieć widzi OD piksela DO całego obrazu naraz!",
17,
"white",
FONT_R,
(80, 600),
),
(
"Mnemonik: ASPP = Piramida z DZIURAMI, patrzy na 5 skal jednocześnie",
15,
"#FFE082",
FONT_R,
(80, 645),
),
]
slides.append(_compose_slide(aspp_clip, labels2, dur))
return slides
# ── Transformer Segmentation ────────────────────────────────────
def _draw_base_grid(
frame: np.ndarray, gx: int, gy: int, grid_n: int, cell: int,
) -> None:
"""Draw an empty grid of cells."""
for r in range(grid_n):
for c in range(grid_n):
x = gx + c * cell
y = gy + r * cell
frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55)
def _draw_cnn_kernel(
frame: np.ndarray, lx: int, ly: int, cell: int, progress: float,
) -> None:
"""Highlight a 3x3 CNN kernel on the grid."""
cnn_phase = 0.2
if progress <= cnn_phase:
return
cx, cy = 2, 2
for dr in range(-1, 2):
for dc in range(-1, 2):
r, c = cy + dr, cx + dc
x = lx + c * cell
y = ly + r * cell
frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200)
x = lx + cx * cell
y = ly + cy * cell
frame[y : y + cell - 2, x : x + cell - 2] = (120, 180, 255)
def _draw_conn_line(
frame: np.ndarray, x0: int, y0: int, x1: int, y1: int,
) -> None:
"""Draw a dashed connection line between two points."""
steps = max(abs(x1 - x0), abs(y1 - y0))
if steps <= 0:
return
for s in range(0, steps, 3):
px = x0 + int((x1 - x0) * s / steps)
py = y0 + int((y1 - y0) * s / steps)
if 0 <= px < W - 1 and 0 <= py < H - 1:
frame[py : py + 1, px : px + 1] = (200, 180, 50)
def _draw_attention_connections(
frame: np.ndarray,
origin: tuple[int, int],
grid_n: int,
cell: int,
progress: float,
) -> None:
"""Draw transformer self-attention connections on the grid."""
rx, ry = origin
transformer_phase = 0.4
if progress <= transformer_phase:
return
cx_t, cy_t = 2, 2
x0 = rx + cx_t * cell + cell // 2
y0 = ry + cy_t * cell + cell // 2
n_connections = int(progress * 36)
conn_idx = 0
for r in range(grid_n):
for c in range(grid_n):
conn_idx += 1
if conn_idx > n_connections:
break
x = rx + c * cell
y = ry + r * cell
dist = abs(r - cy_t) + abs(c - cx_t)
strength = max(30, 200 - dist * 30)
frame[y : y + cell - 2, x : x + cell - 2] = (
strength // 3,
strength // 2,
strength,
)
_draw_conn_line(frame, x0, y0, x + cell // 2, y + cell // 2)
else:
continue
break
x = rx + cx_t * cell
y = ry + cy_t * cell
frame[y : y + cell - 2, x : x + cell - 2] = (255, 200, 50)
def _make_attention_frame(t: float) -> np.ndarray:
"""Render a CNN-vs-Transformer attention comparison frame."""
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
cell = 40
grid_n = 6
lx, ly = 60, 200
_draw_base_grid(frame, lx, ly, grid_n, cell)
_draw_cnn_kernel(frame, lx, ly, cell, progress)
rx, ry = 680, 200
_draw_base_grid(frame, rx, ry, grid_n, cell)
_draw_attention_connections(frame, (rx, ry), grid_n, cell, progress)
return frame
def _transformer_seg_demo() -> list[CompositeVideoClip]:
"""Animate transformer-based segmentation: self-attention concept."""
dur = STEP_DUR + 1
# Slide 1: CNN local vs Transformer global
att_clip = VideoClip(_make_attention_frame, duration=dur).with_fps(FPS)
labels = [
("Transformer: Self-Attention w segmentacji", 26, "#FFE082", FONT_B, (80, 20)),
("CNN = LOKALNY kontekst", 18, "#64B5F6", FONT_B, (60, 160)),
("Transformer = GLOBALNY kontekst", 18, "#FFE082", FONT_B, (680, 160)),
("Filtr 3x3 widzi", 14, "#64B5F6", FONT_R, (60, 460)),
("TYLKO 9 sąsiadów", 14, "#64B5F6", FONT_R, (60, 485)),
("Self-attention: każdy", 14, "#FFE082", FONT_R, (680, 460)),
("piksel widzi WSZYSTKIE!", 14, "#FFE082", FONT_R, (680, 485)),
("vs", 28, "#B0BEC5", FONT_B, (450, 300)),
]
slides = [_compose_slide(att_clip, labels, dur)]
# Slide 2: Self-attention Q/K/V step by step
qkv_lines = [
("Self-Attention: Q / K / V krok po kroku", 26, "#FFE082", FONT_B, (80, 30)),
("Każdy piksel (token) tworzy 3 wektory:", 18, "#B0BEC5", FONT_R, (100, 100)),
(
" Q (Query) = 'czego szukam?' - pytanie piksela",
17,
"#64B5F6",
FONT_R,
(120, 145),
),
(
" K (Key) = 'co oferuj\u0119?' - odpowied\u017a piksela",
17,
"#A5D6A7",
FONT_R,
(120, 185),
),
(
" V (Value) = 'moja warto\u015b\u0107' - informacja do przekazania",
17,
"#FFE082",
FONT_R,
(120, 225),
),
("Algorytm attention:", 18, "#B0BEC5", FONT_R, (100, 285)),
(
" 1. Mnożenie Q x K\u1d40 → macierz NxN (kto ważny dla kogo)",
16,
"white",
FONT_R,
(120, 320),
),
(
" 2. Skalowanie: / √d (stabilność gradientów)",
16,
"white",
FONT_R,
(120, 355),
),
(
" 3. Softmax → wagi attention (sumują się do 1)",
16,
"white",
FONT_R,
(120, 390),
),
(
" 4. Mnożenie wag x V → ważona suma wartości",
16,
"white",
FONT_R,
(120, 425),
),
(
"Attention(Q,K,V) = softmax(Q · K\u1d40 / √d) · V",
20,
"#FFE082",
FONT_B,
(100, 480),
),
(
"Złożoność: O(n²) pamięci — n = liczba pikseli/tokenów",
16,
"#EF9A9A",
FONT_R,
(100, 535),
),
(
"Dlatego SegFormer używa efficient attention (liniowa złożoność)",
15,
"#78909C",
FONT_R,
(100, 570),
),
(
"SegFormer (2021): lightweight + hierarchiczny encoder",
16,
"#A5D6A7",
FONT_R,
(100, 610),
),
(
"Mask2Former (2022): masked attention + "
"unified (semantic+instance+panoptic)",
16,
"#CE93D8",
FONT_R,
(100, 645),
),
]
slides.append(_text_slide(qkv_lines, duration=STEP_DUR + 1))
# Slide 3: Encoder-Decoder in DL summary
summary_lines = [
(
"Podsumowanie: Encoder-Decoder w segmentacji DL",
24,
"#FFE082",
FONT_B,
(80, 30),
),
("Wspólna idea WSZYSTKICH sieci segmentacji:", 18, "#B0BEC5", FONT_R, (80, 90)),
(
"Encoder: obraz → cechy (zmniejsza rozdzielczość, wyciąga CO)",
16,
"#64B5F6",
FONT_R,
(100, 140),
),
(
"Decoder: cechy → mapa (zwiększa rozdzielczość, odtwarza GDZIE)",
16,
"#A5D6A7",
FONT_R,
(100, 175),
),
(
"Skip: przenosi detale z encodera do decodera",
16,
"#FFE082",
FONT_R,
(100, 210),
),
("", 10, "white", FONT_R, (100, 240)),
(
"FCN (2015): Conv1x1 + skip → pierwsza end-to-end",
16,
"#64B5F6",
FONT_R,
(100, 275),
),
(
"U-Net (2015): U-shape + skip concat → segmentacja medyczna",
16,
"#A5D6A7",
FONT_R,
(100, 310),
),
(
"DeepLab (2018): dilated conv + ASPP → multi-scale kontekst",
16,
"#FFE082",
FONT_R,
(100, 345),
),
(
"SegFormer: transformer encoder (globalny kontekst)",
16,
"#CE93D8",
FONT_R,
(100, 380),
),
(
"Mask2Former: masked attention (unified, SOTA)",
16,
"#CE93D8",
FONT_R,
(100, 415),
),
("", 10, "white", FONT_R, (100, 440)),
(
"Ewolucja: więcej kontekstu + lepsze skip connections:",
17,
"white",
FONT_R,
(80, 465),
),
(
" CNN lokal. → dilated (szersze RF) → transformer (global) → masked att.",
16,
"#B0BEC5",
FONT_R,
(80, 505),
),
(
" addition skip → concat skip → cross-attention skip",
16,
"#B0BEC5",
FONT_R,
(80, 540),
),
(
"Metryki: mIoU (standard), Dice (medycyna), Focal Loss (imbalance)",
16,
"#90CAF9",
FONT_R,
(80, 590),
),
(
"Loss: Cross-Entropy per piksel + opcjonalnie Dice/Focal",
15,
"#78909C",
FONT_R,
(80, 625),
),
]
slides.append(_text_slide(summary_lines, duration=STEP_DUR + 1))
return slides
# ── Methods comparison ────────────────────────────────────────────
def _methods_comparison() -> CompositeVideoClip:
bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(10.0)
title = (
_tc(
text="Porównanie metod segmentacji",
font_size=36,
color="white",
font=FONT_B,
)
.with_duration(10.0)
.with_position(("center", 20))
)
rows = [
("Metoda", "Typ", "Idea", "Mnemonik"),
("Thresholding", "Klasyczna", "piksel > T → klasa 1", "PRÓG na bramce"),
("Otsu", "Klasyczna", "auto-próg, min σ²", "AUTO-bramkarz"),
("Region Growing", "Klasyczna", "BFS od seeda", "PLAMA atramentu"),
("Watershed", "Klasyczna", "zalewanie minimów", "ZALEWANIE terenu"),
("Mean Shift", "Klasyczna", "jądro → max gęstości", "KULKI do dołków"),
("U-Net", "Deep Learning", "encoder-decoder + skip", "Litera U + mosty"),
("DeepLab", "Deep Learning", "dilated conv + ASPP", "DZIURY w filtrze"),
]
clips: list[VideoClip] = [bg, title]
mnemonic_col = 3
for i, row in enumerate(rows):
y_pos = 75 + i * 72
col_x = [40, 210, 340, 660]
for j, cell in enumerate(row):
fs = 16 if i > 0 else 18
color = (
"#64B5F6" if i == 0
else ("#E0E0E0" if j < mnemonic_col else "#FFE082")
)
tc = (
_tc(
text=cell,
font_size=fs,
color=color,
font=FONT_B if i == 0 else FONT_R,
)
.with_duration(10.0)
.with_position((col_x[j], y_pos))
)
clips.append(tc)
return CompositeVideoClip(clips, size=(W, H)).with_effects(
[FadeIn(0.5), FadeOut(0.5)]
)
# ── Main ──────────────────────────────────────────────────────────
def main() -> None:
"""Generate the Q23 segmentation visualization video."""
sections: list[VideoClip] = []
sections.append(
_make_header(
"Pytanie 23: Segmentacja obrazu",
"Problem, strategie klasyczne i sieci neuronowe",
duration=4.0,
)
)
# Concept
sections.append(_make_header("Co to segmentacja?", "Etykieta klasy per piksel"))
sections.extend(_segmentation_concept())
# Thresholding
sections.append(
_make_header("Progowanie + Otsu", "Najprostsza metoda — automatyczny próg")
)
sections.extend(_thresholding_demo())
# Region Growing
sections.append(_make_header("Region Growing", "Seed → BFS do podobnych sąsiadów"))
sections.extend(_region_growing_demo())
# Watershed
sections.append(_make_header("Watershed", "Obraz jako mapa topograficzna"))
sections.extend(_watershed_demo())
# FCN
sections.append(
_make_header("FCN (Deep Learning)", "Fully Convolutional Network — Conv 1x1")
)
sections.extend(_fcn_demo())
# U-Net
sections.append(
_make_header(
"U-Net (Deep Learning)", "Architektura encoder-decoder + skip concat"
)
)
sections.extend(_unet_demo())
# DeepLab
sections.append(
_make_header(
"DeepLab v3+ (Deep Learning)", "Dilated convolution + ASPP — multi-scale"
)
)
sections.extend(_deeplab_demo())
# Transformer segmentation
sections.append(
_make_header(
"Transformer (SegFormer, Mask2Former)", "Self-attention — globalny kontekst"
)
)
sections.extend(_transformer_seg_demo())
# Comparison
sections.append(_methods_comparison())
# Summary
sections.append(
_make_header(
"Podsumowanie",
"Klasyczne: próg/region/watershed | DL: FCN/U-Net/DeepLab/Transformer",
duration=4.0,
)
)
final = concatenate_videoclips(sections, method="compose")
final.write_videofile(
OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4
)
_logger.info("Video saved to: %s", OUTPUT)
if __name__ == "__main__":
main()