testsAndMisc-archive/python_pkg/praca_magisterska_video/visualize_q24.py

1892 lines
64 KiB
Python

"""MoviePy visualization for PYTANIE 24: Object Detection.
Creates animated video demonstrating:
- What detection is (bounding box + class + confidence)
- HOG + SVM pipeline (gradient → histogram → classify)
- Viola-Jones (Haar features, integral image, cascade)
- R-CNN evolution (R-CNN → Fast → Faster)
- YOLO one-stage detection
- Building a detector from a classifier
"""
from __future__ import annotations
import os
from pathlib import Path
import numpy as np
os.environ["FFMPEG_BINARY"] = "/usr/bin/ffmpeg"
from moviepy import (
ColorClip,
CompositeVideoClip,
TextClip,
VideoClip,
concatenate_videoclips,
)
from moviepy.video.fx import FadeIn, FadeOut
# ── Constants ─────────────────────────────────────────────────────
W, H = 1280, 720
FPS = 24
STEP_DUR = 7.0
HEADER_DUR = 4.0
FONT_B = "/usr/share/fonts/TTF/DejaVuSans-Bold.ttf"
FONT_R = "/usr/share/fonts/TTF/DejaVuSans.ttf"
OUTPUT_DIR = Path(__file__).resolve().parent / "videos"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT = str(OUTPUT_DIR / "q24_object_detection.mp4")
BG_COLOR = (15, 20, 35)
def _tc(**kwargs: object) -> TextClip:
"""TextClip wrapper that adds enough bottom margin to prevent clipping."""
fs = kwargs.get("font_size", 24)
m = int(fs) // 3 + 2
kwargs["margin"] = (0, m)
return TextClip(**kwargs)
def _make_header(
title: str, subtitle: str, duration: float = HEADER_DUR
) -> CompositeVideoClip:
bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(duration)
t = (
_tc(
text=title,
font_size=48,
color="white",
font=FONT_B,
)
.with_duration(duration)
.with_position(("center", 260))
)
s = (
_tc(
text=subtitle,
font_size=24,
color="#90CAF9",
font=FONT_R,
)
.with_duration(duration)
.with_position(("center", 340))
)
return CompositeVideoClip([bg, t, s], size=(W, H)).with_effects(
[FadeIn(0.5), FadeOut(0.5)]
)
# ── Detection concept ────────────────────────────────────────────
def _detection_concept() -> list[CompositeVideoClip]:
"""Show what detection is: bounding box + class + confidence."""
slides = []
def make_det_frame(_t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
# Draw a "scene" with colored rectangles representing objects
# Sky background area
frame[140:500, 100:700] = (40, 50, 70)
# "Car" object
frame[350:430, 150:320] = (180, 60, 60)
# "Person" object
frame[280:440, 450:520] = (60, 120, 180)
# "Tree" object
frame[200:400, 580:650] = (40, 130, 50)
# Bounding boxes (with labels drawn as colored borders)
# Car bbox
for thickness in range(3):
t = thickness
frame[348 - t : 432 + t, 148 - t : 148 - t + 2] = (255, 80, 80)
frame[348 - t : 432 + t, 322 + t - 2 : 322 + t] = (255, 80, 80)
frame[348 - t : 348 - t + 2, 148 - t : 322 + t] = (255, 80, 80)
frame[432 + t - 2 : 432 + t, 148 - t : 322 + t] = (255, 80, 80)
# Person bbox
for thickness in range(3):
t = thickness
frame[278 - t : 442 + t, 448 - t : 448 - t + 2] = (80, 180, 255)
frame[278 - t : 442 + t, 522 + t - 2 : 522 + t] = (80, 180, 255)
frame[278 - t : 278 - t + 2, 448 - t : 522 + t] = (80, 180, 255)
frame[442 + t - 2 : 442 + t, 448 - t : 522 + t] = (80, 180, 255)
# Tree bbox
for thickness in range(3):
t = thickness
frame[198 - t : 402 + t, 578 - t : 578 - t + 2] = (80, 220, 100)
frame[198 - t : 402 + t, 652 + t - 2 : 652 + t] = (80, 220, 100)
frame[198 - t : 198 - t + 2, 578 - t : 652 + t] = (80, 220, 100)
frame[402 + t - 2 : 402 + t, 578 - t : 652 + t] = (80, 220, 100)
# Comparison boxes on right side
# Classification
frame[180:260, 800:1150] = (35, 45, 65)
# Detection
frame[290:370, 800:1150] = (35, 45, 65)
# Segmentation
frame[400:480, 800:1150] = (35, 45, 65)
return frame
det_clip = VideoClip(make_det_frame, duration=STEP_DUR).with_fps(FPS)
text_clips: list[VideoClip] = [det_clip]
labels = [
("Detekcja obiektów — co to jest?", 28, "#FFE082", FONT_B, (100, 20)),
("Wynik: (klasa, bounding box, pewność)", 20, "#B0BEC5", FONT_R, (100, 65)),
("samochód 95%", 14, "#EF9A9A", FONT_B, (150, 340)),
("osoba 88%", 14, "#64B5F6", FONT_B, (450, 268)),
("drzewo 72%", 14, "#A5D6A7", FONT_B, (580, 188)),
("Klasyfikacja: cały obraz → 1 etykieta", 15, "#78909C", FONT_R, (810, 210)),
("Detekcja: bbox + klasa + pewność", 15, "#FFE082", FONT_R, (810, 320)),
("Segmentacja: maska per piksel", 15, "#78909C", FONT_R, (810, 430)),
("← granulacja rośnie →", 14, "#90CAF9", FONT_R, (810, 520)),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── HOG + SVM pipeline ───────────────────────────────────────────
def _hog_svm_demo() -> list[CompositeVideoClip]:
"""Animate HOG feature computation and SVM classification."""
slides = []
def make_hog_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.8), 1.0)
# Pipeline stages as boxes with arrows
stages = [
("Gradient", (80, 250), (130, 80), (100, 160, 220)),
("Orientacja", (260, 250), (130, 80), (80, 180, 140)),
("Komórki 8x8", (440, 250), (130, 80), (200, 160, 80)),
("Bloki 2x2", (620, 250), (130, 80), (200, 120, 60)),
("Normalizacja", (800, 250), (130, 80), (180, 100, 80)),
("SVM", (980, 250), (130, 80), (220, 80, 80)),
]
n_active = int(progress * len(stages)) + 1
for i, (_label, (sx, sy), (sw, sh), color) in enumerate(stages):
if i < n_active:
frame[sy : sy + sh, sx : sx + sw] = color
# Border
frame[sy : sy + 2, sx : sx + sw] = tuple(
min(c + 60, 255) for c in color
)
frame[sy + sh - 2 : sy + sh, sx : sx + sw] = tuple(
min(c + 60, 255) for c in color
)
# Arrow to next
if i < len(stages) - 1:
ax = sx + sw + 5
ay = sy + sh // 2
frame[ay - 1 : ay + 2, ax : ax + 20] = (150, 150, 170)
# Show gradient computation example at bottom
if progress > 0.2:
# Mini pixel grid showing gradient computation
gx, gy = 100, 430
pixels = [50, 50, 200]
for idx, val in enumerate(pixels):
x = gx + idx * 50
frame[gy : gy + 40, x : x + 40] = (val, val, val)
return frame
hog_clip = VideoClip(make_hog_frame, duration=STEP_DUR).with_fps(FPS)
text_clips: list[VideoClip] = [hog_clip]
labels = [
("HOG + SVM — pipeline detekcji pieszych", 28, "#FFE082", FONT_B, (80, 20)),
(
"Mnemonik: GOKBN = Gradienty→Orientacja→Komórki→Bloki→Normalizacja",
16,
"#A5D6A7",
FONT_R,
(80, 65),
),
("Gradient: siła i kierunek zmiany jasności", 14, "#64B5F6", FONT_R, (80, 95)),
(
"Histogram: 9 binów (0°-180°, co 20°) per komórka 8x8",
14,
"#78909C",
FONT_R,
(80, 120),
),
(
"[50][50][200] → Gx = 200-50 = 150 = silna krawędź!",
16,
"#EF9A9A",
FONT_R,
(80, 490),
),
(
"Wektor HOG (3780 cech) → SVM: pieszy (+1) / tło (-1)",
16,
"white",
FONT_R,
(80, 540),
),
(
"Sliding window 64x128 przesuwa się po obrazie → NMS → wynik",
16,
"#90CAF9",
FONT_R,
(80, 580),
),
(
"SVM = LINIA MAKSYMALNEGO ODDECHU (max margines, support vectors)",
16,
"#FFE082",
FONT_R,
(80, 620),
),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── Viola-Jones ───────────────────────────────────────────────────
def _viola_jones_demo() -> list[CompositeVideoClip]:
"""Animate Viola-Jones cascade concept."""
slides = []
def make_cascade_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.8), 1.0)
# Draw cascade "funnel" — stages filtering out non-faces
stages = 5
start_width = 1000
start_count = 10000
x_center = W // 2
for i in range(stages):
stage_progress = min(progress * stages - i, 1.0)
if stage_progress <= 0:
break
width = int(start_width * (1 - i * 0.18))
int(start_count * (0.3**i))
y = 150 + i * 100
h_box = 60
# Stage box
x1 = x_center - width // 2
frame[y : y + h_box, x1 : x1 + width] = (
50 + i * 10,
60 + i * 10,
80 + i * 10,
)
# Border
frame[y : y + 2, x1 : x1 + width] = (100 + i * 20, 130 + i * 15, 200)
frame[y + h_box - 2 : y + h_box, x1 : x1 + width] = (
100 + i * 20,
130 + i * 15,
200,
)
# Arrow down to next
if i < stages - 1:
frame[y + h_box + 5 : y + h_box + 25, x_center - 1 : x_center + 2] = (
150,
150,
170,
)
# Red "rejected" arrows on sides
if i > 0:
# Left reject arrow
rx = x1 - 30
ry = y + h_box // 2
frame[ry - 1 : ry + 2, rx : rx + 25] = (200, 80, 80)
return frame
cascade_clip = VideoClip(make_cascade_frame, duration=STEP_DUR).with_fps(FPS)
text_clips: list[VideoClip] = [cascade_clip]
labels = [
(
"Viola-Jones — kaskada klasyfikatorów (2001)",
28,
"#FFE082",
FONT_B,
(80, 20),
),
(
"3 innowacje: HIC = Haar + Integral Image + Cascade",
20,
"#B0BEC5",
FONT_R,
(80, 65),
),
("Etap 1: 2 cechy Haar", 14, "#64B5F6", FONT_R, (170, 170)),
("Etap 2: 10 cech", 14, "#64B5F6", FONT_R, (210, 270)),
("Etap 3: 25 cech", 14, "#64B5F6", FONT_R, (240, 370)),
("Etap 4: 50 cech", 14, "#64B5F6", FONT_R, (260, 470)),
("→ TWARZ!", 16, "#A5D6A7", FONT_B, (590, 560)),
(
"SITO: 99% okien odpada w pierwszych 3 etapach → REAL-TIME!",
16,
"#EF9A9A",
FONT_R,
(80, 620),
),
(
"Haar: kontrast jasna/ciemna | Integral Image: suma prostokąta O(1) = 4 odczyty",
14,
"#78909C",
FONT_R,
(80, 655),
),
("odrzucone →", 12, "#EF9A9A", FONT_R, (60, 275)),
("odrzucone →", 12, "#EF9A9A", FONT_R, (60, 375)),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── R-CNN Evolution ───────────────────────────────────────────────
def _rcnn_evolution() -> list[CompositeVideoClip]:
"""Animate R-CNN → Fast R-CNN → Faster R-CNN evolution."""
slides = []
def make_evolution_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.8), 1.0)
# Three rows: R-CNN, Fast R-CNN, Faster R-CNN
models = [
(
"R-CNN (2014)",
50,
[
("Selective\nSearch", (200, 150), (100, 50), (120, 100, 60)),
("2000x\nCNN", (350, 150), (80, 50), (180, 60, 60)),
("2000x\nSVM", (480, 150), (80, 50), (180, 60, 60)),
("NMS", (610, 150), (60, 50), (100, 140, 100)),
],
"50 sec/obraz!",
),
(
"Fast R-CNN (2015)",
300,
[
("Selective\nSearch", (200, 150), (100, 50), (120, 100, 60)),
("1x CNN\n(cały obraz)", (350, 150), (100, 50), (80, 140, 200)),
("ROI Pool\n(2000)", (500, 150), (90, 50), (200, 160, 80)),
("FC", (640, 150), (50, 50), (100, 140, 100)),
],
"2 sec/obraz",
),
(
"Faster R-CNN (2015)",
300,
[
("CNN\nbackbone", (200, 150), (90, 50), (80, 140, 200)),
("RPN\n(~300)", (340, 150), (80, 50), (200, 120, 60)),
("ROI Pool", (470, 150), (80, 50), (200, 160, 80)),
("FC", (600, 150), (50, 50), (100, 140, 100)),
],
"0.2 sec → 5 fps!",
),
]
n_models = int(progress * 3) + 1
for mi, (_name, base_y, stages, _speed) in enumerate(models):
if mi >= n_models:
break
for _label, (bx, by_off), (bw, bh), color in stages:
by = base_y + by_off - 150
frame[by : by + bh, bx : bx + bw] = color
frame[by : by + 2, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
# Arrows between stages
for si in range(len(stages) - 1):
sx = stages[si][1][0] + stages[si][2][0]
ex = stages[si + 1][1][0]
ay = base_y + 25
frame[ay - 1 : ay + 2, sx + 3 : ex - 3] = (150, 150, 170)
return frame
evo_clip = VideoClip(make_evolution_frame, duration=STEP_DUR + 1).with_fps(FPS)
text_clips: list[VideoClip] = [evo_clip]
labels = [
("Ewolucja R-CNN — CORAZ MNIEJ MARNOWANIA", 28, "#FFE082", FONT_B, (80, 20)),
("R-CNN (2014)", 20, "#EF9A9A", FONT_B, (50, 80)),
("50 sec/obraz (2000x forward pass!)", 14, "#EF9A9A", FONT_R, (720, 100)),
("Fast R-CNN (2015)", 20, "#64B5F6", FONT_B, (50, 330)),
("2 sec/obraz (CNN raz + ROI Pool)", 14, "#64B5F6", FONT_R, (720, 350)),
("Faster R-CNN (2015)", 20, "#A5D6A7", FONT_B, (50, 580)),
("0.2 sec → 5 fps (RPN w sieci!)", 14, "#A5D6A7", FONT_R, (720, 600)),
(
"Kluczowe innowacje: ROI Pooling → stały rozmiar | RPN → propozycje w sieci",
14,
"#78909C",
FONT_R,
(80, 660),
),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR + 1)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── R-CNN Detailed Pipeline ──────────────────────────────────────
def _rcnn_detailed() -> list[CompositeVideoClip]:
"""Animate R-CNN step-by-step pipeline in detail."""
slides = []
# Slide 1: R-CNN pipeline step by step
def make_rcnn_pipeline(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.8), 1.0)
# Step boxes arranged vertically with arrows
steps = [
((80, 130), (200, 55), (120, 100, 60), "1. Selective Search"),
((80, 230), (200, 55), (180, 60, 60), "2. Wytnij 2000 regionów"),
((80, 330), (200, 55), (70, 130, 200), "3. CNN per region"),
((80, 430), (200, 55), (200, 100, 80), "4. SVM klasyfikuje"),
((80, 530), (200, 55), (100, 180, 100), "5. Bbox regresja + NMS"),
]
n_steps = min(int(progress * 5) + 1, 5)
for i, ((bx, by), (bw, bh), color, _lbl) in enumerate(steps):
if i < n_steps:
frame[by : by + bh, bx : bx + bw] = color
frame[by : by + 2, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
# Arrow down
if i < 4:
ax = bx + bw // 2
ay = by + bh + 5
frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)
# Illustration: many overlapping regions from Selective Search
if progress > 0.2:
rng_local = np.random.default_rng(42)
n_boxes = min(int((progress - 0.2) * 15), 8)
for i in range(n_boxes):
rx = 500 + rng_local.integers(-30, 100)
ry = 200 + rng_local.integers(-20, 120)
rw = 60 + rng_local.integers(0, 80)
rh = 50 + rng_local.integers(0, 70)
c = (80 + i * 15, 100 + i * 10, 60 + i * 20)
for tt in range(2):
frame[ry - tt : ry + rh + tt, rx - tt : rx - tt + 2] = c
frame[ry - tt : ry + rh + tt, rx + rw + tt - 2 : rx + rw + tt] = c
frame[ry - tt : ry - tt + 2, rx - tt : rx + rw + tt] = c
frame[ry + rh + tt - 2 : ry + rh + tt, rx - tt : rx + rw + tt] = c
return frame
rcnn_clip = VideoClip(make_rcnn_pipeline, duration=STEP_DUR + 1).with_fps(FPS)
dur = STEP_DUR + 1
labels = [
("R-CNN: krok po kroku (2014, Girshick)", 26, "#FFE082", FONT_B, (80, 20)),
("Pipeline detekcji two-stage", 16, "#B0BEC5", FONT_R, (80, 60)),
("Selective Search", 11, "white", FONT_R, (105, 145)),
("2000 regionów", 11, "white", FONT_R, (105, 245)),
("CNN per region", 11, "white", FONT_R, (105, 345)),
("SVM klasyfikuje", 11, "white", FONT_R, (105, 445)),
("Regresja + NMS", 11, "white", FONT_R, (105, 545)),
("~2000 propozycji regionów", 14, "#78909C", FONT_R, (500, 155)),
("(inteligentne łączenie", 13, "#78909C", FONT_R, (500, 180)),
("podobnych fragmentów)", 13, "#78909C", FONT_R, (500, 200)),
("Problem: 2000 x CNN forward pass", 16, "#EF9A9A", FONT_R, (400, 400)),
("= 50 SEKUND na obraz!", 18, "#EF9A9A", FONT_B, (400, 430)),
("CNN liczy cechy per region OSOBNO", 14, "#EF9A9A", FONT_R, (400, 470)),
(
"→ regiony się nakładają → obliczenia się powtarzają!",
14,
"#EF9A9A",
FONT_R,
(400, 495),
),
(
"Rozwiązanie: CNN raz na cały obraz → Fast R-CNN →",
16,
"#A5D6A7",
FONT_R,
(80, 620),
),
]
text_clips: list[VideoClip] = [rcnn_clip]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(dur)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── ROI Pooling ──────────────────────────────────────────────────
def _roi_pooling_demo() -> list[CompositeVideoClip]:
"""Animate ROI Pooling: key Fast R-CNN innovation."""
slides = []
def make_roi_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
# Left: feature map with ROI highlighted
fm_x, fm_y = 60, 180
fm_cell = 30
fm_grid = 8
for r in range(fm_grid):
for c in range(fm_grid):
x = fm_x + c * fm_cell
y = fm_y + r * fm_cell
# Random-looking feature values
val = 30 + ((r * 7 + c * 13 + 42) % 40)
frame[y : y + fm_cell - 1, x : x + fm_cell - 1] = (
val,
val + 10,
val + 20,
)
# ROI region highlighted
roi_r1, roi_c1 = 2, 1
roi_r2, roi_c2 = 6, 5
for tt in range(3):
ry1 = fm_y + roi_r1 * fm_cell - tt
ry2 = fm_y + roi_r2 * fm_cell + tt
rx1 = fm_x + roi_c1 * fm_cell - tt
rx2 = fm_x + roi_c2 * fm_cell + tt
frame[ry1:ry2, rx1 : rx1 + 2] = (255, 200, 50)
frame[ry1:ry2, rx2 - 2 : rx2] = (255, 200, 50)
frame[ry1 : ry1 + 2, rx1:rx2] = (255, 200, 50)
frame[ry2 - 2 : ry2, rx1:rx2] = (255, 200, 50)
# Arrow
if progress > 0.3:
frame[300:303, 310:380] = (150, 150, 170)
# Middle: ROI divided into 3x3 grid (output_size)
if progress > 0.3:
out_x, out_y = 400, 220
out_cell = 50
out_n = 3
roi_h = roi_r2 - roi_r1
roi_w = roi_c2 - roi_c1
for r in range(out_n):
for c in range(out_n):
x = out_x + c * out_cell
y = out_y + r * out_cell
# Compute the max from corresponding region
src_r1 = roi_r1 + r * roi_h // out_n
src_r2 = roi_r1 + (r + 1) * roi_h // out_n
src_c1 = roi_c1 + c * roi_w // out_n
src_c2 = roi_c1 + (c + 1) * roi_w // out_n
max_val = 0
for sr in range(src_r1, src_r2):
for sc in range(src_c1, src_c2):
v = 30 + ((sr * 7 + sc * 13 + 42) % 40)
max_val = max(max_val, v)
frame[y : y + out_cell - 2, x : x + out_cell - 2] = (
max_val,
max_val + 20,
max_val + 40,
)
frame[y : y + 2, x : x + out_cell - 2] = (80, 200, 120)
frame[y + out_cell - 4 : y + out_cell - 2, x : x + out_cell - 2] = (
80,
200,
120,
)
# Arrow to FC
if progress > 0.6:
frame[300:303, 560:630] = (150, 150, 170)
# FC box
frame[270:340, 650:730] = (200, 100, 80)
frame[270:272, 650:730] = (240, 140, 120)
frame[338:340, 650:730] = (240, 140, 120)
return frame
roi_clip = VideoClip(make_roi_frame, duration=STEP_DUR + 1).with_fps(FPS)
dur = STEP_DUR + 1
labels = [
("ROI Pooling: kluczowa innowacja Fast R-CNN", 26, "#FFE082", FONT_B, (80, 20)),
(
"KROK 1: CNN raz na CAŁY obraz → feature mapa",
17,
"#64B5F6",
FONT_R,
(80, 60),
),
(
"KROK 2: Wytnij ROI z feature mapy (nie z obrazu!)",
17,
"#FFE082",
FONT_R,
(80, 90),
),
(
"KROK 3: Siatkuj ROI na 3x3 → max pool per komórka → stały rozmiar",
17,
"#A5D6A7",
FONT_R,
(80, 120),
),
("Feature mapa", 14, "#64B5F6", FONT_B, (60, 160)),
("ROI (żółta ramka)", 13, "#FFE082", FONT_R, (60, 440)),
("ROI Pool 3x3", 14, "#A5D6A7", FONT_B, (400, 195)),
("(max z komórki)", 13, "#78909C", FONT_R, (400, 380)),
("FC", 14, "white", FONT_B, (670, 280)),
(
"Problem: ROI mają RÓŻNE rozmiary, FC wymaga STAŁEGO",
15,
"#B0BEC5",
FONT_R,
(80, 500),
),
(
"ROI Pooling: dzieli ROI na siatkę, max pool → STAŁY rozmiar!",
16,
"white",
FONT_R,
(80, 535),
),
(
"Fast R-CNN: CNN raz → 1 feature mapa → ROI Pool 2000 regionów → 25x szybciej!",
16,
"#A5D6A7",
FONT_R,
(80, 580),
),
(
"(R-CNN: 2000x CNN = 50s | Fast R-CNN: 1xCNN + ROI Pool = 2s)",
15,
"#EF9A9A",
FONT_R,
(80, 620),
),
]
text_clips: list[VideoClip] = [roi_clip]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(dur)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── RPN + Anchor Boxes ───────────────────────────────────────────
def _rpn_anchors_demo() -> list[CompositeVideoClip]:
"""Animate RPN and anchor boxes: Faster R-CNN innovation."""
slides = []
# Slide 1: Anchor boxes concept
def make_anchors_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
# Draw feature map grid point with multiple anchors
cx, cy = 350, 360 # center point on feature map
# Draw a "feature map" grid background
cell = 60
for r in range(-3, 4):
for c in range(-3, 4):
x = cx + c * cell - cell // 2
y = cy + r * cell - cell // 2
frame[y : y + cell - 1, x : x + cell - 1] = (30, 35, 48)
# Center point highlighted
frame[cy - 5 : cy + 5, cx - 5 : cx + 5] = (255, 200, 50)
# Draw anchors around center: 3 sizes x 3 ratios = 9
anchor_specs = [
# (half_w, half_h, color)
(30, 30, (200, 80, 80)), # small 1:1
(20, 40, (200, 60, 60)), # small 1:2
(40, 20, (180, 60, 60)), # small 2:1
(60, 60, (80, 200, 80)), # medium 1:1
(40, 80, (60, 180, 60)), # medium 1:2
(80, 40, (60, 160, 60)), # medium 2:1
(90, 90, (80, 80, 200)), # large 1:1
(60, 120, (60, 60, 180)), # large 1:2
(120, 60, (60, 60, 160)), # large 2:1
]
n_anchors = min(int(progress * 9) + 1, 9)
for i in range(n_anchors):
hw, hh, color = anchor_specs[i]
x1 = max(0, cx - hw)
y1 = max(0, cy - hh)
x2 = min(W - 1, cx + hw)
y2 = min(H - 1, cy + hh)
for tt in range(2):
frame[y1 - tt : y2 + tt, x1 - tt : x1 - tt + 2] = color
frame[y1 - tt : y2 + tt, x2 + tt - 2 : x2 + tt] = color
frame[y1 - tt : y1 - tt + 2, x1 - tt : x2 + tt] = color
frame[y2 + tt - 2 : y2 + tt, x1 - tt : x2 + tt] = color
return frame
anch_clip = VideoClip(make_anchors_frame, duration=STEP_DUR + 1).with_fps(FPS)
dur = STEP_DUR + 1
labels = [
("Anchor Boxes + RPN (Faster R-CNN)", 26, "#FFE082", FONT_B, (80, 20)),
(
"KROK 1: Anchory = predefiniowane kształty w każdej pozycji",
17,
"#A5D6A7",
FONT_R,
(80, 60),
),
(
"3 rozmiary x 3 proporcje = 9 anchorów per punkt",
16,
"#B0BEC5",
FONT_R,
(80, 90),
),
("Małe (1:1, 1:2, 2:1)", 14, "#EF9A9A", FONT_R, (750, 170)),
("Średnie (1:1, 1:2, 2:1)", 14, "#A5D6A7", FONT_R, (750, 210)),
("Duże (1:1, 1:2, 2:1)", 14, "#64B5F6", FONT_R, (750, 250)),
("Żółty punkt = pozycja", 14, "#FFE082", FONT_R, (750, 310)),
("na feature mapie", 14, "#FFE082", FONT_R, (750, 335)),
("Sieć NIE predykuje bbox od zera!", 16, "white", FONT_R, (80, 530)),
(
"Predykuje OFFSET od najbliższego anchora: (Δx, Δy, Δw, Δh)",
16,
"#FFE082",
FONT_R,
(80, 565),
),
(
"+ P(obiekt) = 'czy w tym anchorze jest coś?'",
16,
"#A5D6A7",
FONT_R,
(80, 600),
),
(
"Mnemonik: Anchor = KOTWICA — sieć dopasowuje bbox do kotwicy",
15,
"#78909C",
FONT_R,
(80, 645),
),
]
text_clips: list[VideoClip] = [anch_clip]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(dur)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
# Slide 2: RPN step by step
rpn_lines = [
(
"RPN: Region Proposal Network — krok po kroku",
24,
"#FFE082",
FONT_B,
(80, 30),
),
(
"Zastępuje Selective Search SIECIĄ NEURONOWĄ (end-to-end!)",
17,
"#B0BEC5",
FONT_R,
(80, 85),
),
("", 10, "white", FONT_R, (80, 110)),
(
"1. Backbone (ResNet) przetwarza obraz → feature mapa [40x60x256]",
16,
"#64B5F6",
FONT_R,
(100, 140),
),
(
"2. Filtr 3x3 przesuwa się po feature mapie",
16,
"#A5D6A7",
FONT_R,
(100, 180),
),
(
"3. W KAŻDEJ pozycji (x,y) rozważ k=9 anchorów:",
16,
"#FFE082",
FONT_R,
(100, 220),
),
(" → P(obiekt) — 'czy tu jest coś?'", 15, "white", FONT_R, (120, 255)),
(" → (Δx, Δy, Δw, Δh) — poprawka pozycji", 15, "white", FONT_R, (120, 285)),
(
"4. 40x60 pozycji x 9 anchorów = 21 600 kandydatów!",
16,
"#EF9A9A",
FONT_R,
(100, 325),
),
(
"5. Weź ~300 z najwyższym P(obiekt) → ROI Pool → FC",
16,
"#A5D6A7",
FONT_R,
(100, 365),
),
("", 10, "white", FONT_R, (100, 395)),
("Porównanie generowania propozycji:", 17, "white", FONT_B, (80, 420)),
(
" Selective Search: ~2000 regionów, osobny algorytm, ~2 sec",
15,
"#EF9A9A",
FONT_R,
(100, 460),
),
(
" RPN: ~300 regionów, W SIECI, ~10 ms → 200x szybciej!",
15,
"#A5D6A7",
FONT_R,
(100, 495),
),
("", 10, "white", FONT_R, (100, 520)),
(
"Faster R-CNN = Backbone + RPN + ROI Pool + FC — WSZYSTKO end-to-end",
17,
"#FFE082",
FONT_R,
(80, 545),
),
(
"→ 5 fps (0.2 sec/obraz) vs R-CNN 50 sec = 250x szybciej!",
17,
"#A5D6A7",
FONT_R,
(80, 585),
),
(
"Wciąż two-stage: (1) RPN generuje propozycje, (2) FC klasyfikuje",
15,
"#78909C",
FONT_R,
(80, 630),
),
]
slides.append(_text_slide(rpn_lines, duration=STEP_DUR + 1))
return slides
# ── YOLO ──────────────────────────────────────────────────────────
def _yolo_demo() -> list[CompositeVideoClip]:
"""Animate YOLO grid detection concept."""
slides = []
def make_yolo_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
# Draw image with grid overlay
img_x, img_y = 100, 140
img_size = 420
grid_n = 7
# Background "image"
frame[img_y : img_y + img_size, img_x : img_x + img_size] = (50, 55, 70)
# Objects in the image
frame[img_y + 80 : img_y + 200, img_x + 50 : img_x + 180] = (
180,
60,
60,
) # "car"
frame[img_y + 150 : img_y + 350, img_x + 250 : img_x + 330] = (
60,
120,
180,
) # "person"
# Grid lines
cell = img_size // grid_n
for i in range(grid_n + 1):
# Vertical
x = img_x + i * cell
frame[img_y : img_y + img_size, x : x + 1] = (100, 100, 120)
# Horizontal
y = img_y + i * cell
frame[y : y + 1, img_x : img_x + img_size] = (100, 100, 120)
# Highlight cells containing object centers
if progress > 0.3:
# Car center ~ cell (1, 1)
cx, cy = 1, 2
hx = img_x + cx * cell
hy = img_y + cy * cell
frame[hy : hy + cell, hx : hx + cell] = np.clip(
frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255
).astype(np.uint8)
if progress > 0.5:
# Person center ~ cell (4, 4)
cx, cy = 4, 4
hx = img_x + cx * cell
hy = img_y + cy * cell
frame[hy : hy + cell, hx : hx + cell] = np.clip(
frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255
).astype(np.uint8)
# Bounding boxes predictions from cells
if progress > 0.6:
# Car bbox
for tt in range(2):
frame[
img_y + 78 - tt : img_y + 202 + tt,
img_x + 48 - tt : img_x + 48 - tt + 2,
] = (255, 80, 80)
frame[
img_y + 78 - tt : img_y + 202 + tt,
img_x + 182 + tt - 2 : img_x + 182 + tt,
] = (255, 80, 80)
frame[
img_y + 78 - tt : img_y + 78 - tt + 2,
img_x + 48 - tt : img_x + 182 + tt,
] = (255, 80, 80)
frame[
img_y + 202 + tt - 2 : img_y + 202 + tt,
img_x + 48 - tt : img_x + 182 + tt,
] = (255, 80, 80)
# Person bbox
for tt in range(2):
frame[
img_y + 148 - tt : img_y + 352 + tt,
img_x + 248 - tt : img_x + 248 - tt + 2,
] = (80, 180, 255)
frame[
img_y + 148 - tt : img_y + 352 + tt,
img_x + 332 + tt - 2 : img_x + 332 + tt,
] = (80, 180, 255)
frame[
img_y + 148 - tt : img_y + 148 - tt + 2,
img_x + 248 - tt : img_x + 332 + tt,
] = (80, 180, 255)
frame[
img_y + 352 + tt - 2 : img_y + 352 + tt,
img_x + 248 - tt : img_x + 332 + tt,
] = (80, 180, 255)
return frame
yolo_clip = VideoClip(make_yolo_frame, duration=STEP_DUR).with_fps(FPS)
text_clips: list[VideoClip] = [yolo_clip]
labels = [
("YOLO — You Only Look Once", 28, "#FFE082", FONT_B, (80, 20)),
(
"Jednoetapowy detektor: siatka SxS → wszystkie detekcje naraz!",
18,
"#B0BEC5",
FONT_R,
(80, 65),
),
("Siatka 7x7 = 49 komórek", 16, "#64B5F6", FONT_R, (600, 180)),
("Każda komórka predykuje:", 16, "white", FONT_R, (600, 220)),
(" • B bbox (x, y, w, h, conf)", 14, "#B0BEC5", FONT_R, (600, 255)),
(" • C klas (prawdopodobieństwa)", 14, "#B0BEC5", FONT_R, (600, 285)),
("Komórka odpowiada za obiekt", 14, "#A5D6A7", FONT_R, (600, 325)),
("którego ŚRODEK w niej wpada", 14, "#A5D6A7", FONT_R, (600, 350)),
("45-155 fps! (vs 5 fps Faster R-CNN)", 18, "#EF9A9A", FONT_B, (600, 400)),
(
"Jedno przejście przez sieć → WSZYSTKIE detekcje naraz → NMS → wynik",
14,
"#78909C",
FONT_R,
(80, 620),
),
(
"Two-stage (R-CNN): propozycje+klasyfikacja | One-stage (YOLO): bez propozycji!",
14,
"#90CAF9",
FONT_R,
(80, 655),
),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── YOLO Architecture Detail ──────────────────────────────────────
def _yolo_architecture() -> list[CompositeVideoClip]:
"""Show YOLO architecture: backbone → head, output tensor."""
slides = []
# Slide 1: YOLO architecture breakdown
def make_yolo_arch(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
# Pipeline: Image → Backbone → Neck → Head → SxSx(B*5+C) tensor
blocks = [
((60, 280), (100, 80), (50, 70, 90), "Obraz"),
((200, 280), (100, 80), (70, 130, 200), "Backbone"),
((340, 280), (100, 80), (200, 160, 80), "Neck"),
((480, 280), (100, 80), (200, 100, 60), "Head"),
((620, 280), (160, 80), (80, 200, 120), "SxSx(B*5+C)"),
]
n_blocks = min(int(progress * 5) + 1, 5)
for i, ((bx, by), (bw, bh), color, _lbl) in enumerate(blocks):
if i < n_blocks:
frame[by : by + bh, bx : bx + bw] = color
frame[by : by + 2, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
if i < 4:
ax = bx + bw + 5
ay = by + bh // 2
frame[ay - 1 : ay + 2, ax : ax + 25] = (150, 150, 170)
# Output tensor breakdown (right side)
if progress > 0.6:
# Show SxS grid
gx, gy = 850, 180
gs = 120
gn = 4 # simplified from 7
gc = gs // gn
for r in range(gn):
for c in range(gn):
x = gx + c * gc
y = gy + r * gc
frame[y : y + gc - 1, x : x + gc - 1] = (40, 50, 65)
# Highlight one cell
frame[gy + gc : gy + 2 * gc - 1, gx + gc : gx + 2 * gc - 1] = (80, 200, 120)
return frame
arch_clip = VideoClip(make_yolo_arch, duration=STEP_DUR + 1).with_fps(FPS)
dur = STEP_DUR + 1
labels = [
("YOLO: Architektura — krok po kroku", 26, "#FFE082", FONT_B, (80, 20)),
(
"One-stage: JEDEN forward pass → WSZYSTKIE detekcje naraz",
17,
"#B0BEC5",
FONT_R,
(80, 60),
),
("Obraz", 13, "white", FONT_R, (85, 295)),
("Backbone", 13, "white", FONT_R, (215, 295)),
("(ResNet/", 11, "#78909C", FONT_R, (210, 370)),
("Darknet)", 11, "#78909C", FONT_R, (210, 390)),
("Neck", 13, "white", FONT_R, (365, 295)),
("(FPN/", 11, "#78909C", FONT_R, (360, 370)),
("PANet)", 11, "#78909C", FONT_R, (360, 390)),
("Head", 13, "white", FONT_R, (505, 295)),
("(conv)", 11, "#78909C", FONT_R, (500, 370)),
("Tensor wyjścia", 13, "#A5D6A7", FONT_R, (640, 295)),
("Każda komórka SxS predykuje:", 15, "#FFE082", FONT_R, (830, 320)),
(" B bbox x (x,y,w,h,conf)", 13, "#B0BEC5", FONT_R, (830, 350)),
(" + C klas (prob.)", 13, "#B0BEC5", FONT_R, (830, 375)),
("= SxSx(Bx5+C) tensor", 13, "#A5D6A7", FONT_R, (830, 400)),
("Np. 7x7x(2x5+20) = 7x7x30", 13, "#78909C", FONT_R, (830, 430)),
(
"Two-stage (R-CNN): (1) propozycje → (2) klasyfikacja = 2 przejścia",
15,
"#EF9A9A",
FONT_R,
(80, 470),
),
(
"One-stage (YOLO): siatka → predykcja all-in-one = 1 przejście!",
15,
"#A5D6A7",
FONT_R,
(80, 505),
),
(
"Ewolucja YOLO: v1(2016)→v3→v5→v8(2023, anchor-free, SOTA)",
16,
"#FFE082",
FONT_R,
(80, 555),
),
(
"SSD (2016): multi-scale feature maps → lepsza detekcja małych obiektów",
15,
"#64B5F6",
FONT_R,
(80, 595),
),
(
"FPN: łączy wczesne warstwy (małe obiekty) + późne (duże obiekty)",
15,
"#78909C",
FONT_R,
(80, 630),
),
]
text_clips: list[VideoClip] = [arch_clip]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(dur)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── DETR ──────────────────────────────────────────────────────────
def _detr_demo() -> list[CompositeVideoClip]:
"""Animate DETR: transformer detection, object queries, no NMS."""
slides = []
# Slide 1: DETR pipeline
def make_detr_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
# DETR pipeline: Image → Backbone → Encoder → Decoder → N predictions
blocks = [
((50, 260), (80, 60), (50, 70, 90)),
((170, 260), (90, 60), (70, 130, 200)),
((300, 260), (110, 60), (200, 120, 60)),
((450, 260), (110, 60), (200, 80, 160)),
((600, 260), (120, 60), (80, 200, 120)),
]
n_blocks = min(int(progress * 5) + 1, 5)
for i, ((bx, by), (bw, bh), color) in enumerate(blocks):
if i < n_blocks:
frame[by : by + bh, bx : bx + bw] = color
frame[by : by + 2, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
if i < 4:
ax = bx + bw + 5
ay = by + bh // 2
frame[ay - 1 : ay + 2, ax : ax + 25] = (150, 150, 170)
# Object queries illustration (right side)
if progress > 0.5:
qx, qy = 800, 140
for i in range(6):
y = qy + i * 50
w = 130
active = i < 3
color = (80, 180, 120) if active else (60, 50, 50)
frame[y : y + 35, qx : qx + w] = color
frame[y : y + 1, qx : qx + w] = tuple(min(c + 40, 255) for c in color)
# Arrow from decoder to queries
frame[285:288, 723:798] = (150, 150, 170)
return frame
detr_clip = VideoClip(make_detr_frame, duration=STEP_DUR + 1).with_fps(FPS)
dur = STEP_DUR + 1
labels = [
("DETR: DEtection TRansformer (2020)", 26, "#FFE082", FONT_B, (80, 20)),
(
"Radykalnie prostszy pipeline: BEZ anchorów, BEZ NMS!",
17,
"#B0BEC5",
FONT_R,
(80, 60),
),
("Obraz", 12, "white", FONT_R, (65, 275)),
("Backbone", 12, "white", FONT_R, (185, 275)),
("Transformer", 12, "white", FONT_R, (310, 275)),
("Encoder", 12, "white", FONT_R, (325, 295)),
("Transformer", 12, "white", FONT_R, (460, 275)),
("Decoder", 12, "white", FONT_R, (478, 295)),
("N predykcji", 12, "white", FONT_R, (615, 275)),
("Object Queries:", 14, "#FFE082", FONT_B, (800, 115)),
("samochód 95%", 11, "white", FONT_R, (810, 148)),
("pies 88%", 11, "white", FONT_R, (810, 198)),
("rower 72%", 11, "white", FONT_R, (810, 248)),
("brak", 11, "#78909C", FONT_R, (810, 298)),
("brak", 11, "#78909C", FONT_R, (810, 348)),
("brak", 11, "#78909C", FONT_R, (810, 398)),
("100 wyuczonych queries", 13, "#FFE082", FONT_R, (800, 440)),
("→ każdy 'szuka' obiektu", 13, "#FFE082", FONT_R, (800, 465)),
]
text_clips: list[VideoClip] = [detr_clip]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(dur)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
# Slide 2: Why no NMS + Hungarian matching
detr_details = [
("DETR: Dlaczego bez NMS? — krok po kroku", 24, "#FFE082", FONT_B, (80, 30)),
(
"Problem NMS: duplikaty detekcji → ręcznie usuwaj post-hoc",
16,
"#EF9A9A",
FONT_R,
(80, 90),
),
(
"DETR rozwiązanie: Hungarian matching (dopasowanie węgierskie)",
17,
"#A5D6A7",
FONT_R,
(80, 130),
),
("", 10, "white", FONT_R, (80, 155)),
("Jak to działa podczas TRENINGU:", 17, "white", FONT_B, (80, 180)),
(" 1. Sieć daje N=100 predykcji (queries)", 15, "#64B5F6", FONT_R, (100, 220)),
(
" 2. Na obrazie jest np. 5 obiektów (ground truth)",
15,
"#64B5F6",
FONT_R,
(100, 255),
),
(
" 3. Hungarian matching: optymalne dopasowanie 1:1",
15,
"#FFE082",
FONT_R,
(100, 290),
),
(
" → query_1 ↔ gt_samochód (najlepsze dopasowanie)",
14,
"#A5D6A7",
FONT_R,
(120, 325),
),
(" → query_7 ↔ gt_pies", 14, "#A5D6A7", FONT_R, (120, 355)),
(" → query_3 ↔ gt_rower", 14, "#A5D6A7", FONT_R, (120, 385)),
(
" → pozostałe 97 queries ↔ klasa 'brak obiektu'",
14,
"#78909C",
FONT_R,
(120, 415),
),
(
" 4. Każdy obiekt ma DOKŁADNIE 1 predykcję → BRAK duplikatów!",
15,
"#A5D6A7",
FONT_R,
(100, 455),
),
("", 10, "white", FONT_R, (100, 475)),
(
"Self-attention w encoderze: cechy obrazu 'rozmawiają' ze sobą",
15,
"#64B5F6",
FONT_R,
(80, 500),
),
(
"Cross-attention w decoderze: queries 'pytają' cechy obrazu",
15,
"#CE93D8",
FONT_R,
(80, 535),
),
(
"→ query 'rozumie' który fragment obrazu to 'jego' obiekt",
15,
"#FFE082",
FONT_R,
(80, 570),
),
(
"DETR = Detekcja Eliminująca Trikowe Redundancje (NMS, anchory)",
16,
"#FFE082",
FONT_R,
(80, 620),
),
(
"Wada: wolniejszy trening (O(n²) attention) | Zaleta: prostszy pipeline!",
15,
"#78909C",
FONT_R,
(80, 660),
),
]
slides.append(_text_slide(detr_details, duration=STEP_DUR + 1))
# Slide 3: Two-stage vs One-stage vs Transformer summary
summary_lines = [
(
"Podsumowanie: Two-stage vs One-stage vs Transformer",
22,
"#FFE082",
FONT_B,
(80, 30),
),
("", 10, "white", FONT_R, (80, 55)),
("TWO-STAGE (R-CNN family):", 18, "#EF9A9A", FONT_B, (80, 90)),
(
" (1) Generuj propozycje → (2) Klasyfikuj per region",
15,
"white",
FONT_R,
(100, 125),
),
(
" + Wysoka precyzja | - Wolniejsze (2 przejścia)",
15,
"#78909C",
FONT_R,
(100, 155),
),
(
" R-CNN → Fast R-CNN → Faster R-CNN (0.2s)",
15,
"#B0BEC5",
FONT_R,
(100, 185),
),
("", 10, "white", FONT_R, (80, 210)),
("ONE-STAGE (YOLO, SSD):", 18, "#A5D6A7", FONT_B, (80, 240)),
(
" Siatka → predykcja all-in-one (1 przejście)",
15,
"white",
FONT_R,
(100, 275),
),
(
" + Bardzo szybkie (45-155 fps) | - Historycznie mniej precyzyjne",
15,
"#78909C",
FONT_R,
(100, 305),
),
(
" YOLOv8 (2023): anchor-free, dorównuje two-stage!",
15,
"#B0BEC5",
FONT_R,
(100, 335),
),
("", 10, "white", FONT_R, (80, 360)),
("TRANSFORMER (DETR):", 18, "#CE93D8", FONT_B, (80, 390)),
(
" Object queries + self-attention (globalny kontekst)",
15,
"white",
FONT_R,
(100, 425),
),
(
" + Brak NMS/anchorów | - Wolniejszy trening (O(n²))",
15,
"#78909C",
FONT_R,
(100, 455),
),
(
" Hungarian matching → 1:1 obiekt↔predykcja → brak duplikatów",
15,
"#B0BEC5",
FONT_R,
(100, 485),
),
("", 10, "white", FONT_R, (80, 510)),
(
"Trend: coraz prostsze pipeline, mniej ręcznych komponentów",
17,
"white",
FONT_R,
(80, 540),
),
(
" R-CNN (SS+CNN+SVM+NMS) → YOLO (backbone+head+NMS) → DETR (backbone+transformer)",
14,
"#90CAF9",
FONT_R,
(80, 580),
),
(
"Metryki: mAP@0.5 (standard), mAP@0.5:0.95 (surowsza), IoU do dopasowania",
15,
"#78909C",
FONT_R,
(80, 630),
),
]
slides.append(_text_slide(summary_lines, duration=STEP_DUR + 1))
return slides
# ── NMS + IoU ─────────────────────────────────────────────────────
def _nms_iou_demo() -> list[CompositeVideoClip]:
"""Animate NMS and IoU concepts."""
slides = []
def make_nms_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
# Draw overlapping bounding boxes
ox, oy = 100, 200
obj_w, obj_h = 150, 120
# Multiple overlapping detections for same object
boxes = [
(ox, oy, obj_w, obj_h, 0.95, (255, 80, 80)), # best
(ox + 15, oy - 10, obj_w + 10, obj_h + 5, 0.90, (200, 60, 60)),
(ox - 10, oy + 5, obj_w - 5, obj_h + 10, 0.85, (160, 50, 50)),
]
# Different object far away
boxes.append((ox + 350, oy + 50, 100, 100, 0.40, (80, 180, 255)))
for i, (bx, by, bw, bh, _conf, color) in enumerate(boxes):
if progress > 0.4 and i > 0 and i < 3:
# After NMS, these get removed (shown as faded/crossed)
color = (60, 40, 40)
for tt in range(2):
frame[by - tt : by + bh + tt, bx - tt : bx - tt + 2] = color
frame[by - tt : by + bh + tt, bx + bw + tt - 2 : bx + bw + tt] = color
frame[by - tt : by - tt + 2, bx - tt : bx + bw + tt] = color
frame[by + bh + tt - 2 : by + bh + tt, bx - tt : bx + bw + tt] = color
# IoU visualization on right side
iou_x, iou_y = 700, 200
# Box A
frame[iou_y : iou_y + 100, iou_x : iou_x + 100] = (80, 80, 200)
# Box B (overlapping)
frame[iou_y + 40 : iou_y + 140, iou_x + 40 : iou_x + 140] = (200, 80, 80)
# Intersection highlighted
frame[iou_y + 40 : iou_y + 100, iou_x + 40 : iou_x + 100] = (200, 150, 200)
return frame
nms_clip = VideoClip(make_nms_frame, duration=STEP_DUR).with_fps(FPS)
text_clips: list[VideoClip] = [nms_clip]
labels = [
("NMS (Non-Maximum Suppression) + IoU", 28, "#FFE082", FONT_B, (80, 20)),
(
"NMS = Najlepszy Ma Się dobrze — zachowaj najlepszą, usuń duplikaty",
18,
"#B0BEC5",
FONT_R,
(80, 65),
),
("conf=0.95 ✓", 14, "#A5D6A7", FONT_B, (100, 340)),
("0.90 ✗ IoU>0.5", 13, "#EF9A9A", FONT_R, (100, 365)),
("0.85 ✗ IoU>0.5", 13, "#EF9A9A", FONT_R, (100, 390)),
("0.40 ✓ INNY obiekt", 13, "#64B5F6", FONT_R, (100, 420)),
("IoU = Intersection over Union", 18, "#FFE082", FONT_B, (700, 160)),
("IoU = pole(∩) / pole(AUB)", 16, "white", FONT_R, (700, 380)),
("Fioletowy = intersection", 14, "#CE93D8", FONT_R, (700, 410)),
("IoU > 0.5 → TEN SAM obiekt → usuń", 14, "#EF9A9A", FONT_R, (700, 440)),
("IoU < 0.5 → INNY obiekt → zachowaj", 14, "#A5D6A7", FONT_R, (700, 470)),
(
"DETR: jedyny detektor BEZ NMS (Hungarian matching zamiast tego)",
14,
"#78909C",
FONT_R,
(80, 620),
),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── Detector from Classifier ─────────────────────────────────────
def _detector_from_classifier() -> list[CompositeVideoClip]:
"""Show 3 approaches to building a detector from a classifier."""
slides = []
approaches = [
(
"Podejście 1: Sliding Window (NAJWOLNIEJSZE)",
[
("Okno przesuwa się po obrazie w wielu skalach", "#B0BEC5"),
("Każde okno → klasyfikator (np. ResNet) → klasa + pewność", "#B0BEC5"),
("~18 000 okien x 10ms = ~3 minuty na obraz!", "#EF9A9A"),
("Mnemonik: WYCINAJ i PYTAJ — jak wycinanie ciasteczek", "#FFE082"),
],
"SRF",
),
(
"Podejście 2: Region Proposals (= R-CNN)",
[
("Selective Search → ~2000 inteligentnych regionów", "#B0BEC5"),
("Każdy region → CNN → wektor cech → SVM klasyfikuje", "#B0BEC5"),
("~2000 x 10ms = ~20 sec — 9x szybciej!", "#64B5F6"),
(
"Mnemonik: INTELIGENTNE CIĘCIE — wytnij tylko tam gdzie wiśnie",
"#FFE082",
),
],
"SRF",
),
(
"Podejście 3: Fine-tune backbone (NAJLEPSZE)",
[
(
"Pretrained backbone (ResNet) → odetnij FC → dodaj detection head",
"#B0BEC5",
),
(
"Detection head = głowica klasyfikacji + głowica regresji bbox",
"#B0BEC5",
),
("~0.2 sec/obraz, najlepsza jakość (mAP ~42%)", "#A5D6A7"),
("Mnemonik: PRZESZCZEP GŁOWY — ten sam silnik, nowa głowa", "#FFE082"),
],
"SRF",
),
]
for title, points, _mnem in approaches:
lines = [
(title, 24, "#FFE082", FONT_B, (80, 140)),
]
for i, (text, color) in enumerate(points):
lines.append((f"{text}", 18, color, FONT_R, (100, 220 + i * 50)))
lines.append(
(
"Detektor z klasyfikatora: SRF = Sliding → Region → Fine-tune",
16,
"#78909C",
FONT_R,
(80, 520),
)
)
lines.append(
(
"= Szukaj Ręcznie, Finalnie optymalizuj!",
16,
"#90CAF9",
FONT_R,
(80, 550),
)
)
slides.append(_text_slide(lines, duration=STEP_DUR))
return slides
def _text_slide(
lines: list[tuple[str, int, str, str, tuple[str | int, str | int]]],
duration: float = STEP_DUR,
) -> CompositeVideoClip:
bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(duration)
clips: list[VideoClip] = [bg]
for text, font_size, color, font, pos in lines:
tc = (
_tc(
text=text,
font_size=font_size,
color=color,
font=font,
)
.with_duration(duration)
.with_position(pos)
)
clips.append(tc)
return CompositeVideoClip(clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
# ── Methods comparison ────────────────────────────────────────────
def _methods_comparison() -> CompositeVideoClip:
bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(10.0)
title = (
_tc(
text="Porównanie detektorów",
font_size=36,
color="white",
font=FONT_B,
)
.with_duration(10.0)
.with_position(("center", 20))
)
rows = [
("Model", "Rok", "Typ", "Szybkość", "Kluczowe"),
("HOG+SVM", "2005", "Klasyczny", "~1 fps", "Gradient histogramy"),
("Viola-Jones", "2001", "Klasyczny", "30+ fps", "Haar+Cascade"),
("R-CNN", "2014", "Two-stage", "50 sec!", "CNN per region"),
("Fast R-CNN", "2015", "Two-stage", "2 sec", "ROI Pooling"),
("Faster R-CNN", "2015", "Two-stage", "5 fps", "RPN w sieci"),
("YOLO", "2016", "One-stage", "45+ fps", "Siatka SxS"),
("DETR", "2020", "Transformer", "~40 fps", "Bez NMS!"),
]
clips: list[VideoClip] = [bg, title]
for i, row in enumerate(rows):
y_pos = 75 + i * 72
col_x = [40, 200, 280, 400, 530]
for j, cell in enumerate(row):
fs = 16 if i > 0 else 18
color = "#64B5F6" if i == 0 else "#E0E0E0"
tc = (
_tc(
text=cell,
font_size=fs,
color=color,
font=FONT_B if i == 0 else FONT_R,
)
.with_duration(10.0)
.with_position((col_x[j], y_pos))
)
clips.append(tc)
return CompositeVideoClip(clips, size=(W, H)).with_effects(
[FadeIn(0.5), FadeOut(0.5)]
)
# ── Main ──────────────────────────────────────────────────────────
def main() -> None:
"""Generate the Q24 object detection visualization video."""
sections: list[VideoClip] = []
sections.append(
_make_header(
"Pytanie 24: Detekcja obiektów",
"Problem, metody klasyczne, deep learning",
duration=4.0,
)
)
# What is detection
sections.append(
_make_header("Co to detekcja?", "Lokalizacja (bbox) + klasyfikacja (klasa)")
)
sections.extend(_detection_concept())
# HOG + SVM
sections.append(
_make_header("HOG + SVM (2005)", "Klasyczny pipeline — gradient histogramy")
)
sections.extend(_hog_svm_demo())
# Viola-Jones
sections.append(
_make_header("Viola-Jones (2001)", "Haar features + Integral Image + Cascade")
)
sections.extend(_viola_jones_demo())
# R-CNN evolution (overview)
sections.append(_make_header("Ewolucja R-CNN", "R-CNN → Fast R-CNN → Faster R-CNN"))
sections.extend(_rcnn_evolution())
# R-CNN detailed pipeline
sections.append(
_make_header("R-CNN: krok po kroku", "Selective Search → 2000xCNN → SVM → NMS")
)
sections.extend(_rcnn_detailed())
# ROI Pooling
sections.append(
_make_header("ROI Pooling (Fast R-CNN)", "CNN raz + ROI Pool → 25x szybciej")
)
sections.extend(_roi_pooling_demo())
# RPN + Anchors
sections.append(
_make_header("RPN + Anchor Boxes", "Faster R-CNN: propozycje W SIECI")
)
sections.extend(_rpn_anchors_demo())
# YOLO
sections.append(
_make_header("YOLO (2016)", "You Only Look Once — jednoetapowy detektor")
)
sections.extend(_yolo_demo())
# YOLO architecture detail
sections.append(
_make_header("YOLO: Architektura", "Backbone → Neck → Head → tensor SxS")
)
sections.extend(_yolo_architecture())
# DETR
sections.append(_make_header("DETR (2020)", "Transformer: bez NMS, bez anchorów!"))
sections.extend(_detr_demo())
# NMS + IoU
sections.append(_make_header("NMS + IoU", "Post-processing — usuwanie duplikatów"))
sections.extend(_nms_iou_demo())
# Detector from classifier
sections.append(
_make_header(
"Detektor z klasyfikatora", "3 podejścia: Sliding → Region → Fine-tune"
)
)
sections.extend(_detector_from_classifier())
# Comparison table
sections.append(_methods_comparison())
# Summary
sections.append(
_make_header(
"Podsumowanie",
"Klasyczne: HOG+SVM, Viola-Jones | DL: R-CNN, YOLO, DETR",
duration=4.0,
)
)
final = concatenate_videoclips(sections, method="compose")
final.write_videofile(
OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4
)
print(f"Video saved to: {OUTPUT}")
if __name__ == "__main__":
main()