"""MoviePy visualization for PYTANIE 24: Object Detection. Creates animated video demonstrating: - What detection is (bounding box + class + confidence) - HOG + SVM pipeline (gradient → histogram → classify) - Viola-Jones (Haar features, integral image, cascade) - R-CNN evolution (R-CNN → Fast → Faster) - YOLO one-stage detection - Building a detector from a classifier """ from __future__ import annotations import os from pathlib import Path import numpy as np os.environ["FFMPEG_BINARY"] = "/usr/bin/ffmpeg" from moviepy import ( ColorClip, CompositeVideoClip, TextClip, VideoClip, concatenate_videoclips, ) from moviepy.video.fx import FadeIn, FadeOut # ── Constants ───────────────────────────────────────────────────── W, H = 1280, 720 FPS = 24 STEP_DUR = 7.0 HEADER_DUR = 4.0 FONT_B = "/usr/share/fonts/TTF/DejaVuSans-Bold.ttf" FONT_R = "/usr/share/fonts/TTF/DejaVuSans.ttf" OUTPUT_DIR = Path(__file__).resolve().parent / "videos" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT = str(OUTPUT_DIR / "q24_object_detection.mp4") BG_COLOR = (15, 20, 35) def _tc(**kwargs: object) -> TextClip: """TextClip wrapper that adds enough bottom margin to prevent clipping.""" fs = kwargs.get("font_size", 24) m = int(fs) // 3 + 2 kwargs["margin"] = (0, m) return TextClip(**kwargs) def _make_header( title: str, subtitle: str, duration: float = HEADER_DUR ) -> CompositeVideoClip: bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(duration) t = ( _tc( text=title, font_size=48, color="white", font=FONT_B, ) .with_duration(duration) .with_position(("center", 260)) ) s = ( _tc( text=subtitle, font_size=24, color="#90CAF9", font=FONT_R, ) .with_duration(duration) .with_position(("center", 340)) ) return CompositeVideoClip([bg, t, s], size=(W, H)).with_effects( [FadeIn(0.5), FadeOut(0.5)] ) # ── Detection concept ──────────────────────────────────────────── def _detection_concept() -> list[CompositeVideoClip]: """Show what detection is: bounding box + class + confidence.""" slides = [] def make_det_frame(_t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR # Draw a "scene" with colored rectangles representing objects # Sky background area frame[140:500, 100:700] = (40, 50, 70) # "Car" object frame[350:430, 150:320] = (180, 60, 60) # "Person" object frame[280:440, 450:520] = (60, 120, 180) # "Tree" object frame[200:400, 580:650] = (40, 130, 50) # Bounding boxes (with labels drawn as colored borders) # Car bbox for thickness in range(3): t = thickness frame[348 - t : 432 + t, 148 - t : 148 - t + 2] = (255, 80, 80) frame[348 - t : 432 + t, 322 + t - 2 : 322 + t] = (255, 80, 80) frame[348 - t : 348 - t + 2, 148 - t : 322 + t] = (255, 80, 80) frame[432 + t - 2 : 432 + t, 148 - t : 322 + t] = (255, 80, 80) # Person bbox for thickness in range(3): t = thickness frame[278 - t : 442 + t, 448 - t : 448 - t + 2] = (80, 180, 255) frame[278 - t : 442 + t, 522 + t - 2 : 522 + t] = (80, 180, 255) frame[278 - t : 278 - t + 2, 448 - t : 522 + t] = (80, 180, 255) frame[442 + t - 2 : 442 + t, 448 - t : 522 + t] = (80, 180, 255) # Tree bbox for thickness in range(3): t = thickness frame[198 - t : 402 + t, 578 - t : 578 - t + 2] = (80, 220, 100) frame[198 - t : 402 + t, 652 + t - 2 : 652 + t] = (80, 220, 100) frame[198 - t : 198 - t + 2, 578 - t : 652 + t] = (80, 220, 100) frame[402 + t - 2 : 402 + t, 578 - t : 652 + t] = (80, 220, 100) # Comparison boxes on right side # Classification frame[180:260, 800:1150] = (35, 45, 65) # Detection frame[290:370, 800:1150] = (35, 45, 65) # Segmentation frame[400:480, 800:1150] = (35, 45, 65) return frame det_clip = VideoClip(make_det_frame, duration=STEP_DUR).with_fps(FPS) text_clips: list[VideoClip] = [det_clip] labels = [ ("Detekcja obiektów — co to jest?", 28, "#FFE082", FONT_B, (100, 20)), ("Wynik: (klasa, bounding box, pewność)", 20, "#B0BEC5", FONT_R, (100, 65)), ("samochód 95%", 14, "#EF9A9A", FONT_B, (150, 340)), ("osoba 88%", 14, "#64B5F6", FONT_B, (450, 268)), ("drzewo 72%", 14, "#A5D6A7", FONT_B, (580, 188)), ("Klasyfikacja: cały obraz → 1 etykieta", 15, "#78909C", FONT_R, (810, 210)), ("Detekcja: bbox + klasa + pewność", 15, "#FFE082", FONT_R, (810, 320)), ("Segmentacja: maska per piksel", 15, "#78909C", FONT_R, (810, 430)), ("← granulacja rośnie →", 14, "#90CAF9", FONT_R, (810, 520)), ] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(STEP_DUR) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) return slides # ── HOG + SVM pipeline ─────────────────────────────────────────── def _hog_svm_demo() -> list[CompositeVideoClip]: """Animate HOG feature computation and SVM classification.""" slides = [] def make_hog_frame(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.8), 1.0) # Pipeline stages as boxes with arrows stages = [ ("Gradient", (80, 250), (130, 80), (100, 160, 220)), ("Orientacja", (260, 250), (130, 80), (80, 180, 140)), ("Komórki 8x8", (440, 250), (130, 80), (200, 160, 80)), ("Bloki 2x2", (620, 250), (130, 80), (200, 120, 60)), ("Normalizacja", (800, 250), (130, 80), (180, 100, 80)), ("SVM", (980, 250), (130, 80), (220, 80, 80)), ] n_active = int(progress * len(stages)) + 1 for i, (_label, (sx, sy), (sw, sh), color) in enumerate(stages): if i < n_active: frame[sy : sy + sh, sx : sx + sw] = color # Border frame[sy : sy + 2, sx : sx + sw] = tuple( min(c + 60, 255) for c in color ) frame[sy + sh - 2 : sy + sh, sx : sx + sw] = tuple( min(c + 60, 255) for c in color ) # Arrow to next if i < len(stages) - 1: ax = sx + sw + 5 ay = sy + sh // 2 frame[ay - 1 : ay + 2, ax : ax + 20] = (150, 150, 170) # Show gradient computation example at bottom if progress > 0.2: # Mini pixel grid showing gradient computation gx, gy = 100, 430 pixels = [50, 50, 200] for idx, val in enumerate(pixels): x = gx + idx * 50 frame[gy : gy + 40, x : x + 40] = (val, val, val) return frame hog_clip = VideoClip(make_hog_frame, duration=STEP_DUR).with_fps(FPS) text_clips: list[VideoClip] = [hog_clip] labels = [ ("HOG + SVM — pipeline detekcji pieszych", 28, "#FFE082", FONT_B, (80, 20)), ( "Mnemonik: GOKBN = Gradienty→Orientacja→Komórki→Bloki→Normalizacja", 16, "#A5D6A7", FONT_R, (80, 65), ), ("Gradient: siła i kierunek zmiany jasności", 14, "#64B5F6", FONT_R, (80, 95)), ( "Histogram: 9 binów (0°-180°, co 20°) per komórka 8x8", 14, "#78909C", FONT_R, (80, 120), ), ( "[50][50][200] → Gx = 200-50 = 150 = silna krawędź!", 16, "#EF9A9A", FONT_R, (80, 490), ), ( "Wektor HOG (3780 cech) → SVM: pieszy (+1) / tło (-1)", 16, "white", FONT_R, (80, 540), ), ( "Sliding window 64x128 przesuwa się po obrazie → NMS → wynik", 16, "#90CAF9", FONT_R, (80, 580), ), ( "SVM = LINIA MAKSYMALNEGO ODDECHU (max margines, support vectors)", 16, "#FFE082", FONT_R, (80, 620), ), ] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(STEP_DUR) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) return slides # ── Viola-Jones ─────────────────────────────────────────────────── def _viola_jones_demo() -> list[CompositeVideoClip]: """Animate Viola-Jones cascade concept.""" slides = [] def make_cascade_frame(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.8), 1.0) # Draw cascade "funnel" — stages filtering out non-faces stages = 5 start_width = 1000 start_count = 10000 x_center = W // 2 for i in range(stages): stage_progress = min(progress * stages - i, 1.0) if stage_progress <= 0: break width = int(start_width * (1 - i * 0.18)) int(start_count * (0.3**i)) y = 150 + i * 100 h_box = 60 # Stage box x1 = x_center - width // 2 frame[y : y + h_box, x1 : x1 + width] = ( 50 + i * 10, 60 + i * 10, 80 + i * 10, ) # Border frame[y : y + 2, x1 : x1 + width] = (100 + i * 20, 130 + i * 15, 200) frame[y + h_box - 2 : y + h_box, x1 : x1 + width] = ( 100 + i * 20, 130 + i * 15, 200, ) # Arrow down to next if i < stages - 1: frame[y + h_box + 5 : y + h_box + 25, x_center - 1 : x_center + 2] = ( 150, 150, 170, ) # Red "rejected" arrows on sides if i > 0: # Left reject arrow rx = x1 - 30 ry = y + h_box // 2 frame[ry - 1 : ry + 2, rx : rx + 25] = (200, 80, 80) return frame cascade_clip = VideoClip(make_cascade_frame, duration=STEP_DUR).with_fps(FPS) text_clips: list[VideoClip] = [cascade_clip] labels = [ ( "Viola-Jones — kaskada klasyfikatorów (2001)", 28, "#FFE082", FONT_B, (80, 20), ), ( "3 innowacje: HIC = Haar + Integral Image + Cascade", 20, "#B0BEC5", FONT_R, (80, 65), ), ("Etap 1: 2 cechy Haar", 14, "#64B5F6", FONT_R, (170, 170)), ("Etap 2: 10 cech", 14, "#64B5F6", FONT_R, (210, 270)), ("Etap 3: 25 cech", 14, "#64B5F6", FONT_R, (240, 370)), ("Etap 4: 50 cech", 14, "#64B5F6", FONT_R, (260, 470)), ("→ TWARZ!", 16, "#A5D6A7", FONT_B, (590, 560)), ( "SITO: 99% okien odpada w pierwszych 3 etapach → REAL-TIME!", 16, "#EF9A9A", FONT_R, (80, 620), ), ( "Haar: kontrast jasna/ciemna | Integral Image: suma prostokąta O(1) = 4 odczyty", 14, "#78909C", FONT_R, (80, 655), ), ("odrzucone →", 12, "#EF9A9A", FONT_R, (60, 275)), ("odrzucone →", 12, "#EF9A9A", FONT_R, (60, 375)), ] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(STEP_DUR) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) return slides # ── R-CNN Evolution ─────────────────────────────────────────────── def _rcnn_evolution() -> list[CompositeVideoClip]: """Animate R-CNN → Fast R-CNN → Faster R-CNN evolution.""" slides = [] def make_evolution_frame(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.8), 1.0) # Three rows: R-CNN, Fast R-CNN, Faster R-CNN models = [ ( "R-CNN (2014)", 50, [ ("Selective\nSearch", (200, 150), (100, 50), (120, 100, 60)), ("2000x\nCNN", (350, 150), (80, 50), (180, 60, 60)), ("2000x\nSVM", (480, 150), (80, 50), (180, 60, 60)), ("NMS", (610, 150), (60, 50), (100, 140, 100)), ], "50 sec/obraz!", ), ( "Fast R-CNN (2015)", 300, [ ("Selective\nSearch", (200, 150), (100, 50), (120, 100, 60)), ("1x CNN\n(cały obraz)", (350, 150), (100, 50), (80, 140, 200)), ("ROI Pool\n(2000)", (500, 150), (90, 50), (200, 160, 80)), ("FC", (640, 150), (50, 50), (100, 140, 100)), ], "2 sec/obraz", ), ( "Faster R-CNN (2015)", 300, [ ("CNN\nbackbone", (200, 150), (90, 50), (80, 140, 200)), ("RPN\n(~300)", (340, 150), (80, 50), (200, 120, 60)), ("ROI Pool", (470, 150), (80, 50), (200, 160, 80)), ("FC", (600, 150), (50, 50), (100, 140, 100)), ], "0.2 sec → 5 fps!", ), ] n_models = int(progress * 3) + 1 for mi, (_name, base_y, stages, _speed) in enumerate(models): if mi >= n_models: break for _label, (bx, by_off), (bw, bh), color in stages: by = base_y + by_off - 150 frame[by : by + bh, bx : bx + bw] = color frame[by : by + 2, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) # Arrows between stages for si in range(len(stages) - 1): sx = stages[si][1][0] + stages[si][2][0] ex = stages[si + 1][1][0] ay = base_y + 25 frame[ay - 1 : ay + 2, sx + 3 : ex - 3] = (150, 150, 170) return frame evo_clip = VideoClip(make_evolution_frame, duration=STEP_DUR + 1).with_fps(FPS) text_clips: list[VideoClip] = [evo_clip] labels = [ ("Ewolucja R-CNN — CORAZ MNIEJ MARNOWANIA", 28, "#FFE082", FONT_B, (80, 20)), ("R-CNN (2014)", 20, "#EF9A9A", FONT_B, (50, 80)), ("50 sec/obraz (2000x forward pass!)", 14, "#EF9A9A", FONT_R, (720, 100)), ("Fast R-CNN (2015)", 20, "#64B5F6", FONT_B, (50, 330)), ("2 sec/obraz (CNN raz + ROI Pool)", 14, "#64B5F6", FONT_R, (720, 350)), ("Faster R-CNN (2015)", 20, "#A5D6A7", FONT_B, (50, 580)), ("0.2 sec → 5 fps (RPN w sieci!)", 14, "#A5D6A7", FONT_R, (720, 600)), ( "Kluczowe innowacje: ROI Pooling → stały rozmiar | RPN → propozycje w sieci", 14, "#78909C", FONT_R, (80, 660), ), ] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(STEP_DUR + 1) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) return slides # ── R-CNN Detailed Pipeline ────────────────────────────────────── def _rcnn_detailed() -> list[CompositeVideoClip]: """Animate R-CNN step-by-step pipeline in detail.""" slides = [] # Slide 1: R-CNN pipeline step by step def make_rcnn_pipeline(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.8), 1.0) # Step boxes arranged vertically with arrows steps = [ ((80, 130), (200, 55), (120, 100, 60), "1. Selective Search"), ((80, 230), (200, 55), (180, 60, 60), "2. Wytnij 2000 regionów"), ((80, 330), (200, 55), (70, 130, 200), "3. CNN per region"), ((80, 430), (200, 55), (200, 100, 80), "4. SVM klasyfikuje"), ((80, 530), (200, 55), (100, 180, 100), "5. Bbox regresja + NMS"), ] n_steps = min(int(progress * 5) + 1, 5) for i, ((bx, by), (bw, bh), color, _lbl) in enumerate(steps): if i < n_steps: frame[by : by + bh, bx : bx + bw] = color frame[by : by + 2, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) # Arrow down if i < 4: ax = bx + bw // 2 ay = by + bh + 5 frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170) # Illustration: many overlapping regions from Selective Search if progress > 0.2: rng_local = np.random.default_rng(42) n_boxes = min(int((progress - 0.2) * 15), 8) for i in range(n_boxes): rx = 500 + rng_local.integers(-30, 100) ry = 200 + rng_local.integers(-20, 120) rw = 60 + rng_local.integers(0, 80) rh = 50 + rng_local.integers(0, 70) c = (80 + i * 15, 100 + i * 10, 60 + i * 20) for tt in range(2): frame[ry - tt : ry + rh + tt, rx - tt : rx - tt + 2] = c frame[ry - tt : ry + rh + tt, rx + rw + tt - 2 : rx + rw + tt] = c frame[ry - tt : ry - tt + 2, rx - tt : rx + rw + tt] = c frame[ry + rh + tt - 2 : ry + rh + tt, rx - tt : rx + rw + tt] = c return frame rcnn_clip = VideoClip(make_rcnn_pipeline, duration=STEP_DUR + 1).with_fps(FPS) dur = STEP_DUR + 1 labels = [ ("R-CNN: krok po kroku (2014, Girshick)", 26, "#FFE082", FONT_B, (80, 20)), ("Pipeline detekcji two-stage", 16, "#B0BEC5", FONT_R, (80, 60)), ("Selective Search", 11, "white", FONT_R, (105, 145)), ("2000 regionów", 11, "white", FONT_R, (105, 245)), ("CNN per region", 11, "white", FONT_R, (105, 345)), ("SVM klasyfikuje", 11, "white", FONT_R, (105, 445)), ("Regresja + NMS", 11, "white", FONT_R, (105, 545)), ("~2000 propozycji regionów", 14, "#78909C", FONT_R, (500, 155)), ("(inteligentne łączenie", 13, "#78909C", FONT_R, (500, 180)), ("podobnych fragmentów)", 13, "#78909C", FONT_R, (500, 200)), ("Problem: 2000 x CNN forward pass", 16, "#EF9A9A", FONT_R, (400, 400)), ("= 50 SEKUND na obraz!", 18, "#EF9A9A", FONT_B, (400, 430)), ("CNN liczy cechy per region OSOBNO", 14, "#EF9A9A", FONT_R, (400, 470)), ( "→ regiony się nakładają → obliczenia się powtarzają!", 14, "#EF9A9A", FONT_R, (400, 495), ), ( "Rozwiązanie: CNN raz na cały obraz → Fast R-CNN →", 16, "#A5D6A7", FONT_R, (80, 620), ), ] text_clips: list[VideoClip] = [rcnn_clip] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(dur) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) return slides # ── ROI Pooling ────────────────────────────────────────────────── def _roi_pooling_demo() -> list[CompositeVideoClip]: """Animate ROI Pooling: key Fast R-CNN innovation.""" slides = [] def make_roi_frame(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.7), 1.0) # Left: feature map with ROI highlighted fm_x, fm_y = 60, 180 fm_cell = 30 fm_grid = 8 for r in range(fm_grid): for c in range(fm_grid): x = fm_x + c * fm_cell y = fm_y + r * fm_cell # Random-looking feature values val = 30 + ((r * 7 + c * 13 + 42) % 40) frame[y : y + fm_cell - 1, x : x + fm_cell - 1] = ( val, val + 10, val + 20, ) # ROI region highlighted roi_r1, roi_c1 = 2, 1 roi_r2, roi_c2 = 6, 5 for tt in range(3): ry1 = fm_y + roi_r1 * fm_cell - tt ry2 = fm_y + roi_r2 * fm_cell + tt rx1 = fm_x + roi_c1 * fm_cell - tt rx2 = fm_x + roi_c2 * fm_cell + tt frame[ry1:ry2, rx1 : rx1 + 2] = (255, 200, 50) frame[ry1:ry2, rx2 - 2 : rx2] = (255, 200, 50) frame[ry1 : ry1 + 2, rx1:rx2] = (255, 200, 50) frame[ry2 - 2 : ry2, rx1:rx2] = (255, 200, 50) # Arrow if progress > 0.3: frame[300:303, 310:380] = (150, 150, 170) # Middle: ROI divided into 3x3 grid (output_size) if progress > 0.3: out_x, out_y = 400, 220 out_cell = 50 out_n = 3 roi_h = roi_r2 - roi_r1 roi_w = roi_c2 - roi_c1 for r in range(out_n): for c in range(out_n): x = out_x + c * out_cell y = out_y + r * out_cell # Compute the max from corresponding region src_r1 = roi_r1 + r * roi_h // out_n src_r2 = roi_r1 + (r + 1) * roi_h // out_n src_c1 = roi_c1 + c * roi_w // out_n src_c2 = roi_c1 + (c + 1) * roi_w // out_n max_val = 0 for sr in range(src_r1, src_r2): for sc in range(src_c1, src_c2): v = 30 + ((sr * 7 + sc * 13 + 42) % 40) max_val = max(max_val, v) frame[y : y + out_cell - 2, x : x + out_cell - 2] = ( max_val, max_val + 20, max_val + 40, ) frame[y : y + 2, x : x + out_cell - 2] = (80, 200, 120) frame[y + out_cell - 4 : y + out_cell - 2, x : x + out_cell - 2] = ( 80, 200, 120, ) # Arrow to FC if progress > 0.6: frame[300:303, 560:630] = (150, 150, 170) # FC box frame[270:340, 650:730] = (200, 100, 80) frame[270:272, 650:730] = (240, 140, 120) frame[338:340, 650:730] = (240, 140, 120) return frame roi_clip = VideoClip(make_roi_frame, duration=STEP_DUR + 1).with_fps(FPS) dur = STEP_DUR + 1 labels = [ ("ROI Pooling: kluczowa innowacja Fast R-CNN", 26, "#FFE082", FONT_B, (80, 20)), ( "KROK 1: CNN raz na CAŁY obraz → feature mapa", 17, "#64B5F6", FONT_R, (80, 60), ), ( "KROK 2: Wytnij ROI z feature mapy (nie z obrazu!)", 17, "#FFE082", FONT_R, (80, 90), ), ( "KROK 3: Siatkuj ROI na 3x3 → max pool per komórka → stały rozmiar", 17, "#A5D6A7", FONT_R, (80, 120), ), ("Feature mapa", 14, "#64B5F6", FONT_B, (60, 160)), ("ROI (żółta ramka)", 13, "#FFE082", FONT_R, (60, 440)), ("ROI Pool 3x3", 14, "#A5D6A7", FONT_B, (400, 195)), ("(max z komórki)", 13, "#78909C", FONT_R, (400, 380)), ("FC", 14, "white", FONT_B, (670, 280)), ( "Problem: ROI mają RÓŻNE rozmiary, FC wymaga STAŁEGO", 15, "#B0BEC5", FONT_R, (80, 500), ), ( "ROI Pooling: dzieli ROI na siatkę, max pool → STAŁY rozmiar!", 16, "white", FONT_R, (80, 535), ), ( "Fast R-CNN: CNN raz → 1 feature mapa → ROI Pool 2000 regionów → 25x szybciej!", 16, "#A5D6A7", FONT_R, (80, 580), ), ( "(R-CNN: 2000x CNN = 50s | Fast R-CNN: 1xCNN + ROI Pool = 2s)", 15, "#EF9A9A", FONT_R, (80, 620), ), ] text_clips: list[VideoClip] = [roi_clip] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(dur) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) return slides # ── RPN + Anchor Boxes ─────────────────────────────────────────── def _rpn_anchors_demo() -> list[CompositeVideoClip]: """Animate RPN and anchor boxes: Faster R-CNN innovation.""" slides = [] # Slide 1: Anchor boxes concept def make_anchors_frame(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.7), 1.0) # Draw feature map grid point with multiple anchors cx, cy = 350, 360 # center point on feature map # Draw a "feature map" grid background cell = 60 for r in range(-3, 4): for c in range(-3, 4): x = cx + c * cell - cell // 2 y = cy + r * cell - cell // 2 frame[y : y + cell - 1, x : x + cell - 1] = (30, 35, 48) # Center point highlighted frame[cy - 5 : cy + 5, cx - 5 : cx + 5] = (255, 200, 50) # Draw anchors around center: 3 sizes x 3 ratios = 9 anchor_specs = [ # (half_w, half_h, color) (30, 30, (200, 80, 80)), # small 1:1 (20, 40, (200, 60, 60)), # small 1:2 (40, 20, (180, 60, 60)), # small 2:1 (60, 60, (80, 200, 80)), # medium 1:1 (40, 80, (60, 180, 60)), # medium 1:2 (80, 40, (60, 160, 60)), # medium 2:1 (90, 90, (80, 80, 200)), # large 1:1 (60, 120, (60, 60, 180)), # large 1:2 (120, 60, (60, 60, 160)), # large 2:1 ] n_anchors = min(int(progress * 9) + 1, 9) for i in range(n_anchors): hw, hh, color = anchor_specs[i] x1 = max(0, cx - hw) y1 = max(0, cy - hh) x2 = min(W - 1, cx + hw) y2 = min(H - 1, cy + hh) for tt in range(2): frame[y1 - tt : y2 + tt, x1 - tt : x1 - tt + 2] = color frame[y1 - tt : y2 + tt, x2 + tt - 2 : x2 + tt] = color frame[y1 - tt : y1 - tt + 2, x1 - tt : x2 + tt] = color frame[y2 + tt - 2 : y2 + tt, x1 - tt : x2 + tt] = color return frame anch_clip = VideoClip(make_anchors_frame, duration=STEP_DUR + 1).with_fps(FPS) dur = STEP_DUR + 1 labels = [ ("Anchor Boxes + RPN (Faster R-CNN)", 26, "#FFE082", FONT_B, (80, 20)), ( "KROK 1: Anchory = predefiniowane kształty w każdej pozycji", 17, "#A5D6A7", FONT_R, (80, 60), ), ( "3 rozmiary x 3 proporcje = 9 anchorów per punkt", 16, "#B0BEC5", FONT_R, (80, 90), ), ("Małe (1:1, 1:2, 2:1)", 14, "#EF9A9A", FONT_R, (750, 170)), ("Średnie (1:1, 1:2, 2:1)", 14, "#A5D6A7", FONT_R, (750, 210)), ("Duże (1:1, 1:2, 2:1)", 14, "#64B5F6", FONT_R, (750, 250)), ("Żółty punkt = pozycja", 14, "#FFE082", FONT_R, (750, 310)), ("na feature mapie", 14, "#FFE082", FONT_R, (750, 335)), ("Sieć NIE predykuje bbox od zera!", 16, "white", FONT_R, (80, 530)), ( "Predykuje OFFSET od najbliższego anchora: (Δx, Δy, Δw, Δh)", 16, "#FFE082", FONT_R, (80, 565), ), ( "+ P(obiekt) = 'czy w tym anchorze jest coś?'", 16, "#A5D6A7", FONT_R, (80, 600), ), ( "Mnemonik: Anchor = KOTWICA — sieć dopasowuje bbox do kotwicy", 15, "#78909C", FONT_R, (80, 645), ), ] text_clips: list[VideoClip] = [anch_clip] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(dur) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) # Slide 2: RPN step by step rpn_lines = [ ( "RPN: Region Proposal Network — krok po kroku", 24, "#FFE082", FONT_B, (80, 30), ), ( "Zastępuje Selective Search SIECIĄ NEURONOWĄ (end-to-end!)", 17, "#B0BEC5", FONT_R, (80, 85), ), ("", 10, "white", FONT_R, (80, 110)), ( "1. Backbone (ResNet) przetwarza obraz → feature mapa [40x60x256]", 16, "#64B5F6", FONT_R, (100, 140), ), ( "2. Filtr 3x3 przesuwa się po feature mapie", 16, "#A5D6A7", FONT_R, (100, 180), ), ( "3. W KAŻDEJ pozycji (x,y) rozważ k=9 anchorów:", 16, "#FFE082", FONT_R, (100, 220), ), (" → P(obiekt) — 'czy tu jest coś?'", 15, "white", FONT_R, (120, 255)), (" → (Δx, Δy, Δw, Δh) — poprawka pozycji", 15, "white", FONT_R, (120, 285)), ( "4. 40x60 pozycji x 9 anchorów = 21 600 kandydatów!", 16, "#EF9A9A", FONT_R, (100, 325), ), ( "5. Weź ~300 z najwyższym P(obiekt) → ROI Pool → FC", 16, "#A5D6A7", FONT_R, (100, 365), ), ("", 10, "white", FONT_R, (100, 395)), ("Porównanie generowania propozycji:", 17, "white", FONT_B, (80, 420)), ( " Selective Search: ~2000 regionów, osobny algorytm, ~2 sec", 15, "#EF9A9A", FONT_R, (100, 460), ), ( " RPN: ~300 regionów, W SIECI, ~10 ms → 200x szybciej!", 15, "#A5D6A7", FONT_R, (100, 495), ), ("", 10, "white", FONT_R, (100, 520)), ( "Faster R-CNN = Backbone + RPN + ROI Pool + FC — WSZYSTKO end-to-end", 17, "#FFE082", FONT_R, (80, 545), ), ( "→ 5 fps (0.2 sec/obraz) vs R-CNN 50 sec = 250x szybciej!", 17, "#A5D6A7", FONT_R, (80, 585), ), ( "Wciąż two-stage: (1) RPN generuje propozycje, (2) FC klasyfikuje", 15, "#78909C", FONT_R, (80, 630), ), ] slides.append(_text_slide(rpn_lines, duration=STEP_DUR + 1)) return slides # ── YOLO ────────────────────────────────────────────────────────── def _yolo_demo() -> list[CompositeVideoClip]: """Animate YOLO grid detection concept.""" slides = [] def make_yolo_frame(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.7), 1.0) # Draw image with grid overlay img_x, img_y = 100, 140 img_size = 420 grid_n = 7 # Background "image" frame[img_y : img_y + img_size, img_x : img_x + img_size] = (50, 55, 70) # Objects in the image frame[img_y + 80 : img_y + 200, img_x + 50 : img_x + 180] = ( 180, 60, 60, ) # "car" frame[img_y + 150 : img_y + 350, img_x + 250 : img_x + 330] = ( 60, 120, 180, ) # "person" # Grid lines cell = img_size // grid_n for i in range(grid_n + 1): # Vertical x = img_x + i * cell frame[img_y : img_y + img_size, x : x + 1] = (100, 100, 120) # Horizontal y = img_y + i * cell frame[y : y + 1, img_x : img_x + img_size] = (100, 100, 120) # Highlight cells containing object centers if progress > 0.3: # Car center ~ cell (1, 1) cx, cy = 1, 2 hx = img_x + cx * cell hy = img_y + cy * cell frame[hy : hy + cell, hx : hx + cell] = np.clip( frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255 ).astype(np.uint8) if progress > 0.5: # Person center ~ cell (4, 4) cx, cy = 4, 4 hx = img_x + cx * cell hy = img_y + cy * cell frame[hy : hy + cell, hx : hx + cell] = np.clip( frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255 ).astype(np.uint8) # Bounding boxes predictions from cells if progress > 0.6: # Car bbox for tt in range(2): frame[ img_y + 78 - tt : img_y + 202 + tt, img_x + 48 - tt : img_x + 48 - tt + 2, ] = (255, 80, 80) frame[ img_y + 78 - tt : img_y + 202 + tt, img_x + 182 + tt - 2 : img_x + 182 + tt, ] = (255, 80, 80) frame[ img_y + 78 - tt : img_y + 78 - tt + 2, img_x + 48 - tt : img_x + 182 + tt, ] = (255, 80, 80) frame[ img_y + 202 + tt - 2 : img_y + 202 + tt, img_x + 48 - tt : img_x + 182 + tt, ] = (255, 80, 80) # Person bbox for tt in range(2): frame[ img_y + 148 - tt : img_y + 352 + tt, img_x + 248 - tt : img_x + 248 - tt + 2, ] = (80, 180, 255) frame[ img_y + 148 - tt : img_y + 352 + tt, img_x + 332 + tt - 2 : img_x + 332 + tt, ] = (80, 180, 255) frame[ img_y + 148 - tt : img_y + 148 - tt + 2, img_x + 248 - tt : img_x + 332 + tt, ] = (80, 180, 255) frame[ img_y + 352 + tt - 2 : img_y + 352 + tt, img_x + 248 - tt : img_x + 332 + tt, ] = (80, 180, 255) return frame yolo_clip = VideoClip(make_yolo_frame, duration=STEP_DUR).with_fps(FPS) text_clips: list[VideoClip] = [yolo_clip] labels = [ ("YOLO — You Only Look Once", 28, "#FFE082", FONT_B, (80, 20)), ( "Jednoetapowy detektor: siatka SxS → wszystkie detekcje naraz!", 18, "#B0BEC5", FONT_R, (80, 65), ), ("Siatka 7x7 = 49 komórek", 16, "#64B5F6", FONT_R, (600, 180)), ("Każda komórka predykuje:", 16, "white", FONT_R, (600, 220)), (" • B bbox (x, y, w, h, conf)", 14, "#B0BEC5", FONT_R, (600, 255)), (" • C klas (prawdopodobieństwa)", 14, "#B0BEC5", FONT_R, (600, 285)), ("Komórka odpowiada za obiekt", 14, "#A5D6A7", FONT_R, (600, 325)), ("którego ŚRODEK w niej wpada", 14, "#A5D6A7", FONT_R, (600, 350)), ("45-155 fps! (vs 5 fps Faster R-CNN)", 18, "#EF9A9A", FONT_B, (600, 400)), ( "Jedno przejście przez sieć → WSZYSTKIE detekcje naraz → NMS → wynik", 14, "#78909C", FONT_R, (80, 620), ), ( "Two-stage (R-CNN): propozycje+klasyfikacja | One-stage (YOLO): bez propozycji!", 14, "#90CAF9", FONT_R, (80, 655), ), ] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(STEP_DUR) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) return slides # ── YOLO Architecture Detail ────────────────────────────────────── def _yolo_architecture() -> list[CompositeVideoClip]: """Show YOLO architecture: backbone → head, output tensor.""" slides = [] # Slide 1: YOLO architecture breakdown def make_yolo_arch(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.7), 1.0) # Pipeline: Image → Backbone → Neck → Head → SxSx(B*5+C) tensor blocks = [ ((60, 280), (100, 80), (50, 70, 90), "Obraz"), ((200, 280), (100, 80), (70, 130, 200), "Backbone"), ((340, 280), (100, 80), (200, 160, 80), "Neck"), ((480, 280), (100, 80), (200, 100, 60), "Head"), ((620, 280), (160, 80), (80, 200, 120), "SxSx(B*5+C)"), ] n_blocks = min(int(progress * 5) + 1, 5) for i, ((bx, by), (bw, bh), color, _lbl) in enumerate(blocks): if i < n_blocks: frame[by : by + bh, bx : bx + bw] = color frame[by : by + 2, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) if i < 4: ax = bx + bw + 5 ay = by + bh // 2 frame[ay - 1 : ay + 2, ax : ax + 25] = (150, 150, 170) # Output tensor breakdown (right side) if progress > 0.6: # Show SxS grid gx, gy = 850, 180 gs = 120 gn = 4 # simplified from 7 gc = gs // gn for r in range(gn): for c in range(gn): x = gx + c * gc y = gy + r * gc frame[y : y + gc - 1, x : x + gc - 1] = (40, 50, 65) # Highlight one cell frame[gy + gc : gy + 2 * gc - 1, gx + gc : gx + 2 * gc - 1] = (80, 200, 120) return frame arch_clip = VideoClip(make_yolo_arch, duration=STEP_DUR + 1).with_fps(FPS) dur = STEP_DUR + 1 labels = [ ("YOLO: Architektura — krok po kroku", 26, "#FFE082", FONT_B, (80, 20)), ( "One-stage: JEDEN forward pass → WSZYSTKIE detekcje naraz", 17, "#B0BEC5", FONT_R, (80, 60), ), ("Obraz", 13, "white", FONT_R, (85, 295)), ("Backbone", 13, "white", FONT_R, (215, 295)), ("(ResNet/", 11, "#78909C", FONT_R, (210, 370)), ("Darknet)", 11, "#78909C", FONT_R, (210, 390)), ("Neck", 13, "white", FONT_R, (365, 295)), ("(FPN/", 11, "#78909C", FONT_R, (360, 370)), ("PANet)", 11, "#78909C", FONT_R, (360, 390)), ("Head", 13, "white", FONT_R, (505, 295)), ("(conv)", 11, "#78909C", FONT_R, (500, 370)), ("Tensor wyjścia", 13, "#A5D6A7", FONT_R, (640, 295)), ("Każda komórka SxS predykuje:", 15, "#FFE082", FONT_R, (830, 320)), (" B bbox x (x,y,w,h,conf)", 13, "#B0BEC5", FONT_R, (830, 350)), (" + C klas (prob.)", 13, "#B0BEC5", FONT_R, (830, 375)), ("= SxSx(Bx5+C) tensor", 13, "#A5D6A7", FONT_R, (830, 400)), ("Np. 7x7x(2x5+20) = 7x7x30", 13, "#78909C", FONT_R, (830, 430)), ( "Two-stage (R-CNN): (1) propozycje → (2) klasyfikacja = 2 przejścia", 15, "#EF9A9A", FONT_R, (80, 470), ), ( "One-stage (YOLO): siatka → predykcja all-in-one = 1 przejście!", 15, "#A5D6A7", FONT_R, (80, 505), ), ( "Ewolucja YOLO: v1(2016)→v3→v5→v8(2023, anchor-free, SOTA)", 16, "#FFE082", FONT_R, (80, 555), ), ( "SSD (2016): multi-scale feature maps → lepsza detekcja małych obiektów", 15, "#64B5F6", FONT_R, (80, 595), ), ( "FPN: łączy wczesne warstwy (małe obiekty) + późne (duże obiekty)", 15, "#78909C", FONT_R, (80, 630), ), ] text_clips: list[VideoClip] = [arch_clip] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(dur) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) return slides # ── DETR ────────────────────────────────────────────────────────── def _detr_demo() -> list[CompositeVideoClip]: """Animate DETR: transformer detection, object queries, no NMS.""" slides = [] # Slide 1: DETR pipeline def make_detr_frame(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.7), 1.0) # DETR pipeline: Image → Backbone → Encoder → Decoder → N predictions blocks = [ ((50, 260), (80, 60), (50, 70, 90)), ((170, 260), (90, 60), (70, 130, 200)), ((300, 260), (110, 60), (200, 120, 60)), ((450, 260), (110, 60), (200, 80, 160)), ((600, 260), (120, 60), (80, 200, 120)), ] n_blocks = min(int(progress * 5) + 1, 5) for i, ((bx, by), (bw, bh), color) in enumerate(blocks): if i < n_blocks: frame[by : by + bh, bx : bx + bw] = color frame[by : by + 2, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) if i < 4: ax = bx + bw + 5 ay = by + bh // 2 frame[ay - 1 : ay + 2, ax : ax + 25] = (150, 150, 170) # Object queries illustration (right side) if progress > 0.5: qx, qy = 800, 140 for i in range(6): y = qy + i * 50 w = 130 active = i < 3 color = (80, 180, 120) if active else (60, 50, 50) frame[y : y + 35, qx : qx + w] = color frame[y : y + 1, qx : qx + w] = tuple(min(c + 40, 255) for c in color) # Arrow from decoder to queries frame[285:288, 723:798] = (150, 150, 170) return frame detr_clip = VideoClip(make_detr_frame, duration=STEP_DUR + 1).with_fps(FPS) dur = STEP_DUR + 1 labels = [ ("DETR: DEtection TRansformer (2020)", 26, "#FFE082", FONT_B, (80, 20)), ( "Radykalnie prostszy pipeline: BEZ anchorów, BEZ NMS!", 17, "#B0BEC5", FONT_R, (80, 60), ), ("Obraz", 12, "white", FONT_R, (65, 275)), ("Backbone", 12, "white", FONT_R, (185, 275)), ("Transformer", 12, "white", FONT_R, (310, 275)), ("Encoder", 12, "white", FONT_R, (325, 295)), ("Transformer", 12, "white", FONT_R, (460, 275)), ("Decoder", 12, "white", FONT_R, (478, 295)), ("N predykcji", 12, "white", FONT_R, (615, 275)), ("Object Queries:", 14, "#FFE082", FONT_B, (800, 115)), ("samochód 95%", 11, "white", FONT_R, (810, 148)), ("pies 88%", 11, "white", FONT_R, (810, 198)), ("rower 72%", 11, "white", FONT_R, (810, 248)), ("brak", 11, "#78909C", FONT_R, (810, 298)), ("brak", 11, "#78909C", FONT_R, (810, 348)), ("brak", 11, "#78909C", FONT_R, (810, 398)), ("100 wyuczonych queries", 13, "#FFE082", FONT_R, (800, 440)), ("→ każdy 'szuka' obiektu", 13, "#FFE082", FONT_R, (800, 465)), ] text_clips: list[VideoClip] = [detr_clip] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(dur) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) # Slide 2: Why no NMS + Hungarian matching detr_details = [ ("DETR: Dlaczego bez NMS? — krok po kroku", 24, "#FFE082", FONT_B, (80, 30)), ( "Problem NMS: duplikaty detekcji → ręcznie usuwaj post-hoc", 16, "#EF9A9A", FONT_R, (80, 90), ), ( "DETR rozwiązanie: Hungarian matching (dopasowanie węgierskie)", 17, "#A5D6A7", FONT_R, (80, 130), ), ("", 10, "white", FONT_R, (80, 155)), ("Jak to działa podczas TRENINGU:", 17, "white", FONT_B, (80, 180)), (" 1. Sieć daje N=100 predykcji (queries)", 15, "#64B5F6", FONT_R, (100, 220)), ( " 2. Na obrazie jest np. 5 obiektów (ground truth)", 15, "#64B5F6", FONT_R, (100, 255), ), ( " 3. Hungarian matching: optymalne dopasowanie 1:1", 15, "#FFE082", FONT_R, (100, 290), ), ( " → query_1 ↔ gt_samochód (najlepsze dopasowanie)", 14, "#A5D6A7", FONT_R, (120, 325), ), (" → query_7 ↔ gt_pies", 14, "#A5D6A7", FONT_R, (120, 355)), (" → query_3 ↔ gt_rower", 14, "#A5D6A7", FONT_R, (120, 385)), ( " → pozostałe 97 queries ↔ klasa 'brak obiektu'", 14, "#78909C", FONT_R, (120, 415), ), ( " 4. Każdy obiekt ma DOKŁADNIE 1 predykcję → BRAK duplikatów!", 15, "#A5D6A7", FONT_R, (100, 455), ), ("", 10, "white", FONT_R, (100, 475)), ( "Self-attention w encoderze: cechy obrazu 'rozmawiają' ze sobą", 15, "#64B5F6", FONT_R, (80, 500), ), ( "Cross-attention w decoderze: queries 'pytają' cechy obrazu", 15, "#CE93D8", FONT_R, (80, 535), ), ( "→ query 'rozumie' który fragment obrazu to 'jego' obiekt", 15, "#FFE082", FONT_R, (80, 570), ), ( "DETR = Detekcja Eliminująca Trikowe Redundancje (NMS, anchory)", 16, "#FFE082", FONT_R, (80, 620), ), ( "Wada: wolniejszy trening (O(n²) attention) | Zaleta: prostszy pipeline!", 15, "#78909C", FONT_R, (80, 660), ), ] slides.append(_text_slide(detr_details, duration=STEP_DUR + 1)) # Slide 3: Two-stage vs One-stage vs Transformer summary summary_lines = [ ( "Podsumowanie: Two-stage vs One-stage vs Transformer", 22, "#FFE082", FONT_B, (80, 30), ), ("", 10, "white", FONT_R, (80, 55)), ("TWO-STAGE (R-CNN family):", 18, "#EF9A9A", FONT_B, (80, 90)), ( " (1) Generuj propozycje → (2) Klasyfikuj per region", 15, "white", FONT_R, (100, 125), ), ( " + Wysoka precyzja | - Wolniejsze (2 przejścia)", 15, "#78909C", FONT_R, (100, 155), ), ( " R-CNN → Fast R-CNN → Faster R-CNN (0.2s)", 15, "#B0BEC5", FONT_R, (100, 185), ), ("", 10, "white", FONT_R, (80, 210)), ("ONE-STAGE (YOLO, SSD):", 18, "#A5D6A7", FONT_B, (80, 240)), ( " Siatka → predykcja all-in-one (1 przejście)", 15, "white", FONT_R, (100, 275), ), ( " + Bardzo szybkie (45-155 fps) | - Historycznie mniej precyzyjne", 15, "#78909C", FONT_R, (100, 305), ), ( " YOLOv8 (2023): anchor-free, dorównuje two-stage!", 15, "#B0BEC5", FONT_R, (100, 335), ), ("", 10, "white", FONT_R, (80, 360)), ("TRANSFORMER (DETR):", 18, "#CE93D8", FONT_B, (80, 390)), ( " Object queries + self-attention (globalny kontekst)", 15, "white", FONT_R, (100, 425), ), ( " + Brak NMS/anchorów | - Wolniejszy trening (O(n²))", 15, "#78909C", FONT_R, (100, 455), ), ( " Hungarian matching → 1:1 obiekt↔predykcja → brak duplikatów", 15, "#B0BEC5", FONT_R, (100, 485), ), ("", 10, "white", FONT_R, (80, 510)), ( "Trend: coraz prostsze pipeline, mniej ręcznych komponentów", 17, "white", FONT_R, (80, 540), ), ( " R-CNN (SS+CNN+SVM+NMS) → YOLO (backbone+head+NMS) → DETR (backbone+transformer)", 14, "#90CAF9", FONT_R, (80, 580), ), ( "Metryki: mAP@0.5 (standard), mAP@0.5:0.95 (surowsza), IoU do dopasowania", 15, "#78909C", FONT_R, (80, 630), ), ] slides.append(_text_slide(summary_lines, duration=STEP_DUR + 1)) return slides # ── NMS + IoU ───────────────────────────────────────────────────── def _nms_iou_demo() -> list[CompositeVideoClip]: """Animate NMS and IoU concepts.""" slides = [] def make_nms_frame(t: float) -> np.ndarray: frame = np.zeros((H, W, 3), dtype=np.uint8) frame[:] = BG_COLOR progress = min(t / (STEP_DUR * 0.7), 1.0) # Draw overlapping bounding boxes ox, oy = 100, 200 obj_w, obj_h = 150, 120 # Multiple overlapping detections for same object boxes = [ (ox, oy, obj_w, obj_h, 0.95, (255, 80, 80)), # best (ox + 15, oy - 10, obj_w + 10, obj_h + 5, 0.90, (200, 60, 60)), (ox - 10, oy + 5, obj_w - 5, obj_h + 10, 0.85, (160, 50, 50)), ] # Different object far away boxes.append((ox + 350, oy + 50, 100, 100, 0.40, (80, 180, 255))) for i, (bx, by, bw, bh, _conf, color) in enumerate(boxes): if progress > 0.4 and i > 0 and i < 3: # After NMS, these get removed (shown as faded/crossed) color = (60, 40, 40) for tt in range(2): frame[by - tt : by + bh + tt, bx - tt : bx - tt + 2] = color frame[by - tt : by + bh + tt, bx + bw + tt - 2 : bx + bw + tt] = color frame[by - tt : by - tt + 2, bx - tt : bx + bw + tt] = color frame[by + bh + tt - 2 : by + bh + tt, bx - tt : bx + bw + tt] = color # IoU visualization on right side iou_x, iou_y = 700, 200 # Box A frame[iou_y : iou_y + 100, iou_x : iou_x + 100] = (80, 80, 200) # Box B (overlapping) frame[iou_y + 40 : iou_y + 140, iou_x + 40 : iou_x + 140] = (200, 80, 80) # Intersection highlighted frame[iou_y + 40 : iou_y + 100, iou_x + 40 : iou_x + 100] = (200, 150, 200) return frame nms_clip = VideoClip(make_nms_frame, duration=STEP_DUR).with_fps(FPS) text_clips: list[VideoClip] = [nms_clip] labels = [ ("NMS (Non-Maximum Suppression) + IoU", 28, "#FFE082", FONT_B, (80, 20)), ( "NMS = Najlepszy Ma Się dobrze — zachowaj najlepszą, usuń duplikaty", 18, "#B0BEC5", FONT_R, (80, 65), ), ("conf=0.95 ✓", 14, "#A5D6A7", FONT_B, (100, 340)), ("0.90 ✗ IoU>0.5", 13, "#EF9A9A", FONT_R, (100, 365)), ("0.85 ✗ IoU>0.5", 13, "#EF9A9A", FONT_R, (100, 390)), ("0.40 ✓ INNY obiekt", 13, "#64B5F6", FONT_R, (100, 420)), ("IoU = Intersection over Union", 18, "#FFE082", FONT_B, (700, 160)), ("IoU = pole(∩) / pole(AUB)", 16, "white", FONT_R, (700, 380)), ("Fioletowy = intersection", 14, "#CE93D8", FONT_R, (700, 410)), ("IoU > 0.5 → TEN SAM obiekt → usuń", 14, "#EF9A9A", FONT_R, (700, 440)), ("IoU < 0.5 → INNY obiekt → zachowaj", 14, "#A5D6A7", FONT_R, (700, 470)), ( "DETR: jedyny detektor BEZ NMS (Hungarian matching zamiast tego)", 14, "#78909C", FONT_R, (80, 620), ), ] for text, fs, color, font, pos in labels: tc = ( _tc(text=text, font_size=fs, color=color, font=font) .with_duration(STEP_DUR) .with_position(pos) ) text_clips.append(tc) slides.append( CompositeVideoClip(text_clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) ) return slides # ── Detector from Classifier ───────────────────────────────────── def _detector_from_classifier() -> list[CompositeVideoClip]: """Show 3 approaches to building a detector from a classifier.""" slides = [] approaches = [ ( "Podejście 1: Sliding Window (NAJWOLNIEJSZE)", [ ("Okno przesuwa się po obrazie w wielu skalach", "#B0BEC5"), ("Każde okno → klasyfikator (np. ResNet) → klasa + pewność", "#B0BEC5"), ("~18 000 okien x 10ms = ~3 minuty na obraz!", "#EF9A9A"), ("Mnemonik: WYCINAJ i PYTAJ — jak wycinanie ciasteczek", "#FFE082"), ], "SRF", ), ( "Podejście 2: Region Proposals (= R-CNN)", [ ("Selective Search → ~2000 inteligentnych regionów", "#B0BEC5"), ("Każdy region → CNN → wektor cech → SVM klasyfikuje", "#B0BEC5"), ("~2000 x 10ms = ~20 sec — 9x szybciej!", "#64B5F6"), ( "Mnemonik: INTELIGENTNE CIĘCIE — wytnij tylko tam gdzie wiśnie", "#FFE082", ), ], "SRF", ), ( "Podejście 3: Fine-tune backbone (NAJLEPSZE)", [ ( "Pretrained backbone (ResNet) → odetnij FC → dodaj detection head", "#B0BEC5", ), ( "Detection head = głowica klasyfikacji + głowica regresji bbox", "#B0BEC5", ), ("~0.2 sec/obraz, najlepsza jakość (mAP ~42%)", "#A5D6A7"), ("Mnemonik: PRZESZCZEP GŁOWY — ten sam silnik, nowa głowa", "#FFE082"), ], "SRF", ), ] for title, points, _mnem in approaches: lines = [ (title, 24, "#FFE082", FONT_B, (80, 140)), ] for i, (text, color) in enumerate(points): lines.append((f"• {text}", 18, color, FONT_R, (100, 220 + i * 50))) lines.append( ( "Detektor z klasyfikatora: SRF = Sliding → Region → Fine-tune", 16, "#78909C", FONT_R, (80, 520), ) ) lines.append( ( "= Szukaj Ręcznie, Finalnie optymalizuj!", 16, "#90CAF9", FONT_R, (80, 550), ) ) slides.append(_text_slide(lines, duration=STEP_DUR)) return slides def _text_slide( lines: list[tuple[str, int, str, str, tuple[str | int, str | int]]], duration: float = STEP_DUR, ) -> CompositeVideoClip: bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(duration) clips: list[VideoClip] = [bg] for text, font_size, color, font, pos in lines: tc = ( _tc( text=text, font_size=font_size, color=color, font=font, ) .with_duration(duration) .with_position(pos) ) clips.append(tc) return CompositeVideoClip(clips, size=(W, H)).with_effects( [FadeIn(0.3), FadeOut(0.3)] ) # ── Methods comparison ──────────────────────────────────────────── def _methods_comparison() -> CompositeVideoClip: bg = ColorClip(size=(W, H), color=BG_COLOR).with_duration(10.0) title = ( _tc( text="Porównanie detektorów", font_size=36, color="white", font=FONT_B, ) .with_duration(10.0) .with_position(("center", 20)) ) rows = [ ("Model", "Rok", "Typ", "Szybkość", "Kluczowe"), ("HOG+SVM", "2005", "Klasyczny", "~1 fps", "Gradient histogramy"), ("Viola-Jones", "2001", "Klasyczny", "30+ fps", "Haar+Cascade"), ("R-CNN", "2014", "Two-stage", "50 sec!", "CNN per region"), ("Fast R-CNN", "2015", "Two-stage", "2 sec", "ROI Pooling"), ("Faster R-CNN", "2015", "Two-stage", "5 fps", "RPN w sieci"), ("YOLO", "2016", "One-stage", "45+ fps", "Siatka SxS"), ("DETR", "2020", "Transformer", "~40 fps", "Bez NMS!"), ] clips: list[VideoClip] = [bg, title] for i, row in enumerate(rows): y_pos = 75 + i * 72 col_x = [40, 200, 280, 400, 530] for j, cell in enumerate(row): fs = 16 if i > 0 else 18 color = "#64B5F6" if i == 0 else "#E0E0E0" tc = ( _tc( text=cell, font_size=fs, color=color, font=FONT_B if i == 0 else FONT_R, ) .with_duration(10.0) .with_position((col_x[j], y_pos)) ) clips.append(tc) return CompositeVideoClip(clips, size=(W, H)).with_effects( [FadeIn(0.5), FadeOut(0.5)] ) # ── Main ────────────────────────────────────────────────────────── def main() -> None: """Generate the Q24 object detection visualization video.""" sections: list[VideoClip] = [] sections.append( _make_header( "Pytanie 24: Detekcja obiektów", "Problem, metody klasyczne, deep learning", duration=4.0, ) ) # What is detection sections.append( _make_header("Co to detekcja?", "Lokalizacja (bbox) + klasyfikacja (klasa)") ) sections.extend(_detection_concept()) # HOG + SVM sections.append( _make_header("HOG + SVM (2005)", "Klasyczny pipeline — gradient histogramy") ) sections.extend(_hog_svm_demo()) # Viola-Jones sections.append( _make_header("Viola-Jones (2001)", "Haar features + Integral Image + Cascade") ) sections.extend(_viola_jones_demo()) # R-CNN evolution (overview) sections.append(_make_header("Ewolucja R-CNN", "R-CNN → Fast R-CNN → Faster R-CNN")) sections.extend(_rcnn_evolution()) # R-CNN detailed pipeline sections.append( _make_header("R-CNN: krok po kroku", "Selective Search → 2000xCNN → SVM → NMS") ) sections.extend(_rcnn_detailed()) # ROI Pooling sections.append( _make_header("ROI Pooling (Fast R-CNN)", "CNN raz + ROI Pool → 25x szybciej") ) sections.extend(_roi_pooling_demo()) # RPN + Anchors sections.append( _make_header("RPN + Anchor Boxes", "Faster R-CNN: propozycje W SIECI") ) sections.extend(_rpn_anchors_demo()) # YOLO sections.append( _make_header("YOLO (2016)", "You Only Look Once — jednoetapowy detektor") ) sections.extend(_yolo_demo()) # YOLO architecture detail sections.append( _make_header("YOLO: Architektura", "Backbone → Neck → Head → tensor SxS") ) sections.extend(_yolo_architecture()) # DETR sections.append(_make_header("DETR (2020)", "Transformer: bez NMS, bez anchorów!")) sections.extend(_detr_demo()) # NMS + IoU sections.append(_make_header("NMS + IoU", "Post-processing — usuwanie duplikatów")) sections.extend(_nms_iou_demo()) # Detector from classifier sections.append( _make_header( "Detektor z klasyfikatora", "3 podejścia: Sliding → Region → Fine-tune" ) ) sections.extend(_detector_from_classifier()) # Comparison table sections.append(_methods_comparison()) # Summary sections.append( _make_header( "Podsumowanie", "Klasyczne: HOG+SVM, Viola-Jones | DL: R-CNN, YOLO, DETR", duration=4.0, ) ) final = concatenate_videoclips(sections, method="compose") final.write_videofile( OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4 ) print(f"Video saved to: {OUTPUT}") if __name__ == "__main__": main()