testsAndMisc/python_pkg/praca_magisterska_video/_q24_rcnn.py
Krzysztof kuhy Rudnicki c985160d17 WIP: Enforce 500-line limit - split batch 1
Split 16+ files. 27 files still need splitting. See session notes.
2026-03-16 22:46:48 +01:00

406 lines
14 KiB
Python

"""R-CNN family: evolution, detailed pipeline, ROI pooling."""
from __future__ import annotations
from _q24_common import (
BG_COLOR,
FONT_B,
FONT_R,
FPS,
STEP_DUR,
H,
W,
_tc,
)
from moviepy import CompositeVideoClip, VideoClip
from moviepy.video.fx import FadeIn, FadeOut
import numpy as np
# ── R-CNN Evolution ───────────────────────────────────────────────
def _rcnn_evolution() -> list[CompositeVideoClip]:
"""Animate R-CNN → Fast R-CNN → Faster R-CNN evolution."""
slides = []
def make_evolution_frame(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.8), 1.0)
# Three rows: R-CNN, Fast R-CNN, Faster R-CNN
models = [
(
"R-CNN (2014)",
50,
[
("Selective\nSearch", (200, 150), (100, 50), (120, 100, 60)),
("2000x\nCNN", (350, 150), (80, 50), (180, 60, 60)),
("2000x\nSVM", (480, 150), (80, 50), (180, 60, 60)),
("NMS", (610, 150), (60, 50), (100, 140, 100)),
],
"50 sec/obraz!",
),
(
"Fast R-CNN (2015)",
300,
[
("Selective\nSearch", (200, 150), (100, 50), (120, 100, 60)),
("1x CNN\n(cały obraz)", (350, 150), (100, 50), (80, 140, 200)),
("ROI Pool\n(2000)", (500, 150), (90, 50), (200, 160, 80)),
("FC", (640, 150), (50, 50), (100, 140, 100)),
],
"2 sec/obraz",
),
(
"Faster R-CNN (2015)",
300,
[
("CNN\nbackbone", (200, 150), (90, 50), (80, 140, 200)),
("RPN\n(~300)", (340, 150), (80, 50), (200, 120, 60)),
("ROI Pool", (470, 150), (80, 50), (200, 160, 80)),
("FC", (600, 150), (50, 50), (100, 140, 100)),
],
"0.2 sec → 5 fps!",
),
]
n_models = int(progress * 3) + 1
for mi, (_name, base_y, stages, _speed) in enumerate(models):
if mi >= n_models:
break
for _label, (bx, by_off), (bw, bh), color in stages:
by = base_y + by_off - 150
frame[by : by + bh, bx : bx + bw] = color
frame[by : by + 2, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
# Arrows between stages
for si in range(len(stages) - 1):
sx = stages[si][1][0] + stages[si][2][0]
ex = stages[si + 1][1][0]
ay = base_y + 25
frame[ay - 1 : ay + 2, sx + 3 : ex - 3] = (150, 150, 170)
return frame
evo_clip = VideoClip(make_evolution_frame, duration=STEP_DUR + 1).with_fps(FPS)
text_clips: list[VideoClip] = [evo_clip]
labels = [
("Ewolucja R-CNN — CORAZ MNIEJ MARNOWANIA", 28, "#FFE082", FONT_B, (80, 20)),
("R-CNN (2014)", 20, "#EF9A9A", FONT_B, (50, 80)),
("50 sec/obraz (2000x forward pass!)", 14, "#EF9A9A", FONT_R, (720, 100)),
("Fast R-CNN (2015)", 20, "#64B5F6", FONT_B, (50, 330)),
("2 sec/obraz (CNN raz + ROI Pool)", 14, "#64B5F6", FONT_R, (720, 350)),
("Faster R-CNN (2015)", 20, "#A5D6A7", FONT_B, (50, 580)),
("0.2 sec → 5 fps (RPN w sieci!)", 14, "#A5D6A7", FONT_R, (720, 600)),
(
"Kluczowe innowacje: ROI Pooling → stały rozmiar "
"| RPN → propozycje w sieci",
14,
"#78909C",
FONT_R,
(80, 660),
),
]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(STEP_DUR + 1)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── R-CNN Detailed Pipeline ──────────────────────────────────────
def _rcnn_detailed() -> list[CompositeVideoClip]:
"""Animate R-CNN step-by-step pipeline in detail."""
slides = []
# Slide 1: R-CNN pipeline step by step
def make_rcnn_pipeline(t: float) -> np.ndarray:
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.8), 1.0)
# Step boxes arranged vertically with arrows
steps = [
((80, 130), (200, 55), (120, 100, 60), "1. Selective Search"),
((80, 230), (200, 55), (180, 60, 60), "2. Wytnij 2000 regionów"),
((80, 330), (200, 55), (70, 130, 200), "3. CNN per region"),
((80, 430), (200, 55), (200, 100, 80), "4. SVM klasyfikuje"),
((80, 530), (200, 55), (100, 180, 100), "5. Bbox regresja + NMS"),
]
n_steps = min(int(progress * 5) + 1, 5)
for i, ((bx, by), (bw, bh), color, _lbl) in enumerate(steps):
if i < n_steps:
frame[by : by + bh, bx : bx + bw] = color
frame[by : by + 2, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
min(c + 50, 255) for c in color
)
# Arrow down
arrow_limit = 4
if i < arrow_limit:
ax = bx + bw // 2
ay = by + bh + 5
frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)
# Illustration: many overlapping regions from Selective Search
overlay_phase = 0.2
if progress > overlay_phase:
rng_local = np.random.default_rng(42)
n_boxes = min(int((progress - 0.2) * 15), 8)
for i in range(n_boxes):
rx = 500 + rng_local.integers(-30, 100)
ry = 200 + rng_local.integers(-20, 120)
rw = 60 + rng_local.integers(0, 80)
rh = 50 + rng_local.integers(0, 70)
c = (80 + i * 15, 100 + i * 10, 60 + i * 20)
for tt in range(2):
frame[ry - tt : ry + rh + tt, rx - tt : rx - tt + 2] = c
frame[ry - tt : ry + rh + tt, rx + rw + tt - 2 : rx + rw + tt] = c
frame[ry - tt : ry - tt + 2, rx - tt : rx + rw + tt] = c
frame[ry + rh + tt - 2 : ry + rh + tt, rx - tt : rx + rw + tt] = c
return frame
rcnn_clip = VideoClip(make_rcnn_pipeline, duration=STEP_DUR + 1).with_fps(FPS)
dur = STEP_DUR + 1
labels = [
("R-CNN: krok po kroku (2014, Girshick)", 26, "#FFE082", FONT_B, (80, 20)),
("Pipeline detekcji two-stage", 16, "#B0BEC5", FONT_R, (80, 60)),
("Selective Search", 11, "white", FONT_R, (105, 145)),
("2000 regionów", 11, "white", FONT_R, (105, 245)),
("CNN per region", 11, "white", FONT_R, (105, 345)),
("SVM klasyfikuje", 11, "white", FONT_R, (105, 445)),
("Regresja + NMS", 11, "white", FONT_R, (105, 545)),
("~2000 propozycji regionów", 14, "#78909C", FONT_R, (500, 155)),
("(inteligentne łączenie", 13, "#78909C", FONT_R, (500, 180)),
("podobnych fragmentów)", 13, "#78909C", FONT_R, (500, 200)),
("Problem: 2000 x CNN forward pass", 16, "#EF9A9A", FONT_R, (400, 400)),
("= 50 SEKUND na obraz!", 18, "#EF9A9A", FONT_B, (400, 430)),
("CNN liczy cechy per region OSOBNO", 14, "#EF9A9A", FONT_R, (400, 470)),
(
"→ regiony się nakładają → obliczenia się powtarzają!",
14,
"#EF9A9A",
FONT_R,
(400, 495),
),
(
"Rozwiązanie: CNN raz na cały obraz → Fast R-CNN →",
16,
"#A5D6A7",
FONT_R,
(80, 620),
),
]
text_clips: list[VideoClip] = [rcnn_clip]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(dur)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides
# ── ROI Pooling ──────────────────────────────────────────────────
def _draw_roi_pool_grid(frame: np.ndarray) -> None:
"""Draw the 3x3 ROI pool grid with max-pooled feature values."""
out_x, out_y = 400, 220
out_cell = 50
out_n = 3
roi_r1, roi_c1 = 2, 1
roi_r2, roi_c2 = 6, 5
roi_h = roi_r2 - roi_r1
roi_w = roi_c2 - roi_c1
for r in range(out_n):
for c in range(out_n):
x = out_x + c * out_cell
y = out_y + r * out_cell
# Compute the max from corresponding region
src_r1 = roi_r1 + r * roi_h // out_n
src_r2 = roi_r1 + (r + 1) * roi_h // out_n
src_c1 = roi_c1 + c * roi_w // out_n
src_c2 = roi_c1 + (c + 1) * roi_w // out_n
max_val = 0
for sr in range(src_r1, src_r2):
for sc in range(src_c1, src_c2):
v = 30 + ((sr * 7 + sc * 13 + 42) % 40)
max_val = max(max_val, v)
frame[y : y + out_cell - 2, x : x + out_cell - 2] = (
max_val,
max_val + 20,
max_val + 40,
)
frame[y : y + 2, x : x + out_cell - 2] = (80, 200, 120)
frame[y + out_cell - 4 : y + out_cell - 2, x : x + out_cell - 2] = (
80,
200,
120,
)
def _make_roi_frame(t: float) -> np.ndarray:
"""Render a single frame for the ROI pooling animation."""
frame = np.zeros((H, W, 3), dtype=np.uint8)
frame[:] = BG_COLOR
progress = min(t / (STEP_DUR * 0.7), 1.0)
# Left: feature map with ROI highlighted
fm_x, fm_y = 60, 180
fm_cell = 30
fm_grid = 8
for r in range(fm_grid):
for c in range(fm_grid):
x = fm_x + c * fm_cell
y = fm_y + r * fm_cell
# Random-looking feature values
val = 30 + ((r * 7 + c * 13 + 42) % 40)
frame[y : y + fm_cell - 1, x : x + fm_cell - 1] = (
val,
val + 10,
val + 20,
)
# ROI region highlighted
roi_r1, roi_c1 = 2, 1
roi_r2, roi_c2 = 6, 5
for tt in range(3):
ry1 = fm_y + roi_r1 * fm_cell - tt
ry2 = fm_y + roi_r2 * fm_cell + tt
rx1 = fm_x + roi_c1 * fm_cell - tt
rx2 = fm_x + roi_c2 * fm_cell + tt
frame[ry1:ry2, rx1 : rx1 + 2] = (255, 200, 50)
frame[ry1:ry2, rx2 - 2 : rx2] = (255, 200, 50)
frame[ry1 : ry1 + 2, rx1:rx2] = (255, 200, 50)
frame[ry2 - 2 : ry2, rx1:rx2] = (255, 200, 50)
# Arrow
arrow_phase = 0.3
if progress > arrow_phase:
frame[300:303, 310:380] = (150, 150, 170)
# Middle: ROI divided into 3x3 grid (output_size)
grid_phase = 0.3
if progress > grid_phase:
_draw_roi_pool_grid(frame)
# Arrow to FC
fc_phase = 0.6
if progress > fc_phase:
frame[300:303, 560:630] = (150, 150, 170)
# FC box
frame[270:340, 650:730] = (200, 100, 80)
frame[270:272, 650:730] = (240, 140, 120)
frame[338:340, 650:730] = (240, 140, 120)
return frame
def _roi_pooling_demo() -> list[CompositeVideoClip]:
"""Animate ROI Pooling: key Fast R-CNN innovation."""
slides = []
roi_clip = VideoClip(_make_roi_frame, duration=STEP_DUR + 1).with_fps(FPS)
dur = STEP_DUR + 1
labels = [
("ROI Pooling: kluczowa innowacja Fast R-CNN", 26, "#FFE082", FONT_B, (80, 20)),
(
"KROK 1: CNN raz na CAŁY obraz → feature mapa",
17,
"#64B5F6",
FONT_R,
(80, 60),
),
(
"KROK 2: Wytnij ROI z feature mapy (nie z obrazu!)",
17,
"#FFE082",
FONT_R,
(80, 90),
),
(
"KROK 3: Siatkuj ROI na 3x3 → max pool per komórka → stały rozmiar",
17,
"#A5D6A7",
FONT_R,
(80, 120),
),
("Feature mapa", 14, "#64B5F6", FONT_B, (60, 160)),
("ROI (żółta ramka)", 13, "#FFE082", FONT_R, (60, 440)),
("ROI Pool 3x3", 14, "#A5D6A7", FONT_B, (400, 195)),
("(max z komórki)", 13, "#78909C", FONT_R, (400, 380)),
("FC", 14, "white", FONT_B, (670, 280)),
(
"Problem: ROI mają RÓŻNE rozmiary, FC wymaga STAŁEGO",
15,
"#B0BEC5",
FONT_R,
(80, 500),
),
(
"ROI Pooling: dzieli ROI na siatkę, max pool → STAŁY rozmiar!",
16,
"white",
FONT_R,
(80, 535),
),
(
"Fast R-CNN: CNN raz → 1 feature mapa → "
"ROI Pool 2000 regionów → 25x szybciej!",
16,
"#A5D6A7",
FONT_R,
(80, 580),
),
(
"(R-CNN: 2000x CNN = 50s | Fast R-CNN: 1xCNN + ROI Pool = 2s)",
15,
"#EF9A9A",
FONT_R,
(80, 620),
),
]
text_clips: list[VideoClip] = [roi_clip]
for text, fs, color, font, pos in labels:
tc = (
_tc(text=text, font_size=fs, color=color, font=font)
.with_duration(dur)
.with_position(pos)
)
text_clips.append(tc)
slides.append(
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
[FadeIn(0.3), FadeOut(0.3)]
)
)
return slides