mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 15:23:06 +02:00
406 lines
14 KiB
Python
406 lines
14 KiB
Python
|
|
"""R-CNN family: evolution, detailed pipeline, ROI pooling."""
|
||
|
|
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from _q24_common import (
|
||
|
|
BG_COLOR,
|
||
|
|
FONT_B,
|
||
|
|
FONT_R,
|
||
|
|
FPS,
|
||
|
|
STEP_DUR,
|
||
|
|
H,
|
||
|
|
W,
|
||
|
|
_tc,
|
||
|
|
)
|
||
|
|
from moviepy import CompositeVideoClip, VideoClip
|
||
|
|
from moviepy.video.fx import FadeIn, FadeOut
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
|
||
|
|
# ── R-CNN Evolution ───────────────────────────────────────────────
|
||
|
|
def _rcnn_evolution() -> list[CompositeVideoClip]:
|
||
|
|
"""Animate R-CNN → Fast R-CNN → Faster R-CNN evolution."""
|
||
|
|
slides = []
|
||
|
|
|
||
|
|
def make_evolution_frame(t: float) -> np.ndarray:
|
||
|
|
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||
|
|
frame[:] = BG_COLOR
|
||
|
|
|
||
|
|
progress = min(t / (STEP_DUR * 0.8), 1.0)
|
||
|
|
|
||
|
|
# Three rows: R-CNN, Fast R-CNN, Faster R-CNN
|
||
|
|
models = [
|
||
|
|
(
|
||
|
|
"R-CNN (2014)",
|
||
|
|
50,
|
||
|
|
[
|
||
|
|
("Selective\nSearch", (200, 150), (100, 50), (120, 100, 60)),
|
||
|
|
("2000x\nCNN", (350, 150), (80, 50), (180, 60, 60)),
|
||
|
|
("2000x\nSVM", (480, 150), (80, 50), (180, 60, 60)),
|
||
|
|
("NMS", (610, 150), (60, 50), (100, 140, 100)),
|
||
|
|
],
|
||
|
|
"50 sec/obraz!",
|
||
|
|
),
|
||
|
|
(
|
||
|
|
"Fast R-CNN (2015)",
|
||
|
|
300,
|
||
|
|
[
|
||
|
|
("Selective\nSearch", (200, 150), (100, 50), (120, 100, 60)),
|
||
|
|
("1x CNN\n(cały obraz)", (350, 150), (100, 50), (80, 140, 200)),
|
||
|
|
("ROI Pool\n(2000)", (500, 150), (90, 50), (200, 160, 80)),
|
||
|
|
("FC", (640, 150), (50, 50), (100, 140, 100)),
|
||
|
|
],
|
||
|
|
"2 sec/obraz",
|
||
|
|
),
|
||
|
|
(
|
||
|
|
"Faster R-CNN (2015)",
|
||
|
|
300,
|
||
|
|
[
|
||
|
|
("CNN\nbackbone", (200, 150), (90, 50), (80, 140, 200)),
|
||
|
|
("RPN\n(~300)", (340, 150), (80, 50), (200, 120, 60)),
|
||
|
|
("ROI Pool", (470, 150), (80, 50), (200, 160, 80)),
|
||
|
|
("FC", (600, 150), (50, 50), (100, 140, 100)),
|
||
|
|
],
|
||
|
|
"0.2 sec → 5 fps!",
|
||
|
|
),
|
||
|
|
]
|
||
|
|
|
||
|
|
n_models = int(progress * 3) + 1
|
||
|
|
|
||
|
|
for mi, (_name, base_y, stages, _speed) in enumerate(models):
|
||
|
|
if mi >= n_models:
|
||
|
|
break
|
||
|
|
for _label, (bx, by_off), (bw, bh), color in stages:
|
||
|
|
by = base_y + by_off - 150
|
||
|
|
frame[by : by + bh, bx : bx + bw] = color
|
||
|
|
frame[by : by + 2, bx : bx + bw] = tuple(
|
||
|
|
min(c + 50, 255) for c in color
|
||
|
|
)
|
||
|
|
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
|
||
|
|
min(c + 50, 255) for c in color
|
||
|
|
)
|
||
|
|
|
||
|
|
# Arrows between stages
|
||
|
|
for si in range(len(stages) - 1):
|
||
|
|
sx = stages[si][1][0] + stages[si][2][0]
|
||
|
|
ex = stages[si + 1][1][0]
|
||
|
|
ay = base_y + 25
|
||
|
|
frame[ay - 1 : ay + 2, sx + 3 : ex - 3] = (150, 150, 170)
|
||
|
|
|
||
|
|
return frame
|
||
|
|
|
||
|
|
evo_clip = VideoClip(make_evolution_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
||
|
|
text_clips: list[VideoClip] = [evo_clip]
|
||
|
|
labels = [
|
||
|
|
("Ewolucja R-CNN — CORAZ MNIEJ MARNOWANIA", 28, "#FFE082", FONT_B, (80, 20)),
|
||
|
|
("R-CNN (2014)", 20, "#EF9A9A", FONT_B, (50, 80)),
|
||
|
|
("50 sec/obraz (2000x forward pass!)", 14, "#EF9A9A", FONT_R, (720, 100)),
|
||
|
|
("Fast R-CNN (2015)", 20, "#64B5F6", FONT_B, (50, 330)),
|
||
|
|
("2 sec/obraz (CNN raz + ROI Pool)", 14, "#64B5F6", FONT_R, (720, 350)),
|
||
|
|
("Faster R-CNN (2015)", 20, "#A5D6A7", FONT_B, (50, 580)),
|
||
|
|
("0.2 sec → 5 fps (RPN w sieci!)", 14, "#A5D6A7", FONT_R, (720, 600)),
|
||
|
|
(
|
||
|
|
"Kluczowe innowacje: ROI Pooling → stały rozmiar "
|
||
|
|
"| RPN → propozycje w sieci",
|
||
|
|
14,
|
||
|
|
"#78909C",
|
||
|
|
FONT_R,
|
||
|
|
(80, 660),
|
||
|
|
),
|
||
|
|
]
|
||
|
|
for text, fs, color, font, pos in labels:
|
||
|
|
tc = (
|
||
|
|
_tc(text=text, font_size=fs, color=color, font=font)
|
||
|
|
.with_duration(STEP_DUR + 1)
|
||
|
|
.with_position(pos)
|
||
|
|
)
|
||
|
|
text_clips.append(tc)
|
||
|
|
|
||
|
|
slides.append(
|
||
|
|
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
||
|
|
[FadeIn(0.3), FadeOut(0.3)]
|
||
|
|
)
|
||
|
|
)
|
||
|
|
return slides
|
||
|
|
|
||
|
|
|
||
|
|
# ── R-CNN Detailed Pipeline ──────────────────────────────────────
|
||
|
|
def _rcnn_detailed() -> list[CompositeVideoClip]:
|
||
|
|
"""Animate R-CNN step-by-step pipeline in detail."""
|
||
|
|
slides = []
|
||
|
|
|
||
|
|
# Slide 1: R-CNN pipeline step by step
|
||
|
|
def make_rcnn_pipeline(t: float) -> np.ndarray:
|
||
|
|
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||
|
|
frame[:] = BG_COLOR
|
||
|
|
progress = min(t / (STEP_DUR * 0.8), 1.0)
|
||
|
|
|
||
|
|
# Step boxes arranged vertically with arrows
|
||
|
|
steps = [
|
||
|
|
((80, 130), (200, 55), (120, 100, 60), "1. Selective Search"),
|
||
|
|
((80, 230), (200, 55), (180, 60, 60), "2. Wytnij 2000 regionów"),
|
||
|
|
((80, 330), (200, 55), (70, 130, 200), "3. CNN per region"),
|
||
|
|
((80, 430), (200, 55), (200, 100, 80), "4. SVM klasyfikuje"),
|
||
|
|
((80, 530), (200, 55), (100, 180, 100), "5. Bbox regresja + NMS"),
|
||
|
|
]
|
||
|
|
n_steps = min(int(progress * 5) + 1, 5)
|
||
|
|
for i, ((bx, by), (bw, bh), color, _lbl) in enumerate(steps):
|
||
|
|
if i < n_steps:
|
||
|
|
frame[by : by + bh, bx : bx + bw] = color
|
||
|
|
frame[by : by + 2, bx : bx + bw] = tuple(
|
||
|
|
min(c + 50, 255) for c in color
|
||
|
|
)
|
||
|
|
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
|
||
|
|
min(c + 50, 255) for c in color
|
||
|
|
)
|
||
|
|
# Arrow down
|
||
|
|
arrow_limit = 4
|
||
|
|
if i < arrow_limit:
|
||
|
|
ax = bx + bw // 2
|
||
|
|
ay = by + bh + 5
|
||
|
|
frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)
|
||
|
|
|
||
|
|
# Illustration: many overlapping regions from Selective Search
|
||
|
|
overlay_phase = 0.2
|
||
|
|
if progress > overlay_phase:
|
||
|
|
rng_local = np.random.default_rng(42)
|
||
|
|
n_boxes = min(int((progress - 0.2) * 15), 8)
|
||
|
|
for i in range(n_boxes):
|
||
|
|
rx = 500 + rng_local.integers(-30, 100)
|
||
|
|
ry = 200 + rng_local.integers(-20, 120)
|
||
|
|
rw = 60 + rng_local.integers(0, 80)
|
||
|
|
rh = 50 + rng_local.integers(0, 70)
|
||
|
|
c = (80 + i * 15, 100 + i * 10, 60 + i * 20)
|
||
|
|
for tt in range(2):
|
||
|
|
frame[ry - tt : ry + rh + tt, rx - tt : rx - tt + 2] = c
|
||
|
|
frame[ry - tt : ry + rh + tt, rx + rw + tt - 2 : rx + rw + tt] = c
|
||
|
|
frame[ry - tt : ry - tt + 2, rx - tt : rx + rw + tt] = c
|
||
|
|
frame[ry + rh + tt - 2 : ry + rh + tt, rx - tt : rx + rw + tt] = c
|
||
|
|
|
||
|
|
return frame
|
||
|
|
|
||
|
|
rcnn_clip = VideoClip(make_rcnn_pipeline, duration=STEP_DUR + 1).with_fps(FPS)
|
||
|
|
dur = STEP_DUR + 1
|
||
|
|
labels = [
|
||
|
|
("R-CNN: krok po kroku (2014, Girshick)", 26, "#FFE082", FONT_B, (80, 20)),
|
||
|
|
("Pipeline detekcji two-stage", 16, "#B0BEC5", FONT_R, (80, 60)),
|
||
|
|
("Selective Search", 11, "white", FONT_R, (105, 145)),
|
||
|
|
("2000 regionów", 11, "white", FONT_R, (105, 245)),
|
||
|
|
("CNN per region", 11, "white", FONT_R, (105, 345)),
|
||
|
|
("SVM klasyfikuje", 11, "white", FONT_R, (105, 445)),
|
||
|
|
("Regresja + NMS", 11, "white", FONT_R, (105, 545)),
|
||
|
|
("~2000 propozycji regionów", 14, "#78909C", FONT_R, (500, 155)),
|
||
|
|
("(inteligentne łączenie", 13, "#78909C", FONT_R, (500, 180)),
|
||
|
|
("podobnych fragmentów)", 13, "#78909C", FONT_R, (500, 200)),
|
||
|
|
("Problem: 2000 x CNN forward pass", 16, "#EF9A9A", FONT_R, (400, 400)),
|
||
|
|
("= 50 SEKUND na obraz!", 18, "#EF9A9A", FONT_B, (400, 430)),
|
||
|
|
("CNN liczy cechy per region OSOBNO", 14, "#EF9A9A", FONT_R, (400, 470)),
|
||
|
|
(
|
||
|
|
"→ regiony się nakładają → obliczenia się powtarzają!",
|
||
|
|
14,
|
||
|
|
"#EF9A9A",
|
||
|
|
FONT_R,
|
||
|
|
(400, 495),
|
||
|
|
),
|
||
|
|
(
|
||
|
|
"Rozwiązanie: CNN raz na cały obraz → Fast R-CNN →",
|
||
|
|
16,
|
||
|
|
"#A5D6A7",
|
||
|
|
FONT_R,
|
||
|
|
(80, 620),
|
||
|
|
),
|
||
|
|
]
|
||
|
|
text_clips: list[VideoClip] = [rcnn_clip]
|
||
|
|
for text, fs, color, font, pos in labels:
|
||
|
|
tc = (
|
||
|
|
_tc(text=text, font_size=fs, color=color, font=font)
|
||
|
|
.with_duration(dur)
|
||
|
|
.with_position(pos)
|
||
|
|
)
|
||
|
|
text_clips.append(tc)
|
||
|
|
slides.append(
|
||
|
|
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
||
|
|
[FadeIn(0.3), FadeOut(0.3)]
|
||
|
|
)
|
||
|
|
)
|
||
|
|
|
||
|
|
return slides
|
||
|
|
|
||
|
|
|
||
|
|
# ── ROI Pooling ──────────────────────────────────────────────────
|
||
|
|
|
||
|
|
|
||
|
|
def _draw_roi_pool_grid(frame: np.ndarray) -> None:
|
||
|
|
"""Draw the 3x3 ROI pool grid with max-pooled feature values."""
|
||
|
|
out_x, out_y = 400, 220
|
||
|
|
out_cell = 50
|
||
|
|
out_n = 3
|
||
|
|
roi_r1, roi_c1 = 2, 1
|
||
|
|
roi_r2, roi_c2 = 6, 5
|
||
|
|
roi_h = roi_r2 - roi_r1
|
||
|
|
roi_w = roi_c2 - roi_c1
|
||
|
|
for r in range(out_n):
|
||
|
|
for c in range(out_n):
|
||
|
|
x = out_x + c * out_cell
|
||
|
|
y = out_y + r * out_cell
|
||
|
|
|
||
|
|
# Compute the max from corresponding region
|
||
|
|
src_r1 = roi_r1 + r * roi_h // out_n
|
||
|
|
src_r2 = roi_r1 + (r + 1) * roi_h // out_n
|
||
|
|
src_c1 = roi_c1 + c * roi_w // out_n
|
||
|
|
src_c2 = roi_c1 + (c + 1) * roi_w // out_n
|
||
|
|
max_val = 0
|
||
|
|
for sr in range(src_r1, src_r2):
|
||
|
|
for sc in range(src_c1, src_c2):
|
||
|
|
v = 30 + ((sr * 7 + sc * 13 + 42) % 40)
|
||
|
|
max_val = max(max_val, v)
|
||
|
|
|
||
|
|
frame[y : y + out_cell - 2, x : x + out_cell - 2] = (
|
||
|
|
max_val,
|
||
|
|
max_val + 20,
|
||
|
|
max_val + 40,
|
||
|
|
)
|
||
|
|
frame[y : y + 2, x : x + out_cell - 2] = (80, 200, 120)
|
||
|
|
frame[y + out_cell - 4 : y + out_cell - 2, x : x + out_cell - 2] = (
|
||
|
|
80,
|
||
|
|
200,
|
||
|
|
120,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
def _make_roi_frame(t: float) -> np.ndarray:
|
||
|
|
"""Render a single frame for the ROI pooling animation."""
|
||
|
|
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||
|
|
frame[:] = BG_COLOR
|
||
|
|
progress = min(t / (STEP_DUR * 0.7), 1.0)
|
||
|
|
|
||
|
|
# Left: feature map with ROI highlighted
|
||
|
|
fm_x, fm_y = 60, 180
|
||
|
|
fm_cell = 30
|
||
|
|
fm_grid = 8
|
||
|
|
for r in range(fm_grid):
|
||
|
|
for c in range(fm_grid):
|
||
|
|
x = fm_x + c * fm_cell
|
||
|
|
y = fm_y + r * fm_cell
|
||
|
|
# Random-looking feature values
|
||
|
|
val = 30 + ((r * 7 + c * 13 + 42) % 40)
|
||
|
|
frame[y : y + fm_cell - 1, x : x + fm_cell - 1] = (
|
||
|
|
val,
|
||
|
|
val + 10,
|
||
|
|
val + 20,
|
||
|
|
)
|
||
|
|
|
||
|
|
# ROI region highlighted
|
||
|
|
roi_r1, roi_c1 = 2, 1
|
||
|
|
roi_r2, roi_c2 = 6, 5
|
||
|
|
for tt in range(3):
|
||
|
|
ry1 = fm_y + roi_r1 * fm_cell - tt
|
||
|
|
ry2 = fm_y + roi_r2 * fm_cell + tt
|
||
|
|
rx1 = fm_x + roi_c1 * fm_cell - tt
|
||
|
|
rx2 = fm_x + roi_c2 * fm_cell + tt
|
||
|
|
frame[ry1:ry2, rx1 : rx1 + 2] = (255, 200, 50)
|
||
|
|
frame[ry1:ry2, rx2 - 2 : rx2] = (255, 200, 50)
|
||
|
|
frame[ry1 : ry1 + 2, rx1:rx2] = (255, 200, 50)
|
||
|
|
frame[ry2 - 2 : ry2, rx1:rx2] = (255, 200, 50)
|
||
|
|
|
||
|
|
# Arrow
|
||
|
|
arrow_phase = 0.3
|
||
|
|
if progress > arrow_phase:
|
||
|
|
frame[300:303, 310:380] = (150, 150, 170)
|
||
|
|
|
||
|
|
# Middle: ROI divided into 3x3 grid (output_size)
|
||
|
|
grid_phase = 0.3
|
||
|
|
if progress > grid_phase:
|
||
|
|
_draw_roi_pool_grid(frame)
|
||
|
|
|
||
|
|
# Arrow to FC
|
||
|
|
fc_phase = 0.6
|
||
|
|
if progress > fc_phase:
|
||
|
|
frame[300:303, 560:630] = (150, 150, 170)
|
||
|
|
# FC box
|
||
|
|
frame[270:340, 650:730] = (200, 100, 80)
|
||
|
|
frame[270:272, 650:730] = (240, 140, 120)
|
||
|
|
frame[338:340, 650:730] = (240, 140, 120)
|
||
|
|
|
||
|
|
return frame
|
||
|
|
|
||
|
|
|
||
|
|
def _roi_pooling_demo() -> list[CompositeVideoClip]:
|
||
|
|
"""Animate ROI Pooling: key Fast R-CNN innovation."""
|
||
|
|
slides = []
|
||
|
|
|
||
|
|
roi_clip = VideoClip(_make_roi_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
||
|
|
dur = STEP_DUR + 1
|
||
|
|
labels = [
|
||
|
|
("ROI Pooling: kluczowa innowacja Fast R-CNN", 26, "#FFE082", FONT_B, (80, 20)),
|
||
|
|
(
|
||
|
|
"KROK 1: CNN raz na CAŁY obraz → feature mapa",
|
||
|
|
17,
|
||
|
|
"#64B5F6",
|
||
|
|
FONT_R,
|
||
|
|
(80, 60),
|
||
|
|
),
|
||
|
|
(
|
||
|
|
"KROK 2: Wytnij ROI z feature mapy (nie z obrazu!)",
|
||
|
|
17,
|
||
|
|
"#FFE082",
|
||
|
|
FONT_R,
|
||
|
|
(80, 90),
|
||
|
|
),
|
||
|
|
(
|
||
|
|
"KROK 3: Siatkuj ROI na 3x3 → max pool per komórka → stały rozmiar",
|
||
|
|
17,
|
||
|
|
"#A5D6A7",
|
||
|
|
FONT_R,
|
||
|
|
(80, 120),
|
||
|
|
),
|
||
|
|
("Feature mapa", 14, "#64B5F6", FONT_B, (60, 160)),
|
||
|
|
("ROI (żółta ramka)", 13, "#FFE082", FONT_R, (60, 440)),
|
||
|
|
("ROI Pool 3x3", 14, "#A5D6A7", FONT_B, (400, 195)),
|
||
|
|
("(max z komórki)", 13, "#78909C", FONT_R, (400, 380)),
|
||
|
|
("FC", 14, "white", FONT_B, (670, 280)),
|
||
|
|
(
|
||
|
|
"Problem: ROI mają RÓŻNE rozmiary, FC wymaga STAŁEGO",
|
||
|
|
15,
|
||
|
|
"#B0BEC5",
|
||
|
|
FONT_R,
|
||
|
|
(80, 500),
|
||
|
|
),
|
||
|
|
(
|
||
|
|
"ROI Pooling: dzieli ROI na siatkę, max pool → STAŁY rozmiar!",
|
||
|
|
16,
|
||
|
|
"white",
|
||
|
|
FONT_R,
|
||
|
|
(80, 535),
|
||
|
|
),
|
||
|
|
(
|
||
|
|
"Fast R-CNN: CNN raz → 1 feature mapa → "
|
||
|
|
"ROI Pool 2000 regionów → 25x szybciej!",
|
||
|
|
16,
|
||
|
|
"#A5D6A7",
|
||
|
|
FONT_R,
|
||
|
|
(80, 580),
|
||
|
|
),
|
||
|
|
(
|
||
|
|
"(R-CNN: 2000x CNN = 50s | Fast R-CNN: 1xCNN + ROI Pool = 2s)",
|
||
|
|
15,
|
||
|
|
"#EF9A9A",
|
||
|
|
FONT_R,
|
||
|
|
(80, 620),
|
||
|
|
),
|
||
|
|
]
|
||
|
|
text_clips: list[VideoClip] = [roi_clip]
|
||
|
|
for text, fs, color, font, pos in labels:
|
||
|
|
tc = (
|
||
|
|
_tc(text=text, font_size=fs, color=color, font=font)
|
||
|
|
.with_duration(dur)
|
||
|
|
.with_position(pos)
|
||
|
|
)
|
||
|
|
text_clips.append(tc)
|
||
|
|
slides.append(
|
||
|
|
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
||
|
|
[FadeIn(0.3), FadeOut(0.3)]
|
||
|
|
)
|
||
|
|
)
|
||
|
|
return slides
|