mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 15:23:03 +02:00
Split 16+ files. 27 files still need splitting. See session notes.
384 lines
12 KiB
Python
384 lines
12 KiB
Python
"""RPN anchor boxes and YOLO grid detection."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from _q24_common import (
|
|
BG_COLOR,
|
|
FONT_B,
|
|
FONT_R,
|
|
FPS,
|
|
STEP_DUR,
|
|
H,
|
|
W,
|
|
_tc,
|
|
_text_slide,
|
|
)
|
|
from moviepy import CompositeVideoClip, VideoClip
|
|
from moviepy.video.fx import FadeIn, FadeOut
|
|
import numpy as np
|
|
|
|
|
|
# ── RPN + Anchor Boxes ───────────────────────────────────────────
|
|
def _rpn_anchors_demo() -> list[CompositeVideoClip]:
|
|
"""Animate RPN and anchor boxes: Faster R-CNN innovation."""
|
|
slides = []
|
|
|
|
# Slide 1: Anchor boxes concept
|
|
def make_anchors_frame(t: float) -> np.ndarray:
|
|
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
|
frame[:] = BG_COLOR
|
|
progress = min(t / (STEP_DUR * 0.7), 1.0)
|
|
|
|
# Draw feature map grid point with multiple anchors
|
|
cx, cy = 350, 360 # center point on feature map
|
|
|
|
# Draw a "feature map" grid background
|
|
cell = 60
|
|
for r in range(-3, 4):
|
|
for c in range(-3, 4):
|
|
x = cx + c * cell - cell // 2
|
|
y = cy + r * cell - cell // 2
|
|
frame[y : y + cell - 1, x : x + cell - 1] = (30, 35, 48)
|
|
|
|
# Center point highlighted
|
|
frame[cy - 5 : cy + 5, cx - 5 : cx + 5] = (255, 200, 50)
|
|
|
|
# Draw anchors around center: 3 sizes x 3 ratios = 9
|
|
anchor_specs = [
|
|
(30, 30, (200, 80, 80)), # small 1:1
|
|
(20, 40, (200, 60, 60)), # small 1:2
|
|
(40, 20, (180, 60, 60)), # small 2:1
|
|
(60, 60, (80, 200, 80)), # medium 1:1
|
|
(40, 80, (60, 180, 60)), # medium 1:2
|
|
(80, 40, (60, 160, 60)), # medium 2:1
|
|
(90, 90, (80, 80, 200)), # large 1:1
|
|
(60, 120, (60, 60, 180)), # large 1:2
|
|
(120, 60, (60, 60, 160)), # large 2:1
|
|
]
|
|
n_anchors = min(int(progress * 9) + 1, 9)
|
|
for i in range(n_anchors):
|
|
hw, hh, color = anchor_specs[i]
|
|
x1 = max(0, cx - hw)
|
|
y1 = max(0, cy - hh)
|
|
x2 = min(W - 1, cx + hw)
|
|
y2 = min(H - 1, cy + hh)
|
|
for tt in range(2):
|
|
frame[y1 - tt : y2 + tt, x1 - tt : x1 - tt + 2] = color
|
|
frame[y1 - tt : y2 + tt, x2 + tt - 2 : x2 + tt] = color
|
|
frame[y1 - tt : y1 - tt + 2, x1 - tt : x2 + tt] = color
|
|
frame[y2 + tt - 2 : y2 + tt, x1 - tt : x2 + tt] = color
|
|
|
|
return frame
|
|
|
|
anch_clip = VideoClip(make_anchors_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
|
dur = STEP_DUR + 1
|
|
labels = [
|
|
("Anchor Boxes + RPN (Faster R-CNN)", 26, "#FFE082", FONT_B, (80, 20)),
|
|
(
|
|
"KROK 1: Anchory = predefiniowane kształty w każdej pozycji",
|
|
17,
|
|
"#A5D6A7",
|
|
FONT_R,
|
|
(80, 60),
|
|
),
|
|
(
|
|
"3 rozmiary x 3 proporcje = 9 anchorów per punkt",
|
|
16,
|
|
"#B0BEC5",
|
|
FONT_R,
|
|
(80, 90),
|
|
),
|
|
("Małe (1:1, 1:2, 2:1)", 14, "#EF9A9A", FONT_R, (750, 170)),
|
|
("Średnie (1:1, 1:2, 2:1)", 14, "#A5D6A7", FONT_R, (750, 210)),
|
|
("Duże (1:1, 1:2, 2:1)", 14, "#64B5F6", FONT_R, (750, 250)),
|
|
("Żółty punkt = pozycja", 14, "#FFE082", FONT_R, (750, 310)),
|
|
("na feature mapie", 14, "#FFE082", FONT_R, (750, 335)),
|
|
("Sieć NIE predykuje bbox od zera!", 16, "white", FONT_R, (80, 530)),
|
|
(
|
|
"Predykuje OFFSET od najbliższego anchora: (Δx, Δy, Δw, Δh)",
|
|
16,
|
|
"#FFE082",
|
|
FONT_R,
|
|
(80, 565),
|
|
),
|
|
(
|
|
"+ P(obiekt) = 'czy w tym anchorze jest coś?'",
|
|
16,
|
|
"#A5D6A7",
|
|
FONT_R,
|
|
(80, 600),
|
|
),
|
|
(
|
|
"Mnemonik: Anchor = KOTWICA — sieć dopasowuje bbox do kotwicy",
|
|
15,
|
|
"#78909C",
|
|
FONT_R,
|
|
(80, 645),
|
|
),
|
|
]
|
|
text_clips: list[VideoClip] = [anch_clip]
|
|
for text, fs, color, font, pos in labels:
|
|
tc = (
|
|
_tc(text=text, font_size=fs, color=color, font=font)
|
|
.with_duration(dur)
|
|
.with_position(pos)
|
|
)
|
|
text_clips.append(tc)
|
|
slides.append(
|
|
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
|
[FadeIn(0.3), FadeOut(0.3)]
|
|
)
|
|
)
|
|
|
|
# Slide 2: RPN step by step
|
|
rpn_lines = [
|
|
(
|
|
"RPN: Region Proposal Network — krok po kroku",
|
|
24,
|
|
"#FFE082",
|
|
FONT_B,
|
|
(80, 30),
|
|
),
|
|
(
|
|
"Zastępuje Selective Search SIECIĄ NEURONOWĄ (end-to-end!)",
|
|
17,
|
|
"#B0BEC5",
|
|
FONT_R,
|
|
(80, 85),
|
|
),
|
|
("", 10, "white", FONT_R, (80, 110)),
|
|
(
|
|
"1. Backbone (ResNet) przetwarza obraz → feature mapa [40x60x256]",
|
|
16,
|
|
"#64B5F6",
|
|
FONT_R,
|
|
(100, 140),
|
|
),
|
|
(
|
|
"2. Filtr 3x3 przesuwa się po feature mapie",
|
|
16,
|
|
"#A5D6A7",
|
|
FONT_R,
|
|
(100, 180),
|
|
),
|
|
(
|
|
"3. W KAŻDEJ pozycji (x,y) rozważ k=9 anchorów:",
|
|
16,
|
|
"#FFE082",
|
|
FONT_R,
|
|
(100, 220),
|
|
),
|
|
(" → P(obiekt) — 'czy tu jest coś?'", 15, "white", FONT_R, (120, 255)),
|
|
(" → (Δx, Δy, Δw, Δh) — poprawka pozycji", 15, "white", FONT_R, (120, 285)),
|
|
(
|
|
"4. 40x60 pozycji x 9 anchorów = 21 600 kandydatów!",
|
|
16,
|
|
"#EF9A9A",
|
|
FONT_R,
|
|
(100, 325),
|
|
),
|
|
(
|
|
"5. Weź ~300 z najwyższym P(obiekt) → ROI Pool → FC",
|
|
16,
|
|
"#A5D6A7",
|
|
FONT_R,
|
|
(100, 365),
|
|
),
|
|
("", 10, "white", FONT_R, (100, 395)),
|
|
("Porównanie generowania propozycji:", 17, "white", FONT_B, (80, 420)),
|
|
(
|
|
" Selective Search: ~2000 regionów, osobny algorytm, ~2 sec",
|
|
15,
|
|
"#EF9A9A",
|
|
FONT_R,
|
|
(100, 460),
|
|
),
|
|
(
|
|
" RPN: ~300 regionów, W SIECI, ~10 ms → 200x szybciej!",
|
|
15,
|
|
"#A5D6A7",
|
|
FONT_R,
|
|
(100, 495),
|
|
),
|
|
("", 10, "white", FONT_R, (100, 520)),
|
|
(
|
|
"Faster R-CNN = Backbone + RPN + ROI Pool + FC — WSZYSTKO end-to-end",
|
|
17,
|
|
"#FFE082",
|
|
FONT_R,
|
|
(80, 545),
|
|
),
|
|
(
|
|
"→ 5 fps (0.2 sec/obraz) vs R-CNN 50 sec = 250x szybciej!",
|
|
17,
|
|
"#A5D6A7",
|
|
FONT_R,
|
|
(80, 585),
|
|
),
|
|
(
|
|
"Wciąż two-stage: (1) RPN generuje propozycje, (2) FC klasyfikuje",
|
|
15,
|
|
"#78909C",
|
|
FONT_R,
|
|
(80, 630),
|
|
),
|
|
]
|
|
slides.append(_text_slide(rpn_lines, duration=STEP_DUR + 1))
|
|
|
|
return slides
|
|
|
|
|
|
# ── YOLO ──────────────────────────────────────────────────────────
|
|
def _yolo_demo() -> list[CompositeVideoClip]:
|
|
"""Animate YOLO grid detection concept."""
|
|
slides = []
|
|
|
|
def make_yolo_frame(t: float) -> np.ndarray:
|
|
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
|
frame[:] = BG_COLOR
|
|
|
|
progress = min(t / (STEP_DUR * 0.7), 1.0)
|
|
|
|
# Draw image with grid overlay
|
|
img_x, img_y = 100, 140
|
|
img_size = 420
|
|
grid_n = 7
|
|
|
|
# Background "image"
|
|
frame[img_y : img_y + img_size, img_x : img_x + img_size] = (50, 55, 70)
|
|
|
|
# Objects in the image
|
|
frame[img_y + 80 : img_y + 200, img_x + 50 : img_x + 180] = (
|
|
180,
|
|
60,
|
|
60,
|
|
) # "car"
|
|
frame[img_y + 150 : img_y + 350, img_x + 250 : img_x + 330] = (
|
|
60,
|
|
120,
|
|
180,
|
|
) # "person"
|
|
|
|
# Grid lines
|
|
cell = img_size // grid_n
|
|
for i in range(grid_n + 1):
|
|
# Vertical
|
|
x = img_x + i * cell
|
|
frame[img_y : img_y + img_size, x : x + 1] = (100, 100, 120)
|
|
# Horizontal
|
|
y = img_y + i * cell
|
|
frame[y : y + 1, img_x : img_x + img_size] = (100, 100, 120)
|
|
|
|
# Highlight cells containing object centers
|
|
car_phase = 0.3
|
|
if progress > car_phase:
|
|
# Car center ~ cell (1, 1)
|
|
cx, cy = 1, 2
|
|
hx = img_x + cx * cell
|
|
hy = img_y + cy * cell
|
|
frame[hy : hy + cell, hx : hx + cell] = np.clip(
|
|
frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255
|
|
).astype(np.uint8)
|
|
|
|
person_phase = 0.5
|
|
if progress > person_phase:
|
|
# Person center ~ cell (4, 4)
|
|
cx, cy = 4, 4
|
|
hx = img_x + cx * cell
|
|
hy = img_y + cy * cell
|
|
frame[hy : hy + cell, hx : hx + cell] = np.clip(
|
|
frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255
|
|
).astype(np.uint8)
|
|
|
|
# Bounding boxes predictions from cells
|
|
bbox_phase = 0.6
|
|
if progress > bbox_phase:
|
|
# Car bbox
|
|
for tt in range(2):
|
|
frame[
|
|
img_y + 78 - tt : img_y + 202 + tt,
|
|
img_x + 48 - tt : img_x + 48 - tt + 2,
|
|
] = (255, 80, 80)
|
|
frame[
|
|
img_y + 78 - tt : img_y + 202 + tt,
|
|
img_x + 182 + tt - 2 : img_x + 182 + tt,
|
|
] = (255, 80, 80)
|
|
frame[
|
|
img_y + 78 - tt : img_y + 78 - tt + 2,
|
|
img_x + 48 - tt : img_x + 182 + tt,
|
|
] = (255, 80, 80)
|
|
frame[
|
|
img_y + 202 + tt - 2 : img_y + 202 + tt,
|
|
img_x + 48 - tt : img_x + 182 + tt,
|
|
] = (255, 80, 80)
|
|
|
|
# Person bbox
|
|
for tt in range(2):
|
|
frame[
|
|
img_y + 148 - tt : img_y + 352 + tt,
|
|
img_x + 248 - tt : img_x + 248 - tt + 2,
|
|
] = (80, 180, 255)
|
|
frame[
|
|
img_y + 148 - tt : img_y + 352 + tt,
|
|
img_x + 332 + tt - 2 : img_x + 332 + tt,
|
|
] = (80, 180, 255)
|
|
frame[
|
|
img_y + 148 - tt : img_y + 148 - tt + 2,
|
|
img_x + 248 - tt : img_x + 332 + tt,
|
|
] = (80, 180, 255)
|
|
frame[
|
|
img_y + 352 + tt - 2 : img_y + 352 + tt,
|
|
img_x + 248 - tt : img_x + 332 + tt,
|
|
] = (80, 180, 255)
|
|
|
|
return frame
|
|
|
|
yolo_clip = VideoClip(make_yolo_frame, duration=STEP_DUR).with_fps(FPS)
|
|
text_clips: list[VideoClip] = [yolo_clip]
|
|
labels = [
|
|
("YOLO — You Only Look Once", 28, "#FFE082", FONT_B, (80, 20)),
|
|
(
|
|
"Jednoetapowy detektor: siatka SxS → wszystkie detekcje naraz!",
|
|
18,
|
|
"#B0BEC5",
|
|
FONT_R,
|
|
(80, 65),
|
|
),
|
|
("Siatka 7x7 = 49 komórek", 16, "#64B5F6", FONT_R, (600, 180)),
|
|
("Każda komórka predykuje:", 16, "white", FONT_R, (600, 220)),
|
|
(" • B bbox (x, y, w, h, conf)", 14, "#B0BEC5", FONT_R, (600, 255)),
|
|
(" • C klas (prawdopodobieństwa)", 14, "#B0BEC5", FONT_R, (600, 285)),
|
|
("Komórka odpowiada za obiekt", 14, "#A5D6A7", FONT_R, (600, 325)),
|
|
("którego ŚRODEK w niej wpada", 14, "#A5D6A7", FONT_R, (600, 350)),
|
|
("45-155 fps! (vs 5 fps Faster R-CNN)", 18, "#EF9A9A", FONT_B, (600, 400)),
|
|
(
|
|
"Jedno przejście przez sieć → WSZYSTKIE detekcje naraz → NMS → wynik",
|
|
14,
|
|
"#78909C",
|
|
FONT_R,
|
|
(80, 620),
|
|
),
|
|
(
|
|
"Two-stage (R-CNN): propozycje+klasyfikacja "
|
|
"| One-stage (YOLO): bez propozycji!",
|
|
14,
|
|
"#90CAF9",
|
|
FONT_R,
|
|
(80, 655),
|
|
),
|
|
]
|
|
for text, fs, color, font, pos in labels:
|
|
tc = (
|
|
_tc(text=text, font_size=fs, color=color, font=font)
|
|
.with_duration(STEP_DUR)
|
|
.with_position(pos)
|
|
)
|
|
text_clips.append(tc)
|
|
|
|
slides.append(
|
|
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
|
[FadeIn(0.3), FadeOut(0.3)]
|
|
)
|
|
)
|
|
return slides
|