mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 13:43:02 +02:00
refactor(praca): fix ruff violations in visualize scripts
This commit is contained in:
parent
be31e9abd7
commit
47c7679222
@ -6,6 +6,8 @@ on a small example graph, rendering each algorithm step by step.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@ -33,6 +35,9 @@ OUTPUT_DIR = Path(__file__).resolve().parent / "videos"
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
OUTPUT = str(OUTPUT_DIR / "q02_shortest_path.mp4")
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
# Graph definition
|
||||
NODE_POS = {"S": (250, 280), "A": (550, 180), "B": (550, 450), "C": (850, 320)}
|
||||
EDGES_DIJKSTRA = [
|
||||
@ -101,13 +106,13 @@ def _draw_circle(
|
||||
|
||||
def _draw_line(
|
||||
frame: np.ndarray,
|
||||
x1: int,
|
||||
y1: int,
|
||||
x2: int,
|
||||
y2: int,
|
||||
start: tuple[int, int],
|
||||
end: tuple[int, int],
|
||||
color: tuple[int, ...],
|
||||
thickness: int = 2,
|
||||
) -> None:
|
||||
x1, y1 = start
|
||||
x2, y2 = end
|
||||
length = max(int(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)), 1)
|
||||
for i in range(length):
|
||||
frac = i / length
|
||||
@ -122,13 +127,13 @@ def _draw_line(
|
||||
|
||||
def _draw_arrow(
|
||||
frame: np.ndarray,
|
||||
x1: int,
|
||||
y1: int,
|
||||
x2: int,
|
||||
y2: int,
|
||||
start: tuple[int, int],
|
||||
end: tuple[int, int],
|
||||
color: tuple[int, ...],
|
||||
thickness: int = 2,
|
||||
) -> None:
|
||||
x1, y1 = start
|
||||
x2, y2 = end
|
||||
r = 32
|
||||
length = max(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2), 1)
|
||||
ddx = (x2 - x1) / length
|
||||
@ -137,14 +142,14 @@ def _draw_arrow(
|
||||
sy = int(y1 + ddy * r)
|
||||
ex = int(x2 - ddx * r)
|
||||
ey = int(y2 - ddy * r)
|
||||
_draw_line(frame, sx, sy, ex, ey, color, thickness)
|
||||
_draw_line(frame, (sx, sy), (ex, ey), color, thickness)
|
||||
angle = np.arctan2(ey - sy, ex - sx)
|
||||
arrow_len = 12
|
||||
for side in [-1, 1]:
|
||||
a = angle + np.pi + side * 0.4
|
||||
ax = int(ex + arrow_len * np.cos(a))
|
||||
ay = int(ey + arrow_len * np.sin(a))
|
||||
_draw_line(frame, ex, ey, ax, ay, color, thickness)
|
||||
_draw_line(frame, (ex, ey), (ax, ay), color, thickness)
|
||||
|
||||
|
||||
def _render_graph(
|
||||
@ -163,7 +168,7 @@ def _render_graph(
|
||||
sx, sy = nodes[src]
|
||||
dx, dy = nodes[dst]
|
||||
ec = COL_EDGE_ACT if active_edge == (src, dst) else COL_EDGE
|
||||
_draw_arrow(frame, sx, sy, dx, dy, ec, thickness=2)
|
||||
_draw_arrow(frame, (sx, sy), (dx, dy), ec, thickness=2)
|
||||
|
||||
for name, (x, y) in nodes.items():
|
||||
if name == current:
|
||||
@ -184,19 +189,32 @@ def _render_graph(
|
||||
return frame
|
||||
|
||||
|
||||
@dataclass
|
||||
class _StepConfig:
|
||||
"""Configuration for a single algorithm visualization step."""
|
||||
|
||||
nodes: dict[str, tuple[int, int]]
|
||||
edges: list[tuple[str, str, int]]
|
||||
distances: dict[str, str]
|
||||
current: str | None = None
|
||||
visited: set[str] | None = None
|
||||
active_edge: tuple[str, str] | None = None
|
||||
step_text: str = ""
|
||||
algo_name: str = ""
|
||||
|
||||
|
||||
def _make_step(
|
||||
nodes: dict[str, tuple[int, int]],
|
||||
edges: list[tuple[str, str, int]],
|
||||
distances: dict[str, str],
|
||||
current: str | None = None,
|
||||
visited: set[str] | None = None,
|
||||
active_edge: tuple[str, str] | None = None,
|
||||
step_text: str = "",
|
||||
algo_name: str = "",
|
||||
cfg: _StepConfig,
|
||||
duration: float = STEP_DUR,
|
||||
) -> CompositeVideoClip:
|
||||
if visited is None:
|
||||
visited = set()
|
||||
nodes = cfg.nodes
|
||||
edges = cfg.edges
|
||||
distances = cfg.distances
|
||||
current = cfg.current
|
||||
visited = cfg.visited if cfg.visited is not None else set()
|
||||
active_edge = cfg.active_edge
|
||||
step_text = cfg.step_text
|
||||
algo_name = cfg.algo_name
|
||||
|
||||
graph_frame = _render_graph(nodes, edges, distances, current, visited, active_edge)
|
||||
|
||||
@ -305,6 +323,7 @@ def _dijkstra_steps() -> list[CompositeVideoClip]:
|
||||
e = EDGES_DIJKSTRA
|
||||
return [
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": INF, "B": INF, "C": INF},
|
||||
@ -312,7 +331,9 @@ def _dijkstra_steps() -> list[CompositeVideoClip]:
|
||||
step_text="Inicjalizacja: d[S]=0, reszta=∞. Wybierz S (min d).",
|
||||
algo_name="Algorytm Dijkstry",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "2", "B": "5", "C": INF},
|
||||
@ -321,7 +342,9 @@ def _dijkstra_steps() -> list[CompositeVideoClip]:
|
||||
step_text="Relaksacja S→A: d[A]=0+2=2. S→B: d[B]=0+5=5.",
|
||||
algo_name="Algorytm Dijkstry",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "2", "B": "5", "C": "5"},
|
||||
@ -331,25 +354,36 @@ def _dijkstra_steps() -> list[CompositeVideoClip]:
|
||||
step_text="Zamknij S. Min=A(2). Relaksacja A→C: d[C]=2+3=5.",
|
||||
algo_name="Algorytm Dijkstry",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "2", "B": "5", "C": "5"},
|
||||
current="B",
|
||||
visited={"S", "A"},
|
||||
active_edge=("B", "A"),
|
||||
step_text="Zamknij A. Min=B(5). B→A: 5+1=6>2, nie zmieniaj. B→C: 5+6=11>5.",
|
||||
step_text=(
|
||||
"Zamknij A. Min=B(5). B→A: 5+1=6>2, "
|
||||
"nie zmieniaj. B→C: 5+6=11>5."
|
||||
),
|
||||
algo_name="Algorytm Dijkstry",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "2", "B": "5", "C": "5"},
|
||||
current="C",
|
||||
visited={"S", "A", "B"},
|
||||
step_text="Zamknij B. Min=C(5). Koniec! Wynik: d={S:0, A:2, B:5, C:5}.",
|
||||
step_text=(
|
||||
"Zamknij B. Min=C(5). Koniec! "
|
||||
"Wynik: d={S:0, A:2, B:5, C:5}."
|
||||
),
|
||||
algo_name="Dijkstra -- WYNIK",
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -358,43 +392,68 @@ def _bellman_ford_steps() -> list[CompositeVideoClip]:
|
||||
e = EDGES_BF
|
||||
return [
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": INF, "B": INF, "C": INF},
|
||||
step_text="Bellman-Ford: relaksuj WSZYSTKIE krawędzie V-1=3 razy. Ujemne wagi OK!",
|
||||
step_text=(
|
||||
"Bellman-Ford: relaksuj WSZYSTKIE "
|
||||
"krawędzie V-1=3 razy. Ujemne wagi OK!"
|
||||
),
|
||||
algo_name="Algorytm Bellmana-Forda",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "2", "B": "5", "C": "5"},
|
||||
active_edge=("S", "A"),
|
||||
step_text="Iteracja 1: S→A:2, A→C:5, S→B:5. Potem B→A: 5+(-4)=1 < 2 → A=1!",
|
||||
step_text=(
|
||||
"Iteracja 1: S→A:2, A→C:5, S→B:5. "
|
||||
"Potem B→A: 5+(-4)=1 < 2 → A=1!"
|
||||
),
|
||||
algo_name="Bellman-Ford -- iteracja 1",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "1", "B": "5", "C": "5"},
|
||||
active_edge=("B", "A"),
|
||||
step_text="B→A z ujemną wagą -4: d[A] poprawione z 2 na 1! (Dijkstra by to pominął!)",
|
||||
step_text=(
|
||||
"B→A z ujemną wagą -4: d[A] poprawione "
|
||||
"z 2 na 1! (Dijkstra by to pominął!)"
|
||||
),
|
||||
algo_name="Bellman-Ford -- ujemna waga",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "1", "B": "5", "C": "4"},
|
||||
active_edge=("A", "C"),
|
||||
step_text="Iteracja 2: A→C: 1+3=4 < 5 → C=4. Propagacja poprawionego A.",
|
||||
step_text=(
|
||||
"Iteracja 2: A→C: 1+3=4 < 5 → C=4. "
|
||||
"Propagacja poprawionego A."
|
||||
),
|
||||
algo_name="Bellman-Ford -- iteracja 2",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "1", "B": "5", "C": "4"},
|
||||
step_text="Iteracja 3: brak zmian. V-ta iteracja: brak popraw → brak cyklu ujemnego.",
|
||||
step_text=(
|
||||
"Iteracja 3: brak zmian. V-ta iteracja: "
|
||||
"brak popraw → brak cyklu ujemnego."
|
||||
),
|
||||
algo_name="Bellman-Ford -- WYNIK, O(V*E)",
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -403,41 +462,61 @@ def _astar_steps() -> list[CompositeVideoClip]:
|
||||
e = EDGES_DIJKSTRA
|
||||
return [
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": INF, "B": INF, "C": INF},
|
||||
current="S",
|
||||
step_text="A*: f(n)=g(n)+h(n). Cel=C. h(S)=5, h(A)=3, h(B)=4, h(C)=0. f(S)=0+5=5.",
|
||||
step_text=(
|
||||
"A*: f(n)=g(n)+h(n). Cel=C. "
|
||||
"h(S)=5, h(A)=3, h(B)=4, h(C)=0. f(S)=0+5=5."
|
||||
),
|
||||
algo_name="Algorytm A*",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "2", "B": "5", "C": INF},
|
||||
current="S",
|
||||
active_edge=("S", "A"),
|
||||
step_text="Relaksuj S: A(g=2,f=2+3=5), B(g=5,f=5+4=9). Min f → A(5).",
|
||||
step_text=(
|
||||
"Relaksuj S: A(g=2,f=2+3=5), "
|
||||
"B(g=5,f=5+4=9). Min f → A(5)."
|
||||
),
|
||||
algo_name="A* -- rozwijanie S",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "2", "B": "5", "C": "5"},
|
||||
current="A",
|
||||
visited={"S"},
|
||||
active_edge=("A", "C"),
|
||||
step_text="Rozwiń A(f=5): A→C: g=2+3=5, f=5+0=5. Min f → C(5) = CEL!",
|
||||
step_text=(
|
||||
"Rozwiń A(f=5): A→C: g=2+3=5, "
|
||||
"f=5+0=5. Min f → C(5) = CEL!"
|
||||
),
|
||||
algo_name="A* -- rozwijanie A",
|
||||
),
|
||||
),
|
||||
_make_step(
|
||||
_StepConfig(
|
||||
n,
|
||||
e,
|
||||
{"S": "0", "A": "2", "B": "5", "C": "5"},
|
||||
current="C",
|
||||
visited={"S", "A"},
|
||||
step_text="Dotarliśmy do C! Koszt=5. A* NIE przetwarza B (3 vs 4 w Dijkstrze).",
|
||||
step_text=(
|
||||
"Dotarliśmy do C! Koszt=5. "
|
||||
"A* NIE przetwarza B (3 vs 4 w Dijkstrze)."
|
||||
),
|
||||
algo_name="A* -- cel osiągnięty!",
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -523,7 +602,7 @@ def main() -> None:
|
||||
final.write_videofile(
|
||||
OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4
|
||||
)
|
||||
print(f"Video saved to: {OUTPUT}")
|
||||
_logger.info("Video saved to: %s", OUTPUT)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -10,6 +10,7 @@ Creates animated video demonstrating:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@ -37,6 +38,9 @@ OUTPUT_DIR = Path(__file__).resolve().parent / "videos"
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
OUTPUT = str(OUTPUT_DIR / "q23_segmentation.mp4")
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
BG_COLOR = (15, 20, 35)
|
||||
rng = np.random.default_rng(42)
|
||||
|
||||
@ -102,6 +106,25 @@ def _text_slide(
|
||||
)
|
||||
|
||||
|
||||
def _compose_slide(
|
||||
base_clip: VideoClip,
|
||||
labels: list[tuple[str, int, str, str, tuple[int, int]]],
|
||||
duration: float,
|
||||
) -> CompositeVideoClip:
|
||||
"""Overlay text labels on an animated base clip."""
|
||||
text_clips: list[VideoClip] = [base_clip]
|
||||
for text, fs, color, font, pos in labels:
|
||||
tc = (
|
||||
_tc(text=text, font_size=fs, color=color, font=font)
|
||||
.with_duration(duration)
|
||||
.with_position(pos)
|
||||
)
|
||||
text_clips.append(tc)
|
||||
return CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
||||
[FadeIn(0.3), FadeOut(0.3)]
|
||||
)
|
||||
|
||||
|
||||
# ── Segmentation concept ─────────────────────────────────────────
|
||||
def _segmentation_concept() -> list[CompositeVideoClip]:
|
||||
"""Show what segmentation is: pixel-level labeling."""
|
||||
@ -164,7 +187,8 @@ def _segmentation_concept() -> list[CompositeVideoClip]:
|
||||
("niebo | drzewo | droga | samochód", 18, "#90CAF9", FONT_R, (600, 420)),
|
||||
("Segmentacja = klasyfikacja per-piksel", 24, "#FFE082", FONT_B, (100, 500)),
|
||||
(
|
||||
"Semantic: klasy bez instancji | Instance: rozróżnia obiekty | Panoptic: oba",
|
||||
"Semantic: klasy bez instancji | Instance: "
|
||||
"rozróżnia obiekty | Panoptic: oba",
|
||||
16,
|
||||
"#78909C",
|
||||
FONT_R,
|
||||
@ -459,7 +483,8 @@ def _watershed_demo() -> list[CompositeVideoClip]:
|
||||
|
||||
# Dam marker at ridge
|
||||
ridge_x = ox + int(0.5 * terrain_w)
|
||||
if water_level > 160:
|
||||
dam_visible_threshold = 160
|
||||
if water_level > dam_visible_threshold:
|
||||
frame[oy - water_level : oy - 140, ridge_x - 2 : ridge_x + 2] = (
|
||||
255,
|
||||
80,
|
||||
@ -495,7 +520,9 @@ def _watershed_demo() -> list[CompositeVideoClip]:
|
||||
(100, 160),
|
||||
),
|
||||
(
|
||||
"Problem: over-segmentation (za dużo regionów). Rozwiązanie: marker-controlled.",
|
||||
"Problem: over-segmentation "
|
||||
"(za dużo regionów). "
|
||||
"Rozwiązanie: marker-controlled.",
|
||||
16,
|
||||
"#A5D6A7",
|
||||
FONT_R,
|
||||
@ -526,84 +553,84 @@ def _watershed_demo() -> list[CompositeVideoClip]:
|
||||
|
||||
|
||||
# ── U-Net Architecture ───────────────────────────────────────────
|
||||
def _unet_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate U-Net encoder-decoder architecture."""
|
||||
slides = []
|
||||
def _draw_unet_skips(
|
||||
frame: np.ndarray,
|
||||
enc_positions: list[tuple[int, int, int, int]],
|
||||
n_blocks: int,
|
||||
dec_x: int,
|
||||
skip_threshold: int,
|
||||
) -> None:
|
||||
"""Draw horizontal dashed skip-connection lines."""
|
||||
if n_blocks <= skip_threshold:
|
||||
return
|
||||
for i in range(min(n_blocks - 5, 4)):
|
||||
ey = enc_positions[i][1] + enc_positions[i][3] // 2
|
||||
ex_end = enc_positions[i][0] + enc_positions[i][2]
|
||||
for dash_x in range(ex_end + 10, dec_x - 10, 15):
|
||||
frame[ey : ey + 2, dash_x : dash_x + 8] = (255, 200, 50)
|
||||
|
||||
def make_unet_frame(t: float) -> np.ndarray:
|
||||
|
||||
def _make_unet_frame(t: float) -> np.ndarray:
|
||||
"""Render a single U-Net animation frame."""
|
||||
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||||
frame[:] = BG_COLOR
|
||||
|
||||
# Draw U-shape: encoder blocks going down, decoder going up
|
||||
# Encoder: 4 blocks getting smaller
|
||||
enc_sizes = [(80, 120), (60, 100), (45, 80), (30, 60)]
|
||||
dec_sizes = list(reversed(enc_sizes))
|
||||
enc_x = 150
|
||||
dec_x = 850
|
||||
|
||||
progress = min(t / (STEP_DUR * 0.6), 1.0)
|
||||
n_blocks = int(progress * 8) + 1 # 1 to 8
|
||||
n_blocks = int(progress * 8) + 1
|
||||
|
||||
enc_positions = []
|
||||
enc_positions: list[tuple[int, int, int, int]] = []
|
||||
y_offset = 120
|
||||
for i, (bw, bh) in enumerate(enc_sizes):
|
||||
x = enc_x
|
||||
y = y_offset + i * 130
|
||||
enc_positions.append((x, y, bw, bh))
|
||||
if i < n_blocks:
|
||||
# Draw encoder block
|
||||
frame[y : y + bh, x : x + bw] = (70, 130, 200)
|
||||
# Border
|
||||
frame[y : y + 2, x : x + bw] = (100, 180, 255)
|
||||
frame[y + bh - 2 : y + bh, x : x + bw] = (100, 180, 255)
|
||||
frame[y : y + bh, x : x + 2] = (100, 180, 255)
|
||||
frame[y : y + bh, x + bw - 2 : x + bw] = (100, 180, 255)
|
||||
|
||||
# Down arrow
|
||||
if i < len(enc_sizes) - 1:
|
||||
ax = x + bw // 2
|
||||
ay = y + bh + 10
|
||||
frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)
|
||||
|
||||
# Bottleneck
|
||||
bx, by = 500, y_offset + 3 * 130 + 30
|
||||
if n_blocks > 4:
|
||||
encoder_count = 4
|
||||
if n_blocks > encoder_count:
|
||||
frame[by : by + 50, bx : bx + 25] = (200, 100, 80)
|
||||
frame[by : by + 2, bx : bx + 25] = (255, 140, 100)
|
||||
frame[by + 48 : by + 50, bx : bx + 25] = (255, 140, 100)
|
||||
|
||||
# Decoder
|
||||
dec_positions = []
|
||||
for i, (bw, bh) in enumerate(dec_sizes):
|
||||
x = dec_x
|
||||
y = y_offset + (3 - i) * 130
|
||||
dec_positions.append((x, y, bw, bh))
|
||||
if n_blocks > 4 + i + 1:
|
||||
frame[y : y + bh, x : x + bw] = (80, 200, 120)
|
||||
frame[y : y + 2, x : x + bw] = (120, 230, 150)
|
||||
frame[y + bh - 2 : y + bh, x : x + bw] = (120, 230, 150)
|
||||
frame[y : y + bh, x : x + 2] = (120, 230, 150)
|
||||
frame[y : y + bh, x + bw - 2 : x + bw] = (120, 230, 150)
|
||||
|
||||
# Up arrow
|
||||
if i < len(dec_sizes) - 1:
|
||||
ax = x + bw // 2
|
||||
ay = y - 30
|
||||
frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)
|
||||
|
||||
# Skip connections (horizontal dashed lines)
|
||||
if n_blocks > 5:
|
||||
for i in range(min(n_blocks - 5, 4)):
|
||||
ey = enc_positions[i][1] + enc_positions[i][3] // 2
|
||||
ex_end = enc_positions[i][0] + enc_positions[i][2]
|
||||
dx_start = dec_x
|
||||
for dash_x in range(ex_end + 10, dx_start - 10, 15):
|
||||
frame[ey : ey + 2, dash_x : dash_x + 8] = (255, 200, 50)
|
||||
skip_threshold = 5
|
||||
_draw_unet_skips(frame, enc_positions, n_blocks, dec_x, skip_threshold)
|
||||
|
||||
return frame
|
||||
|
||||
unet_clip = VideoClip(make_unet_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
||||
text_clips: list[VideoClip] = [unet_clip]
|
||||
|
||||
def _unet_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate U-Net encoder-decoder architecture."""
|
||||
dur = STEP_DUR + 1
|
||||
unet_clip = VideoClip(_make_unet_frame, duration=dur).with_fps(FPS)
|
||||
labels = [
|
||||
("U-Net: Encoder-Decoder + Skip Connections", 28, "#FFE082", FONT_B, (80, 20)),
|
||||
(
|
||||
@ -649,34 +676,59 @@ def _unet_demo() -> list[CompositeVideoClip]:
|
||||
(80, 670),
|
||||
),
|
||||
]
|
||||
for text, fs, color, font, pos in labels:
|
||||
tc = (
|
||||
_tc(text=text, font_size=fs, color=color, font=font)
|
||||
.with_duration(STEP_DUR + 1)
|
||||
.with_position(pos)
|
||||
)
|
||||
text_clips.append(tc)
|
||||
|
||||
slides.append(
|
||||
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
||||
[FadeIn(0.3), FadeOut(0.3)]
|
||||
)
|
||||
)
|
||||
return slides
|
||||
return [_compose_slide(unet_clip, labels, dur)]
|
||||
|
||||
|
||||
# ── FCN Architecture ─────────────────────────────────────────────
|
||||
def _fcn_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate FCN step-by-step: FC → Conv 1x1 transformation."""
|
||||
slides = []
|
||||
def _draw_pipeline_blocks(
|
||||
frame: np.ndarray,
|
||||
blocks: list[
|
||||
tuple[tuple[int, int], tuple[int, int], tuple[int, int, int]]
|
||||
],
|
||||
n_visible: int,
|
||||
arrow_limit: int,
|
||||
) -> None:
|
||||
"""Draw coloured blocks with connecting arrows."""
|
||||
for i, ((bx, by), (bw, bh), color) in enumerate(blocks):
|
||||
if i < n_visible:
|
||||
frame[by : by + bh, bx : bx + bw] = color
|
||||
frame[by : by + 2, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
if i < arrow_limit:
|
||||
ax = bx + bw + 3
|
||||
ay = by + bh // 2
|
||||
frame[ay - 1 : ay + 2, ax : ax + 12] = (150, 150, 170)
|
||||
|
||||
# Slide 1: Classic CNN vs FCN pipeline comparison
|
||||
def make_fcn_frame(t: float) -> np.ndarray:
|
||||
|
||||
def _draw_red_cross(
|
||||
frame: np.ndarray,
|
||||
x_start: int,
|
||||
width: int,
|
||||
top_y: int,
|
||||
height: int,
|
||||
) -> None:
|
||||
"""Draw a red X across the given rectangle."""
|
||||
for d in range(-2, 3):
|
||||
for step in range(height):
|
||||
x1 = x_start + int(step * width / height)
|
||||
y1 = top_y + step + d
|
||||
if 0 <= y1 < H and 0 <= x1 < W:
|
||||
frame[y1, x1] = (255, 80, 80)
|
||||
y2 = top_y + height - step + d
|
||||
if 0 <= y2 < H and 0 <= x1 < W:
|
||||
frame[y2, x1] = (255, 80, 80)
|
||||
|
||||
|
||||
def _make_fcn_frame(t: float) -> np.ndarray:
|
||||
"""Render a single FCN comparison frame."""
|
||||
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||||
frame[:] = BG_COLOR
|
||||
progress = min(t / (STEP_DUR * 0.8), 1.0)
|
||||
|
||||
# TOP: Classic CNN → FC → 1 label
|
||||
top_y = 140
|
||||
blocks_classic = [
|
||||
((80, top_y), (70, 50), (70, 130, 200)),
|
||||
@ -688,33 +740,13 @@ def _fcn_demo() -> list[CompositeVideoClip]:
|
||||
((545, top_y), (80, 50), (200, 80, 80)),
|
||||
]
|
||||
n_top = min(int(progress * 7) + 1, 7)
|
||||
for i, ((bx, by), (bw, bh), color) in enumerate(blocks_classic):
|
||||
if i < n_top:
|
||||
frame[by : by + bh, bx : bx + bw] = color
|
||||
frame[by : by + 2, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
if i < 6:
|
||||
ax = bx + bw + 3
|
||||
ay = by + bh // 2
|
||||
frame[ay - 1 : ay + 2, ax : ax + 12] = (150, 150, 170)
|
||||
arrow_limit = 6
|
||||
_draw_pipeline_blocks(frame, blocks_classic, n_top, arrow_limit)
|
||||
|
||||
# Red X over Flatten+FC when FCN appears
|
||||
if progress > 0.6:
|
||||
for d in range(-2, 3):
|
||||
for step in range(50):
|
||||
x1 = 385 + int(step * 135 / 50)
|
||||
y1 = top_y + step + d
|
||||
if 0 <= y1 < H and 0 <= x1 < W:
|
||||
frame[y1, x1] = (255, 80, 80)
|
||||
y2 = top_y + 50 - step + d
|
||||
if 0 <= y2 < H and 0 <= x1 < W:
|
||||
frame[y2, x1] = (255, 80, 80)
|
||||
cross_phase = 0.6
|
||||
if progress > cross_phase:
|
||||
_draw_red_cross(frame, 385, 135, top_y, 50)
|
||||
|
||||
# BOTTOM: FCN pipeline
|
||||
bot_y = 380
|
||||
blocks_fcn = [
|
||||
((80, bot_y), (70, 50), (70, 130, 200)),
|
||||
@ -725,26 +757,18 @@ def _fcn_demo() -> list[CompositeVideoClip]:
|
||||
((480, bot_y), (75, 50), (200, 160, 80)),
|
||||
((580, bot_y), (80, 50), (100, 200, 100)),
|
||||
]
|
||||
if progress > 0.4:
|
||||
n_bot = min(int((progress - 0.4) / 0.6 * 7) + 1, 7)
|
||||
for i, ((bx, by), (bw, bh), color) in enumerate(blocks_fcn):
|
||||
if i < n_bot:
|
||||
frame[by : by + bh, bx : bx + bw] = color
|
||||
frame[by : by + 2, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
if i < 6:
|
||||
ax = bx + bw + 3
|
||||
ay = by + bh // 2
|
||||
frame[ay - 1 : ay + 2, ax : ax + 12] = (150, 150, 170)
|
||||
fcn_phase = 0.4
|
||||
if progress > fcn_phase:
|
||||
n_bot = min(int((progress - fcn_phase) / 0.6 * 7) + 1, 7)
|
||||
_draw_pipeline_blocks(frame, blocks_fcn, n_bot, arrow_limit)
|
||||
|
||||
return frame
|
||||
|
||||
fcn_clip = VideoClip(make_fcn_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
||||
|
||||
def _fcn_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate FCN step-by-step: FC → Conv 1x1 transformation."""
|
||||
dur = STEP_DUR + 1
|
||||
fcn_clip = VideoClip(_make_fcn_frame, duration=dur).with_fps(FPS)
|
||||
labels = [
|
||||
("FCN: Fully Convolutional Network (2015)", 26, "#FFE082", FONT_B, (80, 20)),
|
||||
("KROK 1: Zamień FC → Conv 1x1", 18, "#A5D6A7", FONT_R, (80, 60)),
|
||||
@ -807,19 +831,7 @@ def _fcn_demo() -> list[CompositeVideoClip]:
|
||||
(80, 640),
|
||||
),
|
||||
]
|
||||
text_clips: list[VideoClip] = [fcn_clip]
|
||||
for text, fs, color, font, pos in labels:
|
||||
tc = (
|
||||
_tc(text=text, font_size=fs, color=color, font=font)
|
||||
.with_duration(dur)
|
||||
.with_position(pos)
|
||||
)
|
||||
text_clips.append(tc)
|
||||
slides.append(
|
||||
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
||||
[FadeIn(0.3), FadeOut(0.3)]
|
||||
)
|
||||
)
|
||||
slides = [_compose_slide(fcn_clip, labels, dur)]
|
||||
|
||||
# Slide 2: FCN skip connections step by step
|
||||
skip_lines = [
|
||||
@ -909,7 +921,8 @@ def _fcn_demo() -> list[CompositeVideoClip]:
|
||||
(100, 555),
|
||||
),
|
||||
(
|
||||
"Im więcej skip connections → tym więcej detali z encodera → ostrzejszy wynik",
|
||||
"Im więcej skip connections → tym więcej "
|
||||
"detali z encodera → ostrzejszy wynik",
|
||||
17,
|
||||
"white",
|
||||
FONT_R,
|
||||
@ -922,18 +935,13 @@ def _fcn_demo() -> list[CompositeVideoClip]:
|
||||
|
||||
|
||||
# ── DeepLab Architecture ─────────────────────────────────────────
|
||||
def _deeplab_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate DeepLab: dilated convolution + ASPP step by step."""
|
||||
slides = []
|
||||
|
||||
# Slide 1: Regular vs Dilated convolution
|
||||
def make_dilated_frame(t: float) -> np.ndarray:
|
||||
def _make_dilated_frame(t: float) -> np.ndarray:
|
||||
"""Render a dilated convolution comparison frame."""
|
||||
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||||
frame[:] = BG_COLOR
|
||||
progress = min(t / (STEP_DUR * 0.7), 1.0)
|
||||
|
||||
cell = 36
|
||||
# Draw three grids side by side for rate=1, rate=2, rate=3
|
||||
grids = [
|
||||
(
|
||||
"rate=1",
|
||||
@ -987,14 +995,11 @@ def _deeplab_demo() -> list[CompositeVideoClip]:
|
||||
break
|
||||
gy = 180
|
||||
grid_size = 7
|
||||
# Draw background grid
|
||||
for r in range(grid_size):
|
||||
for c in range(grid_size):
|
||||
x = gx + c * cell
|
||||
y = gy + r * cell
|
||||
frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55)
|
||||
|
||||
# Highlight filter positions
|
||||
for r, c in positions:
|
||||
x = gx + c * cell
|
||||
y = gy + r * cell
|
||||
@ -1004,8 +1009,60 @@ def _deeplab_demo() -> list[CompositeVideoClip]:
|
||||
|
||||
return frame
|
||||
|
||||
dil_clip = VideoClip(make_dilated_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
||||
|
||||
def _make_aspp_frame(t: float) -> np.ndarray:
|
||||
"""Render a single ASPP module animation frame."""
|
||||
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||||
frame[:] = BG_COLOR
|
||||
progress = min(t / (STEP_DUR * 0.7), 1.0)
|
||||
|
||||
frame[250:330, 50:130] = (70, 130, 200)
|
||||
frame[250:252, 50:130] = (120, 180, 255)
|
||||
frame[328:330, 50:130] = (120, 180, 255)
|
||||
|
||||
branches = [
|
||||
("1x1 conv", 250, (200, 170), (100, 40), (80, 200, 120)),
|
||||
("rate=6", 310, (200, 250), (100, 40), (200, 160, 80)),
|
||||
("rate=12", 370, (200, 330), (100, 40), (200, 120, 60)),
|
||||
("rate=18", 430, (200, 410), (100, 40), (180, 100, 80)),
|
||||
("GAP", 490, (200, 490), (100, 40), (160, 80, 160)),
|
||||
]
|
||||
n_branches = min(int(progress * 5) + 1, 5)
|
||||
for i, (_lbl, _h, (bx, by), (bw, bh), color) in enumerate(branches):
|
||||
if i < n_branches:
|
||||
frame[by : by + bh, bx : bx + bw] = color
|
||||
frame[by : by + 2, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
ay = by + bh // 2
|
||||
frame[ay - 1 : ay + 2, 133:197] = (150, 150, 170)
|
||||
|
||||
concat_phase = 0.6
|
||||
if progress > concat_phase:
|
||||
frame[250:530, 380:420] = (50, 60, 80)
|
||||
frame[250:252, 380:420] = (200, 200, 100)
|
||||
frame[528:530, 380:420] = (200, 200, 100)
|
||||
for i, (_lbl, _h, (bx, by), (bw, bh), _c) in enumerate(branches):
|
||||
if i < n_branches:
|
||||
ay = by + bh // 2
|
||||
frame[ay - 1 : ay + 2, bx + bw + 3 : 378] = (150, 150, 170)
|
||||
|
||||
final_conv_phase = 0.8
|
||||
if progress > final_conv_phase:
|
||||
frame[350:420, 450:550] = (100, 200, 100)
|
||||
frame[350:352, 450:550] = (150, 230, 150)
|
||||
frame[418:420, 450:550] = (150, 230, 150)
|
||||
frame[388:391, 423:448] = (150, 150, 170)
|
||||
|
||||
return frame
|
||||
|
||||
|
||||
def _deeplab_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate DeepLab: dilated convolution + ASPP step by step."""
|
||||
dur = STEP_DUR + 1
|
||||
|
||||
# Slide 1: Regular vs Dilated convolution
|
||||
dil_clip = VideoClip(_make_dilated_frame, duration=dur).with_fps(FPS)
|
||||
labels = [
|
||||
("DeepLab: Atrous (Dilated) Convolution", 26, "#FFE082", FONT_B, (80, 20)),
|
||||
(
|
||||
@ -1032,7 +1089,8 @@ def _deeplab_demo() -> list[CompositeVideoClip]:
|
||||
(80, 510),
|
||||
),
|
||||
(
|
||||
"TE SAME 9 wag → WIĘKSZE pole widzenia → lepszy kontekst BEZ dodatkowych parametrów!",
|
||||
"TE SAME 9 wag → WIĘKSZE pole widzenia "
|
||||
"→ lepszy kontekst BEZ dodatkowych parametrów!",
|
||||
16,
|
||||
"white",
|
||||
FONT_R,
|
||||
@ -1046,72 +1104,10 @@ def _deeplab_demo() -> list[CompositeVideoClip]:
|
||||
(80, 600),
|
||||
),
|
||||
]
|
||||
text_clips: list[VideoClip] = [dil_clip]
|
||||
for text, fs, color, font, pos in labels:
|
||||
tc = (
|
||||
_tc(text=text, font_size=fs, color=color, font=font)
|
||||
.with_duration(dur)
|
||||
.with_position(pos)
|
||||
)
|
||||
text_clips.append(tc)
|
||||
slides.append(
|
||||
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
||||
[FadeIn(0.3), FadeOut(0.3)]
|
||||
)
|
||||
)
|
||||
slides = [_compose_slide(dil_clip, labels, dur)]
|
||||
|
||||
# Slide 2: ASPP module step by step
|
||||
def make_aspp_frame(t: float) -> np.ndarray:
|
||||
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||||
frame[:] = BG_COLOR
|
||||
progress = min(t / (STEP_DUR * 0.7), 1.0)
|
||||
|
||||
# Input feature map on left
|
||||
frame[250:330, 50:130] = (70, 130, 200)
|
||||
frame[250:252, 50:130] = (120, 180, 255)
|
||||
frame[328:330, 50:130] = (120, 180, 255)
|
||||
|
||||
# ASPP parallel branches
|
||||
branches = [
|
||||
("1x1 conv", 250, (200, 170), (100, 40), (80, 200, 120)),
|
||||
("rate=6", 310, (200, 250), (100, 40), (200, 160, 80)),
|
||||
("rate=12", 370, (200, 330), (100, 40), (200, 120, 60)),
|
||||
("rate=18", 430, (200, 410), (100, 40), (180, 100, 80)),
|
||||
("GAP", 490, (200, 490), (100, 40), (160, 80, 160)),
|
||||
]
|
||||
n_branches = min(int(progress * 5) + 1, 5)
|
||||
for i, (_lbl, _h, (bx, by), (bw, bh), color) in enumerate(branches):
|
||||
if i < n_branches:
|
||||
frame[by : by + bh, bx : bx + bw] = color
|
||||
frame[by : by + 2, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
# Arrow from input
|
||||
ay = by + bh // 2
|
||||
frame[ay - 1 : ay + 2, 133:197] = (150, 150, 170)
|
||||
|
||||
# Concatenation box
|
||||
if progress > 0.6:
|
||||
frame[250:530, 380:420] = (50, 60, 80)
|
||||
frame[250:252, 380:420] = (200, 200, 100)
|
||||
frame[528:530, 380:420] = (200, 200, 100)
|
||||
# Arrows from branches to concat
|
||||
for i, (_lbl, _h, (bx, by), (bw, bh), _c) in enumerate(branches):
|
||||
if i < n_branches:
|
||||
ay = by + bh // 2
|
||||
frame[ay - 1 : ay + 2, bx + bw + 3 : 378] = (150, 150, 170)
|
||||
|
||||
# Final conv after concat
|
||||
if progress > 0.8:
|
||||
frame[350:420, 450:550] = (100, 200, 100)
|
||||
frame[350:352, 450:550] = (150, 230, 150)
|
||||
frame[418:420, 450:550] = (150, 230, 150)
|
||||
# Arrow from concat
|
||||
frame[388:391, 423:448] = (150, 150, 170)
|
||||
|
||||
return frame
|
||||
|
||||
aspp_clip = VideoClip(make_aspp_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
||||
aspp_clip = VideoClip(_make_aspp_frame, duration=dur).with_fps(FPS)
|
||||
labels2 = [
|
||||
(
|
||||
"DeepLab: ASPP (Atrous Spatial Pyramid Pooling)",
|
||||
@ -1163,71 +1159,69 @@ def _deeplab_demo() -> list[CompositeVideoClip]:
|
||||
(80, 645),
|
||||
),
|
||||
]
|
||||
text_clips2: list[VideoClip] = [aspp_clip]
|
||||
for text, fs, color, font, pos in labels2:
|
||||
tc = (
|
||||
_tc(text=text, font_size=fs, color=color, font=font)
|
||||
.with_duration(dur)
|
||||
.with_position(pos)
|
||||
)
|
||||
text_clips2.append(tc)
|
||||
slides.append(
|
||||
CompositeVideoClip(text_clips2, size=(W, H)).with_effects(
|
||||
[FadeIn(0.3), FadeOut(0.3)]
|
||||
)
|
||||
)
|
||||
slides.append(_compose_slide(aspp_clip, labels2, dur))
|
||||
|
||||
return slides
|
||||
|
||||
|
||||
# ── Transformer Segmentation ────────────────────────────────────
|
||||
def _transformer_seg_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate transformer-based segmentation: self-attention concept."""
|
||||
slides = []
|
||||
|
||||
# Slide 1: CNN local vs Transformer global
|
||||
def make_attention_frame(t: float) -> np.ndarray:
|
||||
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||||
frame[:] = BG_COLOR
|
||||
progress = min(t / (STEP_DUR * 0.7), 1.0)
|
||||
|
||||
cell = 40
|
||||
grid_n = 6
|
||||
|
||||
# LEFT: CNN — local receptive field
|
||||
lx, ly = 60, 200
|
||||
def _draw_base_grid(
|
||||
frame: np.ndarray, gx: int, gy: int, grid_n: int, cell: int,
|
||||
) -> None:
|
||||
"""Draw an empty grid of cells."""
|
||||
for r in range(grid_n):
|
||||
for c in range(grid_n):
|
||||
x = lx + c * cell
|
||||
y = ly + r * cell
|
||||
x = gx + c * cell
|
||||
y = gy + r * cell
|
||||
frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55)
|
||||
|
||||
# Highlight 3x3 kernel in CNN
|
||||
if progress > 0.2:
|
||||
cx, cy = 2, 2 # center cell
|
||||
|
||||
def _draw_cnn_kernel(
|
||||
frame: np.ndarray, lx: int, ly: int, cell: int, progress: float,
|
||||
) -> None:
|
||||
"""Highlight a 3x3 CNN kernel on the grid."""
|
||||
cnn_phase = 0.2
|
||||
if progress <= cnn_phase:
|
||||
return
|
||||
cx, cy = 2, 2
|
||||
for dr in range(-1, 2):
|
||||
for dc in range(-1, 2):
|
||||
r, c = cy + dr, cx + dc
|
||||
x = lx + c * cell
|
||||
y = ly + r * cell
|
||||
frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200)
|
||||
# Center highlighted more
|
||||
x = lx + cx * cell
|
||||
y = ly + cy * cell
|
||||
frame[y : y + cell - 2, x : x + cell - 2] = (120, 180, 255)
|
||||
|
||||
# RIGHT: Transformer — global attention
|
||||
rx, ry = 680, 200
|
||||
for r in range(grid_n):
|
||||
for c in range(grid_n):
|
||||
x = rx + c * cell
|
||||
y = ry + r * cell
|
||||
frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55)
|
||||
|
||||
# All cells connected to center
|
||||
if progress > 0.4:
|
||||
def _draw_conn_line(
|
||||
frame: np.ndarray, x0: int, y0: int, x1: int, y1: int,
|
||||
) -> None:
|
||||
"""Draw a dashed connection line between two points."""
|
||||
steps = max(abs(x1 - x0), abs(y1 - y0))
|
||||
if steps <= 0:
|
||||
return
|
||||
for s in range(0, steps, 3):
|
||||
px = x0 + int((x1 - x0) * s / steps)
|
||||
py = y0 + int((y1 - y0) * s / steps)
|
||||
if 0 <= px < W - 1 and 0 <= py < H - 1:
|
||||
frame[py : py + 1, px : px + 1] = (200, 180, 50)
|
||||
|
||||
|
||||
def _draw_attention_connections(
|
||||
frame: np.ndarray,
|
||||
origin: tuple[int, int],
|
||||
grid_n: int,
|
||||
cell: int,
|
||||
progress: float,
|
||||
) -> None:
|
||||
"""Draw transformer self-attention connections on the grid."""
|
||||
rx, ry = origin
|
||||
transformer_phase = 0.4
|
||||
if progress <= transformer_phase:
|
||||
return
|
||||
cx_t, cy_t = 2, 2
|
||||
# Center cell
|
||||
x0 = rx + cx_t * cell + cell // 2
|
||||
y0 = ry + cy_t * cell + cell // 2
|
||||
n_connections = int(progress * 36)
|
||||
@ -1239,7 +1233,6 @@ def _transformer_seg_demo() -> list[CompositeVideoClip]:
|
||||
break
|
||||
x = rx + c * cell
|
||||
y = ry + r * cell
|
||||
# Color by "attention strength" — closer = stronger
|
||||
dist = abs(r - cy_t) + abs(c - cx_t)
|
||||
strength = max(30, 200 - dist * 30)
|
||||
frame[y : y + cell - 2, x : x + cell - 2] = (
|
||||
@ -1247,28 +1240,41 @@ def _transformer_seg_demo() -> list[CompositeVideoClip]:
|
||||
strength // 2,
|
||||
strength,
|
||||
)
|
||||
# Draw connection line
|
||||
x1 = x + cell // 2
|
||||
y1 = y + cell // 2
|
||||
steps = max(abs(x1 - x0), abs(y1 - y0))
|
||||
if steps > 0:
|
||||
for s in range(0, steps, 3):
|
||||
px = x0 + int((x1 - x0) * s / steps)
|
||||
py = y0 + int((y1 - y0) * s / steps)
|
||||
if 0 <= px < W - 1 and 0 <= py < H - 1:
|
||||
frame[py : py + 1, px : px + 1] = (200, 180, 50)
|
||||
_draw_conn_line(frame, x0, y0, x + cell // 2, y + cell // 2)
|
||||
else:
|
||||
continue
|
||||
break
|
||||
# Center highlighted strongly
|
||||
x = rx + cx_t * cell
|
||||
y = ry + cy_t * cell
|
||||
frame[y : y + cell - 2, x : x + cell - 2] = (255, 200, 50)
|
||||
|
||||
|
||||
def _make_attention_frame(t: float) -> np.ndarray:
|
||||
"""Render a CNN-vs-Transformer attention comparison frame."""
|
||||
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||||
frame[:] = BG_COLOR
|
||||
progress = min(t / (STEP_DUR * 0.7), 1.0)
|
||||
|
||||
cell = 40
|
||||
grid_n = 6
|
||||
|
||||
lx, ly = 60, 200
|
||||
_draw_base_grid(frame, lx, ly, grid_n, cell)
|
||||
_draw_cnn_kernel(frame, lx, ly, cell, progress)
|
||||
|
||||
rx, ry = 680, 200
|
||||
_draw_base_grid(frame, rx, ry, grid_n, cell)
|
||||
_draw_attention_connections(frame, (rx, ry), grid_n, cell, progress)
|
||||
|
||||
return frame
|
||||
|
||||
att_clip = VideoClip(make_attention_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
||||
|
||||
def _transformer_seg_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate transformer-based segmentation: self-attention concept."""
|
||||
dur = STEP_DUR + 1
|
||||
|
||||
# Slide 1: CNN local vs Transformer global
|
||||
att_clip = VideoClip(_make_attention_frame, duration=dur).with_fps(FPS)
|
||||
labels = [
|
||||
("Transformer: Self-Attention w segmentacji", 26, "#FFE082", FONT_B, (80, 20)),
|
||||
("CNN = LOKALNY kontekst", 18, "#64B5F6", FONT_B, (60, 160)),
|
||||
@ -1279,19 +1285,7 @@ def _transformer_seg_demo() -> list[CompositeVideoClip]:
|
||||
("piksel widzi WSZYSTKIE!", 14, "#FFE082", FONT_R, (680, 485)),
|
||||
("vs", 28, "#B0BEC5", FONT_B, (450, 300)),
|
||||
]
|
||||
text_clips: list[VideoClip] = [att_clip]
|
||||
for text, fs, color, font, pos in labels:
|
||||
tc = (
|
||||
_tc(text=text, font_size=fs, color=color, font=font)
|
||||
.with_duration(dur)
|
||||
.with_position(pos)
|
||||
)
|
||||
text_clips.append(tc)
|
||||
slides.append(
|
||||
CompositeVideoClip(text_clips, size=(W, H)).with_effects(
|
||||
[FadeIn(0.3), FadeOut(0.3)]
|
||||
)
|
||||
)
|
||||
slides = [_compose_slide(att_clip, labels, dur)]
|
||||
|
||||
# Slide 2: Self-attention Q/K/V step by step
|
||||
qkv_lines = [
|
||||
@ -1376,7 +1370,8 @@ def _transformer_seg_demo() -> list[CompositeVideoClip]:
|
||||
(100, 610),
|
||||
),
|
||||
(
|
||||
"Mask2Former (2022): masked attention + unified (semantic+instance+panoptic)",
|
||||
"Mask2Former (2022): masked attention + "
|
||||
"unified (semantic+instance+panoptic)",
|
||||
16,
|
||||
"#CE93D8",
|
||||
FONT_R,
|
||||
@ -1520,12 +1515,16 @@ def _methods_comparison() -> CompositeVideoClip:
|
||||
]
|
||||
|
||||
clips: list[VideoClip] = [bg, title]
|
||||
mnemonic_col = 3
|
||||
for i, row in enumerate(rows):
|
||||
y_pos = 75 + i * 72
|
||||
col_x = [40, 210, 340, 660]
|
||||
for j, cell in enumerate(row):
|
||||
fs = 16 if i > 0 else 18
|
||||
color = "#64B5F6" if i == 0 else ("#E0E0E0" if j < 3 else "#FFE082")
|
||||
color = (
|
||||
"#64B5F6" if i == 0
|
||||
else ("#E0E0E0" if j < mnemonic_col else "#FFE082")
|
||||
)
|
||||
tc = (
|
||||
_tc(
|
||||
text=cell,
|
||||
@ -1620,7 +1619,7 @@ def main() -> None:
|
||||
final.write_videofile(
|
||||
OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4
|
||||
)
|
||||
print(f"Video saved to: {OUTPUT}")
|
||||
_logger.info("Video saved to: %s", OUTPUT)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -11,6 +11,7 @@ Creates animated video demonstrating:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
@ -40,6 +41,8 @@ OUTPUT = str(OUTPUT_DIR / "q24_object_detection.mp4")
|
||||
|
||||
BG_COLOR = (15, 20, 35)
|
||||
|
||||
_logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _tc(**kwargs: object) -> TextClip:
|
||||
"""TextClip wrapper that adds enough bottom margin to prevent clipping."""
|
||||
@ -203,7 +206,8 @@ def _hog_svm_demo() -> list[CompositeVideoClip]:
|
||||
frame[ay - 1 : ay + 2, ax : ax + 20] = (150, 150, 170)
|
||||
|
||||
# Show gradient computation example at bottom
|
||||
if progress > 0.2:
|
||||
gradient_phase = 0.2
|
||||
if progress > gradient_phase:
|
||||
# Mini pixel grid showing gradient computation
|
||||
gx, gy = 100, 430
|
||||
pixels = [50, 50, 200]
|
||||
@ -366,7 +370,8 @@ def _viola_jones_demo() -> list[CompositeVideoClip]:
|
||||
(80, 620),
|
||||
),
|
||||
(
|
||||
"Haar: kontrast jasna/ciemna | Integral Image: suma prostokąta O(1) = 4 odczyty",
|
||||
"Haar: kontrast jasna/ciemna | Integral Image: "
|
||||
"suma prostokąta O(1) = 4 odczyty",
|
||||
14,
|
||||
"#78909C",
|
||||
FONT_R,
|
||||
@ -474,7 +479,8 @@ def _rcnn_evolution() -> list[CompositeVideoClip]:
|
||||
("Faster R-CNN (2015)", 20, "#A5D6A7", FONT_B, (50, 580)),
|
||||
("0.2 sec → 5 fps (RPN w sieci!)", 14, "#A5D6A7", FONT_R, (720, 600)),
|
||||
(
|
||||
"Kluczowe innowacje: ROI Pooling → stały rozmiar | RPN → propozycje w sieci",
|
||||
"Kluczowe innowacje: ROI Pooling → stały rozmiar "
|
||||
"| RPN → propozycje w sieci",
|
||||
14,
|
||||
"#78909C",
|
||||
FONT_R,
|
||||
@ -527,13 +533,15 @@ def _rcnn_detailed() -> list[CompositeVideoClip]:
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
# Arrow down
|
||||
if i < 4:
|
||||
arrow_limit = 4
|
||||
if i < arrow_limit:
|
||||
ax = bx + bw // 2
|
||||
ay = by + bh + 5
|
||||
frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170)
|
||||
|
||||
# Illustration: many overlapping regions from Selective Search
|
||||
if progress > 0.2:
|
||||
overlay_phase = 0.2
|
||||
if progress > overlay_phase:
|
||||
rng_local = np.random.default_rng(42)
|
||||
n_boxes = min(int((progress - 0.2) * 15), 8)
|
||||
for i in range(n_boxes):
|
||||
@ -599,11 +607,48 @@ def _rcnn_detailed() -> list[CompositeVideoClip]:
|
||||
|
||||
|
||||
# ── ROI Pooling ──────────────────────────────────────────────────
|
||||
def _roi_pooling_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate ROI Pooling: key Fast R-CNN innovation."""
|
||||
slides = []
|
||||
|
||||
def make_roi_frame(t: float) -> np.ndarray:
|
||||
|
||||
def _draw_roi_pool_grid(frame: np.ndarray) -> None:
|
||||
"""Draw the 3x3 ROI pool grid with max-pooled feature values."""
|
||||
out_x, out_y = 400, 220
|
||||
out_cell = 50
|
||||
out_n = 3
|
||||
roi_r1, roi_c1 = 2, 1
|
||||
roi_r2, roi_c2 = 6, 5
|
||||
roi_h = roi_r2 - roi_r1
|
||||
roi_w = roi_c2 - roi_c1
|
||||
for r in range(out_n):
|
||||
for c in range(out_n):
|
||||
x = out_x + c * out_cell
|
||||
y = out_y + r * out_cell
|
||||
|
||||
# Compute the max from corresponding region
|
||||
src_r1 = roi_r1 + r * roi_h // out_n
|
||||
src_r2 = roi_r1 + (r + 1) * roi_h // out_n
|
||||
src_c1 = roi_c1 + c * roi_w // out_n
|
||||
src_c2 = roi_c1 + (c + 1) * roi_w // out_n
|
||||
max_val = 0
|
||||
for sr in range(src_r1, src_r2):
|
||||
for sc in range(src_c1, src_c2):
|
||||
v = 30 + ((sr * 7 + sc * 13 + 42) % 40)
|
||||
max_val = max(max_val, v)
|
||||
|
||||
frame[y : y + out_cell - 2, x : x + out_cell - 2] = (
|
||||
max_val,
|
||||
max_val + 20,
|
||||
max_val + 40,
|
||||
)
|
||||
frame[y : y + 2, x : x + out_cell - 2] = (80, 200, 120)
|
||||
frame[y + out_cell - 4 : y + out_cell - 2, x : x + out_cell - 2] = (
|
||||
80,
|
||||
200,
|
||||
120,
|
||||
)
|
||||
|
||||
|
||||
def _make_roi_frame(t: float) -> np.ndarray:
|
||||
"""Render a single frame for the ROI pooling animation."""
|
||||
frame = np.zeros((H, W, 3), dtype=np.uint8)
|
||||
frame[:] = BG_COLOR
|
||||
progress = min(t / (STEP_DUR * 0.7), 1.0)
|
||||
@ -638,46 +683,18 @@ def _roi_pooling_demo() -> list[CompositeVideoClip]:
|
||||
frame[ry2 - 2 : ry2, rx1:rx2] = (255, 200, 50)
|
||||
|
||||
# Arrow
|
||||
if progress > 0.3:
|
||||
arrow_phase = 0.3
|
||||
if progress > arrow_phase:
|
||||
frame[300:303, 310:380] = (150, 150, 170)
|
||||
|
||||
# Middle: ROI divided into 3x3 grid (output_size)
|
||||
if progress > 0.3:
|
||||
out_x, out_y = 400, 220
|
||||
out_cell = 50
|
||||
out_n = 3
|
||||
roi_h = roi_r2 - roi_r1
|
||||
roi_w = roi_c2 - roi_c1
|
||||
for r in range(out_n):
|
||||
for c in range(out_n):
|
||||
x = out_x + c * out_cell
|
||||
y = out_y + r * out_cell
|
||||
|
||||
# Compute the max from corresponding region
|
||||
src_r1 = roi_r1 + r * roi_h // out_n
|
||||
src_r2 = roi_r1 + (r + 1) * roi_h // out_n
|
||||
src_c1 = roi_c1 + c * roi_w // out_n
|
||||
src_c2 = roi_c1 + (c + 1) * roi_w // out_n
|
||||
max_val = 0
|
||||
for sr in range(src_r1, src_r2):
|
||||
for sc in range(src_c1, src_c2):
|
||||
v = 30 + ((sr * 7 + sc * 13 + 42) % 40)
|
||||
max_val = max(max_val, v)
|
||||
|
||||
frame[y : y + out_cell - 2, x : x + out_cell - 2] = (
|
||||
max_val,
|
||||
max_val + 20,
|
||||
max_val + 40,
|
||||
)
|
||||
frame[y : y + 2, x : x + out_cell - 2] = (80, 200, 120)
|
||||
frame[y + out_cell - 4 : y + out_cell - 2, x : x + out_cell - 2] = (
|
||||
80,
|
||||
200,
|
||||
120,
|
||||
)
|
||||
grid_phase = 0.3
|
||||
if progress > grid_phase:
|
||||
_draw_roi_pool_grid(frame)
|
||||
|
||||
# Arrow to FC
|
||||
if progress > 0.6:
|
||||
fc_phase = 0.6
|
||||
if progress > fc_phase:
|
||||
frame[300:303, 560:630] = (150, 150, 170)
|
||||
# FC box
|
||||
frame[270:340, 650:730] = (200, 100, 80)
|
||||
@ -686,7 +703,12 @@ def _roi_pooling_demo() -> list[CompositeVideoClip]:
|
||||
|
||||
return frame
|
||||
|
||||
roi_clip = VideoClip(make_roi_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
||||
|
||||
def _roi_pooling_demo() -> list[CompositeVideoClip]:
|
||||
"""Animate ROI Pooling: key Fast R-CNN innovation."""
|
||||
slides = []
|
||||
|
||||
roi_clip = VideoClip(_make_roi_frame, duration=STEP_DUR + 1).with_fps(FPS)
|
||||
dur = STEP_DUR + 1
|
||||
labels = [
|
||||
("ROI Pooling: kluczowa innowacja Fast R-CNN", 26, "#FFE082", FONT_B, (80, 20)),
|
||||
@ -731,7 +753,8 @@ def _roi_pooling_demo() -> list[CompositeVideoClip]:
|
||||
(80, 535),
|
||||
),
|
||||
(
|
||||
"Fast R-CNN: CNN raz → 1 feature mapa → ROI Pool 2000 regionów → 25x szybciej!",
|
||||
"Fast R-CNN: CNN raz → 1 feature mapa → "
|
||||
"ROI Pool 2000 regionów → 25x szybciej!",
|
||||
16,
|
||||
"#A5D6A7",
|
||||
FONT_R,
|
||||
@ -788,7 +811,6 @@ def _rpn_anchors_demo() -> list[CompositeVideoClip]:
|
||||
|
||||
# Draw anchors around center: 3 sizes x 3 ratios = 9
|
||||
anchor_specs = [
|
||||
# (half_w, half_h, color)
|
||||
(30, 30, (200, 80, 80)), # small 1:1
|
||||
(20, 40, (200, 60, 60)), # small 1:2
|
||||
(40, 20, (180, 60, 60)), # small 2:1
|
||||
@ -1014,7 +1036,8 @@ def _yolo_demo() -> list[CompositeVideoClip]:
|
||||
frame[y : y + 1, img_x : img_x + img_size] = (100, 100, 120)
|
||||
|
||||
# Highlight cells containing object centers
|
||||
if progress > 0.3:
|
||||
car_phase = 0.3
|
||||
if progress > car_phase:
|
||||
# Car center ~ cell (1, 1)
|
||||
cx, cy = 1, 2
|
||||
hx = img_x + cx * cell
|
||||
@ -1023,7 +1046,8 @@ def _yolo_demo() -> list[CompositeVideoClip]:
|
||||
frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255
|
||||
).astype(np.uint8)
|
||||
|
||||
if progress > 0.5:
|
||||
person_phase = 0.5
|
||||
if progress > person_phase:
|
||||
# Person center ~ cell (4, 4)
|
||||
cx, cy = 4, 4
|
||||
hx = img_x + cx * cell
|
||||
@ -1033,7 +1057,8 @@ def _yolo_demo() -> list[CompositeVideoClip]:
|
||||
).astype(np.uint8)
|
||||
|
||||
# Bounding boxes predictions from cells
|
||||
if progress > 0.6:
|
||||
bbox_phase = 0.6
|
||||
if progress > bbox_phase:
|
||||
# Car bbox
|
||||
for tt in range(2):
|
||||
frame[
|
||||
@ -1100,7 +1125,8 @@ def _yolo_demo() -> list[CompositeVideoClip]:
|
||||
(80, 620),
|
||||
),
|
||||
(
|
||||
"Two-stage (R-CNN): propozycje+klasyfikacja | One-stage (YOLO): bez propozycji!",
|
||||
"Two-stage (R-CNN): propozycje+klasyfikacja "
|
||||
"| One-stage (YOLO): bez propozycji!",
|
||||
14,
|
||||
"#90CAF9",
|
||||
FONT_R,
|
||||
@ -1152,13 +1178,15 @@ def _yolo_architecture() -> list[CompositeVideoClip]:
|
||||
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
if i < 4:
|
||||
arrow_limit = 4
|
||||
if i < arrow_limit:
|
||||
ax = bx + bw + 5
|
||||
ay = by + bh // 2
|
||||
frame[ay - 1 : ay + 2, ax : ax + 25] = (150, 150, 170)
|
||||
|
||||
# Output tensor breakdown (right side)
|
||||
if progress > 0.6:
|
||||
tensor_phase = 0.6
|
||||
if progress > tensor_phase:
|
||||
# Show SxS grid
|
||||
gx, gy = 850, 180
|
||||
gs = 120
|
||||
@ -1282,18 +1310,21 @@ def _detr_demo() -> list[CompositeVideoClip]:
|
||||
frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple(
|
||||
min(c + 50, 255) for c in color
|
||||
)
|
||||
if i < 4:
|
||||
arrow_limit = 4
|
||||
if i < arrow_limit:
|
||||
ax = bx + bw + 5
|
||||
ay = by + bh // 2
|
||||
frame[ay - 1 : ay + 2, ax : ax + 25] = (150, 150, 170)
|
||||
|
||||
# Object queries illustration (right side)
|
||||
if progress > 0.5:
|
||||
query_phase = 0.5
|
||||
if progress > query_phase:
|
||||
qx, qy = 800, 140
|
||||
for i in range(6):
|
||||
y = qy + i * 50
|
||||
w = 130
|
||||
active = i < 3
|
||||
active_limit = 3
|
||||
active = i < active_limit
|
||||
color = (80, 180, 120) if active else (60, 50, 50)
|
||||
frame[y : y + 35, qx : qx + w] = color
|
||||
frame[y : y + 1, qx : qx + w] = tuple(min(c + 40, 255) for c in color)
|
||||
@ -1528,7 +1559,8 @@ def _detr_demo() -> list[CompositeVideoClip]:
|
||||
(80, 540),
|
||||
),
|
||||
(
|
||||
" R-CNN (SS+CNN+SVM+NMS) → YOLO (backbone+head+NMS) → DETR (backbone+transformer)",
|
||||
" R-CNN (SS+CNN+SVM+NMS) → YOLO "
|
||||
"(backbone+head+NMS) → DETR (backbone+transformer)",
|
||||
14,
|
||||
"#90CAF9",
|
||||
FONT_R,
|
||||
@ -1572,15 +1604,18 @@ def _nms_iou_demo() -> list[CompositeVideoClip]:
|
||||
boxes.append((ox + 350, oy + 50, 100, 100, 0.40, (80, 180, 255)))
|
||||
|
||||
for i, (bx, by, bw, bh, _conf, color) in enumerate(boxes):
|
||||
if progress > 0.4 and i > 0 and i < 3:
|
||||
dc = color
|
||||
nms_phase = 0.4
|
||||
nms_limit = 3
|
||||
if progress > nms_phase and i > 0 and i < nms_limit:
|
||||
# After NMS, these get removed (shown as faded/crossed)
|
||||
color = (60, 40, 40)
|
||||
dc = (60, 40, 40)
|
||||
|
||||
for tt in range(2):
|
||||
frame[by - tt : by + bh + tt, bx - tt : bx - tt + 2] = color
|
||||
frame[by - tt : by + bh + tt, bx + bw + tt - 2 : bx + bw + tt] = color
|
||||
frame[by - tt : by - tt + 2, bx - tt : bx + bw + tt] = color
|
||||
frame[by + bh + tt - 2 : by + bh + tt, bx - tt : bx + bw + tt] = color
|
||||
frame[by - tt : by + bh + tt, bx - tt : bx - tt + 2] = dc
|
||||
frame[by - tt : by + bh + tt, bx + bw + tt - 2 : bx + bw + tt] = dc
|
||||
frame[by - tt : by - tt + 2, bx - tt : bx + bw + tt] = dc
|
||||
frame[by + bh + tt - 2 : by + bh + tt, bx - tt : bx + bw + tt] = dc
|
||||
|
||||
# IoU visualization on right side
|
||||
iou_x, iou_y = 700, 200
|
||||
@ -1884,7 +1919,7 @@ def main() -> None:
|
||||
final.write_videofile(
|
||||
OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4
|
||||
)
|
||||
print(f"Video saved to: {OUTPUT}")
|
||||
_logger.info("Video saved to: %s", OUTPUT)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Loading…
Reference in New Issue
Block a user