diff --git a/python_pkg/praca_magisterska_video/visualize_q02.py b/python_pkg/praca_magisterska_video/visualize_q02.py index fd5e859..717cac0 100644 --- a/python_pkg/praca_magisterska_video/visualize_q02.py +++ b/python_pkg/praca_magisterska_video/visualize_q02.py @@ -6,6 +6,8 @@ on a small example graph, rendering each algorithm step by step. from __future__ import annotations +from dataclasses import dataclass +import logging import os from pathlib import Path @@ -33,6 +35,9 @@ OUTPUT_DIR = Path(__file__).resolve().parent / "videos" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT = str(OUTPUT_DIR / "q02_shortest_path.mp4") +logging.basicConfig(level=logging.INFO) +_logger = logging.getLogger(__name__) + # Graph definition NODE_POS = {"S": (250, 280), "A": (550, 180), "B": (550, 450), "C": (850, 320)} EDGES_DIJKSTRA = [ @@ -101,13 +106,13 @@ def _draw_circle( def _draw_line( frame: np.ndarray, - x1: int, - y1: int, - x2: int, - y2: int, + start: tuple[int, int], + end: tuple[int, int], color: tuple[int, ...], thickness: int = 2, ) -> None: + x1, y1 = start + x2, y2 = end length = max(int(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)), 1) for i in range(length): frac = i / length @@ -122,13 +127,13 @@ def _draw_line( def _draw_arrow( frame: np.ndarray, - x1: int, - y1: int, - x2: int, - y2: int, + start: tuple[int, int], + end: tuple[int, int], color: tuple[int, ...], thickness: int = 2, ) -> None: + x1, y1 = start + x2, y2 = end r = 32 length = max(np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2), 1) ddx = (x2 - x1) / length @@ -137,14 +142,14 @@ def _draw_arrow( sy = int(y1 + ddy * r) ex = int(x2 - ddx * r) ey = int(y2 - ddy * r) - _draw_line(frame, sx, sy, ex, ey, color, thickness) + _draw_line(frame, (sx, sy), (ex, ey), color, thickness) angle = np.arctan2(ey - sy, ex - sx) arrow_len = 12 for side in [-1, 1]: a = angle + np.pi + side * 0.4 ax = int(ex + arrow_len * np.cos(a)) ay = int(ey + arrow_len * np.sin(a)) - _draw_line(frame, ex, ey, ax, ay, color, thickness) + _draw_line(frame, (ex, ey), (ax, ay), color, thickness) def _render_graph( @@ -163,7 +168,7 @@ def _render_graph( sx, sy = nodes[src] dx, dy = nodes[dst] ec = COL_EDGE_ACT if active_edge == (src, dst) else COL_EDGE - _draw_arrow(frame, sx, sy, dx, dy, ec, thickness=2) + _draw_arrow(frame, (sx, sy), (dx, dy), ec, thickness=2) for name, (x, y) in nodes.items(): if name == current: @@ -184,19 +189,32 @@ def _render_graph( return frame +@dataclass +class _StepConfig: + """Configuration for a single algorithm visualization step.""" + + nodes: dict[str, tuple[int, int]] + edges: list[tuple[str, str, int]] + distances: dict[str, str] + current: str | None = None + visited: set[str] | None = None + active_edge: tuple[str, str] | None = None + step_text: str = "" + algo_name: str = "" + + def _make_step( - nodes: dict[str, tuple[int, int]], - edges: list[tuple[str, str, int]], - distances: dict[str, str], - current: str | None = None, - visited: set[str] | None = None, - active_edge: tuple[str, str] | None = None, - step_text: str = "", - algo_name: str = "", + cfg: _StepConfig, duration: float = STEP_DUR, ) -> CompositeVideoClip: - if visited is None: - visited = set() + nodes = cfg.nodes + edges = cfg.edges + distances = cfg.distances + current = cfg.current + visited = cfg.visited if cfg.visited is not None else set() + active_edge = cfg.active_edge + step_text = cfg.step_text + algo_name = cfg.algo_name graph_frame = _render_graph(nodes, edges, distances, current, visited, active_edge) @@ -305,50 +323,66 @@ def _dijkstra_steps() -> list[CompositeVideoClip]: e = EDGES_DIJKSTRA return [ _make_step( - n, - e, - {"S": "0", "A": INF, "B": INF, "C": INF}, - current="S", - step_text="Inicjalizacja: d[S]=0, reszta=∞. Wybierz S (min d).", - algo_name="Algorytm Dijkstry", + _StepConfig( + n, + e, + {"S": "0", "A": INF, "B": INF, "C": INF}, + current="S", + step_text="Inicjalizacja: d[S]=0, reszta=∞. Wybierz S (min d).", + algo_name="Algorytm Dijkstry", + ), ), _make_step( - n, - e, - {"S": "0", "A": "2", "B": "5", "C": INF}, - current="S", - active_edge=("S", "A"), - step_text="Relaksacja S→A: d[A]=0+2=2. S→B: d[B]=0+5=5.", - algo_name="Algorytm Dijkstry", + _StepConfig( + n, + e, + {"S": "0", "A": "2", "B": "5", "C": INF}, + current="S", + active_edge=("S", "A"), + step_text="Relaksacja S→A: d[A]=0+2=2. S→B: d[B]=0+5=5.", + algo_name="Algorytm Dijkstry", + ), ), _make_step( - n, - e, - {"S": "0", "A": "2", "B": "5", "C": "5"}, - current="A", - visited={"S"}, - active_edge=("A", "C"), - step_text="Zamknij S. Min=A(2). Relaksacja A→C: d[C]=2+3=5.", - algo_name="Algorytm Dijkstry", + _StepConfig( + n, + e, + {"S": "0", "A": "2", "B": "5", "C": "5"}, + current="A", + visited={"S"}, + active_edge=("A", "C"), + step_text="Zamknij S. Min=A(2). Relaksacja A→C: d[C]=2+3=5.", + algo_name="Algorytm Dijkstry", + ), ), _make_step( - n, - e, - {"S": "0", "A": "2", "B": "5", "C": "5"}, - current="B", - visited={"S", "A"}, - active_edge=("B", "A"), - step_text="Zamknij A. Min=B(5). B→A: 5+1=6>2, nie zmieniaj. B→C: 5+6=11>5.", - algo_name="Algorytm Dijkstry", + _StepConfig( + n, + e, + {"S": "0", "A": "2", "B": "5", "C": "5"}, + current="B", + visited={"S", "A"}, + active_edge=("B", "A"), + step_text=( + "Zamknij A. Min=B(5). B→A: 5+1=6>2, " + "nie zmieniaj. B→C: 5+6=11>5." + ), + algo_name="Algorytm Dijkstry", + ), ), _make_step( - n, - e, - {"S": "0", "A": "2", "B": "5", "C": "5"}, - current="C", - visited={"S", "A", "B"}, - step_text="Zamknij B. Min=C(5). Koniec! Wynik: d={S:0, A:2, B:5, C:5}.", - algo_name="Dijkstra -- WYNIK", + _StepConfig( + n, + e, + {"S": "0", "A": "2", "B": "5", "C": "5"}, + current="C", + visited={"S", "A", "B"}, + step_text=( + "Zamknij B. Min=C(5). Koniec! " + "Wynik: d={S:0, A:2, B:5, C:5}." + ), + algo_name="Dijkstra -- WYNIK", + ), ), ] @@ -358,42 +392,67 @@ def _bellman_ford_steps() -> list[CompositeVideoClip]: e = EDGES_BF return [ _make_step( - n, - e, - {"S": "0", "A": INF, "B": INF, "C": INF}, - step_text="Bellman-Ford: relaksuj WSZYSTKIE krawędzie V-1=3 razy. Ujemne wagi OK!", - algo_name="Algorytm Bellmana-Forda", + _StepConfig( + n, + e, + {"S": "0", "A": INF, "B": INF, "C": INF}, + step_text=( + "Bellman-Ford: relaksuj WSZYSTKIE " + "krawędzie V-1=3 razy. Ujemne wagi OK!" + ), + algo_name="Algorytm Bellmana-Forda", + ), ), _make_step( - n, - e, - {"S": "0", "A": "2", "B": "5", "C": "5"}, - active_edge=("S", "A"), - step_text="Iteracja 1: S→A:2, A→C:5, S→B:5. Potem B→A: 5+(-4)=1 < 2 → A=1!", - algo_name="Bellman-Ford -- iteracja 1", + _StepConfig( + n, + e, + {"S": "0", "A": "2", "B": "5", "C": "5"}, + active_edge=("S", "A"), + step_text=( + "Iteracja 1: S→A:2, A→C:5, S→B:5. " + "Potem B→A: 5+(-4)=1 < 2 → A=1!" + ), + algo_name="Bellman-Ford -- iteracja 1", + ), ), _make_step( - n, - e, - {"S": "0", "A": "1", "B": "5", "C": "5"}, - active_edge=("B", "A"), - step_text="B→A z ujemną wagą -4: d[A] poprawione z 2 na 1! (Dijkstra by to pominął!)", - algo_name="Bellman-Ford -- ujemna waga", + _StepConfig( + n, + e, + {"S": "0", "A": "1", "B": "5", "C": "5"}, + active_edge=("B", "A"), + step_text=( + "B→A z ujemną wagą -4: d[A] poprawione " + "z 2 na 1! (Dijkstra by to pominął!)" + ), + algo_name="Bellman-Ford -- ujemna waga", + ), ), _make_step( - n, - e, - {"S": "0", "A": "1", "B": "5", "C": "4"}, - active_edge=("A", "C"), - step_text="Iteracja 2: A→C: 1+3=4 < 5 → C=4. Propagacja poprawionego A.", - algo_name="Bellman-Ford -- iteracja 2", + _StepConfig( + n, + e, + {"S": "0", "A": "1", "B": "5", "C": "4"}, + active_edge=("A", "C"), + step_text=( + "Iteracja 2: A→C: 1+3=4 < 5 → C=4. " + "Propagacja poprawionego A." + ), + algo_name="Bellman-Ford -- iteracja 2", + ), ), _make_step( - n, - e, - {"S": "0", "A": "1", "B": "5", "C": "4"}, - step_text="Iteracja 3: brak zmian. V-ta iteracja: brak popraw → brak cyklu ujemnego.", - algo_name="Bellman-Ford -- WYNIK, O(V*E)", + _StepConfig( + n, + e, + {"S": "0", "A": "1", "B": "5", "C": "4"}, + step_text=( + "Iteracja 3: brak zmian. V-ta iteracja: " + "brak popraw → brak cyklu ujemnego." + ), + algo_name="Bellman-Ford -- WYNIK, O(V*E)", + ), ), ] @@ -403,40 +462,60 @@ def _astar_steps() -> list[CompositeVideoClip]: e = EDGES_DIJKSTRA return [ _make_step( - n, - e, - {"S": "0", "A": INF, "B": INF, "C": INF}, - current="S", - step_text="A*: f(n)=g(n)+h(n). Cel=C. h(S)=5, h(A)=3, h(B)=4, h(C)=0. f(S)=0+5=5.", - algo_name="Algorytm A*", + _StepConfig( + n, + e, + {"S": "0", "A": INF, "B": INF, "C": INF}, + current="S", + step_text=( + "A*: f(n)=g(n)+h(n). Cel=C. " + "h(S)=5, h(A)=3, h(B)=4, h(C)=0. f(S)=0+5=5." + ), + algo_name="Algorytm A*", + ), ), _make_step( - n, - e, - {"S": "0", "A": "2", "B": "5", "C": INF}, - current="S", - active_edge=("S", "A"), - step_text="Relaksuj S: A(g=2,f=2+3=5), B(g=5,f=5+4=9). Min f → A(5).", - algo_name="A* -- rozwijanie S", + _StepConfig( + n, + e, + {"S": "0", "A": "2", "B": "5", "C": INF}, + current="S", + active_edge=("S", "A"), + step_text=( + "Relaksuj S: A(g=2,f=2+3=5), " + "B(g=5,f=5+4=9). Min f → A(5)." + ), + algo_name="A* -- rozwijanie S", + ), ), _make_step( - n, - e, - {"S": "0", "A": "2", "B": "5", "C": "5"}, - current="A", - visited={"S"}, - active_edge=("A", "C"), - step_text="Rozwiń A(f=5): A→C: g=2+3=5, f=5+0=5. Min f → C(5) = CEL!", - algo_name="A* -- rozwijanie A", + _StepConfig( + n, + e, + {"S": "0", "A": "2", "B": "5", "C": "5"}, + current="A", + visited={"S"}, + active_edge=("A", "C"), + step_text=( + "Rozwiń A(f=5): A→C: g=2+3=5, " + "f=5+0=5. Min f → C(5) = CEL!" + ), + algo_name="A* -- rozwijanie A", + ), ), _make_step( - n, - e, - {"S": "0", "A": "2", "B": "5", "C": "5"}, - current="C", - visited={"S", "A"}, - step_text="Dotarliśmy do C! Koszt=5. A* NIE przetwarza B (3 vs 4 w Dijkstrze).", - algo_name="A* -- cel osiągnięty!", + _StepConfig( + n, + e, + {"S": "0", "A": "2", "B": "5", "C": "5"}, + current="C", + visited={"S", "A"}, + step_text=( + "Dotarliśmy do C! Koszt=5. " + "A* NIE przetwarza B (3 vs 4 w Dijkstrze)." + ), + algo_name="A* -- cel osiągnięty!", + ), ), ] @@ -523,7 +602,7 @@ def main() -> None: final.write_videofile( OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4 ) - print(f"Video saved to: {OUTPUT}") + _logger.info("Video saved to: %s", OUTPUT) if __name__ == "__main__": diff --git a/python_pkg/praca_magisterska_video/visualize_q23.py b/python_pkg/praca_magisterska_video/visualize_q23.py index 9981934..91894e8 100644 --- a/python_pkg/praca_magisterska_video/visualize_q23.py +++ b/python_pkg/praca_magisterska_video/visualize_q23.py @@ -10,6 +10,7 @@ Creates animated video demonstrating: from __future__ import annotations +import logging import os from pathlib import Path @@ -37,6 +38,9 @@ OUTPUT_DIR = Path(__file__).resolve().parent / "videos" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) OUTPUT = str(OUTPUT_DIR / "q23_segmentation.mp4") +logging.basicConfig(level=logging.INFO) +_logger = logging.getLogger(__name__) + BG_COLOR = (15, 20, 35) rng = np.random.default_rng(42) @@ -102,6 +106,25 @@ def _text_slide( ) +def _compose_slide( + base_clip: VideoClip, + labels: list[tuple[str, int, str, str, tuple[int, int]]], + duration: float, +) -> CompositeVideoClip: + """Overlay text labels on an animated base clip.""" + text_clips: list[VideoClip] = [base_clip] + for text, fs, color, font, pos in labels: + tc = ( + _tc(text=text, font_size=fs, color=color, font=font) + .with_duration(duration) + .with_position(pos) + ) + text_clips.append(tc) + return CompositeVideoClip(text_clips, size=(W, H)).with_effects( + [FadeIn(0.3), FadeOut(0.3)] + ) + + # ── Segmentation concept ───────────────────────────────────────── def _segmentation_concept() -> list[CompositeVideoClip]: """Show what segmentation is: pixel-level labeling.""" @@ -164,7 +187,8 @@ def _segmentation_concept() -> list[CompositeVideoClip]: ("niebo | drzewo | droga | samochód", 18, "#90CAF9", FONT_R, (600, 420)), ("Segmentacja = klasyfikacja per-piksel", 24, "#FFE082", FONT_B, (100, 500)), ( - "Semantic: klasy bez instancji | Instance: rozróżnia obiekty | Panoptic: oba", + "Semantic: klasy bez instancji | Instance: " + "rozróżnia obiekty | Panoptic: oba", 16, "#78909C", FONT_R, @@ -459,7 +483,8 @@ def _watershed_demo() -> list[CompositeVideoClip]: # Dam marker at ridge ridge_x = ox + int(0.5 * terrain_w) - if water_level > 160: + dam_visible_threshold = 160 + if water_level > dam_visible_threshold: frame[oy - water_level : oy - 140, ridge_x - 2 : ridge_x + 2] = ( 255, 80, @@ -495,7 +520,9 @@ def _watershed_demo() -> list[CompositeVideoClip]: (100, 160), ), ( - "Problem: over-segmentation (za dużo regionów). Rozwiązanie: marker-controlled.", + "Problem: over-segmentation " + "(za dużo regionów). " + "Rozwiązanie: marker-controlled.", 16, "#A5D6A7", FONT_R, @@ -526,84 +553,84 @@ def _watershed_demo() -> list[CompositeVideoClip]: # ── U-Net Architecture ─────────────────────────────────────────── +def _draw_unet_skips( + frame: np.ndarray, + enc_positions: list[tuple[int, int, int, int]], + n_blocks: int, + dec_x: int, + skip_threshold: int, +) -> None: + """Draw horizontal dashed skip-connection lines.""" + if n_blocks <= skip_threshold: + return + for i in range(min(n_blocks - 5, 4)): + ey = enc_positions[i][1] + enc_positions[i][3] // 2 + ex_end = enc_positions[i][0] + enc_positions[i][2] + for dash_x in range(ex_end + 10, dec_x - 10, 15): + frame[ey : ey + 2, dash_x : dash_x + 8] = (255, 200, 50) + + +def _make_unet_frame(t: float) -> np.ndarray: + """Render a single U-Net animation frame.""" + frame = np.zeros((H, W, 3), dtype=np.uint8) + frame[:] = BG_COLOR + + enc_sizes = [(80, 120), (60, 100), (45, 80), (30, 60)] + dec_sizes = list(reversed(enc_sizes)) + enc_x = 150 + dec_x = 850 + + progress = min(t / (STEP_DUR * 0.6), 1.0) + n_blocks = int(progress * 8) + 1 + + enc_positions: list[tuple[int, int, int, int]] = [] + y_offset = 120 + for i, (bw, bh) in enumerate(enc_sizes): + x = enc_x + y = y_offset + i * 130 + enc_positions.append((x, y, bw, bh)) + if i < n_blocks: + frame[y : y + bh, x : x + bw] = (70, 130, 200) + frame[y : y + 2, x : x + bw] = (100, 180, 255) + frame[y + bh - 2 : y + bh, x : x + bw] = (100, 180, 255) + frame[y : y + bh, x : x + 2] = (100, 180, 255) + frame[y : y + bh, x + bw - 2 : x + bw] = (100, 180, 255) + if i < len(enc_sizes) - 1: + ax = x + bw // 2 + ay = y + bh + 10 + frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170) + + bx, by = 500, y_offset + 3 * 130 + 30 + encoder_count = 4 + if n_blocks > encoder_count: + frame[by : by + 50, bx : bx + 25] = (200, 100, 80) + frame[by : by + 2, bx : bx + 25] = (255, 140, 100) + frame[by + 48 : by + 50, bx : bx + 25] = (255, 140, 100) + + for i, (bw, bh) in enumerate(dec_sizes): + x = dec_x + y = y_offset + (3 - i) * 130 + if n_blocks > 4 + i + 1: + frame[y : y + bh, x : x + bw] = (80, 200, 120) + frame[y : y + 2, x : x + bw] = (120, 230, 150) + frame[y + bh - 2 : y + bh, x : x + bw] = (120, 230, 150) + frame[y : y + bh, x : x + 2] = (120, 230, 150) + frame[y : y + bh, x + bw - 2 : x + bw] = (120, 230, 150) + if i < len(dec_sizes) - 1: + ax = x + bw // 2 + ay = y - 30 + frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170) + + skip_threshold = 5 + _draw_unet_skips(frame, enc_positions, n_blocks, dec_x, skip_threshold) + + return frame + + def _unet_demo() -> list[CompositeVideoClip]: """Animate U-Net encoder-decoder architecture.""" - slides = [] - - def make_unet_frame(t: float) -> np.ndarray: - frame = np.zeros((H, W, 3), dtype=np.uint8) - frame[:] = BG_COLOR - - # Draw U-shape: encoder blocks going down, decoder going up - # Encoder: 4 blocks getting smaller - enc_sizes = [(80, 120), (60, 100), (45, 80), (30, 60)] - dec_sizes = list(reversed(enc_sizes)) - enc_x = 150 - dec_x = 850 - - progress = min(t / (STEP_DUR * 0.6), 1.0) - n_blocks = int(progress * 8) + 1 # 1 to 8 - - enc_positions = [] - y_offset = 120 - for i, (bw, bh) in enumerate(enc_sizes): - x = enc_x - y = y_offset + i * 130 - enc_positions.append((x, y, bw, bh)) - if i < n_blocks: - # Draw encoder block - frame[y : y + bh, x : x + bw] = (70, 130, 200) - # Border - frame[y : y + 2, x : x + bw] = (100, 180, 255) - frame[y + bh - 2 : y + bh, x : x + bw] = (100, 180, 255) - frame[y : y + bh, x : x + 2] = (100, 180, 255) - frame[y : y + bh, x + bw - 2 : x + bw] = (100, 180, 255) - - # Down arrow - if i < len(enc_sizes) - 1: - ax = x + bw // 2 - ay = y + bh + 10 - frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170) - - # Bottleneck - bx, by = 500, y_offset + 3 * 130 + 30 - if n_blocks > 4: - frame[by : by + 50, bx : bx + 25] = (200, 100, 80) - frame[by : by + 2, bx : bx + 25] = (255, 140, 100) - frame[by + 48 : by + 50, bx : bx + 25] = (255, 140, 100) - - # Decoder - dec_positions = [] - for i, (bw, bh) in enumerate(dec_sizes): - x = dec_x - y = y_offset + (3 - i) * 130 - dec_positions.append((x, y, bw, bh)) - if n_blocks > 4 + i + 1: - frame[y : y + bh, x : x + bw] = (80, 200, 120) - frame[y : y + 2, x : x + bw] = (120, 230, 150) - frame[y + bh - 2 : y + bh, x : x + bw] = (120, 230, 150) - frame[y : y + bh, x : x + 2] = (120, 230, 150) - frame[y : y + bh, x + bw - 2 : x + bw] = (120, 230, 150) - - # Up arrow - if i < len(dec_sizes) - 1: - ax = x + bw // 2 - ay = y - 30 - frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170) - - # Skip connections (horizontal dashed lines) - if n_blocks > 5: - for i in range(min(n_blocks - 5, 4)): - ey = enc_positions[i][1] + enc_positions[i][3] // 2 - ex_end = enc_positions[i][0] + enc_positions[i][2] - dx_start = dec_x - for dash_x in range(ex_end + 10, dx_start - 10, 15): - frame[ey : ey + 2, dash_x : dash_x + 8] = (255, 200, 50) - - return frame - - unet_clip = VideoClip(make_unet_frame, duration=STEP_DUR + 1).with_fps(FPS) - text_clips: list[VideoClip] = [unet_clip] + dur = STEP_DUR + 1 + unet_clip = VideoClip(_make_unet_frame, duration=dur).with_fps(FPS) labels = [ ("U-Net: Encoder-Decoder + Skip Connections", 28, "#FFE082", FONT_B, (80, 20)), ( @@ -649,102 +676,99 @@ def _unet_demo() -> list[CompositeVideoClip]: (80, 670), ), ] - for text, fs, color, font, pos in labels: - tc = ( - _tc(text=text, font_size=fs, color=color, font=font) - .with_duration(STEP_DUR + 1) - .with_position(pos) - ) - text_clips.append(tc) - - slides.append( - CompositeVideoClip(text_clips, size=(W, H)).with_effects( - [FadeIn(0.3), FadeOut(0.3)] - ) - ) - return slides + return [_compose_slide(unet_clip, labels, dur)] # ── FCN Architecture ───────────────────────────────────────────── +def _draw_pipeline_blocks( + frame: np.ndarray, + blocks: list[ + tuple[tuple[int, int], tuple[int, int], tuple[int, int, int]] + ], + n_visible: int, + arrow_limit: int, +) -> None: + """Draw coloured blocks with connecting arrows.""" + for i, ((bx, by), (bw, bh), color) in enumerate(blocks): + if i < n_visible: + frame[by : by + bh, bx : bx + bw] = color + frame[by : by + 2, bx : bx + bw] = tuple( + min(c + 50, 255) for c in color + ) + frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple( + min(c + 50, 255) for c in color + ) + if i < arrow_limit: + ax = bx + bw + 3 + ay = by + bh // 2 + frame[ay - 1 : ay + 2, ax : ax + 12] = (150, 150, 170) + + +def _draw_red_cross( + frame: np.ndarray, + x_start: int, + width: int, + top_y: int, + height: int, +) -> None: + """Draw a red X across the given rectangle.""" + for d in range(-2, 3): + for step in range(height): + x1 = x_start + int(step * width / height) + y1 = top_y + step + d + if 0 <= y1 < H and 0 <= x1 < W: + frame[y1, x1] = (255, 80, 80) + y2 = top_y + height - step + d + if 0 <= y2 < H and 0 <= x1 < W: + frame[y2, x1] = (255, 80, 80) + + +def _make_fcn_frame(t: float) -> np.ndarray: + """Render a single FCN comparison frame.""" + frame = np.zeros((H, W, 3), dtype=np.uint8) + frame[:] = BG_COLOR + progress = min(t / (STEP_DUR * 0.8), 1.0) + + top_y = 140 + blocks_classic = [ + ((80, top_y), (70, 50), (70, 130, 200)), + ((170, top_y), (50, 40), (50, 100, 160)), + ((240, top_y), (60, 50), (70, 130, 200)), + ((320, top_y), (40, 35), (50, 100, 160)), + ((385, top_y), (55, 50), (160, 80, 60)), + ((465, top_y), (55, 50), (180, 60, 60)), + ((545, top_y), (80, 50), (200, 80, 80)), + ] + n_top = min(int(progress * 7) + 1, 7) + arrow_limit = 6 + _draw_pipeline_blocks(frame, blocks_classic, n_top, arrow_limit) + + cross_phase = 0.6 + if progress > cross_phase: + _draw_red_cross(frame, 385, 135, top_y, 50) + + bot_y = 380 + blocks_fcn = [ + ((80, bot_y), (70, 50), (70, 130, 200)), + ((170, bot_y), (50, 40), (50, 100, 160)), + ((240, bot_y), (60, 50), (70, 130, 200)), + ((320, bot_y), (40, 35), (50, 100, 160)), + ((385, bot_y), (70, 50), (80, 200, 120)), + ((480, bot_y), (75, 50), (200, 160, 80)), + ((580, bot_y), (80, 50), (100, 200, 100)), + ] + fcn_phase = 0.4 + if progress > fcn_phase: + n_bot = min(int((progress - fcn_phase) / 0.6 * 7) + 1, 7) + _draw_pipeline_blocks(frame, blocks_fcn, n_bot, arrow_limit) + + return frame + + def _fcn_demo() -> list[CompositeVideoClip]: """Animate FCN step-by-step: FC → Conv 1x1 transformation.""" - slides = [] - - # Slide 1: Classic CNN vs FCN pipeline comparison - def make_fcn_frame(t: float) -> np.ndarray: - frame = np.zeros((H, W, 3), dtype=np.uint8) - frame[:] = BG_COLOR - progress = min(t / (STEP_DUR * 0.8), 1.0) - - # TOP: Classic CNN → FC → 1 label - top_y = 140 - blocks_classic = [ - ((80, top_y), (70, 50), (70, 130, 200)), - ((170, top_y), (50, 40), (50, 100, 160)), - ((240, top_y), (60, 50), (70, 130, 200)), - ((320, top_y), (40, 35), (50, 100, 160)), - ((385, top_y), (55, 50), (160, 80, 60)), - ((465, top_y), (55, 50), (180, 60, 60)), - ((545, top_y), (80, 50), (200, 80, 80)), - ] - n_top = min(int(progress * 7) + 1, 7) - for i, ((bx, by), (bw, bh), color) in enumerate(blocks_classic): - if i < n_top: - frame[by : by + bh, bx : bx + bw] = color - frame[by : by + 2, bx : bx + bw] = tuple( - min(c + 50, 255) for c in color - ) - frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple( - min(c + 50, 255) for c in color - ) - if i < 6: - ax = bx + bw + 3 - ay = by + bh // 2 - frame[ay - 1 : ay + 2, ax : ax + 12] = (150, 150, 170) - - # Red X over Flatten+FC when FCN appears - if progress > 0.6: - for d in range(-2, 3): - for step in range(50): - x1 = 385 + int(step * 135 / 50) - y1 = top_y + step + d - if 0 <= y1 < H and 0 <= x1 < W: - frame[y1, x1] = (255, 80, 80) - y2 = top_y + 50 - step + d - if 0 <= y2 < H and 0 <= x1 < W: - frame[y2, x1] = (255, 80, 80) - - # BOTTOM: FCN pipeline - bot_y = 380 - blocks_fcn = [ - ((80, bot_y), (70, 50), (70, 130, 200)), - ((170, bot_y), (50, 40), (50, 100, 160)), - ((240, bot_y), (60, 50), (70, 130, 200)), - ((320, bot_y), (40, 35), (50, 100, 160)), - ((385, bot_y), (70, 50), (80, 200, 120)), - ((480, bot_y), (75, 50), (200, 160, 80)), - ((580, bot_y), (80, 50), (100, 200, 100)), - ] - if progress > 0.4: - n_bot = min(int((progress - 0.4) / 0.6 * 7) + 1, 7) - for i, ((bx, by), (bw, bh), color) in enumerate(blocks_fcn): - if i < n_bot: - frame[by : by + bh, bx : bx + bw] = color - frame[by : by + 2, bx : bx + bw] = tuple( - min(c + 50, 255) for c in color - ) - frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple( - min(c + 50, 255) for c in color - ) - if i < 6: - ax = bx + bw + 3 - ay = by + bh // 2 - frame[ay - 1 : ay + 2, ax : ax + 12] = (150, 150, 170) - - return frame - - fcn_clip = VideoClip(make_fcn_frame, duration=STEP_DUR + 1).with_fps(FPS) dur = STEP_DUR + 1 + fcn_clip = VideoClip(_make_fcn_frame, duration=dur).with_fps(FPS) labels = [ ("FCN: Fully Convolutional Network (2015)", 26, "#FFE082", FONT_B, (80, 20)), ("KROK 1: Zamień FC → Conv 1x1", 18, "#A5D6A7", FONT_R, (80, 60)), @@ -807,19 +831,7 @@ def _fcn_demo() -> list[CompositeVideoClip]: (80, 640), ), ] - text_clips: list[VideoClip] = [fcn_clip] - for text, fs, color, font, pos in labels: - tc = ( - _tc(text=text, font_size=fs, color=color, font=font) - .with_duration(dur) - .with_position(pos) - ) - text_clips.append(tc) - slides.append( - CompositeVideoClip(text_clips, size=(W, H)).with_effects( - [FadeIn(0.3), FadeOut(0.3)] - ) - ) + slides = [_compose_slide(fcn_clip, labels, dur)] # Slide 2: FCN skip connections step by step skip_lines = [ @@ -909,7 +921,8 @@ def _fcn_demo() -> list[CompositeVideoClip]: (100, 555), ), ( - "Im więcej skip connections → tym więcej detali z encodera → ostrzejszy wynik", + "Im więcej skip connections → tym więcej " + "detali z encodera → ostrzejszy wynik", 17, "white", FONT_R, @@ -922,90 +935,134 @@ def _fcn_demo() -> list[CompositeVideoClip]: # ── DeepLab Architecture ───────────────────────────────────────── -def _deeplab_demo() -> list[CompositeVideoClip]: - """Animate DeepLab: dilated convolution + ASPP step by step.""" - slides = [] +def _make_dilated_frame(t: float) -> np.ndarray: + """Render a dilated convolution comparison frame.""" + frame = np.zeros((H, W, 3), dtype=np.uint8) + frame[:] = BG_COLOR + progress = min(t / (STEP_DUR * 0.7), 1.0) - # Slide 1: Regular vs Dilated convolution - def make_dilated_frame(t: float) -> np.ndarray: - frame = np.zeros((H, W, 3), dtype=np.uint8) - frame[:] = BG_COLOR - progress = min(t / (STEP_DUR * 0.7), 1.0) + cell = 36 + grids = [ + ( + "rate=1", + 60, + [ + (0, 0), + (0, 1), + (0, 2), + (1, 0), + (1, 1), + (1, 2), + (2, 0), + (2, 1), + (2, 2), + ], + ), + ( + "rate=2", + 420, + [ + (0, 0), + (0, 2), + (0, 4), + (2, 0), + (2, 2), + (2, 4), + (4, 0), + (4, 2), + (4, 4), + ], + ), + ( + "rate=3", + 820, + [ + (0, 0), + (0, 3), + (0, 6), + (3, 0), + (3, 3), + (3, 6), + (6, 0), + (6, 3), + (6, 6), + ], + ), + ] - cell = 36 - # Draw three grids side by side for rate=1, rate=2, rate=3 - grids = [ - ( - "rate=1", - 60, - [ - (0, 0), - (0, 1), - (0, 2), - (1, 0), - (1, 1), - (1, 2), - (2, 0), - (2, 1), - (2, 2), - ], - ), - ( - "rate=2", - 420, - [ - (0, 0), - (0, 2), - (0, 4), - (2, 0), - (2, 2), - (2, 4), - (4, 0), - (4, 2), - (4, 4), - ], - ), - ( - "rate=3", - 820, - [ - (0, 0), - (0, 3), - (0, 6), - (3, 0), - (3, 3), - (3, 6), - (6, 0), - (6, 3), - (6, 6), - ], - ), - ] - - for gi, (_label, gx, positions) in enumerate(grids): - if progress < gi * 0.3: - break - gy = 180 - grid_size = 7 - # Draw background grid - for r in range(grid_size): - for c in range(grid_size): - x = gx + c * cell - y = gy + r * cell - frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55) - - # Highlight filter positions - for r, c in positions: + for gi, (_label, gx, positions) in enumerate(grids): + if progress < gi * 0.3: + break + gy = 180 + grid_size = 7 + for r in range(grid_size): + for c in range(grid_size): x = gx + c * cell y = gy + r * cell - frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200) - frame[y : y + 2, x : x + cell - 2] = (120, 180, 255) - frame[y + cell - 4 : y + cell - 2, x : x + cell - 2] = (120, 180, 255) + frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55) + for r, c in positions: + x = gx + c * cell + y = gy + r * cell + frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200) + frame[y : y + 2, x : x + cell - 2] = (120, 180, 255) + frame[y + cell - 4 : y + cell - 2, x : x + cell - 2] = (120, 180, 255) - return frame + return frame - dil_clip = VideoClip(make_dilated_frame, duration=STEP_DUR + 1).with_fps(FPS) + +def _make_aspp_frame(t: float) -> np.ndarray: + """Render a single ASPP module animation frame.""" + frame = np.zeros((H, W, 3), dtype=np.uint8) + frame[:] = BG_COLOR + progress = min(t / (STEP_DUR * 0.7), 1.0) + + frame[250:330, 50:130] = (70, 130, 200) + frame[250:252, 50:130] = (120, 180, 255) + frame[328:330, 50:130] = (120, 180, 255) + + branches = [ + ("1x1 conv", 250, (200, 170), (100, 40), (80, 200, 120)), + ("rate=6", 310, (200, 250), (100, 40), (200, 160, 80)), + ("rate=12", 370, (200, 330), (100, 40), (200, 120, 60)), + ("rate=18", 430, (200, 410), (100, 40), (180, 100, 80)), + ("GAP", 490, (200, 490), (100, 40), (160, 80, 160)), + ] + n_branches = min(int(progress * 5) + 1, 5) + for i, (_lbl, _h, (bx, by), (bw, bh), color) in enumerate(branches): + if i < n_branches: + frame[by : by + bh, bx : bx + bw] = color + frame[by : by + 2, bx : bx + bw] = tuple( + min(c + 50, 255) for c in color + ) + ay = by + bh // 2 + frame[ay - 1 : ay + 2, 133:197] = (150, 150, 170) + + concat_phase = 0.6 + if progress > concat_phase: + frame[250:530, 380:420] = (50, 60, 80) + frame[250:252, 380:420] = (200, 200, 100) + frame[528:530, 380:420] = (200, 200, 100) + for i, (_lbl, _h, (bx, by), (bw, bh), _c) in enumerate(branches): + if i < n_branches: + ay = by + bh // 2 + frame[ay - 1 : ay + 2, bx + bw + 3 : 378] = (150, 150, 170) + + final_conv_phase = 0.8 + if progress > final_conv_phase: + frame[350:420, 450:550] = (100, 200, 100) + frame[350:352, 450:550] = (150, 230, 150) + frame[418:420, 450:550] = (150, 230, 150) + frame[388:391, 423:448] = (150, 150, 170) + + return frame + + +def _deeplab_demo() -> list[CompositeVideoClip]: + """Animate DeepLab: dilated convolution + ASPP step by step.""" dur = STEP_DUR + 1 + + # Slide 1: Regular vs Dilated convolution + dil_clip = VideoClip(_make_dilated_frame, duration=dur).with_fps(FPS) labels = [ ("DeepLab: Atrous (Dilated) Convolution", 26, "#FFE082", FONT_B, (80, 20)), ( @@ -1032,7 +1089,8 @@ def _deeplab_demo() -> list[CompositeVideoClip]: (80, 510), ), ( - "TE SAME 9 wag → WIĘKSZE pole widzenia → lepszy kontekst BEZ dodatkowych parametrów!", + "TE SAME 9 wag → WIĘKSZE pole widzenia " + "→ lepszy kontekst BEZ dodatkowych parametrów!", 16, "white", FONT_R, @@ -1046,72 +1104,10 @@ def _deeplab_demo() -> list[CompositeVideoClip]: (80, 600), ), ] - text_clips: list[VideoClip] = [dil_clip] - for text, fs, color, font, pos in labels: - tc = ( - _tc(text=text, font_size=fs, color=color, font=font) - .with_duration(dur) - .with_position(pos) - ) - text_clips.append(tc) - slides.append( - CompositeVideoClip(text_clips, size=(W, H)).with_effects( - [FadeIn(0.3), FadeOut(0.3)] - ) - ) + slides = [_compose_slide(dil_clip, labels, dur)] # Slide 2: ASPP module step by step - def make_aspp_frame(t: float) -> np.ndarray: - frame = np.zeros((H, W, 3), dtype=np.uint8) - frame[:] = BG_COLOR - progress = min(t / (STEP_DUR * 0.7), 1.0) - - # Input feature map on left - frame[250:330, 50:130] = (70, 130, 200) - frame[250:252, 50:130] = (120, 180, 255) - frame[328:330, 50:130] = (120, 180, 255) - - # ASPP parallel branches - branches = [ - ("1x1 conv", 250, (200, 170), (100, 40), (80, 200, 120)), - ("rate=6", 310, (200, 250), (100, 40), (200, 160, 80)), - ("rate=12", 370, (200, 330), (100, 40), (200, 120, 60)), - ("rate=18", 430, (200, 410), (100, 40), (180, 100, 80)), - ("GAP", 490, (200, 490), (100, 40), (160, 80, 160)), - ] - n_branches = min(int(progress * 5) + 1, 5) - for i, (_lbl, _h, (bx, by), (bw, bh), color) in enumerate(branches): - if i < n_branches: - frame[by : by + bh, bx : bx + bw] = color - frame[by : by + 2, bx : bx + bw] = tuple( - min(c + 50, 255) for c in color - ) - # Arrow from input - ay = by + bh // 2 - frame[ay - 1 : ay + 2, 133:197] = (150, 150, 170) - - # Concatenation box - if progress > 0.6: - frame[250:530, 380:420] = (50, 60, 80) - frame[250:252, 380:420] = (200, 200, 100) - frame[528:530, 380:420] = (200, 200, 100) - # Arrows from branches to concat - for i, (_lbl, _h, (bx, by), (bw, bh), _c) in enumerate(branches): - if i < n_branches: - ay = by + bh // 2 - frame[ay - 1 : ay + 2, bx + bw + 3 : 378] = (150, 150, 170) - - # Final conv after concat - if progress > 0.8: - frame[350:420, 450:550] = (100, 200, 100) - frame[350:352, 450:550] = (150, 230, 150) - frame[418:420, 450:550] = (150, 230, 150) - # Arrow from concat - frame[388:391, 423:448] = (150, 150, 170) - - return frame - - aspp_clip = VideoClip(make_aspp_frame, duration=STEP_DUR + 1).with_fps(FPS) + aspp_clip = VideoClip(_make_aspp_frame, duration=dur).with_fps(FPS) labels2 = [ ( "DeepLab: ASPP (Atrous Spatial Pyramid Pooling)", @@ -1163,112 +1159,122 @@ def _deeplab_demo() -> list[CompositeVideoClip]: (80, 645), ), ] - text_clips2: list[VideoClip] = [aspp_clip] - for text, fs, color, font, pos in labels2: - tc = ( - _tc(text=text, font_size=fs, color=color, font=font) - .with_duration(dur) - .with_position(pos) - ) - text_clips2.append(tc) - slides.append( - CompositeVideoClip(text_clips2, size=(W, H)).with_effects( - [FadeIn(0.3), FadeOut(0.3)] - ) - ) + slides.append(_compose_slide(aspp_clip, labels2, dur)) return slides # ── Transformer Segmentation ──────────────────────────────────── +def _draw_base_grid( + frame: np.ndarray, gx: int, gy: int, grid_n: int, cell: int, +) -> None: + """Draw an empty grid of cells.""" + for r in range(grid_n): + for c in range(grid_n): + x = gx + c * cell + y = gy + r * cell + frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55) + + +def _draw_cnn_kernel( + frame: np.ndarray, lx: int, ly: int, cell: int, progress: float, +) -> None: + """Highlight a 3x3 CNN kernel on the grid.""" + cnn_phase = 0.2 + if progress <= cnn_phase: + return + cx, cy = 2, 2 + for dr in range(-1, 2): + for dc in range(-1, 2): + r, c = cy + dr, cx + dc + x = lx + c * cell + y = ly + r * cell + frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200) + x = lx + cx * cell + y = ly + cy * cell + frame[y : y + cell - 2, x : x + cell - 2] = (120, 180, 255) + + +def _draw_conn_line( + frame: np.ndarray, x0: int, y0: int, x1: int, y1: int, +) -> None: + """Draw a dashed connection line between two points.""" + steps = max(abs(x1 - x0), abs(y1 - y0)) + if steps <= 0: + return + for s in range(0, steps, 3): + px = x0 + int((x1 - x0) * s / steps) + py = y0 + int((y1 - y0) * s / steps) + if 0 <= px < W - 1 and 0 <= py < H - 1: + frame[py : py + 1, px : px + 1] = (200, 180, 50) + + +def _draw_attention_connections( + frame: np.ndarray, + origin: tuple[int, int], + grid_n: int, + cell: int, + progress: float, +) -> None: + """Draw transformer self-attention connections on the grid.""" + rx, ry = origin + transformer_phase = 0.4 + if progress <= transformer_phase: + return + cx_t, cy_t = 2, 2 + x0 = rx + cx_t * cell + cell // 2 + y0 = ry + cy_t * cell + cell // 2 + n_connections = int(progress * 36) + conn_idx = 0 + for r in range(grid_n): + for c in range(grid_n): + conn_idx += 1 + if conn_idx > n_connections: + break + x = rx + c * cell + y = ry + r * cell + dist = abs(r - cy_t) + abs(c - cx_t) + strength = max(30, 200 - dist * 30) + frame[y : y + cell - 2, x : x + cell - 2] = ( + strength // 3, + strength // 2, + strength, + ) + _draw_conn_line(frame, x0, y0, x + cell // 2, y + cell // 2) + else: + continue + break + x = rx + cx_t * cell + y = ry + cy_t * cell + frame[y : y + cell - 2, x : x + cell - 2] = (255, 200, 50) + + +def _make_attention_frame(t: float) -> np.ndarray: + """Render a CNN-vs-Transformer attention comparison frame.""" + frame = np.zeros((H, W, 3), dtype=np.uint8) + frame[:] = BG_COLOR + progress = min(t / (STEP_DUR * 0.7), 1.0) + + cell = 40 + grid_n = 6 + + lx, ly = 60, 200 + _draw_base_grid(frame, lx, ly, grid_n, cell) + _draw_cnn_kernel(frame, lx, ly, cell, progress) + + rx, ry = 680, 200 + _draw_base_grid(frame, rx, ry, grid_n, cell) + _draw_attention_connections(frame, (rx, ry), grid_n, cell, progress) + + return frame + + def _transformer_seg_demo() -> list[CompositeVideoClip]: """Animate transformer-based segmentation: self-attention concept.""" - slides = [] + dur = STEP_DUR + 1 # Slide 1: CNN local vs Transformer global - def make_attention_frame(t: float) -> np.ndarray: - frame = np.zeros((H, W, 3), dtype=np.uint8) - frame[:] = BG_COLOR - progress = min(t / (STEP_DUR * 0.7), 1.0) - - cell = 40 - grid_n = 6 - - # LEFT: CNN — local receptive field - lx, ly = 60, 200 - for r in range(grid_n): - for c in range(grid_n): - x = lx + c * cell - y = ly + r * cell - frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55) - - # Highlight 3x3 kernel in CNN - if progress > 0.2: - cx, cy = 2, 2 # center cell - for dr in range(-1, 2): - for dc in range(-1, 2): - r, c = cy + dr, cx + dc - x = lx + c * cell - y = ly + r * cell - frame[y : y + cell - 2, x : x + cell - 2] = (70, 130, 200) - # Center highlighted more - x = lx + cx * cell - y = ly + cy * cell - frame[y : y + cell - 2, x : x + cell - 2] = (120, 180, 255) - - # RIGHT: Transformer — global attention - rx, ry = 680, 200 - for r in range(grid_n): - for c in range(grid_n): - x = rx + c * cell - y = ry + r * cell - frame[y : y + cell - 2, x : x + cell - 2] = (35, 40, 55) - - # All cells connected to center - if progress > 0.4: - cx_t, cy_t = 2, 2 - # Center cell - x0 = rx + cx_t * cell + cell // 2 - y0 = ry + cy_t * cell + cell // 2 - n_connections = int(progress * 36) - conn_idx = 0 - for r in range(grid_n): - for c in range(grid_n): - conn_idx += 1 - if conn_idx > n_connections: - break - x = rx + c * cell - y = ry + r * cell - # Color by "attention strength" — closer = stronger - dist = abs(r - cy_t) + abs(c - cx_t) - strength = max(30, 200 - dist * 30) - frame[y : y + cell - 2, x : x + cell - 2] = ( - strength // 3, - strength // 2, - strength, - ) - # Draw connection line - x1 = x + cell // 2 - y1 = y + cell // 2 - steps = max(abs(x1 - x0), abs(y1 - y0)) - if steps > 0: - for s in range(0, steps, 3): - px = x0 + int((x1 - x0) * s / steps) - py = y0 + int((y1 - y0) * s / steps) - if 0 <= px < W - 1 and 0 <= py < H - 1: - frame[py : py + 1, px : px + 1] = (200, 180, 50) - else: - continue - break - # Center highlighted strongly - x = rx + cx_t * cell - y = ry + cy_t * cell - frame[y : y + cell - 2, x : x + cell - 2] = (255, 200, 50) - - return frame - - att_clip = VideoClip(make_attention_frame, duration=STEP_DUR + 1).with_fps(FPS) - dur = STEP_DUR + 1 + att_clip = VideoClip(_make_attention_frame, duration=dur).with_fps(FPS) labels = [ ("Transformer: Self-Attention w segmentacji", 26, "#FFE082", FONT_B, (80, 20)), ("CNN = LOKALNY kontekst", 18, "#64B5F6", FONT_B, (60, 160)), @@ -1279,19 +1285,7 @@ def _transformer_seg_demo() -> list[CompositeVideoClip]: ("piksel widzi WSZYSTKIE!", 14, "#FFE082", FONT_R, (680, 485)), ("vs", 28, "#B0BEC5", FONT_B, (450, 300)), ] - text_clips: list[VideoClip] = [att_clip] - for text, fs, color, font, pos in labels: - tc = ( - _tc(text=text, font_size=fs, color=color, font=font) - .with_duration(dur) - .with_position(pos) - ) - text_clips.append(tc) - slides.append( - CompositeVideoClip(text_clips, size=(W, H)).with_effects( - [FadeIn(0.3), FadeOut(0.3)] - ) - ) + slides = [_compose_slide(att_clip, labels, dur)] # Slide 2: Self-attention Q/K/V step by step qkv_lines = [ @@ -1376,7 +1370,8 @@ def _transformer_seg_demo() -> list[CompositeVideoClip]: (100, 610), ), ( - "Mask2Former (2022): masked attention + unified (semantic+instance+panoptic)", + "Mask2Former (2022): masked attention + " + "unified (semantic+instance+panoptic)", 16, "#CE93D8", FONT_R, @@ -1520,12 +1515,16 @@ def _methods_comparison() -> CompositeVideoClip: ] clips: list[VideoClip] = [bg, title] + mnemonic_col = 3 for i, row in enumerate(rows): y_pos = 75 + i * 72 col_x = [40, 210, 340, 660] for j, cell in enumerate(row): fs = 16 if i > 0 else 18 - color = "#64B5F6" if i == 0 else ("#E0E0E0" if j < 3 else "#FFE082") + color = ( + "#64B5F6" if i == 0 + else ("#E0E0E0" if j < mnemonic_col else "#FFE082") + ) tc = ( _tc( text=cell, @@ -1620,7 +1619,7 @@ def main() -> None: final.write_videofile( OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4 ) - print(f"Video saved to: {OUTPUT}") + _logger.info("Video saved to: %s", OUTPUT) if __name__ == "__main__": diff --git a/python_pkg/praca_magisterska_video/visualize_q24.py b/python_pkg/praca_magisterska_video/visualize_q24.py index d5c380a..5e3f1b5 100644 --- a/python_pkg/praca_magisterska_video/visualize_q24.py +++ b/python_pkg/praca_magisterska_video/visualize_q24.py @@ -11,6 +11,7 @@ Creates animated video demonstrating: from __future__ import annotations +import logging import os from pathlib import Path @@ -40,6 +41,8 @@ OUTPUT = str(OUTPUT_DIR / "q24_object_detection.mp4") BG_COLOR = (15, 20, 35) +_logger = logging.getLogger(__name__) + def _tc(**kwargs: object) -> TextClip: """TextClip wrapper that adds enough bottom margin to prevent clipping.""" @@ -203,7 +206,8 @@ def _hog_svm_demo() -> list[CompositeVideoClip]: frame[ay - 1 : ay + 2, ax : ax + 20] = (150, 150, 170) # Show gradient computation example at bottom - if progress > 0.2: + gradient_phase = 0.2 + if progress > gradient_phase: # Mini pixel grid showing gradient computation gx, gy = 100, 430 pixels = [50, 50, 200] @@ -366,7 +370,8 @@ def _viola_jones_demo() -> list[CompositeVideoClip]: (80, 620), ), ( - "Haar: kontrast jasna/ciemna | Integral Image: suma prostokąta O(1) = 4 odczyty", + "Haar: kontrast jasna/ciemna | Integral Image: " + "suma prostokąta O(1) = 4 odczyty", 14, "#78909C", FONT_R, @@ -474,7 +479,8 @@ def _rcnn_evolution() -> list[CompositeVideoClip]: ("Faster R-CNN (2015)", 20, "#A5D6A7", FONT_B, (50, 580)), ("0.2 sec → 5 fps (RPN w sieci!)", 14, "#A5D6A7", FONT_R, (720, 600)), ( - "Kluczowe innowacje: ROI Pooling → stały rozmiar | RPN → propozycje w sieci", + "Kluczowe innowacje: ROI Pooling → stały rozmiar " + "| RPN → propozycje w sieci", 14, "#78909C", FONT_R, @@ -527,13 +533,15 @@ def _rcnn_detailed() -> list[CompositeVideoClip]: min(c + 50, 255) for c in color ) # Arrow down - if i < 4: + arrow_limit = 4 + if i < arrow_limit: ax = bx + bw // 2 ay = by + bh + 5 frame[ay : ay + 20, ax - 1 : ax + 2] = (150, 150, 170) # Illustration: many overlapping regions from Selective Search - if progress > 0.2: + overlay_phase = 0.2 + if progress > overlay_phase: rng_local = np.random.default_rng(42) n_boxes = min(int((progress - 0.2) * 15), 8) for i in range(n_boxes): @@ -599,94 +607,108 @@ def _rcnn_detailed() -> list[CompositeVideoClip]: # ── ROI Pooling ────────────────────────────────────────────────── + + +def _draw_roi_pool_grid(frame: np.ndarray) -> None: + """Draw the 3x3 ROI pool grid with max-pooled feature values.""" + out_x, out_y = 400, 220 + out_cell = 50 + out_n = 3 + roi_r1, roi_c1 = 2, 1 + roi_r2, roi_c2 = 6, 5 + roi_h = roi_r2 - roi_r1 + roi_w = roi_c2 - roi_c1 + for r in range(out_n): + for c in range(out_n): + x = out_x + c * out_cell + y = out_y + r * out_cell + + # Compute the max from corresponding region + src_r1 = roi_r1 + r * roi_h // out_n + src_r2 = roi_r1 + (r + 1) * roi_h // out_n + src_c1 = roi_c1 + c * roi_w // out_n + src_c2 = roi_c1 + (c + 1) * roi_w // out_n + max_val = 0 + for sr in range(src_r1, src_r2): + for sc in range(src_c1, src_c2): + v = 30 + ((sr * 7 + sc * 13 + 42) % 40) + max_val = max(max_val, v) + + frame[y : y + out_cell - 2, x : x + out_cell - 2] = ( + max_val, + max_val + 20, + max_val + 40, + ) + frame[y : y + 2, x : x + out_cell - 2] = (80, 200, 120) + frame[y + out_cell - 4 : y + out_cell - 2, x : x + out_cell - 2] = ( + 80, + 200, + 120, + ) + + +def _make_roi_frame(t: float) -> np.ndarray: + """Render a single frame for the ROI pooling animation.""" + frame = np.zeros((H, W, 3), dtype=np.uint8) + frame[:] = BG_COLOR + progress = min(t / (STEP_DUR * 0.7), 1.0) + + # Left: feature map with ROI highlighted + fm_x, fm_y = 60, 180 + fm_cell = 30 + fm_grid = 8 + for r in range(fm_grid): + for c in range(fm_grid): + x = fm_x + c * fm_cell + y = fm_y + r * fm_cell + # Random-looking feature values + val = 30 + ((r * 7 + c * 13 + 42) % 40) + frame[y : y + fm_cell - 1, x : x + fm_cell - 1] = ( + val, + val + 10, + val + 20, + ) + + # ROI region highlighted + roi_r1, roi_c1 = 2, 1 + roi_r2, roi_c2 = 6, 5 + for tt in range(3): + ry1 = fm_y + roi_r1 * fm_cell - tt + ry2 = fm_y + roi_r2 * fm_cell + tt + rx1 = fm_x + roi_c1 * fm_cell - tt + rx2 = fm_x + roi_c2 * fm_cell + tt + frame[ry1:ry2, rx1 : rx1 + 2] = (255, 200, 50) + frame[ry1:ry2, rx2 - 2 : rx2] = (255, 200, 50) + frame[ry1 : ry1 + 2, rx1:rx2] = (255, 200, 50) + frame[ry2 - 2 : ry2, rx1:rx2] = (255, 200, 50) + + # Arrow + arrow_phase = 0.3 + if progress > arrow_phase: + frame[300:303, 310:380] = (150, 150, 170) + + # Middle: ROI divided into 3x3 grid (output_size) + grid_phase = 0.3 + if progress > grid_phase: + _draw_roi_pool_grid(frame) + + # Arrow to FC + fc_phase = 0.6 + if progress > fc_phase: + frame[300:303, 560:630] = (150, 150, 170) + # FC box + frame[270:340, 650:730] = (200, 100, 80) + frame[270:272, 650:730] = (240, 140, 120) + frame[338:340, 650:730] = (240, 140, 120) + + return frame + + def _roi_pooling_demo() -> list[CompositeVideoClip]: """Animate ROI Pooling: key Fast R-CNN innovation.""" slides = [] - def make_roi_frame(t: float) -> np.ndarray: - frame = np.zeros((H, W, 3), dtype=np.uint8) - frame[:] = BG_COLOR - progress = min(t / (STEP_DUR * 0.7), 1.0) - - # Left: feature map with ROI highlighted - fm_x, fm_y = 60, 180 - fm_cell = 30 - fm_grid = 8 - for r in range(fm_grid): - for c in range(fm_grid): - x = fm_x + c * fm_cell - y = fm_y + r * fm_cell - # Random-looking feature values - val = 30 + ((r * 7 + c * 13 + 42) % 40) - frame[y : y + fm_cell - 1, x : x + fm_cell - 1] = ( - val, - val + 10, - val + 20, - ) - - # ROI region highlighted - roi_r1, roi_c1 = 2, 1 - roi_r2, roi_c2 = 6, 5 - for tt in range(3): - ry1 = fm_y + roi_r1 * fm_cell - tt - ry2 = fm_y + roi_r2 * fm_cell + tt - rx1 = fm_x + roi_c1 * fm_cell - tt - rx2 = fm_x + roi_c2 * fm_cell + tt - frame[ry1:ry2, rx1 : rx1 + 2] = (255, 200, 50) - frame[ry1:ry2, rx2 - 2 : rx2] = (255, 200, 50) - frame[ry1 : ry1 + 2, rx1:rx2] = (255, 200, 50) - frame[ry2 - 2 : ry2, rx1:rx2] = (255, 200, 50) - - # Arrow - if progress > 0.3: - frame[300:303, 310:380] = (150, 150, 170) - - # Middle: ROI divided into 3x3 grid (output_size) - if progress > 0.3: - out_x, out_y = 400, 220 - out_cell = 50 - out_n = 3 - roi_h = roi_r2 - roi_r1 - roi_w = roi_c2 - roi_c1 - for r in range(out_n): - for c in range(out_n): - x = out_x + c * out_cell - y = out_y + r * out_cell - - # Compute the max from corresponding region - src_r1 = roi_r1 + r * roi_h // out_n - src_r2 = roi_r1 + (r + 1) * roi_h // out_n - src_c1 = roi_c1 + c * roi_w // out_n - src_c2 = roi_c1 + (c + 1) * roi_w // out_n - max_val = 0 - for sr in range(src_r1, src_r2): - for sc in range(src_c1, src_c2): - v = 30 + ((sr * 7 + sc * 13 + 42) % 40) - max_val = max(max_val, v) - - frame[y : y + out_cell - 2, x : x + out_cell - 2] = ( - max_val, - max_val + 20, - max_val + 40, - ) - frame[y : y + 2, x : x + out_cell - 2] = (80, 200, 120) - frame[y + out_cell - 4 : y + out_cell - 2, x : x + out_cell - 2] = ( - 80, - 200, - 120, - ) - - # Arrow to FC - if progress > 0.6: - frame[300:303, 560:630] = (150, 150, 170) - # FC box - frame[270:340, 650:730] = (200, 100, 80) - frame[270:272, 650:730] = (240, 140, 120) - frame[338:340, 650:730] = (240, 140, 120) - - return frame - - roi_clip = VideoClip(make_roi_frame, duration=STEP_DUR + 1).with_fps(FPS) + roi_clip = VideoClip(_make_roi_frame, duration=STEP_DUR + 1).with_fps(FPS) dur = STEP_DUR + 1 labels = [ ("ROI Pooling: kluczowa innowacja Fast R-CNN", 26, "#FFE082", FONT_B, (80, 20)), @@ -731,7 +753,8 @@ def _roi_pooling_demo() -> list[CompositeVideoClip]: (80, 535), ), ( - "Fast R-CNN: CNN raz → 1 feature mapa → ROI Pool 2000 regionów → 25x szybciej!", + "Fast R-CNN: CNN raz → 1 feature mapa → " + "ROI Pool 2000 regionów → 25x szybciej!", 16, "#A5D6A7", FONT_R, @@ -788,7 +811,6 @@ def _rpn_anchors_demo() -> list[CompositeVideoClip]: # Draw anchors around center: 3 sizes x 3 ratios = 9 anchor_specs = [ - # (half_w, half_h, color) (30, 30, (200, 80, 80)), # small 1:1 (20, 40, (200, 60, 60)), # small 1:2 (40, 20, (180, 60, 60)), # small 2:1 @@ -1014,7 +1036,8 @@ def _yolo_demo() -> list[CompositeVideoClip]: frame[y : y + 1, img_x : img_x + img_size] = (100, 100, 120) # Highlight cells containing object centers - if progress > 0.3: + car_phase = 0.3 + if progress > car_phase: # Car center ~ cell (1, 1) cx, cy = 1, 2 hx = img_x + cx * cell @@ -1023,7 +1046,8 @@ def _yolo_demo() -> list[CompositeVideoClip]: frame[hy : hy + cell, hx : hx + cell].astype(int) + 40, 0, 255 ).astype(np.uint8) - if progress > 0.5: + person_phase = 0.5 + if progress > person_phase: # Person center ~ cell (4, 4) cx, cy = 4, 4 hx = img_x + cx * cell @@ -1033,7 +1057,8 @@ def _yolo_demo() -> list[CompositeVideoClip]: ).astype(np.uint8) # Bounding boxes predictions from cells - if progress > 0.6: + bbox_phase = 0.6 + if progress > bbox_phase: # Car bbox for tt in range(2): frame[ @@ -1100,7 +1125,8 @@ def _yolo_demo() -> list[CompositeVideoClip]: (80, 620), ), ( - "Two-stage (R-CNN): propozycje+klasyfikacja | One-stage (YOLO): bez propozycji!", + "Two-stage (R-CNN): propozycje+klasyfikacja " + "| One-stage (YOLO): bez propozycji!", 14, "#90CAF9", FONT_R, @@ -1152,13 +1178,15 @@ def _yolo_architecture() -> list[CompositeVideoClip]: frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) - if i < 4: + arrow_limit = 4 + if i < arrow_limit: ax = bx + bw + 5 ay = by + bh // 2 frame[ay - 1 : ay + 2, ax : ax + 25] = (150, 150, 170) # Output tensor breakdown (right side) - if progress > 0.6: + tensor_phase = 0.6 + if progress > tensor_phase: # Show SxS grid gx, gy = 850, 180 gs = 120 @@ -1282,18 +1310,21 @@ def _detr_demo() -> list[CompositeVideoClip]: frame[by + bh - 2 : by + bh, bx : bx + bw] = tuple( min(c + 50, 255) for c in color ) - if i < 4: + arrow_limit = 4 + if i < arrow_limit: ax = bx + bw + 5 ay = by + bh // 2 frame[ay - 1 : ay + 2, ax : ax + 25] = (150, 150, 170) # Object queries illustration (right side) - if progress > 0.5: + query_phase = 0.5 + if progress > query_phase: qx, qy = 800, 140 for i in range(6): y = qy + i * 50 w = 130 - active = i < 3 + active_limit = 3 + active = i < active_limit color = (80, 180, 120) if active else (60, 50, 50) frame[y : y + 35, qx : qx + w] = color frame[y : y + 1, qx : qx + w] = tuple(min(c + 40, 255) for c in color) @@ -1528,7 +1559,8 @@ def _detr_demo() -> list[CompositeVideoClip]: (80, 540), ), ( - " R-CNN (SS+CNN+SVM+NMS) → YOLO (backbone+head+NMS) → DETR (backbone+transformer)", + " R-CNN (SS+CNN+SVM+NMS) → YOLO " + "(backbone+head+NMS) → DETR (backbone+transformer)", 14, "#90CAF9", FONT_R, @@ -1572,15 +1604,18 @@ def _nms_iou_demo() -> list[CompositeVideoClip]: boxes.append((ox + 350, oy + 50, 100, 100, 0.40, (80, 180, 255))) for i, (bx, by, bw, bh, _conf, color) in enumerate(boxes): - if progress > 0.4 and i > 0 and i < 3: + dc = color + nms_phase = 0.4 + nms_limit = 3 + if progress > nms_phase and i > 0 and i < nms_limit: # After NMS, these get removed (shown as faded/crossed) - color = (60, 40, 40) + dc = (60, 40, 40) for tt in range(2): - frame[by - tt : by + bh + tt, bx - tt : bx - tt + 2] = color - frame[by - tt : by + bh + tt, bx + bw + tt - 2 : bx + bw + tt] = color - frame[by - tt : by - tt + 2, bx - tt : bx + bw + tt] = color - frame[by + bh + tt - 2 : by + bh + tt, bx - tt : bx + bw + tt] = color + frame[by - tt : by + bh + tt, bx - tt : bx - tt + 2] = dc + frame[by - tt : by + bh + tt, bx + bw + tt - 2 : bx + bw + tt] = dc + frame[by - tt : by - tt + 2, bx - tt : bx + bw + tt] = dc + frame[by + bh + tt - 2 : by + bh + tt, bx - tt : bx + bw + tt] = dc # IoU visualization on right side iou_x, iou_y = 700, 200 @@ -1884,7 +1919,7 @@ def main() -> None: final.write_videofile( OUTPUT, fps=FPS, codec="libx264", audio=False, preset="medium", threads=4 ) - print(f"Video saved to: {OUTPUT}") + _logger.info("Video saved to: %s", OUTPUT) if __name__ == "__main__":