testsAndMisc-archive/python_pkg/word_frequency/_deck_builder.py
Krzysztof kuhy Rudnicki 8f2fbd2311 refactor: enforce 500-line limit on all Python source files
Split 18+ Python files that exceeded 500 lines into smaller modules
with helper files (prefixed with _). All functions are re-exported
from the original modules to maintain backward compatibility with
test patches and external imports.

Files split:
- moviepy_showcase.py (1212 -> 302 + 3 helpers)
- anki_generator.py (1174 -> 473 + 4 helpers)
- test_analyze_chess_game.py (1152 -> 361 + 2 parts)
- poker_modifier_app.py (1024 -> 263 + 2 helpers)
- transcribe_fw.py (1007 -> 342 + 3 helpers)
- music_generator.py (1002 -> 319 + 2 helpers)
- translator.py (951 -> 442 + 2 helpers)
- cinema_planner.py (893 -> 369 + 2 helpers)
- lichess_bot/main.py (757 -> 495 + _game_logic.py)
- test_translator.py (725 -> 289 + part2 + conftest)
- test_lichess_api.py (680 -> 475 + part2)
- learning_pipe.py (668 -> 375 + 2 helpers)
- cache.py (655 -> 360 + _cache_decks.py)
- analyze_chess_game.py (632 -> 463 + _move_analysis.py)
- visualize_q02.py (609 -> 371 + helper)
- repo_explorer.py (602 -> 347 + 2 helpers)
- keyboard_coop/main.py (515 -> 416 + _dictionary.py)
- scanning.py (501 -> 314 + _enforce_loop.py)

All tests pass: 144 lichess_bot (100% branch coverage), 243 others.
No new lint errors introduced.
2026-03-17 22:47:42 +01:00

192 lines
5.9 KiB
Python

"""Anki deck building and card formatting."""
from __future__ import annotations
import re
from python_pkg.word_frequency._types import DeckInput
from python_pkg.word_frequency.translator import translate_words_batch
def find_word_contexts(
text: str,
words: list[str],
context_words: int = 5,
) -> dict[str, str]:
"""Find example contexts for each word in the text.
Args:
text: The source text.
words: List of words to find contexts for.
context_words: Number of words of context on each side.
Returns:
Dict mapping word to example context.
"""
# Extract all words preserving positions
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
all_words_lower = [w.lower() for w in all_words]
contexts: dict[str, str] = {}
words_lower = {w.lower() for w in words}
for target in words_lower:
# Find first occurrence
for i, word in enumerate(all_words_lower):
if word == target:
start = max(0, i - context_words)
end = min(len(all_words), i + context_words + 1)
context = " ".join(all_words[start:end])
contexts[target] = f"...{context}..."
break
return contexts
def _format_excerpt_card(
excerpt: str,
excerpt_words: list[tuple[str, int]] | None,
) -> str:
"""Format the excerpt as the first Anki card.
Args:
excerpt: The target excerpt text.
excerpt_words: Words in the excerpt with ranks.
Returns:
Formatted excerpt card line.
"""
excerpt_escaped = excerpt.replace(";", ",")
if excerpt_words:
most_frequent = min(excerpt_words, key=lambda x: x[1])[0]
rarest = max(excerpt_words, key=lambda x: x[1])[0]
if most_frequent != rarest:
pattern_rare = re.compile(
rf"\b({re.escape(rarest)})\b", re.IGNORECASE
)
excerpt_escaped = pattern_rare.sub(
r"<b>\1</b>", excerpt_escaped
)
pattern_freq = re.compile(
rf"\b({re.escape(most_frequent)})\b",
re.IGNORECASE,
)
excerpt_escaped = pattern_freq.sub(
r"<i>\1</i>", excerpt_escaped
)
else:
pattern = re.compile(
rf"\b({re.escape(most_frequent)})\b",
re.IGNORECASE,
)
excerpt_escaped = pattern.sub(
r"<b><i>\1</i></b>", excerpt_escaped
)
return f"\U0001f4d6 TARGET EXCERPT;{excerpt_escaped};#0"
def _build_translation_lookup(
words_with_ranks: list[tuple[str, int]],
source_lang: str,
target_lang: str,
*,
no_translate: bool = False,
) -> dict[str, str]:
"""Build word-to-translation lookup dict.
Args:
words_with_ranks: List of (word, rank) tuples.
source_lang: Source language code.
target_lang: Target language code.
no_translate: If True, use placeholder translations.
Returns:
Dict mapping lowercase word to translation.
"""
words = [w for w, _ in words_with_ranks]
if no_translate:
return {w.lower(): "[TODO]" for w in words}
translations = translate_words_batch(words, source_lang, target_lang)
trans_lookup: dict[str, str] = {}
for result in translations:
if result.success:
trans_lookup[result.source_word.lower()] = (
result.translated_word
)
else:
trans_lookup[result.source_word.lower()] = (
f"[{result.source_word}]"
)
return trans_lookup
def generate_anki_deck(
deck_input: DeckInput,
*,
include_context: bool = False,
no_translate: bool = False,
excerpt: str = "",
excerpt_words: list[tuple[str, int]] | None = None,
) -> str:
"""Generate Anki-compatible deck content.
Args:
deck_input: Core deck data (words, langs, contexts, name).
include_context: Whether to include context in cards.
no_translate: If True, skip translation (use placeholder).
excerpt: The target excerpt text to include in cards.
excerpt_words: Words in the excerpt with ranks.
Returns:
Semicolon-separated content ready for Anki import.
"""
lines: list[str] = []
# Add Anki headers
lines.append("#separator:semicolon")
lines.append("#html:true")
lines.append(f"#deck:{deck_input.deck_name}")
lines.append(f"#tags:vocabulary {deck_input.source_lang}")
if include_context:
lines.append("#columns:Front;Back;Rank;Context")
else:
lines.append("#columns:Front;Back;Rank")
lines.append("") # Empty line before data
if excerpt:
lines.append(_format_excerpt_card(excerpt, excerpt_words))
trans_lookup = _build_translation_lookup(
deck_input.words_with_ranks,
deck_input.source_lang,
deck_input.target_lang,
no_translate=no_translate,
)
# Generate cards
for word, rank in deck_input.words_with_ranks:
translation = trans_lookup.get(word.lower(), f"[{word}]")
# Escape semicolons in fields
word_escaped = word.replace(";", ",")
translation_escaped = translation.replace(";", ",")
if include_context and deck_input.contexts:
context = deck_input.contexts.get(word.lower(), "")
if context:
context_escaped = context.replace(";", ",")
pattern = re.compile(re.escape(word), re.IGNORECASE)
context_escaped = pattern.sub(
f"<b>{word}</b>", context_escaped
)
else:
context_escaped = ""
lines.append(
f"{word_escaped};{translation_escaped}"
f";#{rank};{context_escaped}"
)
else:
lines.append(f"{word_escaped};{translation_escaped};#{rank}")
return "\n".join(lines)