mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 18:03:07 +02:00
- Add comprehensive tests for all packages (3572 tests, 100% branch coverage) - Split oversized test files to stay under 500-line limit - Add per-file ruff ignores for test-appropriate suppressions - Fix _cache_decks.py to properly convert JSON lists to tuples - Add session-scoped conftest fixture for logging handler cleanup (Python 3.14) - Update ruff pre-commit hook to v0.15.2 - Add codespell ignore words for test data - Add generated output files to .gitignore
180 lines
5.7 KiB
Python
180 lines
5.7 KiB
Python
"""Anki deck building and card formatting."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import TYPE_CHECKING
|
|
|
|
from python_pkg.word_frequency.translator import translate_words_batch
|
|
|
|
if TYPE_CHECKING:
|
|
from python_pkg.word_frequency._types import DeckInput
|
|
|
|
|
|
def find_word_contexts(
|
|
text: str,
|
|
words: list[str],
|
|
context_words: int = 5,
|
|
) -> dict[str, str]:
|
|
"""Find example contexts for each word in the text.
|
|
|
|
Args:
|
|
text: The source text.
|
|
words: List of words to find contexts for.
|
|
context_words: Number of words of context on each side.
|
|
|
|
Returns:
|
|
Dict mapping word to example context.
|
|
"""
|
|
# Extract all words preserving positions
|
|
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
|
|
all_words_lower = [w.lower() for w in all_words]
|
|
|
|
contexts: dict[str, str] = {}
|
|
words_lower = {w.lower() for w in words}
|
|
|
|
for target in words_lower:
|
|
# Find first occurrence
|
|
for i, word in enumerate(all_words_lower):
|
|
if word == target:
|
|
start = max(0, i - context_words)
|
|
end = min(len(all_words), i + context_words + 1)
|
|
context = " ".join(all_words[start:end])
|
|
contexts[target] = f"...{context}..."
|
|
break
|
|
|
|
return contexts
|
|
|
|
|
|
def _format_excerpt_card(
|
|
excerpt: str,
|
|
excerpt_words: list[tuple[str, int]] | None,
|
|
) -> str:
|
|
"""Format the excerpt as the first Anki card.
|
|
|
|
Args:
|
|
excerpt: The target excerpt text.
|
|
excerpt_words: Words in the excerpt with ranks.
|
|
|
|
Returns:
|
|
Formatted excerpt card line.
|
|
"""
|
|
excerpt_escaped = excerpt.replace(";", ",")
|
|
if excerpt_words:
|
|
most_frequent = min(excerpt_words, key=lambda x: x[1])[0]
|
|
rarest = max(excerpt_words, key=lambda x: x[1])[0]
|
|
if most_frequent != rarest:
|
|
pattern_rare = re.compile(rf"\b({re.escape(rarest)})\b", re.IGNORECASE)
|
|
excerpt_escaped = pattern_rare.sub(r"<b>\1</b>", excerpt_escaped)
|
|
pattern_freq = re.compile(
|
|
rf"\b({re.escape(most_frequent)})\b",
|
|
re.IGNORECASE,
|
|
)
|
|
excerpt_escaped = pattern_freq.sub(r"<i>\1</i>", excerpt_escaped)
|
|
else:
|
|
pattern = re.compile(
|
|
rf"\b({re.escape(most_frequent)})\b",
|
|
re.IGNORECASE,
|
|
)
|
|
excerpt_escaped = pattern.sub(r"<b><i>\1</i></b>", excerpt_escaped)
|
|
return f"\U0001f4d6 TARGET EXCERPT;{excerpt_escaped};#0"
|
|
|
|
|
|
def _build_translation_lookup(
|
|
words_with_ranks: list[tuple[str, int]],
|
|
source_lang: str,
|
|
target_lang: str,
|
|
*,
|
|
no_translate: bool = False,
|
|
) -> dict[str, str]:
|
|
"""Build word-to-translation lookup dict.
|
|
|
|
Args:
|
|
words_with_ranks: List of (word, rank) tuples.
|
|
source_lang: Source language code.
|
|
target_lang: Target language code.
|
|
no_translate: If True, use placeholder translations.
|
|
|
|
Returns:
|
|
Dict mapping lowercase word to translation.
|
|
"""
|
|
words = [w for w, _ in words_with_ranks]
|
|
if no_translate:
|
|
return {w.lower(): "[TODO]" for w in words}
|
|
translations = translate_words_batch(words, source_lang, target_lang)
|
|
trans_lookup: dict[str, str] = {}
|
|
for result in translations:
|
|
if result.success:
|
|
trans_lookup[result.source_word.lower()] = result.translated_word
|
|
else:
|
|
trans_lookup[result.source_word.lower()] = f"[{result.source_word}]"
|
|
return trans_lookup
|
|
|
|
|
|
def generate_anki_deck(
|
|
deck_input: DeckInput,
|
|
*,
|
|
include_context: bool = False,
|
|
no_translate: bool = False,
|
|
excerpt: str = "",
|
|
excerpt_words: list[tuple[str, int]] | None = None,
|
|
) -> str:
|
|
"""Generate Anki-compatible deck content.
|
|
|
|
Args:
|
|
deck_input: Core deck data (words, langs, contexts, name).
|
|
include_context: Whether to include context in cards.
|
|
no_translate: If True, skip translation (use placeholder).
|
|
excerpt: The target excerpt text to include in cards.
|
|
excerpt_words: Words in the excerpt with ranks.
|
|
|
|
Returns:
|
|
Semicolon-separated content ready for Anki import.
|
|
"""
|
|
lines: list[str] = []
|
|
|
|
# Add Anki headers
|
|
lines.append("#separator:semicolon")
|
|
lines.append("#html:true")
|
|
lines.append(f"#deck:{deck_input.deck_name}")
|
|
lines.append(f"#tags:vocabulary {deck_input.source_lang}")
|
|
if include_context:
|
|
lines.append("#columns:Front;Back;Rank;Context")
|
|
else:
|
|
lines.append("#columns:Front;Back;Rank")
|
|
lines.append("") # Empty line before data
|
|
|
|
if excerpt:
|
|
lines.append(_format_excerpt_card(excerpt, excerpt_words))
|
|
|
|
trans_lookup = _build_translation_lookup(
|
|
deck_input.words_with_ranks,
|
|
deck_input.source_lang,
|
|
deck_input.target_lang,
|
|
no_translate=no_translate,
|
|
)
|
|
|
|
# Generate cards
|
|
for word, rank in deck_input.words_with_ranks:
|
|
translation = trans_lookup.get(word.lower(), f"[{word}]")
|
|
|
|
# Escape semicolons in fields
|
|
word_escaped = word.replace(";", ",")
|
|
translation_escaped = translation.replace(";", ",")
|
|
|
|
if include_context and deck_input.contexts:
|
|
context = deck_input.contexts.get(word.lower(), "")
|
|
if context:
|
|
context_escaped = context.replace(";", ",")
|
|
pattern = re.compile(re.escape(word), re.IGNORECASE)
|
|
context_escaped = pattern.sub(f"<b>{word}</b>", context_escaped)
|
|
else:
|
|
context_escaped = ""
|
|
lines.append(
|
|
f"{word_escaped};{translation_escaped};#{rank};{context_escaped}"
|
|
)
|
|
else:
|
|
lines.append(f"{word_escaped};{translation_escaped};#{rank}")
|
|
|
|
return "\n".join(lines)
|