mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 20:43:13 +02:00
- Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters
1175 lines
32 KiB
Python
Executable File
1175 lines
32 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Anki flashcard generator from vocabulary curve analysis.
|
|
|
|
Generates Anki-compatible flashcard decks from the vocabulary needed to
|
|
understand excerpts of a given length.
|
|
|
|
Usage::
|
|
|
|
# Generate flashcards for a 20-word excerpt
|
|
python -m python_pkg.word_frequency.anki_generator \
|
|
--file text.txt --length 20
|
|
|
|
# Specify source language (auto-detected by default)
|
|
python -m python_pkg.word_frequency.anki_generator \
|
|
--file text.txt --length 20 --from pl
|
|
|
|
# Custom output file
|
|
python -m python_pkg.word_frequency.anki_generator \
|
|
--file text.txt --length 20 --output polish_vocab.txt
|
|
|
|
# Include example sentences/context
|
|
python -m python_pkg.word_frequency.anki_generator \
|
|
--file text.txt --length 20 --include-context
|
|
|
|
Output:
|
|
Creates a semicolon-separated text file importable into Anki.
|
|
Format: ``word;translation;frequency_rank;example_context``
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import contextlib
|
|
from dataclasses import dataclass
|
|
import logging
|
|
from pathlib import Path
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from typing import TYPE_CHECKING, NamedTuple
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Sequence
|
|
|
|
try:
|
|
from python_pkg.word_frequency.analyzer import read_file
|
|
from python_pkg.word_frequency.cache import (
|
|
AnkiDeckKey,
|
|
clear_all_caches,
|
|
get_all_cache_stats,
|
|
get_anki_deck_cache,
|
|
get_vocab_curve_cache,
|
|
)
|
|
from python_pkg.word_frequency.translator import (
|
|
detect_language,
|
|
translate_words_batch,
|
|
)
|
|
except ImportError:
|
|
from analyzer import read_file
|
|
from cache import (
|
|
AnkiDeckKey,
|
|
clear_all_caches,
|
|
get_all_cache_stats,
|
|
get_anki_deck_cache,
|
|
get_vocab_curve_cache,
|
|
)
|
|
from translator import detect_language, translate_words_batch
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_MIN_VOCAB_DUMP_PARTS = 2
|
|
_MIN_EXCERPT_PARTS = 3
|
|
_ONE_KB = 1024
|
|
_ONE_MB = 1024 * 1024
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class FlashcardOptions:
|
|
"""Options for flashcard generation."""
|
|
|
|
source_lang: str | None = None
|
|
target_lang: str = "en"
|
|
deck_name: str | None = None
|
|
include_context: bool = False
|
|
no_translate: bool = False
|
|
force: bool = False
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class DeckInput:
|
|
"""Input data for Anki deck generation."""
|
|
|
|
words_with_ranks: list[tuple[str, int]]
|
|
source_lang: str
|
|
target_lang: str = "en"
|
|
contexts: dict[str, str] | None = None
|
|
deck_name: str = "Vocabulary"
|
|
|
|
|
|
# Path to C vocabulary_curve executable
|
|
C_EXECUTABLE = (
|
|
Path(__file__).parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
|
|
)
|
|
|
|
|
|
class VocabWord(NamedTuple):
|
|
"""A vocabulary word with its metadata."""
|
|
|
|
word: str
|
|
rank: int
|
|
translation: str
|
|
context: str
|
|
|
|
|
|
def run_vocabulary_curve(
|
|
filepath: Path, max_length: int, *, dump_vocab: bool = False
|
|
) -> str:
|
|
"""Run the C vocabulary_curve executable.
|
|
|
|
Args:
|
|
filepath: Path to the text file.
|
|
max_length: Maximum excerpt length.
|
|
dump_vocab: If True, also dump all vocabulary up to max rank needed.
|
|
|
|
Returns:
|
|
Output from the executable.
|
|
|
|
Raises:
|
|
FileNotFoundError: If executable not found.
|
|
subprocess.CalledProcessError: If execution fails.
|
|
"""
|
|
if not C_EXECUTABLE.exists():
|
|
msg = (
|
|
f"C executable not found at {C_EXECUTABLE}. "
|
|
"Please compile it first: cd C/vocabulary_curve && make"
|
|
)
|
|
raise FileNotFoundError(msg)
|
|
|
|
cmd = [str(C_EXECUTABLE), str(filepath), str(max_length)]
|
|
if dump_vocab:
|
|
cmd.append("--dump-vocab")
|
|
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
check=True,
|
|
)
|
|
return result.stdout
|
|
|
|
|
|
def run_vocabulary_curve_inverse(
|
|
filepath: Path, max_vocab: int, *, dump_vocab: bool = False
|
|
) -> str:
|
|
"""Run the C vocabulary_curve executable in inverse mode.
|
|
|
|
Args:
|
|
filepath: Path to the text file.
|
|
max_vocab: Maximum vocabulary size (top N words).
|
|
dump_vocab: If True, also dump all vocabulary up to max_vocab.
|
|
|
|
Returns:
|
|
Output from the executable.
|
|
|
|
Raises:
|
|
FileNotFoundError: If executable not found.
|
|
subprocess.CalledProcessError: If execution fails.
|
|
"""
|
|
if not C_EXECUTABLE.exists():
|
|
msg = (
|
|
f"C executable not found at {C_EXECUTABLE}. "
|
|
"Please compile it first: cd C/vocabulary_curve && make"
|
|
)
|
|
raise FileNotFoundError(msg)
|
|
|
|
cmd = [str(C_EXECUTABLE), str(filepath), "--max-vocab", str(max_vocab)]
|
|
if dump_vocab:
|
|
cmd.append("--dump-vocab")
|
|
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
check=True,
|
|
)
|
|
return result.stdout
|
|
|
|
|
|
def _parse_vocab_dump(lines: list[str]) -> list[tuple[str, int]]:
|
|
"""Parse VOCAB_DUMP section from output lines.
|
|
|
|
Args:
|
|
lines: Output lines from vocabulary_curve.
|
|
|
|
Returns:
|
|
List of (word, rank) tuples.
|
|
"""
|
|
all_vocab: list[tuple[str, int]] = []
|
|
in_vocab_dump = False
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
if stripped == "VOCAB_DUMP_START":
|
|
in_vocab_dump = True
|
|
continue
|
|
if stripped == "VOCAB_DUMP_END":
|
|
break
|
|
if in_vocab_dump and ";" in stripped:
|
|
parts = stripped.split(";")
|
|
if len(parts) == _MIN_VOCAB_DUMP_PARTS:
|
|
word, rank_str = parts
|
|
with contextlib.suppress(ValueError):
|
|
all_vocab.append((word, int(rank_str)))
|
|
return all_vocab
|
|
|
|
|
|
def _parse_excerpt_lines(lines: list[str], start: int) -> str:
|
|
"""Parse excerpt text from output lines starting after 'Excerpt:'.
|
|
|
|
Args:
|
|
lines: Output lines.
|
|
start: Index of the line after 'Excerpt:'.
|
|
|
|
Returns:
|
|
Joined excerpt text.
|
|
"""
|
|
excerpt_parts: list[str] = []
|
|
idx = start
|
|
while idx < len(lines):
|
|
next_line = lines[idx].strip()
|
|
next_line = next_line.removeprefix('"')
|
|
if next_line.endswith('"'):
|
|
next_line = next_line[:-1]
|
|
excerpt_parts.append(next_line)
|
|
break
|
|
excerpt_parts.append(next_line)
|
|
idx += 1
|
|
return " ".join(excerpt_parts)
|
|
|
|
|
|
def parse_inverse_mode_output(
|
|
output: str,
|
|
) -> tuple[str, int, int, list[tuple[str, int]]]:
|
|
"""Parse output from vocabulary_curve inverse mode.
|
|
|
|
Args:
|
|
output: Raw output from vocabulary_curve --max-vocab.
|
|
|
|
Returns:
|
|
Tuple of (excerpt_text, excerpt_length, max_rank_used, all_vocab_words).
|
|
"""
|
|
lines = output.split("\n")
|
|
excerpt = ""
|
|
excerpt_length = 0
|
|
max_rank_used = 0
|
|
|
|
for i, raw_line in enumerate(lines):
|
|
line = raw_line.strip()
|
|
|
|
if line.startswith("LONGEST EXCERPT:"):
|
|
parts = line.split()
|
|
if len(parts) >= _MIN_EXCERPT_PARTS:
|
|
excerpt_length = int(parts[2])
|
|
|
|
elif line.startswith("Excerpt:"):
|
|
excerpt = _parse_excerpt_lines(lines, i + 1)
|
|
|
|
elif line.startswith("Rarest word used:"):
|
|
match = re.search(r"\(#(\d+)\)", line)
|
|
if match:
|
|
max_rank_used = int(match.group(1))
|
|
|
|
all_vocab = _parse_vocab_dump(lines)
|
|
return excerpt, excerpt_length, max_rank_used, all_vocab
|
|
|
|
|
|
def _parse_target_length_block(
|
|
lines: list[str],
|
|
target_length: int,
|
|
) -> tuple[str, list[tuple[str, int]]]:
|
|
"""Parse the [Length N] block from vocabulary curve output.
|
|
|
|
Args:
|
|
lines: Output lines.
|
|
target_length: Target excerpt length to find.
|
|
|
|
Returns:
|
|
Tuple of (excerpt, excerpt_words).
|
|
"""
|
|
excerpt = ""
|
|
excerpt_words: list[tuple[str, int]] = []
|
|
i = 0
|
|
while i < len(lines):
|
|
if lines[i].strip().startswith(f"[Length {target_length}]"):
|
|
i += 1
|
|
# Find excerpt line
|
|
while i < len(lines) and not lines[i].strip().startswith(
|
|
"Excerpt:"
|
|
):
|
|
i += 1
|
|
if i < len(lines):
|
|
excerpt_line = lines[i].strip()
|
|
if '"' in excerpt_line:
|
|
start = excerpt_line.index('"') + 1
|
|
end = excerpt_line.rindex('"')
|
|
excerpt = excerpt_line[start:end]
|
|
# Find words line
|
|
i += 1
|
|
while i < len(lines) and not lines[i].strip().startswith(
|
|
"Words:"
|
|
):
|
|
i += 1
|
|
if i < len(lines):
|
|
words_line = lines[i].strip()
|
|
if words_line.startswith("Words:"):
|
|
words_part = words_line[6:].strip()
|
|
pattern = r"(\S+)\(#(\d+)\)"
|
|
matches = re.findall(pattern, words_part)
|
|
excerpt_words = [
|
|
(w, int(r)) for w, r in matches
|
|
]
|
|
break
|
|
i += 1
|
|
return excerpt, excerpt_words
|
|
|
|
|
|
def parse_vocabulary_curve_output(
|
|
output: str, target_length: int
|
|
) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
|
|
"""Parse output from vocabulary_curve to get words needed.
|
|
|
|
Args:
|
|
output: Raw output from vocabulary_curve.
|
|
target_length: The target excerpt length.
|
|
|
|
Returns:
|
|
Tuple of (excerpt_text, excerpt_words, all_vocab_words).
|
|
excerpt_words: words in the excerpt with their ranks.
|
|
all_vocab_words: all words up to max rank
|
|
(from VOCAB_DUMP if present).
|
|
"""
|
|
lines = output.split("\n")
|
|
|
|
excerpt, excerpt_words = _parse_target_length_block(
|
|
lines, target_length
|
|
)
|
|
all_vocab = _parse_vocab_dump(lines)
|
|
|
|
return excerpt, excerpt_words, all_vocab
|
|
|
|
|
|
def find_word_contexts(
|
|
text: str,
|
|
words: list[str],
|
|
context_words: int = 5,
|
|
) -> dict[str, str]:
|
|
"""Find example contexts for each word in the text.
|
|
|
|
Args:
|
|
text: The source text.
|
|
words: List of words to find contexts for.
|
|
context_words: Number of words of context on each side.
|
|
|
|
Returns:
|
|
Dict mapping word to example context.
|
|
"""
|
|
# Extract all words preserving positions
|
|
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
|
|
all_words_lower = [w.lower() for w in all_words]
|
|
|
|
contexts: dict[str, str] = {}
|
|
words_lower = {w.lower() for w in words}
|
|
|
|
for target in words_lower:
|
|
# Find first occurrence
|
|
for i, word in enumerate(all_words_lower):
|
|
if word == target:
|
|
start = max(0, i - context_words)
|
|
end = min(len(all_words), i + context_words + 1)
|
|
context = " ".join(all_words[start:end])
|
|
contexts[target] = f"...{context}..."
|
|
break
|
|
|
|
return contexts
|
|
|
|
|
|
def _format_excerpt_card(
|
|
excerpt: str,
|
|
excerpt_words: list[tuple[str, int]] | None,
|
|
) -> str:
|
|
"""Format the excerpt as the first Anki card.
|
|
|
|
Args:
|
|
excerpt: The target excerpt text.
|
|
excerpt_words: Words in the excerpt with ranks.
|
|
|
|
Returns:
|
|
Formatted excerpt card line.
|
|
"""
|
|
excerpt_escaped = excerpt.replace(";", ",")
|
|
if excerpt_words:
|
|
most_frequent = min(excerpt_words, key=lambda x: x[1])[0]
|
|
rarest = max(excerpt_words, key=lambda x: x[1])[0]
|
|
if most_frequent != rarest:
|
|
pattern_rare = re.compile(
|
|
rf"\b({re.escape(rarest)})\b", re.IGNORECASE
|
|
)
|
|
excerpt_escaped = pattern_rare.sub(
|
|
r"<b>\1</b>", excerpt_escaped
|
|
)
|
|
pattern_freq = re.compile(
|
|
rf"\b({re.escape(most_frequent)})\b",
|
|
re.IGNORECASE,
|
|
)
|
|
excerpt_escaped = pattern_freq.sub(
|
|
r"<i>\1</i>", excerpt_escaped
|
|
)
|
|
else:
|
|
pattern = re.compile(
|
|
rf"\b({re.escape(most_frequent)})\b",
|
|
re.IGNORECASE,
|
|
)
|
|
excerpt_escaped = pattern.sub(
|
|
r"<b><i>\1</i></b>", excerpt_escaped
|
|
)
|
|
return f"\U0001f4d6 TARGET EXCERPT;{excerpt_escaped};#0"
|
|
|
|
|
|
def _build_translation_lookup(
|
|
words_with_ranks: list[tuple[str, int]],
|
|
source_lang: str,
|
|
target_lang: str,
|
|
*,
|
|
no_translate: bool = False,
|
|
) -> dict[str, str]:
|
|
"""Build word-to-translation lookup dict.
|
|
|
|
Args:
|
|
words_with_ranks: List of (word, rank) tuples.
|
|
source_lang: Source language code.
|
|
target_lang: Target language code.
|
|
no_translate: If True, use placeholder translations.
|
|
|
|
Returns:
|
|
Dict mapping lowercase word to translation.
|
|
"""
|
|
words = [w for w, _ in words_with_ranks]
|
|
if no_translate:
|
|
return {w.lower(): "[TODO]" for w in words}
|
|
translations = translate_words_batch(words, source_lang, target_lang)
|
|
trans_lookup: dict[str, str] = {}
|
|
for result in translations:
|
|
if result.success:
|
|
trans_lookup[result.source_word.lower()] = (
|
|
result.translated_word
|
|
)
|
|
else:
|
|
trans_lookup[result.source_word.lower()] = (
|
|
f"[{result.source_word}]"
|
|
)
|
|
return trans_lookup
|
|
|
|
|
|
def generate_anki_deck(
|
|
deck_input: DeckInput,
|
|
*,
|
|
include_context: bool = False,
|
|
no_translate: bool = False,
|
|
excerpt: str = "",
|
|
excerpt_words: list[tuple[str, int]] | None = None,
|
|
) -> str:
|
|
"""Generate Anki-compatible deck content.
|
|
|
|
Args:
|
|
deck_input: Core deck data (words, langs, contexts, name).
|
|
include_context: Whether to include context in cards.
|
|
no_translate: If True, skip translation (use placeholder).
|
|
excerpt: The target excerpt text to include in cards.
|
|
excerpt_words: Words in the excerpt with ranks.
|
|
|
|
Returns:
|
|
Semicolon-separated content ready for Anki import.
|
|
"""
|
|
lines: list[str] = []
|
|
|
|
# Add Anki headers
|
|
lines.append("#separator:semicolon")
|
|
lines.append("#html:true")
|
|
lines.append(f"#deck:{deck_input.deck_name}")
|
|
lines.append(f"#tags:vocabulary {deck_input.source_lang}")
|
|
if include_context:
|
|
lines.append("#columns:Front;Back;Rank;Context")
|
|
else:
|
|
lines.append("#columns:Front;Back;Rank")
|
|
lines.append("") # Empty line before data
|
|
|
|
if excerpt:
|
|
lines.append(_format_excerpt_card(excerpt, excerpt_words))
|
|
|
|
trans_lookup = _build_translation_lookup(
|
|
deck_input.words_with_ranks,
|
|
deck_input.source_lang,
|
|
deck_input.target_lang,
|
|
no_translate=no_translate,
|
|
)
|
|
|
|
# Generate cards
|
|
for word, rank in deck_input.words_with_ranks:
|
|
translation = trans_lookup.get(word.lower(), f"[{word}]")
|
|
|
|
# Escape semicolons in fields
|
|
word_escaped = word.replace(";", ",")
|
|
translation_escaped = translation.replace(";", ",")
|
|
|
|
if include_context and deck_input.contexts:
|
|
context = deck_input.contexts.get(word.lower(), "")
|
|
if context:
|
|
context_escaped = context.replace(";", ",")
|
|
pattern = re.compile(re.escape(word), re.IGNORECASE)
|
|
context_escaped = pattern.sub(
|
|
f"<b>{word}</b>", context_escaped
|
|
)
|
|
else:
|
|
context_escaped = ""
|
|
lines.append(
|
|
f"{word_escaped};{translation_escaped}"
|
|
f";#{rank};{context_escaped}"
|
|
)
|
|
else:
|
|
lines.append(f"{word_escaped};{translation_escaped};#{rank}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def get_cached_excerpt(
|
|
filepath: Path, length: int, *, force: bool = False
|
|
) -> tuple[str, list[tuple[str, int]]] | None:
|
|
"""Get cached excerpt if available.
|
|
|
|
Args:
|
|
filepath: Path to source file.
|
|
length: Excerpt length.
|
|
force: If True, ignore cache.
|
|
|
|
Returns:
|
|
Tuple of (excerpt, words) or None if not cached.
|
|
"""
|
|
if force:
|
|
return None
|
|
return get_vocab_curve_cache().get(filepath, length)
|
|
|
|
|
|
def cache_excerpt(
|
|
filepath: Path, length: int, excerpt: str, words: list[tuple[str, int]]
|
|
) -> None:
|
|
"""Store excerpt in cache.
|
|
|
|
Args:
|
|
filepath: Path to source file.
|
|
length: Excerpt length.
|
|
excerpt: The excerpt text.
|
|
words: List of (word, rank) tuples.
|
|
"""
|
|
get_vocab_curve_cache().set(filepath, length, excerpt, words)
|
|
|
|
|
|
def get_cached_deck(
|
|
key: AnkiDeckKey,
|
|
*,
|
|
force: bool = False,
|
|
) -> tuple[str, str, int, int] | None:
|
|
"""Get cached Anki deck if available.
|
|
|
|
Args:
|
|
key: Cache key parameters.
|
|
force: If True, ignore cache.
|
|
|
|
Returns:
|
|
Tuple of (content, excerpt, num_words, max_rank) or None.
|
|
"""
|
|
if force:
|
|
return None
|
|
return get_anki_deck_cache().get(key)
|
|
|
|
|
|
def cache_deck(
|
|
key: AnkiDeckKey,
|
|
anki_content: str,
|
|
excerpt: str,
|
|
num_words: int,
|
|
max_rank: int,
|
|
) -> None:
|
|
"""Store Anki deck in cache.
|
|
|
|
Args:
|
|
key: Cache key parameters.
|
|
anki_content: The deck content.
|
|
excerpt: The excerpt text.
|
|
num_words: Number of words.
|
|
max_rank: Maximum rank.
|
|
"""
|
|
get_anki_deck_cache().set(
|
|
key,
|
|
anki_content,
|
|
excerpt,
|
|
num_words,
|
|
max_rank,
|
|
)
|
|
|
|
|
|
def _detect_source_language(
|
|
filepath: Path,
|
|
text: str,
|
|
) -> str:
|
|
"""Auto-detect source language from file content.
|
|
|
|
Args:
|
|
filepath: Path to source file.
|
|
text: Already-read text (may be empty).
|
|
|
|
Returns:
|
|
Detected language code.
|
|
|
|
Raises:
|
|
ValueError: If language cannot be detected.
|
|
"""
|
|
sample_text = read_file(filepath)[:1000] if not text else text[:1000]
|
|
detected = detect_language(sample_text)
|
|
if detected is None:
|
|
msg = (
|
|
"Could not auto-detect source language. "
|
|
"Please specify with --from (e.g., --from pl for Polish). "
|
|
"Install langdetect for auto-detection: "
|
|
"pip install langdetect"
|
|
)
|
|
raise ValueError(msg)
|
|
return detected
|
|
|
|
|
|
def generate_flashcards(
|
|
filepath: str | Path,
|
|
excerpt_length: int,
|
|
options: FlashcardOptions | None = None,
|
|
*,
|
|
all_vocab: bool = True,
|
|
) -> tuple[str, str, int, int]:
|
|
"""Generate Anki flashcards for vocabulary needed for an excerpt.
|
|
|
|
Args:
|
|
filepath: Path to the source text file.
|
|
excerpt_length: Target excerpt length.
|
|
options: Flashcard generation options.
|
|
all_vocab: If True, include ALL words rank 1 to max rank.
|
|
|
|
Returns:
|
|
Tuple of (anki_content, excerpt, num_words, max_rank).
|
|
"""
|
|
if options is None:
|
|
options = FlashcardOptions()
|
|
filepath = Path(filepath)
|
|
deck_key = AnkiDeckKey(
|
|
filepath=filepath,
|
|
length=excerpt_length,
|
|
target_lang=options.target_lang,
|
|
include_context=options.include_context,
|
|
all_vocab=all_vocab,
|
|
)
|
|
|
|
# Check for cached full deck (if not using no_translate)
|
|
if not options.no_translate and not options.force:
|
|
cached = get_cached_deck(deck_key)
|
|
if cached is not None:
|
|
return cached
|
|
|
|
# Read the text (only needed for context finding)
|
|
text = read_file(filepath) if options.include_context else ""
|
|
|
|
# Auto-detect language if not provided
|
|
source_lang = options.source_lang
|
|
if source_lang is None:
|
|
source_lang = _detect_source_language(filepath, text)
|
|
|
|
# Run vocabulary curve analysis with vocab dump for all words
|
|
output = run_vocabulary_curve(
|
|
filepath, excerpt_length, dump_vocab=all_vocab
|
|
)
|
|
excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(
|
|
output, excerpt_length
|
|
)
|
|
|
|
if not excerpt_words:
|
|
msg = f"No words found for excerpt length {excerpt_length}"
|
|
raise ValueError(msg)
|
|
|
|
max_rank = max(rank for _, rank in excerpt_words)
|
|
words_with_ranks = (
|
|
all_vocab_words if all_vocab and all_vocab_words else excerpt_words
|
|
)
|
|
|
|
contexts = None
|
|
if options.include_context:
|
|
if not text:
|
|
text = read_file(filepath)
|
|
words = [w for w, _ in words_with_ranks]
|
|
contexts = find_word_contexts(text, words)
|
|
|
|
deck_name = options.deck_name or f"{filepath.stem}_vocab_{excerpt_length}"
|
|
|
|
anki_content = generate_anki_deck(
|
|
DeckInput(
|
|
words_with_ranks=words_with_ranks,
|
|
source_lang=source_lang,
|
|
target_lang=options.target_lang,
|
|
contexts=contexts,
|
|
deck_name=deck_name,
|
|
),
|
|
include_context=options.include_context,
|
|
no_translate=options.no_translate,
|
|
excerpt=excerpt,
|
|
excerpt_words=excerpt_words,
|
|
)
|
|
|
|
if not options.no_translate:
|
|
cache_deck(
|
|
deck_key,
|
|
anki_content,
|
|
excerpt,
|
|
len(words_with_ranks),
|
|
max_rank,
|
|
)
|
|
|
|
return anki_content, excerpt, len(words_with_ranks), max_rank
|
|
|
|
|
|
def generate_flashcards_inverse(
|
|
filepath: str | Path,
|
|
max_vocab: int,
|
|
options: FlashcardOptions | None = None,
|
|
) -> tuple[str, str, int, int, int]:
|
|
"""Generate Anki flashcards for the longest excerpt using top N words.
|
|
|
|
This is the inverse mode: given a vocabulary size, find the longest
|
|
excerpt that can be understood with only those words.
|
|
|
|
Args:
|
|
filepath: Path to the source text file.
|
|
max_vocab: Maximum vocabulary size (top N words to learn).
|
|
options: Flashcard generation options.
|
|
|
|
Returns:
|
|
Tuple of (anki_content, excerpt, excerpt_length,
|
|
num_words, max_rank_used).
|
|
"""
|
|
if options is None:
|
|
options = FlashcardOptions()
|
|
filepath = Path(filepath)
|
|
|
|
text = read_file(filepath) if options.include_context else ""
|
|
|
|
source_lang = options.source_lang
|
|
if source_lang is None:
|
|
source_lang = _detect_source_language(filepath, text)
|
|
|
|
output = run_vocabulary_curve_inverse(
|
|
filepath, max_vocab, dump_vocab=True
|
|
)
|
|
excerpt, excerpt_length, max_rank_used, all_vocab_words = (
|
|
parse_inverse_mode_output(output)
|
|
)
|
|
|
|
if excerpt_length == 0:
|
|
msg = (
|
|
f"No valid excerpt found using only top {max_vocab} "
|
|
"words. Try increasing the vocabulary limit."
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
if not all_vocab_words:
|
|
msg = f"No vocabulary returned for max_vocab={max_vocab}"
|
|
raise ValueError(msg)
|
|
|
|
words_with_ranks = all_vocab_words
|
|
|
|
excerpt_word_set = set(excerpt.lower().split())
|
|
excerpt_words = [
|
|
(w, r)
|
|
for w, r in all_vocab_words
|
|
if w.lower() in excerpt_word_set
|
|
]
|
|
|
|
contexts = None
|
|
if options.include_context:
|
|
if not text:
|
|
text = read_file(filepath)
|
|
words = [w for w, _ in words_with_ranks]
|
|
contexts = find_word_contexts(text, words)
|
|
|
|
deck_name = options.deck_name or f"{filepath.stem}_top{max_vocab}"
|
|
|
|
anki_content = generate_anki_deck(
|
|
DeckInput(
|
|
words_with_ranks=words_with_ranks,
|
|
source_lang=source_lang,
|
|
target_lang=options.target_lang,
|
|
contexts=contexts,
|
|
deck_name=deck_name,
|
|
),
|
|
include_context=options.include_context,
|
|
no_translate=options.no_translate,
|
|
excerpt=excerpt,
|
|
excerpt_words=excerpt_words or None,
|
|
)
|
|
|
|
return (
|
|
anki_content,
|
|
excerpt,
|
|
excerpt_length,
|
|
len(words_with_ranks),
|
|
max_rank_used,
|
|
)
|
|
|
|
|
|
def _format_cache_size(value: int) -> str:
|
|
"""Format a byte size as human-readable string."""
|
|
if value < _ONE_KB:
|
|
return f"{value} B"
|
|
if value < _ONE_MB:
|
|
return f"{value / _ONE_KB:.1f} KB"
|
|
return f"{value / _ONE_MB:.1f} MB"
|
|
|
|
|
|
def _print_cache_stats() -> int:
|
|
"""Print cache statistics and return exit code."""
|
|
stats = get_all_cache_stats()
|
|
logger.info("Cache Statistics")
|
|
logger.info("=" * 50)
|
|
for cache_name, cache_stats in stats.items():
|
|
logger.info("\n%s:", cache_name.upper())
|
|
for key, value in cache_stats.items():
|
|
if key == "cache_size_bytes":
|
|
logger.info(" %s: %s", key, _format_cache_size(value))
|
|
else:
|
|
logger.info(" %s: %s", key, value)
|
|
return 0
|
|
|
|
|
|
def _clear_caches() -> int:
|
|
"""Clear all caches and return exit code."""
|
|
clear_all_caches()
|
|
logger.info("All caches cleared.")
|
|
return 0
|
|
|
|
|
|
def _log_anki_import_instructions(output_path: Path) -> None:
|
|
"""Log Anki import instructions."""
|
|
logger.info("")
|
|
logger.info("To import into Anki:")
|
|
logger.info(" 1. Open Anki")
|
|
logger.info(" 2. File -> Import")
|
|
logger.info(" 3. Select: %s", output_path)
|
|
logger.info(" 4. Click Import")
|
|
|
|
|
|
def _handle_inverse_mode(
|
|
args: argparse.Namespace,
|
|
filepath: Path,
|
|
) -> int:
|
|
"""Handle inverse mode (--max-vocab) flashcard generation.
|
|
|
|
Args:
|
|
args: Parsed command line arguments.
|
|
filepath: Path to source file.
|
|
|
|
Returns:
|
|
Exit code.
|
|
"""
|
|
if not args.quiet:
|
|
logger.info("Analyzing %s...", filepath.name)
|
|
logger.info(
|
|
"Finding longest excerpt using top %d words...",
|
|
args.max_vocab,
|
|
)
|
|
|
|
anki_content, excerpt, excerpt_length, num_words, max_rank_used = (
|
|
generate_flashcards_inverse(
|
|
filepath,
|
|
args.max_vocab,
|
|
FlashcardOptions(
|
|
source_lang=args.source_lang,
|
|
target_lang=args.target_lang,
|
|
deck_name=args.deck_name,
|
|
include_context=args.include_context,
|
|
no_translate=args.no_translate,
|
|
force=args.force,
|
|
),
|
|
)
|
|
)
|
|
|
|
output_path = (
|
|
Path(args.output)
|
|
if args.output
|
|
else filepath.parent
|
|
/ f"{filepath.stem}_anki_top{args.max_vocab}.txt"
|
|
)
|
|
output_path.write_text(anki_content, encoding="utf-8")
|
|
|
|
if not args.quiet:
|
|
logger.info("")
|
|
logger.info("=" * 60)
|
|
logger.info("FLASHCARD GENERATION COMPLETE (INVERSE MODE)")
|
|
logger.info("=" * 60)
|
|
logger.info("Learning: top %d words", args.max_vocab)
|
|
logger.info(
|
|
"Longest excerpt you can understand: %d words",
|
|
excerpt_length,
|
|
)
|
|
logger.info(' "%s"', excerpt)
|
|
logger.info("")
|
|
logger.info("Rarest word in excerpt: #%d", max_rank_used)
|
|
logger.info("Flashcards: %d", num_words)
|
|
logger.info("Output file: %s", output_path)
|
|
_log_anki_import_instructions(output_path)
|
|
else:
|
|
logger.info("%s", output_path)
|
|
|
|
return 0
|
|
|
|
|
|
def _handle_normal_mode(
|
|
args: argparse.Namespace,
|
|
filepath: Path,
|
|
) -> int:
|
|
"""Handle normal mode (--length) flashcard generation.
|
|
|
|
Args:
|
|
args: Parsed command line arguments.
|
|
filepath: Path to source file.
|
|
|
|
Returns:
|
|
Exit code.
|
|
"""
|
|
if not args.quiet:
|
|
logger.info("Analyzing %s...", filepath.name)
|
|
logger.info(
|
|
"Finding vocabulary for %d-word excerpt...", args.length
|
|
)
|
|
|
|
anki_content, excerpt, num_words, max_rank = generate_flashcards(
|
|
filepath,
|
|
args.length,
|
|
FlashcardOptions(
|
|
source_lang=args.source_lang,
|
|
target_lang=args.target_lang,
|
|
deck_name=args.deck_name,
|
|
include_context=args.include_context,
|
|
no_translate=args.no_translate,
|
|
force=args.force,
|
|
),
|
|
all_vocab=not args.excerpt_words_only,
|
|
)
|
|
|
|
output_path = (
|
|
Path(args.output)
|
|
if args.output
|
|
else filepath.parent / f"{filepath.stem}_anki_{args.length}.txt"
|
|
)
|
|
output_path.write_text(anki_content, encoding="utf-8")
|
|
|
|
if not args.quiet:
|
|
logger.info("")
|
|
logger.info("=" * 60)
|
|
logger.info("FLASHCARD GENERATION COMPLETE")
|
|
logger.info("=" * 60)
|
|
logger.info(
|
|
"Excerpt to understand (%d words):", args.length
|
|
)
|
|
logger.info(' "%s"', excerpt)
|
|
logger.info("")
|
|
logger.info("Max word rank needed: #%d", max_rank)
|
|
if args.excerpt_words_only:
|
|
logger.info(
|
|
"Flashcards: %d (excerpt words only)", num_words
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Flashcards: %d (ALL words rank #1 to #%d)",
|
|
num_words,
|
|
max_rank,
|
|
)
|
|
logger.info("Output file: %s", output_path)
|
|
_log_anki_import_instructions(output_path)
|
|
else:
|
|
logger.info("%s", output_path)
|
|
|
|
return 0
|
|
|
|
|
|
def _build_parser() -> argparse.ArgumentParser:
|
|
"""Build the argument parser for the CLI.
|
|
|
|
Returns:
|
|
Configured argument parser.
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate Anki flashcards from vocabulary analysis.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--file",
|
|
"-f",
|
|
type=str,
|
|
default=None,
|
|
help="Path to the text file to analyze",
|
|
)
|
|
parser.add_argument(
|
|
"--length",
|
|
"-l",
|
|
type=int,
|
|
default=None,
|
|
help=(
|
|
"Target excerpt length "
|
|
"(how many words you want to understand)"
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--max-vocab",
|
|
"-v",
|
|
type=int,
|
|
default=None,
|
|
help=(
|
|
"INVERSE MODE: Learn top N words, "
|
|
"find longest excerpt you can understand"
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--from",
|
|
dest="source_lang",
|
|
type=str,
|
|
default=None,
|
|
help=(
|
|
"Source language code (e.g., 'pl', 'la', 'de'). "
|
|
"Auto-detected if not specified."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--to",
|
|
"-T",
|
|
dest="target_lang",
|
|
type=str,
|
|
default="en",
|
|
help="Target language code for translations (default: 'en')",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
type=str,
|
|
default=None,
|
|
help="Output file path (default: <filename>_anki_<length>.txt)",
|
|
)
|
|
parser.add_argument(
|
|
"--include-context",
|
|
"-c",
|
|
action="store_true",
|
|
help="Include example context sentences in flashcards",
|
|
)
|
|
parser.add_argument(
|
|
"--deck-name",
|
|
"-d",
|
|
type=str,
|
|
default=None,
|
|
help="Name for the Anki deck (default: auto-generated)",
|
|
)
|
|
parser.add_argument(
|
|
"--quiet",
|
|
"-q",
|
|
action="store_true",
|
|
help="Only output the file path, no status messages",
|
|
)
|
|
parser.add_argument(
|
|
"--excerpt-words-only",
|
|
"-e",
|
|
action="store_true",
|
|
help=(
|
|
"Only include words that appear in the excerpt "
|
|
"(default: include ALL words up to max rank)"
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--no-translate",
|
|
"-n",
|
|
action="store_true",
|
|
help="Skip translation (output words without translations)",
|
|
)
|
|
parser.add_argument(
|
|
"--force",
|
|
"-F",
|
|
action="store_true",
|
|
help="Force regeneration, ignoring all caches",
|
|
)
|
|
parser.add_argument(
|
|
"--cache-stats",
|
|
action="store_true",
|
|
help="Show cache statistics and exit",
|
|
)
|
|
parser.add_argument(
|
|
"--clear-cache",
|
|
action="store_true",
|
|
help="Clear all caches and exit",
|
|
)
|
|
return parser
|
|
|
|
|
|
def _run_generation(args: argparse.Namespace) -> int:
|
|
"""Validate args and run flashcard generation.
|
|
|
|
Args:
|
|
args: Parsed command line arguments.
|
|
|
|
Returns:
|
|
Exit code.
|
|
"""
|
|
filepath = Path(args.file)
|
|
if not filepath.exists():
|
|
logger.error("Error: File not found: %s", args.file)
|
|
return 1
|
|
|
|
if args.max_vocab is not None:
|
|
return _handle_inverse_mode(args, filepath)
|
|
return _handle_normal_mode(args, filepath)
|
|
|
|
|
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
"""Main entry point.
|
|
|
|
Args:
|
|
argv: Command line arguments.
|
|
|
|
Returns:
|
|
Exit code.
|
|
"""
|
|
parser = _build_parser()
|
|
args = parser.parse_args(argv)
|
|
|
|
if args.cache_stats:
|
|
return _print_cache_stats()
|
|
|
|
if args.clear_cache:
|
|
return _clear_caches()
|
|
|
|
if args.file is None:
|
|
parser.error("--file/-f is required")
|
|
if args.length is None and args.max_vocab is None:
|
|
parser.error("Either --length/-l or --max-vocab/-v is required")
|
|
if args.length is not None and args.max_vocab is not None:
|
|
parser.error(
|
|
"Cannot use both --length and --max-vocab. Choose one mode."
|
|
)
|
|
|
|
try:
|
|
return _run_generation(args)
|
|
except FileNotFoundError:
|
|
logger.exception("File not found")
|
|
except subprocess.CalledProcessError:
|
|
logger.exception("Error running vocabulary_curve")
|
|
except ValueError:
|
|
logger.exception("Value error")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|