testsAndMisc-archive/python_pkg/word_frequency/anki_generator.py
Krzysztof kuhy Rudnicki 2bb930db6f refactor(word_frequency): fix all ruff violations and remove noqa comments
- Replace print() with logging module throughout
- Add type annotations and Google docstrings to all functions
- Introduce DeckInput and LessonConfig dataclasses to reduce function parameters
- Use specific exception types instead of bare except (BLE001)
- Remove all noqa suppression comments
- Fix test fixtures: remove unused _capsys/_tmp_path parameters
2026-03-13 20:41:31 +01:00

1175 lines
32 KiB
Python
Executable File

#!/usr/bin/env python3
"""Anki flashcard generator from vocabulary curve analysis.
Generates Anki-compatible flashcard decks from the vocabulary needed to
understand excerpts of a given length.
Usage::
# Generate flashcards for a 20-word excerpt
python -m python_pkg.word_frequency.anki_generator \
--file text.txt --length 20
# Specify source language (auto-detected by default)
python -m python_pkg.word_frequency.anki_generator \
--file text.txt --length 20 --from pl
# Custom output file
python -m python_pkg.word_frequency.anki_generator \
--file text.txt --length 20 --output polish_vocab.txt
# Include example sentences/context
python -m python_pkg.word_frequency.anki_generator \
--file text.txt --length 20 --include-context
Output:
Creates a semicolon-separated text file importable into Anki.
Format: ``word;translation;frequency_rank;example_context``
"""
from __future__ import annotations
import argparse
import contextlib
from dataclasses import dataclass
import logging
from pathlib import Path
import re
import subprocess
import sys
from typing import TYPE_CHECKING, NamedTuple
if TYPE_CHECKING:
from collections.abc import Sequence
try:
from python_pkg.word_frequency.analyzer import read_file
from python_pkg.word_frequency.cache import (
AnkiDeckKey,
clear_all_caches,
get_all_cache_stats,
get_anki_deck_cache,
get_vocab_curve_cache,
)
from python_pkg.word_frequency.translator import (
detect_language,
translate_words_batch,
)
except ImportError:
from analyzer import read_file
from cache import (
AnkiDeckKey,
clear_all_caches,
get_all_cache_stats,
get_anki_deck_cache,
get_vocab_curve_cache,
)
from translator import detect_language, translate_words_batch
logger = logging.getLogger(__name__)
_MIN_VOCAB_DUMP_PARTS = 2
_MIN_EXCERPT_PARTS = 3
_ONE_KB = 1024
_ONE_MB = 1024 * 1024
@dataclass(frozen=True)
class FlashcardOptions:
"""Options for flashcard generation."""
source_lang: str | None = None
target_lang: str = "en"
deck_name: str | None = None
include_context: bool = False
no_translate: bool = False
force: bool = False
@dataclass(frozen=True)
class DeckInput:
"""Input data for Anki deck generation."""
words_with_ranks: list[tuple[str, int]]
source_lang: str
target_lang: str = "en"
contexts: dict[str, str] | None = None
deck_name: str = "Vocabulary"
# Path to C vocabulary_curve executable
C_EXECUTABLE = (
Path(__file__).parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
)
class VocabWord(NamedTuple):
"""A vocabulary word with its metadata."""
word: str
rank: int
translation: str
context: str
def run_vocabulary_curve(
filepath: Path, max_length: int, *, dump_vocab: bool = False
) -> str:
"""Run the C vocabulary_curve executable.
Args:
filepath: Path to the text file.
max_length: Maximum excerpt length.
dump_vocab: If True, also dump all vocabulary up to max rank needed.
Returns:
Output from the executable.
Raises:
FileNotFoundError: If executable not found.
subprocess.CalledProcessError: If execution fails.
"""
if not C_EXECUTABLE.exists():
msg = (
f"C executable not found at {C_EXECUTABLE}. "
"Please compile it first: cd C/vocabulary_curve && make"
)
raise FileNotFoundError(msg)
cmd = [str(C_EXECUTABLE), str(filepath), str(max_length)]
if dump_vocab:
cmd.append("--dump-vocab")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120,
check=True,
)
return result.stdout
def run_vocabulary_curve_inverse(
filepath: Path, max_vocab: int, *, dump_vocab: bool = False
) -> str:
"""Run the C vocabulary_curve executable in inverse mode.
Args:
filepath: Path to the text file.
max_vocab: Maximum vocabulary size (top N words).
dump_vocab: If True, also dump all vocabulary up to max_vocab.
Returns:
Output from the executable.
Raises:
FileNotFoundError: If executable not found.
subprocess.CalledProcessError: If execution fails.
"""
if not C_EXECUTABLE.exists():
msg = (
f"C executable not found at {C_EXECUTABLE}. "
"Please compile it first: cd C/vocabulary_curve && make"
)
raise FileNotFoundError(msg)
cmd = [str(C_EXECUTABLE), str(filepath), "--max-vocab", str(max_vocab)]
if dump_vocab:
cmd.append("--dump-vocab")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120,
check=True,
)
return result.stdout
def _parse_vocab_dump(lines: list[str]) -> list[tuple[str, int]]:
"""Parse VOCAB_DUMP section from output lines.
Args:
lines: Output lines from vocabulary_curve.
Returns:
List of (word, rank) tuples.
"""
all_vocab: list[tuple[str, int]] = []
in_vocab_dump = False
for line in lines:
stripped = line.strip()
if stripped == "VOCAB_DUMP_START":
in_vocab_dump = True
continue
if stripped == "VOCAB_DUMP_END":
break
if in_vocab_dump and ";" in stripped:
parts = stripped.split(";")
if len(parts) == _MIN_VOCAB_DUMP_PARTS:
word, rank_str = parts
with contextlib.suppress(ValueError):
all_vocab.append((word, int(rank_str)))
return all_vocab
def _parse_excerpt_lines(lines: list[str], start: int) -> str:
"""Parse excerpt text from output lines starting after 'Excerpt:'.
Args:
lines: Output lines.
start: Index of the line after 'Excerpt:'.
Returns:
Joined excerpt text.
"""
excerpt_parts: list[str] = []
idx = start
while idx < len(lines):
next_line = lines[idx].strip()
next_line = next_line.removeprefix('"')
if next_line.endswith('"'):
next_line = next_line[:-1]
excerpt_parts.append(next_line)
break
excerpt_parts.append(next_line)
idx += 1
return " ".join(excerpt_parts)
def parse_inverse_mode_output(
output: str,
) -> tuple[str, int, int, list[tuple[str, int]]]:
"""Parse output from vocabulary_curve inverse mode.
Args:
output: Raw output from vocabulary_curve --max-vocab.
Returns:
Tuple of (excerpt_text, excerpt_length, max_rank_used, all_vocab_words).
"""
lines = output.split("\n")
excerpt = ""
excerpt_length = 0
max_rank_used = 0
for i, raw_line in enumerate(lines):
line = raw_line.strip()
if line.startswith("LONGEST EXCERPT:"):
parts = line.split()
if len(parts) >= _MIN_EXCERPT_PARTS:
excerpt_length = int(parts[2])
elif line.startswith("Excerpt:"):
excerpt = _parse_excerpt_lines(lines, i + 1)
elif line.startswith("Rarest word used:"):
match = re.search(r"\(#(\d+)\)", line)
if match:
max_rank_used = int(match.group(1))
all_vocab = _parse_vocab_dump(lines)
return excerpt, excerpt_length, max_rank_used, all_vocab
def _parse_target_length_block(
lines: list[str],
target_length: int,
) -> tuple[str, list[tuple[str, int]]]:
"""Parse the [Length N] block from vocabulary curve output.
Args:
lines: Output lines.
target_length: Target excerpt length to find.
Returns:
Tuple of (excerpt, excerpt_words).
"""
excerpt = ""
excerpt_words: list[tuple[str, int]] = []
i = 0
while i < len(lines):
if lines[i].strip().startswith(f"[Length {target_length}]"):
i += 1
# Find excerpt line
while i < len(lines) and not lines[i].strip().startswith(
"Excerpt:"
):
i += 1
if i < len(lines):
excerpt_line = lines[i].strip()
if '"' in excerpt_line:
start = excerpt_line.index('"') + 1
end = excerpt_line.rindex('"')
excerpt = excerpt_line[start:end]
# Find words line
i += 1
while i < len(lines) and not lines[i].strip().startswith(
"Words:"
):
i += 1
if i < len(lines):
words_line = lines[i].strip()
if words_line.startswith("Words:"):
words_part = words_line[6:].strip()
pattern = r"(\S+)\(#(\d+)\)"
matches = re.findall(pattern, words_part)
excerpt_words = [
(w, int(r)) for w, r in matches
]
break
i += 1
return excerpt, excerpt_words
def parse_vocabulary_curve_output(
output: str, target_length: int
) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
"""Parse output from vocabulary_curve to get words needed.
Args:
output: Raw output from vocabulary_curve.
target_length: The target excerpt length.
Returns:
Tuple of (excerpt_text, excerpt_words, all_vocab_words).
excerpt_words: words in the excerpt with their ranks.
all_vocab_words: all words up to max rank
(from VOCAB_DUMP if present).
"""
lines = output.split("\n")
excerpt, excerpt_words = _parse_target_length_block(
lines, target_length
)
all_vocab = _parse_vocab_dump(lines)
return excerpt, excerpt_words, all_vocab
def find_word_contexts(
text: str,
words: list[str],
context_words: int = 5,
) -> dict[str, str]:
"""Find example contexts for each word in the text.
Args:
text: The source text.
words: List of words to find contexts for.
context_words: Number of words of context on each side.
Returns:
Dict mapping word to example context.
"""
# Extract all words preserving positions
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
all_words_lower = [w.lower() for w in all_words]
contexts: dict[str, str] = {}
words_lower = {w.lower() for w in words}
for target in words_lower:
# Find first occurrence
for i, word in enumerate(all_words_lower):
if word == target:
start = max(0, i - context_words)
end = min(len(all_words), i + context_words + 1)
context = " ".join(all_words[start:end])
contexts[target] = f"...{context}..."
break
return contexts
def _format_excerpt_card(
excerpt: str,
excerpt_words: list[tuple[str, int]] | None,
) -> str:
"""Format the excerpt as the first Anki card.
Args:
excerpt: The target excerpt text.
excerpt_words: Words in the excerpt with ranks.
Returns:
Formatted excerpt card line.
"""
excerpt_escaped = excerpt.replace(";", ",")
if excerpt_words:
most_frequent = min(excerpt_words, key=lambda x: x[1])[0]
rarest = max(excerpt_words, key=lambda x: x[1])[0]
if most_frequent != rarest:
pattern_rare = re.compile(
rf"\b({re.escape(rarest)})\b", re.IGNORECASE
)
excerpt_escaped = pattern_rare.sub(
r"<b>\1</b>", excerpt_escaped
)
pattern_freq = re.compile(
rf"\b({re.escape(most_frequent)})\b",
re.IGNORECASE,
)
excerpt_escaped = pattern_freq.sub(
r"<i>\1</i>", excerpt_escaped
)
else:
pattern = re.compile(
rf"\b({re.escape(most_frequent)})\b",
re.IGNORECASE,
)
excerpt_escaped = pattern.sub(
r"<b><i>\1</i></b>", excerpt_escaped
)
return f"\U0001f4d6 TARGET EXCERPT;{excerpt_escaped};#0"
def _build_translation_lookup(
words_with_ranks: list[tuple[str, int]],
source_lang: str,
target_lang: str,
*,
no_translate: bool = False,
) -> dict[str, str]:
"""Build word-to-translation lookup dict.
Args:
words_with_ranks: List of (word, rank) tuples.
source_lang: Source language code.
target_lang: Target language code.
no_translate: If True, use placeholder translations.
Returns:
Dict mapping lowercase word to translation.
"""
words = [w for w, _ in words_with_ranks]
if no_translate:
return {w.lower(): "[TODO]" for w in words}
translations = translate_words_batch(words, source_lang, target_lang)
trans_lookup: dict[str, str] = {}
for result in translations:
if result.success:
trans_lookup[result.source_word.lower()] = (
result.translated_word
)
else:
trans_lookup[result.source_word.lower()] = (
f"[{result.source_word}]"
)
return trans_lookup
def generate_anki_deck(
deck_input: DeckInput,
*,
include_context: bool = False,
no_translate: bool = False,
excerpt: str = "",
excerpt_words: list[tuple[str, int]] | None = None,
) -> str:
"""Generate Anki-compatible deck content.
Args:
deck_input: Core deck data (words, langs, contexts, name).
include_context: Whether to include context in cards.
no_translate: If True, skip translation (use placeholder).
excerpt: The target excerpt text to include in cards.
excerpt_words: Words in the excerpt with ranks.
Returns:
Semicolon-separated content ready for Anki import.
"""
lines: list[str] = []
# Add Anki headers
lines.append("#separator:semicolon")
lines.append("#html:true")
lines.append(f"#deck:{deck_input.deck_name}")
lines.append(f"#tags:vocabulary {deck_input.source_lang}")
if include_context:
lines.append("#columns:Front;Back;Rank;Context")
else:
lines.append("#columns:Front;Back;Rank")
lines.append("") # Empty line before data
if excerpt:
lines.append(_format_excerpt_card(excerpt, excerpt_words))
trans_lookup = _build_translation_lookup(
deck_input.words_with_ranks,
deck_input.source_lang,
deck_input.target_lang,
no_translate=no_translate,
)
# Generate cards
for word, rank in deck_input.words_with_ranks:
translation = trans_lookup.get(word.lower(), f"[{word}]")
# Escape semicolons in fields
word_escaped = word.replace(";", ",")
translation_escaped = translation.replace(";", ",")
if include_context and deck_input.contexts:
context = deck_input.contexts.get(word.lower(), "")
if context:
context_escaped = context.replace(";", ",")
pattern = re.compile(re.escape(word), re.IGNORECASE)
context_escaped = pattern.sub(
f"<b>{word}</b>", context_escaped
)
else:
context_escaped = ""
lines.append(
f"{word_escaped};{translation_escaped}"
f";#{rank};{context_escaped}"
)
else:
lines.append(f"{word_escaped};{translation_escaped};#{rank}")
return "\n".join(lines)
def get_cached_excerpt(
filepath: Path, length: int, *, force: bool = False
) -> tuple[str, list[tuple[str, int]]] | None:
"""Get cached excerpt if available.
Args:
filepath: Path to source file.
length: Excerpt length.
force: If True, ignore cache.
Returns:
Tuple of (excerpt, words) or None if not cached.
"""
if force:
return None
return get_vocab_curve_cache().get(filepath, length)
def cache_excerpt(
filepath: Path, length: int, excerpt: str, words: list[tuple[str, int]]
) -> None:
"""Store excerpt in cache.
Args:
filepath: Path to source file.
length: Excerpt length.
excerpt: The excerpt text.
words: List of (word, rank) tuples.
"""
get_vocab_curve_cache().set(filepath, length, excerpt, words)
def get_cached_deck(
key: AnkiDeckKey,
*,
force: bool = False,
) -> tuple[str, str, int, int] | None:
"""Get cached Anki deck if available.
Args:
key: Cache key parameters.
force: If True, ignore cache.
Returns:
Tuple of (content, excerpt, num_words, max_rank) or None.
"""
if force:
return None
return get_anki_deck_cache().get(key)
def cache_deck(
key: AnkiDeckKey,
anki_content: str,
excerpt: str,
num_words: int,
max_rank: int,
) -> None:
"""Store Anki deck in cache.
Args:
key: Cache key parameters.
anki_content: The deck content.
excerpt: The excerpt text.
num_words: Number of words.
max_rank: Maximum rank.
"""
get_anki_deck_cache().set(
key,
anki_content,
excerpt,
num_words,
max_rank,
)
def _detect_source_language(
filepath: Path,
text: str,
) -> str:
"""Auto-detect source language from file content.
Args:
filepath: Path to source file.
text: Already-read text (may be empty).
Returns:
Detected language code.
Raises:
ValueError: If language cannot be detected.
"""
sample_text = read_file(filepath)[:1000] if not text else text[:1000]
detected = detect_language(sample_text)
if detected is None:
msg = (
"Could not auto-detect source language. "
"Please specify with --from (e.g., --from pl for Polish). "
"Install langdetect for auto-detection: "
"pip install langdetect"
)
raise ValueError(msg)
return detected
def generate_flashcards(
filepath: str | Path,
excerpt_length: int,
options: FlashcardOptions | None = None,
*,
all_vocab: bool = True,
) -> tuple[str, str, int, int]:
"""Generate Anki flashcards for vocabulary needed for an excerpt.
Args:
filepath: Path to the source text file.
excerpt_length: Target excerpt length.
options: Flashcard generation options.
all_vocab: If True, include ALL words rank 1 to max rank.
Returns:
Tuple of (anki_content, excerpt, num_words, max_rank).
"""
if options is None:
options = FlashcardOptions()
filepath = Path(filepath)
deck_key = AnkiDeckKey(
filepath=filepath,
length=excerpt_length,
target_lang=options.target_lang,
include_context=options.include_context,
all_vocab=all_vocab,
)
# Check for cached full deck (if not using no_translate)
if not options.no_translate and not options.force:
cached = get_cached_deck(deck_key)
if cached is not None:
return cached
# Read the text (only needed for context finding)
text = read_file(filepath) if options.include_context else ""
# Auto-detect language if not provided
source_lang = options.source_lang
if source_lang is None:
source_lang = _detect_source_language(filepath, text)
# Run vocabulary curve analysis with vocab dump for all words
output = run_vocabulary_curve(
filepath, excerpt_length, dump_vocab=all_vocab
)
excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(
output, excerpt_length
)
if not excerpt_words:
msg = f"No words found for excerpt length {excerpt_length}"
raise ValueError(msg)
max_rank = max(rank for _, rank in excerpt_words)
words_with_ranks = (
all_vocab_words if all_vocab and all_vocab_words else excerpt_words
)
contexts = None
if options.include_context:
if not text:
text = read_file(filepath)
words = [w for w, _ in words_with_ranks]
contexts = find_word_contexts(text, words)
deck_name = options.deck_name or f"{filepath.stem}_vocab_{excerpt_length}"
anki_content = generate_anki_deck(
DeckInput(
words_with_ranks=words_with_ranks,
source_lang=source_lang,
target_lang=options.target_lang,
contexts=contexts,
deck_name=deck_name,
),
include_context=options.include_context,
no_translate=options.no_translate,
excerpt=excerpt,
excerpt_words=excerpt_words,
)
if not options.no_translate:
cache_deck(
deck_key,
anki_content,
excerpt,
len(words_with_ranks),
max_rank,
)
return anki_content, excerpt, len(words_with_ranks), max_rank
def generate_flashcards_inverse(
filepath: str | Path,
max_vocab: int,
options: FlashcardOptions | None = None,
) -> tuple[str, str, int, int, int]:
"""Generate Anki flashcards for the longest excerpt using top N words.
This is the inverse mode: given a vocabulary size, find the longest
excerpt that can be understood with only those words.
Args:
filepath: Path to the source text file.
max_vocab: Maximum vocabulary size (top N words to learn).
options: Flashcard generation options.
Returns:
Tuple of (anki_content, excerpt, excerpt_length,
num_words, max_rank_used).
"""
if options is None:
options = FlashcardOptions()
filepath = Path(filepath)
text = read_file(filepath) if options.include_context else ""
source_lang = options.source_lang
if source_lang is None:
source_lang = _detect_source_language(filepath, text)
output = run_vocabulary_curve_inverse(
filepath, max_vocab, dump_vocab=True
)
excerpt, excerpt_length, max_rank_used, all_vocab_words = (
parse_inverse_mode_output(output)
)
if excerpt_length == 0:
msg = (
f"No valid excerpt found using only top {max_vocab} "
"words. Try increasing the vocabulary limit."
)
raise ValueError(msg)
if not all_vocab_words:
msg = f"No vocabulary returned for max_vocab={max_vocab}"
raise ValueError(msg)
words_with_ranks = all_vocab_words
excerpt_word_set = set(excerpt.lower().split())
excerpt_words = [
(w, r)
for w, r in all_vocab_words
if w.lower() in excerpt_word_set
]
contexts = None
if options.include_context:
if not text:
text = read_file(filepath)
words = [w for w, _ in words_with_ranks]
contexts = find_word_contexts(text, words)
deck_name = options.deck_name or f"{filepath.stem}_top{max_vocab}"
anki_content = generate_anki_deck(
DeckInput(
words_with_ranks=words_with_ranks,
source_lang=source_lang,
target_lang=options.target_lang,
contexts=contexts,
deck_name=deck_name,
),
include_context=options.include_context,
no_translate=options.no_translate,
excerpt=excerpt,
excerpt_words=excerpt_words or None,
)
return (
anki_content,
excerpt,
excerpt_length,
len(words_with_ranks),
max_rank_used,
)
def _format_cache_size(value: int) -> str:
"""Format a byte size as human-readable string."""
if value < _ONE_KB:
return f"{value} B"
if value < _ONE_MB:
return f"{value / _ONE_KB:.1f} KB"
return f"{value / _ONE_MB:.1f} MB"
def _print_cache_stats() -> int:
"""Print cache statistics and return exit code."""
stats = get_all_cache_stats()
logger.info("Cache Statistics")
logger.info("=" * 50)
for cache_name, cache_stats in stats.items():
logger.info("\n%s:", cache_name.upper())
for key, value in cache_stats.items():
if key == "cache_size_bytes":
logger.info(" %s: %s", key, _format_cache_size(value))
else:
logger.info(" %s: %s", key, value)
return 0
def _clear_caches() -> int:
"""Clear all caches and return exit code."""
clear_all_caches()
logger.info("All caches cleared.")
return 0
def _log_anki_import_instructions(output_path: Path) -> None:
"""Log Anki import instructions."""
logger.info("")
logger.info("To import into Anki:")
logger.info(" 1. Open Anki")
logger.info(" 2. File -> Import")
logger.info(" 3. Select: %s", output_path)
logger.info(" 4. Click Import")
def _handle_inverse_mode(
args: argparse.Namespace,
filepath: Path,
) -> int:
"""Handle inverse mode (--max-vocab) flashcard generation.
Args:
args: Parsed command line arguments.
filepath: Path to source file.
Returns:
Exit code.
"""
if not args.quiet:
logger.info("Analyzing %s...", filepath.name)
logger.info(
"Finding longest excerpt using top %d words...",
args.max_vocab,
)
anki_content, excerpt, excerpt_length, num_words, max_rank_used = (
generate_flashcards_inverse(
filepath,
args.max_vocab,
FlashcardOptions(
source_lang=args.source_lang,
target_lang=args.target_lang,
deck_name=args.deck_name,
include_context=args.include_context,
no_translate=args.no_translate,
force=args.force,
),
)
)
output_path = (
Path(args.output)
if args.output
else filepath.parent
/ f"{filepath.stem}_anki_top{args.max_vocab}.txt"
)
output_path.write_text(anki_content, encoding="utf-8")
if not args.quiet:
logger.info("")
logger.info("=" * 60)
logger.info("FLASHCARD GENERATION COMPLETE (INVERSE MODE)")
logger.info("=" * 60)
logger.info("Learning: top %d words", args.max_vocab)
logger.info(
"Longest excerpt you can understand: %d words",
excerpt_length,
)
logger.info(' "%s"', excerpt)
logger.info("")
logger.info("Rarest word in excerpt: #%d", max_rank_used)
logger.info("Flashcards: %d", num_words)
logger.info("Output file: %s", output_path)
_log_anki_import_instructions(output_path)
else:
logger.info("%s", output_path)
return 0
def _handle_normal_mode(
args: argparse.Namespace,
filepath: Path,
) -> int:
"""Handle normal mode (--length) flashcard generation.
Args:
args: Parsed command line arguments.
filepath: Path to source file.
Returns:
Exit code.
"""
if not args.quiet:
logger.info("Analyzing %s...", filepath.name)
logger.info(
"Finding vocabulary for %d-word excerpt...", args.length
)
anki_content, excerpt, num_words, max_rank = generate_flashcards(
filepath,
args.length,
FlashcardOptions(
source_lang=args.source_lang,
target_lang=args.target_lang,
deck_name=args.deck_name,
include_context=args.include_context,
no_translate=args.no_translate,
force=args.force,
),
all_vocab=not args.excerpt_words_only,
)
output_path = (
Path(args.output)
if args.output
else filepath.parent / f"{filepath.stem}_anki_{args.length}.txt"
)
output_path.write_text(anki_content, encoding="utf-8")
if not args.quiet:
logger.info("")
logger.info("=" * 60)
logger.info("FLASHCARD GENERATION COMPLETE")
logger.info("=" * 60)
logger.info(
"Excerpt to understand (%d words):", args.length
)
logger.info(' "%s"', excerpt)
logger.info("")
logger.info("Max word rank needed: #%d", max_rank)
if args.excerpt_words_only:
logger.info(
"Flashcards: %d (excerpt words only)", num_words
)
else:
logger.info(
"Flashcards: %d (ALL words rank #1 to #%d)",
num_words,
max_rank,
)
logger.info("Output file: %s", output_path)
_log_anki_import_instructions(output_path)
else:
logger.info("%s", output_path)
return 0
def _build_parser() -> argparse.ArgumentParser:
"""Build the argument parser for the CLI.
Returns:
Configured argument parser.
"""
parser = argparse.ArgumentParser(
description="Generate Anki flashcards from vocabulary analysis.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"--file",
"-f",
type=str,
default=None,
help="Path to the text file to analyze",
)
parser.add_argument(
"--length",
"-l",
type=int,
default=None,
help=(
"Target excerpt length "
"(how many words you want to understand)"
),
)
parser.add_argument(
"--max-vocab",
"-v",
type=int,
default=None,
help=(
"INVERSE MODE: Learn top N words, "
"find longest excerpt you can understand"
),
)
parser.add_argument(
"--from",
dest="source_lang",
type=str,
default=None,
help=(
"Source language code (e.g., 'pl', 'la', 'de'). "
"Auto-detected if not specified."
),
)
parser.add_argument(
"--to",
"-T",
dest="target_lang",
type=str,
default="en",
help="Target language code for translations (default: 'en')",
)
parser.add_argument(
"--output",
"-o",
type=str,
default=None,
help="Output file path (default: <filename>_anki_<length>.txt)",
)
parser.add_argument(
"--include-context",
"-c",
action="store_true",
help="Include example context sentences in flashcards",
)
parser.add_argument(
"--deck-name",
"-d",
type=str,
default=None,
help="Name for the Anki deck (default: auto-generated)",
)
parser.add_argument(
"--quiet",
"-q",
action="store_true",
help="Only output the file path, no status messages",
)
parser.add_argument(
"--excerpt-words-only",
"-e",
action="store_true",
help=(
"Only include words that appear in the excerpt "
"(default: include ALL words up to max rank)"
),
)
parser.add_argument(
"--no-translate",
"-n",
action="store_true",
help="Skip translation (output words without translations)",
)
parser.add_argument(
"--force",
"-F",
action="store_true",
help="Force regeneration, ignoring all caches",
)
parser.add_argument(
"--cache-stats",
action="store_true",
help="Show cache statistics and exit",
)
parser.add_argument(
"--clear-cache",
action="store_true",
help="Clear all caches and exit",
)
return parser
def _run_generation(args: argparse.Namespace) -> int:
"""Validate args and run flashcard generation.
Args:
args: Parsed command line arguments.
Returns:
Exit code.
"""
filepath = Path(args.file)
if not filepath.exists():
logger.error("Error: File not found: %s", args.file)
return 1
if args.max_vocab is not None:
return _handle_inverse_mode(args, filepath)
return _handle_normal_mode(args, filepath)
def main(argv: Sequence[str] | None = None) -> int:
"""Main entry point.
Args:
argv: Command line arguments.
Returns:
Exit code.
"""
parser = _build_parser()
args = parser.parse_args(argv)
if args.cache_stats:
return _print_cache_stats()
if args.clear_cache:
return _clear_caches()
if args.file is None:
parser.error("--file/-f is required")
if args.length is None and args.max_vocab is None:
parser.error("Either --length/-l or --max-vocab/-v is required")
if args.length is not None and args.max_vocab is not None:
parser.error(
"Cannot use both --length and --max-vocab. Choose one mode."
)
try:
return _run_generation(args)
except FileNotFoundError:
logger.exception("File not found")
except subprocess.CalledProcessError:
logger.exception("Error running vocabulary_curve")
except ValueError:
logger.exception("Value error")
return 1
if __name__ == "__main__":
sys.exit(main())