testsAndMisc/python_pkg/word_frequency/anki_generator.py

435 lines
12 KiB
Python
Raw Normal View History

2025-12-28 16:48:34 +01:00
#!/usr/bin/env python3
"""Anki flashcard generator from vocabulary curve analysis.
Generates Anki-compatible flashcard decks from the vocabulary needed to
understand excerpts of a given length.
Usage::
2025-12-28 16:48:34 +01:00
# Generate flashcards for a 20-word excerpt
python -m python_pkg.word_frequency.anki_generator \
--file text.txt --length 20
2025-12-28 16:48:34 +01:00
# Specify source language (auto-detected by default)
python -m python_pkg.word_frequency.anki_generator \
--file text.txt --length 20 --from pl
2025-12-28 16:48:34 +01:00
# Custom output file
python -m python_pkg.word_frequency.anki_generator \
--file text.txt --length 20 --output polish_vocab.txt
2025-12-28 16:48:34 +01:00
# Include example sentences/context
python -m python_pkg.word_frequency.anki_generator \
--file text.txt --length 20 --include-context
2025-12-28 16:48:34 +01:00
Output:
Creates a semicolon-separated text file importable into Anki.
Format: ``word;translation;frequency_rank;example_context``
2025-12-28 16:48:34 +01:00
"""
from __future__ import annotations
import argparse
import logging
from pathlib import Path
2025-12-28 16:48:34 +01:00
import subprocess
import sys
from typing import TYPE_CHECKING
2025-12-28 16:48:34 +01:00
if TYPE_CHECKING:
from collections.abc import Sequence
from python_pkg.word_frequency._deck_builder import (
find_word_contexts,
generate_anki_deck,
)
from python_pkg.word_frequency._generation import (
cache_deck,
cache_excerpt,
generate_flashcards,
generate_flashcards_inverse,
get_cached_deck,
get_cached_excerpt,
run_vocabulary_curve,
run_vocabulary_curve_inverse,
)
from python_pkg.word_frequency._parsing import (
parse_inverse_mode_output,
parse_vocabulary_curve_output,
)
from python_pkg.word_frequency._types import (
_ONE_KB,
_ONE_MB,
C_EXECUTABLE,
DeckInput,
FlashcardOptions,
VocabWord,
)
from python_pkg.word_frequency.cache import (
clear_all_caches,
get_all_cache_stats,
)
2025-12-28 16:48:34 +01:00
logger = logging.getLogger(__name__)
2025-12-28 16:48:34 +01:00
# Re-export public API from helper modules
__all__ = [
"C_EXECUTABLE",
"DeckInput",
"FlashcardOptions",
"VocabWord",
"cache_deck",
"cache_excerpt",
"find_word_contexts",
"generate_anki_deck",
"generate_flashcards",
"generate_flashcards_inverse",
"get_cached_deck",
"get_cached_excerpt",
"main",
"parse_inverse_mode_output",
"parse_vocabulary_curve_output",
"run_vocabulary_curve",
"run_vocabulary_curve_inverse",
]
2025-12-29 16:10:26 +01:00
def _format_cache_size(value: int) -> str:
"""Format a byte size as human-readable string."""
if value < _ONE_KB:
return f"{value} B"
if value < _ONE_MB:
return f"{value / _ONE_KB:.1f} KB"
return f"{value / _ONE_MB:.1f} MB"
def _print_cache_stats() -> int:
"""Print cache statistics and return exit code."""
stats = get_all_cache_stats()
logger.info("Cache Statistics")
logger.info("=" * 50)
for cache_name, cache_stats in stats.items():
logger.info("\n%s:", cache_name.upper())
for key, value in cache_stats.items():
if key == "cache_size_bytes":
logger.info(" %s: %s", key, _format_cache_size(value))
else:
logger.info(" %s: %s", key, value)
return 0
def _clear_caches() -> int:
"""Clear all caches and return exit code."""
clear_all_caches()
logger.info("All caches cleared.")
return 0
def _log_anki_import_instructions(output_path: Path) -> None:
"""Log Anki import instructions."""
logger.info("")
logger.info("To import into Anki:")
logger.info(" 1. Open Anki")
logger.info(" 2. File -> Import")
logger.info(" 3. Select: %s", output_path)
logger.info(" 4. Click Import")
2025-12-29 16:10:26 +01:00
def _handle_inverse_mode(
args: argparse.Namespace,
filepath: Path,
) -> int:
"""Handle inverse mode (--max-vocab) flashcard generation.
2025-12-28 16:48:34 +01:00
Args:
args: Parsed command line arguments.
filepath: Path to source file.
Returns:
Exit code.
"""
if not args.quiet:
logger.info("Analyzing %s...", filepath.name)
logger.info(
"Finding longest excerpt using top %d words...",
args.max_vocab,
)
anki_content, excerpt, excerpt_length, num_words, max_rank_used = (
generate_flashcards_inverse(
filepath,
args.max_vocab,
FlashcardOptions(
source_lang=args.source_lang,
target_lang=args.target_lang,
deck_name=args.deck_name,
include_context=args.include_context,
no_translate=args.no_translate,
force=args.force,
),
)
)
output_path = (
Path(args.output)
if args.output
else filepath.parent / f"{filepath.stem}_anki_top{args.max_vocab}.txt"
)
output_path.write_text(anki_content, encoding="utf-8")
if not args.quiet:
logger.info("")
logger.info("=" * 60)
logger.info("FLASHCARD GENERATION COMPLETE (INVERSE MODE)")
logger.info("=" * 60)
logger.info("Learning: top %d words", args.max_vocab)
logger.info(
"Longest excerpt you can understand: %d words",
excerpt_length,
)
logger.info(' "%s"', excerpt)
logger.info("")
logger.info("Rarest word in excerpt: #%d", max_rank_used)
logger.info("Flashcards: %d", num_words)
logger.info("Output file: %s", output_path)
_log_anki_import_instructions(output_path)
else:
logger.info("%s", output_path)
return 0
def _handle_normal_mode(
args: argparse.Namespace,
filepath: Path,
) -> int:
"""Handle normal mode (--length) flashcard generation.
Args:
args: Parsed command line arguments.
filepath: Path to source file.
2025-12-28 16:48:34 +01:00
Returns:
Exit code.
"""
if not args.quiet:
logger.info("Analyzing %s...", filepath.name)
logger.info("Finding vocabulary for %d-word excerpt...", args.length)
anki_content, excerpt, num_words, max_rank = generate_flashcards(
filepath,
args.length,
FlashcardOptions(
source_lang=args.source_lang,
target_lang=args.target_lang,
deck_name=args.deck_name,
include_context=args.include_context,
no_translate=args.no_translate,
force=args.force,
),
all_vocab=not args.excerpt_words_only,
)
output_path = (
Path(args.output)
if args.output
else filepath.parent / f"{filepath.stem}_anki_{args.length}.txt"
)
output_path.write_text(anki_content, encoding="utf-8")
if not args.quiet:
logger.info("")
logger.info("=" * 60)
logger.info("FLASHCARD GENERATION COMPLETE")
logger.info("=" * 60)
logger.info("Excerpt to understand (%d words):", args.length)
logger.info(' "%s"', excerpt)
logger.info("")
logger.info("Max word rank needed: #%d", max_rank)
if args.excerpt_words_only:
logger.info("Flashcards: %d (excerpt words only)", num_words)
else:
logger.info(
"Flashcards: %d (ALL words rank #1 to #%d)",
num_words,
max_rank,
)
logger.info("Output file: %s", output_path)
_log_anki_import_instructions(output_path)
else:
logger.info("%s", output_path)
return 0
def _build_parser() -> argparse.ArgumentParser:
"""Build the argument parser for the CLI.
Returns:
Configured argument parser.
"""
2025-12-28 16:48:34 +01:00
parser = argparse.ArgumentParser(
description="Generate Anki flashcards from vocabulary analysis.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"--file",
"-f",
type=str,
default=None,
2025-12-28 16:48:34 +01:00
help="Path to the text file to analyze",
)
parser.add_argument(
"--length",
"-l",
type=int,
default=None,
help=("Target excerpt length (how many words you want to understand)"),
2025-12-28 16:48:34 +01:00
)
2025-12-29 16:10:26 +01:00
parser.add_argument(
"--max-vocab",
"-v",
type=int,
default=None,
help=(
"INVERSE MODE: Learn top N words, find longest excerpt you can understand"
),
2025-12-29 16:10:26 +01:00
)
2025-12-28 16:48:34 +01:00
parser.add_argument(
"--from",
dest="source_lang",
type=str,
default=None,
help=(
"Source language code (e.g., 'pl', 'la', 'de'). "
"Auto-detected if not specified."
),
2025-12-28 16:48:34 +01:00
)
parser.add_argument(
"--to",
"-T",
dest="target_lang",
type=str,
default="en",
help="Target language code for translations (default: 'en')",
)
parser.add_argument(
"--output",
"-o",
type=str,
default=None,
help="Output file path (default: <filename>_anki_<length>.txt)",
)
parser.add_argument(
"--include-context",
"-c",
action="store_true",
help="Include example context sentences in flashcards",
)
parser.add_argument(
"--deck-name",
"-d",
type=str,
default=None,
help="Name for the Anki deck (default: auto-generated)",
)
parser.add_argument(
"--quiet",
"-q",
action="store_true",
help="Only output the file path, no status messages",
)
parser.add_argument(
"--excerpt-words-only",
"-e",
action="store_true",
help=(
"Only include words that appear in the excerpt "
"(default: include ALL words up to max rank)"
),
2025-12-28 16:48:34 +01:00
)
parser.add_argument(
"--no-translate",
"-n",
action="store_true",
help="Skip translation (output words without translations)",
)
parser.add_argument(
"--force",
"-F",
action="store_true",
help="Force regeneration, ignoring all caches",
)
parser.add_argument(
"--cache-stats",
action="store_true",
help="Show cache statistics and exit",
)
parser.add_argument(
"--clear-cache",
action="store_true",
help="Clear all caches and exit",
)
return parser
def _run_generation(args: argparse.Namespace) -> int:
"""Validate args and run flashcard generation.
Args:
args: Parsed command line arguments.
Returns:
Exit code.
"""
filepath = Path(args.file)
if not filepath.exists():
logger.error("Error: File not found: %s", args.file)
return 1
if args.max_vocab is not None:
return _handle_inverse_mode(args, filepath)
return _handle_normal_mode(args, filepath)
def main(argv: Sequence[str] | None = None) -> int:
"""Main entry point.
Args:
argv: Command line arguments.
2025-12-28 16:48:34 +01:00
Returns:
Exit code.
"""
parser = _build_parser()
2025-12-28 16:48:34 +01:00
args = parser.parse_args(argv)
if args.cache_stats:
return _print_cache_stats()
if args.clear_cache:
return _clear_caches()
if args.file is None:
parser.error("--file/-f is required")
2025-12-29 16:10:26 +01:00
if args.length is None and args.max_vocab is None:
parser.error("Either --length/-l or --max-vocab/-v is required")
if args.length is not None and args.max_vocab is not None:
parser.error("Cannot use both --length and --max-vocab. Choose one mode.")
2025-12-28 16:48:34 +01:00
try:
return _run_generation(args)
except FileNotFoundError:
logger.exception("File not found")
except subprocess.CalledProcessError:
logger.exception("Error running vocabulary_curve")
except ValueError:
logger.exception("Value error")
return 1
2025-12-28 16:48:34 +01:00
if __name__ == "__main__":
sys.exit(main())