#!/usr/bin/env python3 """Anki flashcard generator from vocabulary curve analysis. Generates Anki-compatible flashcard decks from the vocabulary needed to understand excerpts of a given length. Usage: # Generate flashcards for a 20-word excerpt python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 # Specify source language (auto-detected by default) python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 --from pl # Custom output file python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 --output polish_vocab.txt # Include example sentences/context python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 --include-context Output: Creates a semicolon-separated text file that can be imported into Anki. Format: word;translation;frequency_rank;example_context (optional) """ from __future__ import annotations import argparse import re import subprocess import sys from collections import Counter from pathlib import Path from typing import TYPE_CHECKING, NamedTuple if TYPE_CHECKING: from collections.abc import Sequence try: from python_pkg.word_frequency.translator import ( detect_language, translate_words_batch, ) from python_pkg.word_frequency.analyzer import read_file, analyze_text except ImportError: from translator import detect_language, translate_words_batch from analyzer import read_file, analyze_text # Path to C vocabulary_curve executable C_EXECUTABLE = Path(__file__).parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve" class VocabWord(NamedTuple): """A vocabulary word with its metadata.""" word: str rank: int translation: str context: str def run_vocabulary_curve(filepath: Path, max_length: int) -> str: """Run the C vocabulary_curve executable. Args: filepath: Path to the text file. max_length: Maximum excerpt length. Returns: Output from the executable. Raises: FileNotFoundError: If executable not found. subprocess.CalledProcessError: If execution fails. """ if not C_EXECUTABLE.exists(): raise FileNotFoundError( f"C executable not found at {C_EXECUTABLE}. " "Please compile it first: cd C/vocabulary_curve && make" ) result = subprocess.run( [str(C_EXECUTABLE), str(filepath), str(max_length)], capture_output=True, text=True, timeout=120, check=True, ) return result.stdout def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]]]: """Parse output from vocabulary_curve to get words needed. Args: output: Raw output from vocabulary_curve. target_length: The target excerpt length. Returns: Tuple of (excerpt_text, list of (word, rank) tuples). """ lines = output.split("\n") excerpt = "" words: list[tuple[str, int]] = [] # Find the line for the target length i = 0 while i < len(lines): line = lines[i] if line.strip().startswith(f"[Length {target_length}]"): # Found our target length, now get excerpt and words i += 1 # Find excerpt line while i < len(lines) and not lines[i].strip().startswith("Excerpt:"): i += 1 if i < len(lines): excerpt_line = lines[i].strip() if '"' in excerpt_line: start = excerpt_line.index('"') + 1 end = excerpt_line.rindex('"') excerpt = excerpt_line[start:end] # Find words line i += 1 while i < len(lines) and not lines[i].strip().startswith("Words:"): i += 1 if i < len(lines): words_line = lines[i].strip() if words_line.startswith("Words:"): words_part = words_line[6:].strip() # Parse "word(#rank), word2(#rank2), ..." pattern = r"(\S+)\(#(\d+)\)" matches = re.findall(pattern, words_part) words = [(w, int(r)) for w, r in matches] break i += 1 return excerpt, words def get_top_n_words(text: str, n: int) -> list[tuple[str, int]]: """Get the top N most frequent words from text. Args: text: The source text. n: Number of top words to return. Returns: List of (word, rank) tuples, ranked 1 to n. """ word_counts = analyze_text(text) sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0])) return [(word, rank + 1) for rank, (word, _) in enumerate(sorted_words[:n])] def find_word_contexts( text: str, words: list[str], context_words: int = 5, ) -> dict[str, str]: """Find example contexts for each word in the text. Args: text: The source text. words: List of words to find contexts for. context_words: Number of words of context on each side. Returns: Dict mapping word to example context. """ # Extract all words preserving positions all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE) all_words_lower = [w.lower() for w in all_words] contexts: dict[str, str] = {} words_lower = {w.lower() for w in words} for target in words_lower: # Find first occurrence for i, word in enumerate(all_words_lower): if word == target: start = max(0, i - context_words) end = min(len(all_words), i + context_words + 1) context = " ".join(all_words[start:end]) contexts[target] = f"...{context}..." break return contexts def generate_anki_deck( words_with_ranks: list[tuple[str, int]], source_lang: str, target_lang: str = "en", contexts: dict[str, str] | None = None, deck_name: str = "Vocabulary", include_context: bool = False, no_translate: bool = False, ) -> str: """Generate Anki-compatible deck content. Args: words_with_ranks: List of (word, rank) tuples. source_lang: Source language code. target_lang: Target language code (default: en). contexts: Optional dict of word -> context. deck_name: Name for the deck. include_context: Whether to include context in cards. no_translate: If True, skip translation (use placeholder). Returns: Semicolon-separated content ready for Anki import. """ lines: list[str] = [] # Add Anki headers lines.append(f"#separator:semicolon") lines.append(f"#html:true") lines.append(f"#deck:{deck_name}") lines.append(f"#tags:vocabulary {source_lang}") if include_context: lines.append("#columns:Front;Back;Rank;Context") else: lines.append("#columns:Front;Back;Rank") lines.append("") # Empty line before data # Get translations (or skip if no_translate) words = [w for w, _ in words_with_ranks] if no_translate: trans_lookup = {w.lower(): "[TODO]" for w in words} else: translations = translate_words_batch(words, source_lang, target_lang) # Build translation lookup trans_lookup = {} for result in translations: if result.success: trans_lookup[result.source_word.lower()] = result.translated_word else: trans_lookup[result.source_word.lower()] = f"[{result.source_word}]" # Generate cards for word, rank in words_with_ranks: translation = trans_lookup.get(word.lower(), f"[{word}]") # Escape semicolons in fields word_escaped = word.replace(";", ",") translation_escaped = translation.replace(";", ",") if include_context and contexts: context = contexts.get(word.lower(), "") # Highlight the word in context if context: context_escaped = context.replace(";", ",") # Make target word bold in context pattern = re.compile(re.escape(word), re.IGNORECASE) context_escaped = pattern.sub(f"{word}", context_escaped) else: context_escaped = "" lines.append(f"{word_escaped};{translation_escaped};#{rank};{context_escaped}") else: lines.append(f"{word_escaped};{translation_escaped};#{rank}") return "\n".join(lines) def generate_flashcards( filepath: str | Path, excerpt_length: int, source_lang: str | None = None, target_lang: str = "en", include_context: bool = False, deck_name: str | None = None, all_vocab: bool = True, no_translate: bool = False, ) -> tuple[str, str, int, int]: """Generate Anki flashcards for vocabulary needed for an excerpt length. Args: filepath: Path to the source text file. excerpt_length: Target excerpt length. source_lang: Source language (auto-detected if None). target_lang: Target language for translations. include_context: Whether to include example contexts. deck_name: Optional deck name. all_vocab: If True, include ALL words from rank 1 to max rank needed. If False, only include words that appear in the excerpt. no_translate: If True, skip translation. Returns: Tuple of (anki_content, excerpt, num_words, max_rank). """ filepath = Path(filepath) # Read the text text = read_file(filepath) # Auto-detect language if not provided if source_lang is None: source_lang = detect_language(text) if source_lang is None: source_lang = "auto" # Run vocabulary curve analysis output = run_vocabulary_curve(filepath, excerpt_length) # Parse the output excerpt, excerpt_words = parse_vocabulary_curve_output(output, excerpt_length) if not excerpt_words: raise ValueError(f"No words found for excerpt length {excerpt_length}") # Find max rank needed max_rank = max(rank for _, rank in excerpt_words) # Get ALL words up to max_rank if requested if all_vocab: words_with_ranks = get_top_n_words(text, max_rank) else: words_with_ranks = excerpt_words # Get contexts if requested contexts = None if include_context: words = [w for w, _ in words_with_ranks] contexts = find_word_contexts(text, words) # Generate deck name if deck_name is None: deck_name = f"{filepath.stem}_vocab_{excerpt_length}" # Generate Anki content anki_content = generate_anki_deck( words_with_ranks, source_lang, target_lang, contexts, deck_name, include_context, no_translate, ) return anki_content, excerpt, len(words_with_ranks), max_rank def main(argv: Sequence[str] | None = None) -> int: """Main entry point. Args: argv: Command line arguments. Returns: Exit code. """ parser = argparse.ArgumentParser( description="Generate Anki flashcards from vocabulary analysis.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) parser.add_argument( "--file", "-f", type=str, required=True, help="Path to the text file to analyze", ) parser.add_argument( "--length", "-l", type=int, required=True, help="Target excerpt length (how many words you want to understand)", ) parser.add_argument( "--from", "-F", dest="source_lang", type=str, default=None, help="Source language code (e.g., 'pl', 'la', 'de'). Auto-detected if not specified.", ) parser.add_argument( "--to", "-T", dest="target_lang", type=str, default="en", help="Target language code for translations (default: 'en')", ) parser.add_argument( "--output", "-o", type=str, default=None, help="Output file path (default: _anki_.txt)", ) parser.add_argument( "--include-context", "-c", action="store_true", help="Include example context sentences in flashcards", ) parser.add_argument( "--deck-name", "-d", type=str, default=None, help="Name for the Anki deck (default: auto-generated)", ) parser.add_argument( "--quiet", "-q", action="store_true", help="Only output the file path, no status messages", ) parser.add_argument( "--excerpt-words-only", "-e", action="store_true", help="Only include words that appear in the excerpt (default: include ALL words up to max rank)", ) parser.add_argument( "--no-translate", "-n", action="store_true", help="Skip translation (output words without translations)", ) args = parser.parse_args(argv) try: filepath = Path(args.file) if not filepath.exists(): print(f"Error: File not found: {args.file}", file=sys.stderr) # noqa: T201 return 1 if not args.quiet: print(f"Analyzing {filepath.name}...") # noqa: T201 print(f"Finding vocabulary for {args.length}-word excerpt...") # noqa: T201 # Generate flashcards anki_content, excerpt, num_words, max_rank = generate_flashcards( filepath, args.length, source_lang=args.source_lang, target_lang=args.target_lang, include_context=args.include_context, deck_name=args.deck_name, all_vocab=not args.excerpt_words_only, no_translate=args.no_translate, ) # Determine output path if args.output: output_path = Path(args.output) else: output_path = filepath.parent / f"{filepath.stem}_anki_{args.length}.txt" # Write output output_path.write_text(anki_content, encoding="utf-8") if not args.quiet: print("") # noqa: T201 print("=" * 60) # noqa: T201 print("FLASHCARD GENERATION COMPLETE") # noqa: T201 print("=" * 60) # noqa: T201 print(f"Excerpt to understand ({args.length} words):") # noqa: T201 print(f' "{excerpt}"') # noqa: T201 print("") # noqa: T201 print(f"Max word rank needed: #{max_rank}") # noqa: T201 if args.excerpt_words_only: print(f"Flashcards: {num_words} (excerpt words only)") # noqa: T201 else: print(f"Flashcards: {num_words} (ALL words rank #1 to #{max_rank})") # noqa: T201 print(f"Output file: {output_path}") # noqa: T201 print("") # noqa: T201 print("To import into Anki:") # noqa: T201 print(" 1. Open Anki") # noqa: T201 print(" 2. File -> Import") # noqa: T201 print(f" 3. Select: {output_path}") # noqa: T201 print(" 4. Click Import") # noqa: T201 else: print(output_path) # noqa: T201 return 0 except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) # noqa: T201 return 1 except subprocess.CalledProcessError as e: print(f"Error running vocabulary_curve: {e}", file=sys.stderr) # noqa: T201 return 1 except ValueError as e: print(f"Error: {e}", file=sys.stderr) # noqa: T201 return 1 if __name__ == "__main__": sys.exit(main())