mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 14:23:04 +02:00
500 lines
16 KiB
Python
500 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""Anki flashcard generator from vocabulary curve analysis.
|
|
|
|
Generates Anki-compatible flashcard decks from the vocabulary needed to
|
|
understand excerpts of a given length.
|
|
|
|
Usage:
|
|
# Generate flashcards for a 20-word excerpt
|
|
python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20
|
|
|
|
# Specify source language (auto-detected by default)
|
|
python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 --from pl
|
|
|
|
# Custom output file
|
|
python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 --output polish_vocab.txt
|
|
|
|
# Include example sentences/context
|
|
python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 --include-context
|
|
|
|
Output:
|
|
Creates a semicolon-separated text file that can be imported into Anki.
|
|
Format: word;translation;frequency_rank;example_context (optional)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, NamedTuple
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Sequence
|
|
|
|
try:
|
|
from python_pkg.word_frequency.translator import (
|
|
detect_language,
|
|
translate_words_batch,
|
|
)
|
|
from python_pkg.word_frequency.analyzer import read_file, analyze_text
|
|
except ImportError:
|
|
from translator import detect_language, translate_words_batch
|
|
from analyzer import read_file, analyze_text
|
|
|
|
|
|
# Path to C vocabulary_curve executable
|
|
C_EXECUTABLE = Path(__file__).parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
|
|
|
|
|
|
class VocabWord(NamedTuple):
|
|
"""A vocabulary word with its metadata."""
|
|
|
|
word: str
|
|
rank: int
|
|
translation: str
|
|
context: str
|
|
|
|
|
|
def run_vocabulary_curve(filepath: Path, max_length: int) -> str:
|
|
"""Run the C vocabulary_curve executable.
|
|
|
|
Args:
|
|
filepath: Path to the text file.
|
|
max_length: Maximum excerpt length.
|
|
|
|
Returns:
|
|
Output from the executable.
|
|
|
|
Raises:
|
|
FileNotFoundError: If executable not found.
|
|
subprocess.CalledProcessError: If execution fails.
|
|
"""
|
|
if not C_EXECUTABLE.exists():
|
|
raise FileNotFoundError(
|
|
f"C executable not found at {C_EXECUTABLE}. "
|
|
"Please compile it first: cd C/vocabulary_curve && make"
|
|
)
|
|
|
|
result = subprocess.run(
|
|
[str(C_EXECUTABLE), str(filepath), str(max_length)],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
check=True,
|
|
)
|
|
return result.stdout
|
|
|
|
|
|
def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]]]:
|
|
"""Parse output from vocabulary_curve to get words needed.
|
|
|
|
Args:
|
|
output: Raw output from vocabulary_curve.
|
|
target_length: The target excerpt length.
|
|
|
|
Returns:
|
|
Tuple of (excerpt_text, list of (word, rank) tuples).
|
|
"""
|
|
lines = output.split("\n")
|
|
excerpt = ""
|
|
words: list[tuple[str, int]] = []
|
|
|
|
# Find the line for the target length
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
if line.strip().startswith(f"[Length {target_length}]"):
|
|
# Found our target length, now get excerpt and words
|
|
i += 1
|
|
# Find excerpt line
|
|
while i < len(lines) and not lines[i].strip().startswith("Excerpt:"):
|
|
i += 1
|
|
if i < len(lines):
|
|
excerpt_line = lines[i].strip()
|
|
if '"' in excerpt_line:
|
|
start = excerpt_line.index('"') + 1
|
|
end = excerpt_line.rindex('"')
|
|
excerpt = excerpt_line[start:end]
|
|
|
|
# Find words line
|
|
i += 1
|
|
while i < len(lines) and not lines[i].strip().startswith("Words:"):
|
|
i += 1
|
|
if i < len(lines):
|
|
words_line = lines[i].strip()
|
|
if words_line.startswith("Words:"):
|
|
words_part = words_line[6:].strip()
|
|
# Parse "word(#rank), word2(#rank2), ..."
|
|
pattern = r"(\S+)\(#(\d+)\)"
|
|
matches = re.findall(pattern, words_part)
|
|
words = [(w, int(r)) for w, r in matches]
|
|
break
|
|
i += 1
|
|
|
|
return excerpt, words
|
|
|
|
|
|
def get_top_n_words(text: str, n: int) -> list[tuple[str, int]]:
|
|
"""Get the top N most frequent words from text.
|
|
|
|
Args:
|
|
text: The source text.
|
|
n: Number of top words to return.
|
|
|
|
Returns:
|
|
List of (word, rank) tuples, ranked 1 to n.
|
|
"""
|
|
word_counts = analyze_text(text)
|
|
sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0]))
|
|
return [(word, rank + 1) for rank, (word, _) in enumerate(sorted_words[:n])]
|
|
|
|
|
|
def find_word_contexts(
|
|
text: str,
|
|
words: list[str],
|
|
context_words: int = 5,
|
|
) -> dict[str, str]:
|
|
"""Find example contexts for each word in the text.
|
|
|
|
Args:
|
|
text: The source text.
|
|
words: List of words to find contexts for.
|
|
context_words: Number of words of context on each side.
|
|
|
|
Returns:
|
|
Dict mapping word to example context.
|
|
"""
|
|
# Extract all words preserving positions
|
|
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
|
|
all_words_lower = [w.lower() for w in all_words]
|
|
|
|
contexts: dict[str, str] = {}
|
|
words_lower = {w.lower() for w in words}
|
|
|
|
for target in words_lower:
|
|
# Find first occurrence
|
|
for i, word in enumerate(all_words_lower):
|
|
if word == target:
|
|
start = max(0, i - context_words)
|
|
end = min(len(all_words), i + context_words + 1)
|
|
context = " ".join(all_words[start:end])
|
|
contexts[target] = f"...{context}..."
|
|
break
|
|
|
|
return contexts
|
|
|
|
|
|
def generate_anki_deck(
|
|
words_with_ranks: list[tuple[str, int]],
|
|
source_lang: str,
|
|
target_lang: str = "en",
|
|
contexts: dict[str, str] | None = None,
|
|
deck_name: str = "Vocabulary",
|
|
include_context: bool = False,
|
|
no_translate: bool = False,
|
|
) -> str:
|
|
"""Generate Anki-compatible deck content.
|
|
|
|
Args:
|
|
words_with_ranks: List of (word, rank) tuples.
|
|
source_lang: Source language code.
|
|
target_lang: Target language code (default: en).
|
|
contexts: Optional dict of word -> context.
|
|
deck_name: Name for the deck.
|
|
include_context: Whether to include context in cards.
|
|
no_translate: If True, skip translation (use placeholder).
|
|
|
|
Returns:
|
|
Semicolon-separated content ready for Anki import.
|
|
"""
|
|
lines: list[str] = []
|
|
|
|
# Add Anki headers
|
|
lines.append(f"#separator:semicolon")
|
|
lines.append(f"#html:true")
|
|
lines.append(f"#deck:{deck_name}")
|
|
lines.append(f"#tags:vocabulary {source_lang}")
|
|
if include_context:
|
|
lines.append("#columns:Front;Back;Rank;Context")
|
|
else:
|
|
lines.append("#columns:Front;Back;Rank")
|
|
lines.append("") # Empty line before data
|
|
|
|
# Get translations (or skip if no_translate)
|
|
words = [w for w, _ in words_with_ranks]
|
|
if no_translate:
|
|
trans_lookup = {w.lower(): "[TODO]" for w in words}
|
|
else:
|
|
translations = translate_words_batch(words, source_lang, target_lang)
|
|
# Build translation lookup
|
|
trans_lookup = {}
|
|
for result in translations:
|
|
if result.success:
|
|
trans_lookup[result.source_word.lower()] = result.translated_word
|
|
else:
|
|
trans_lookup[result.source_word.lower()] = f"[{result.source_word}]"
|
|
|
|
# Generate cards
|
|
for word, rank in words_with_ranks:
|
|
translation = trans_lookup.get(word.lower(), f"[{word}]")
|
|
|
|
# Escape semicolons in fields
|
|
word_escaped = word.replace(";", ",")
|
|
translation_escaped = translation.replace(";", ",")
|
|
|
|
if include_context and contexts:
|
|
context = contexts.get(word.lower(), "")
|
|
# Highlight the word in context
|
|
if context:
|
|
context_escaped = context.replace(";", ",")
|
|
# Make target word bold in context
|
|
pattern = re.compile(re.escape(word), re.IGNORECASE)
|
|
context_escaped = pattern.sub(f"<b>{word}</b>", context_escaped)
|
|
else:
|
|
context_escaped = ""
|
|
lines.append(f"{word_escaped};{translation_escaped};#{rank};{context_escaped}")
|
|
else:
|
|
lines.append(f"{word_escaped};{translation_escaped};#{rank}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def generate_flashcards(
|
|
filepath: str | Path,
|
|
excerpt_length: int,
|
|
source_lang: str | None = None,
|
|
target_lang: str = "en",
|
|
include_context: bool = False,
|
|
deck_name: str | None = None,
|
|
all_vocab: bool = True,
|
|
no_translate: bool = False,
|
|
) -> tuple[str, str, int, int]:
|
|
"""Generate Anki flashcards for vocabulary needed for an excerpt length.
|
|
|
|
Args:
|
|
filepath: Path to the source text file.
|
|
excerpt_length: Target excerpt length.
|
|
source_lang: Source language (auto-detected if None).
|
|
target_lang: Target language for translations.
|
|
include_context: Whether to include example contexts.
|
|
deck_name: Optional deck name.
|
|
all_vocab: If True, include ALL words from rank 1 to max rank needed.
|
|
If False, only include words that appear in the excerpt.
|
|
no_translate: If True, skip translation.
|
|
|
|
Returns:
|
|
Tuple of (anki_content, excerpt, num_words, max_rank).
|
|
"""
|
|
filepath = Path(filepath)
|
|
|
|
# Read the text
|
|
text = read_file(filepath)
|
|
|
|
# Auto-detect language if not provided
|
|
if source_lang is None:
|
|
source_lang = detect_language(text)
|
|
if source_lang is None:
|
|
source_lang = "auto"
|
|
|
|
# Run vocabulary curve analysis
|
|
output = run_vocabulary_curve(filepath, excerpt_length)
|
|
|
|
# Parse the output
|
|
excerpt, excerpt_words = parse_vocabulary_curve_output(output, excerpt_length)
|
|
|
|
if not excerpt_words:
|
|
raise ValueError(f"No words found for excerpt length {excerpt_length}")
|
|
|
|
# Find max rank needed
|
|
max_rank = max(rank for _, rank in excerpt_words)
|
|
|
|
# Get ALL words up to max_rank if requested
|
|
if all_vocab:
|
|
words_with_ranks = get_top_n_words(text, max_rank)
|
|
else:
|
|
words_with_ranks = excerpt_words
|
|
|
|
# Get contexts if requested
|
|
contexts = None
|
|
if include_context:
|
|
words = [w for w, _ in words_with_ranks]
|
|
contexts = find_word_contexts(text, words)
|
|
|
|
# Generate deck name
|
|
if deck_name is None:
|
|
deck_name = f"{filepath.stem}_vocab_{excerpt_length}"
|
|
|
|
# Generate Anki content
|
|
anki_content = generate_anki_deck(
|
|
words_with_ranks,
|
|
source_lang,
|
|
target_lang,
|
|
contexts,
|
|
deck_name,
|
|
include_context,
|
|
no_translate,
|
|
)
|
|
|
|
return anki_content, excerpt, len(words_with_ranks), max_rank
|
|
|
|
|
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
"""Main entry point.
|
|
|
|
Args:
|
|
argv: Command line arguments.
|
|
|
|
Returns:
|
|
Exit code.
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate Anki flashcards from vocabulary analysis.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__,
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--file",
|
|
"-f",
|
|
type=str,
|
|
required=True,
|
|
help="Path to the text file to analyze",
|
|
)
|
|
parser.add_argument(
|
|
"--length",
|
|
"-l",
|
|
type=int,
|
|
required=True,
|
|
help="Target excerpt length (how many words you want to understand)",
|
|
)
|
|
parser.add_argument(
|
|
"--from",
|
|
"-F",
|
|
dest="source_lang",
|
|
type=str,
|
|
default=None,
|
|
help="Source language code (e.g., 'pl', 'la', 'de'). Auto-detected if not specified.",
|
|
)
|
|
parser.add_argument(
|
|
"--to",
|
|
"-T",
|
|
dest="target_lang",
|
|
type=str,
|
|
default="en",
|
|
help="Target language code for translations (default: 'en')",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
type=str,
|
|
default=None,
|
|
help="Output file path (default: <filename>_anki_<length>.txt)",
|
|
)
|
|
parser.add_argument(
|
|
"--include-context",
|
|
"-c",
|
|
action="store_true",
|
|
help="Include example context sentences in flashcards",
|
|
)
|
|
parser.add_argument(
|
|
"--deck-name",
|
|
"-d",
|
|
type=str,
|
|
default=None,
|
|
help="Name for the Anki deck (default: auto-generated)",
|
|
)
|
|
parser.add_argument(
|
|
"--quiet",
|
|
"-q",
|
|
action="store_true",
|
|
help="Only output the file path, no status messages",
|
|
)
|
|
parser.add_argument(
|
|
"--excerpt-words-only",
|
|
"-e",
|
|
action="store_true",
|
|
help="Only include words that appear in the excerpt (default: include ALL words up to max rank)",
|
|
)
|
|
parser.add_argument(
|
|
"--no-translate",
|
|
"-n",
|
|
action="store_true",
|
|
help="Skip translation (output words without translations)",
|
|
)
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
try:
|
|
filepath = Path(args.file)
|
|
if not filepath.exists():
|
|
print(f"Error: File not found: {args.file}", file=sys.stderr) # noqa: T201
|
|
return 1
|
|
|
|
if not args.quiet:
|
|
print(f"Analyzing {filepath.name}...") # noqa: T201
|
|
print(f"Finding vocabulary for {args.length}-word excerpt...") # noqa: T201
|
|
|
|
# Generate flashcards
|
|
anki_content, excerpt, num_words, max_rank = generate_flashcards(
|
|
filepath,
|
|
args.length,
|
|
source_lang=args.source_lang,
|
|
target_lang=args.target_lang,
|
|
include_context=args.include_context,
|
|
deck_name=args.deck_name,
|
|
all_vocab=not args.excerpt_words_only,
|
|
no_translate=args.no_translate,
|
|
)
|
|
|
|
# Determine output path
|
|
if args.output:
|
|
output_path = Path(args.output)
|
|
else:
|
|
output_path = filepath.parent / f"{filepath.stem}_anki_{args.length}.txt"
|
|
|
|
# Write output
|
|
output_path.write_text(anki_content, encoding="utf-8")
|
|
|
|
if not args.quiet:
|
|
print("") # noqa: T201
|
|
print("=" * 60) # noqa: T201
|
|
print("FLASHCARD GENERATION COMPLETE") # noqa: T201
|
|
print("=" * 60) # noqa: T201
|
|
print(f"Excerpt to understand ({args.length} words):") # noqa: T201
|
|
print(f' "{excerpt}"') # noqa: T201
|
|
print("") # noqa: T201
|
|
print(f"Max word rank needed: #{max_rank}") # noqa: T201
|
|
if args.excerpt_words_only:
|
|
print(f"Flashcards: {num_words} (excerpt words only)") # noqa: T201
|
|
else:
|
|
print(f"Flashcards: {num_words} (ALL words rank #1 to #{max_rank})") # noqa: T201
|
|
print(f"Output file: {output_path}") # noqa: T201
|
|
print("") # noqa: T201
|
|
print("To import into Anki:") # noqa: T201
|
|
print(" 1. Open Anki") # noqa: T201
|
|
print(" 2. File -> Import") # noqa: T201
|
|
print(f" 3. Select: {output_path}") # noqa: T201
|
|
print(" 4. Click Import") # noqa: T201
|
|
else:
|
|
print(output_path) # noqa: T201
|
|
|
|
return 0
|
|
|
|
except FileNotFoundError as e:
|
|
print(f"Error: {e}", file=sys.stderr) # noqa: T201
|
|
return 1
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"Error running vocabulary_curve: {e}", file=sys.stderr) # noqa: T201
|
|
return 1
|
|
except ValueError as e:
|
|
print(f"Error: {e}", file=sys.stderr) # noqa: T201
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|