diff --git a/python_pkg/word_frequency/analyzer.py b/python_pkg/word_frequency/analyzer.py
index 5cdf807..b20aa06 100755
--- a/python_pkg/word_frequency/analyzer.py
+++ b/python_pkg/word_frequency/analyzer.py
@@ -22,11 +22,14 @@ from __future__ import annotations
import argparse
from collections import Counter
+import logging
from pathlib import Path
import re
import sys
from typing import TYPE_CHECKING
+logger = logging.getLogger(__name__)
+
if TYPE_CHECKING:
from collections.abc import Sequence
@@ -90,9 +93,7 @@ def read_files(filepaths: Sequence[str | Path]) -> str:
Returns:
Combined text content of all files.
"""
- texts = []
- for filepath in filepaths:
- texts.append(read_file(filepath))
+ texts = [read_file(filepath) for filepath in filepaths]
return "\n".join(texts)
@@ -244,15 +245,15 @@ def main(argv: Sequence[str] | None = None) -> int:
if args.output:
Path(args.output).write_text(result, encoding="utf-8")
- print(f"Output written to {args.output}")
+ logger.info("Output written to %s", args.output)
else:
- print(result)
+ sys.stdout.write(result + "\n")
- except FileNotFoundError as e:
- print(f"Error: File not found - {e}", file=sys.stderr)
+ except FileNotFoundError:
+ logger.exception("File not found")
return 1
- except UnicodeDecodeError as e:
- print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
+ except UnicodeDecodeError:
+ logger.exception("Could not decode file as UTF-8")
return 1
return 0
diff --git a/python_pkg/word_frequency/anki_generator.py b/python_pkg/word_frequency/anki_generator.py
index dced133..7251c47 100755
--- a/python_pkg/word_frequency/anki_generator.py
+++ b/python_pkg/word_frequency/anki_generator.py
@@ -4,27 +4,35 @@
Generates Anki-compatible flashcard decks from the vocabulary needed to
understand excerpts of a given length.
-Usage:
+Usage::
+
# Generate flashcards for a 20-word excerpt
- python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20
+ python -m python_pkg.word_frequency.anki_generator \
+ --file text.txt --length 20
# Specify source language (auto-detected by default)
- python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 --from pl
+ python -m python_pkg.word_frequency.anki_generator \
+ --file text.txt --length 20 --from pl
# Custom output file
- python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 --output polish_vocab.txt
+ python -m python_pkg.word_frequency.anki_generator \
+ --file text.txt --length 20 --output polish_vocab.txt
# Include example sentences/context
- python -m python_pkg.word_frequency.anki_generator --file text.txt --length 20 --include-context
+ python -m python_pkg.word_frequency.anki_generator \
+ --file text.txt --length 20 --include-context
Output:
- Creates a semicolon-separated text file that can be imported into Anki.
- Format: word;translation;frequency_rank;example_context (optional)
+ Creates a semicolon-separated text file importable into Anki.
+ Format: ``word;translation;frequency_rank;example_context``
"""
from __future__ import annotations
import argparse
+import contextlib
+from dataclasses import dataclass
+import logging
from pathlib import Path
import re
import subprocess
@@ -36,14 +44,58 @@ if TYPE_CHECKING:
try:
from python_pkg.word_frequency.analyzer import read_file
+ from python_pkg.word_frequency.cache import (
+ AnkiDeckKey,
+ clear_all_caches,
+ get_all_cache_stats,
+ get_anki_deck_cache,
+ get_vocab_curve_cache,
+ )
from python_pkg.word_frequency.translator import (
detect_language,
translate_words_batch,
)
except ImportError:
from analyzer import read_file
+ from cache import (
+ AnkiDeckKey,
+ clear_all_caches,
+ get_all_cache_stats,
+ get_anki_deck_cache,
+ get_vocab_curve_cache,
+ )
from translator import detect_language, translate_words_batch
+logger = logging.getLogger(__name__)
+
+_MIN_VOCAB_DUMP_PARTS = 2
+_MIN_EXCERPT_PARTS = 3
+_ONE_KB = 1024
+_ONE_MB = 1024 * 1024
+
+
+@dataclass(frozen=True)
+class FlashcardOptions:
+ """Options for flashcard generation."""
+
+ source_lang: str | None = None
+ target_lang: str = "en"
+ deck_name: str | None = None
+ include_context: bool = False
+ no_translate: bool = False
+ force: bool = False
+
+
+@dataclass(frozen=True)
+class DeckInput:
+ """Input data for Anki deck generation."""
+
+ words_with_ranks: list[tuple[str, int]]
+ source_lang: str
+ target_lang: str = "en"
+ contexts: dict[str, str] | None = None
+ deck_name: str = "Vocabulary"
+
# Path to C vocabulary_curve executable
C_EXECUTABLE = (
@@ -78,10 +130,11 @@ def run_vocabulary_curve(
subprocess.CalledProcessError: If execution fails.
"""
if not C_EXECUTABLE.exists():
- raise FileNotFoundError(
+ msg = (
f"C executable not found at {C_EXECUTABLE}. "
"Please compile it first: cd C/vocabulary_curve && make"
)
+ raise FileNotFoundError(msg)
cmd = [str(C_EXECUTABLE), str(filepath), str(max_length)]
if dump_vocab:
@@ -115,10 +168,11 @@ def run_vocabulary_curve_inverse(
subprocess.CalledProcessError: If execution fails.
"""
if not C_EXECUTABLE.exists():
- raise FileNotFoundError(
+ msg = (
f"C executable not found at {C_EXECUTABLE}. "
"Please compile it first: cd C/vocabulary_curve && make"
)
+ raise FileNotFoundError(msg)
cmd = [str(C_EXECUTABLE), str(filepath), "--max-vocab", str(max_vocab)]
if dump_vocab:
@@ -134,6 +188,57 @@ def run_vocabulary_curve_inverse(
return result.stdout
+def _parse_vocab_dump(lines: list[str]) -> list[tuple[str, int]]:
+ """Parse VOCAB_DUMP section from output lines.
+
+ Args:
+ lines: Output lines from vocabulary_curve.
+
+ Returns:
+ List of (word, rank) tuples.
+ """
+ all_vocab: list[tuple[str, int]] = []
+ in_vocab_dump = False
+ for line in lines:
+ stripped = line.strip()
+ if stripped == "VOCAB_DUMP_START":
+ in_vocab_dump = True
+ continue
+ if stripped == "VOCAB_DUMP_END":
+ break
+ if in_vocab_dump and ";" in stripped:
+ parts = stripped.split(";")
+ if len(parts) == _MIN_VOCAB_DUMP_PARTS:
+ word, rank_str = parts
+ with contextlib.suppress(ValueError):
+ all_vocab.append((word, int(rank_str)))
+ return all_vocab
+
+
+def _parse_excerpt_lines(lines: list[str], start: int) -> str:
+ """Parse excerpt text from output lines starting after 'Excerpt:'.
+
+ Args:
+ lines: Output lines.
+ start: Index of the line after 'Excerpt:'.
+
+ Returns:
+ Joined excerpt text.
+ """
+ excerpt_parts: list[str] = []
+ idx = start
+ while idx < len(lines):
+ next_line = lines[idx].strip()
+ next_line = next_line.removeprefix('"')
+ if next_line.endswith('"'):
+ next_line = next_line[:-1]
+ excerpt_parts.append(next_line)
+ break
+ excerpt_parts.append(next_line)
+ idx += 1
+ return " ".join(excerpt_parts)
+
+
def parse_inverse_mode_output(
output: str,
) -> tuple[str, int, int, list[tuple[str, int]]]:
@@ -149,58 +254,77 @@ def parse_inverse_mode_output(
excerpt = ""
excerpt_length = 0
max_rank_used = 0
- all_vocab: list[tuple[str, int]] = []
- for i, line in enumerate(lines):
- line = line.strip()
+ for i, raw_line in enumerate(lines):
+ line = raw_line.strip()
if line.startswith("LONGEST EXCERPT:"):
parts = line.split()
- if len(parts) >= 3:
+ if len(parts) >= _MIN_EXCERPT_PARTS:
excerpt_length = int(parts[2])
elif line.startswith("Excerpt:"):
- # Next line(s) contain the excerpt
- i += 1
- excerpt_parts = []
- while i < len(lines):
- next_line = lines[i].strip()
- if next_line.startswith('"'):
- next_line = next_line[1:]
- if next_line.endswith('"'):
- next_line = next_line[:-1]
- excerpt_parts.append(next_line)
- break
- excerpt_parts.append(next_line)
- i += 1
- excerpt = " ".join(excerpt_parts)
+ excerpt = _parse_excerpt_lines(lines, i + 1)
elif line.startswith("Rarest word used:"):
- # Parse "word (#rank)"
match = re.search(r"\(#(\d+)\)", line)
if match:
max_rank_used = int(match.group(1))
- # Parse VOCAB_DUMP section if present
- in_vocab_dump = False
- for line in lines:
- if line.strip() == "VOCAB_DUMP_START":
- in_vocab_dump = True
- continue
- if line.strip() == "VOCAB_DUMP_END":
- break
- if in_vocab_dump and ";" in line:
- parts = line.strip().split(";")
- if len(parts) == 2:
- word, rank_str = parts
- try:
- all_vocab.append((word, int(rank_str)))
- except ValueError:
- pass
-
+ all_vocab = _parse_vocab_dump(lines)
return excerpt, excerpt_length, max_rank_used, all_vocab
+def _parse_target_length_block(
+ lines: list[str],
+ target_length: int,
+) -> tuple[str, list[tuple[str, int]]]:
+ """Parse the [Length N] block from vocabulary curve output.
+
+ Args:
+ lines: Output lines.
+ target_length: Target excerpt length to find.
+
+ Returns:
+ Tuple of (excerpt, excerpt_words).
+ """
+ excerpt = ""
+ excerpt_words: list[tuple[str, int]] = []
+ i = 0
+ while i < len(lines):
+ if lines[i].strip().startswith(f"[Length {target_length}]"):
+ i += 1
+ # Find excerpt line
+ while i < len(lines) and not lines[i].strip().startswith(
+ "Excerpt:"
+ ):
+ i += 1
+ if i < len(lines):
+ excerpt_line = lines[i].strip()
+ if '"' in excerpt_line:
+ start = excerpt_line.index('"') + 1
+ end = excerpt_line.rindex('"')
+ excerpt = excerpt_line[start:end]
+ # Find words line
+ i += 1
+ while i < len(lines) and not lines[i].strip().startswith(
+ "Words:"
+ ):
+ i += 1
+ if i < len(lines):
+ words_line = lines[i].strip()
+ if words_line.startswith("Words:"):
+ words_part = words_line[6:].strip()
+ pattern = r"(\S+)\(#(\d+)\)"
+ matches = re.findall(pattern, words_part)
+ excerpt_words = [
+ (w, int(r)) for w, r in matches
+ ]
+ break
+ i += 1
+ return excerpt, excerpt_words
+
+
def parse_vocabulary_curve_output(
output: str, target_length: int
) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
@@ -213,61 +337,15 @@ def parse_vocabulary_curve_output(
Returns:
Tuple of (excerpt_text, excerpt_words, all_vocab_words).
excerpt_words: words in the excerpt with their ranks.
- all_vocab_words: all words up to max rank (from VOCAB_DUMP if present).
+ all_vocab_words: all words up to max rank
+ (from VOCAB_DUMP if present).
"""
lines = output.split("\n")
- excerpt = ""
- excerpt_words: list[tuple[str, int]] = []
- all_vocab: list[tuple[str, int]] = []
- # Find the line for the target length
- i = 0
- while i < len(lines):
- line = lines[i]
- if line.strip().startswith(f"[Length {target_length}]"):
- # Found our target length, now get excerpt and words
- i += 1
- # Find excerpt line
- while i < len(lines) and not lines[i].strip().startswith("Excerpt:"):
- i += 1
- if i < len(lines):
- excerpt_line = lines[i].strip()
- if '"' in excerpt_line:
- start = excerpt_line.index('"') + 1
- end = excerpt_line.rindex('"')
- excerpt = excerpt_line[start:end]
-
- # Find words line
- i += 1
- while i < len(lines) and not lines[i].strip().startswith("Words:"):
- i += 1
- if i < len(lines):
- words_line = lines[i].strip()
- if words_line.startswith("Words:"):
- words_part = words_line[6:].strip()
- # Parse "word(#rank), word2(#rank2), ..."
- pattern = r"(\S+)\(#(\d+)\)"
- matches = re.findall(pattern, words_part)
- excerpt_words = [(w, int(r)) for w, r in matches]
- break
- i += 1
-
- # Parse VOCAB_DUMP section if present
- in_vocab_dump = False
- for line in lines:
- if line.strip() == "VOCAB_DUMP_START":
- in_vocab_dump = True
- continue
- if line.strip() == "VOCAB_DUMP_END":
- break
- if in_vocab_dump and ";" in line:
- parts = line.strip().split(";")
- if len(parts) == 2:
- word, rank_str = parts
- try:
- all_vocab.append((word, int(rank_str)))
- except ValueError:
- pass
+ excerpt, excerpt_words = _parse_target_length_block(
+ lines, target_length
+ )
+ all_vocab = _parse_vocab_dump(lines)
return excerpt, excerpt_words, all_vocab
@@ -307,12 +385,86 @@ def find_word_contexts(
return contexts
-def generate_anki_deck(
+def _format_excerpt_card(
+ excerpt: str,
+ excerpt_words: list[tuple[str, int]] | None,
+) -> str:
+ """Format the excerpt as the first Anki card.
+
+ Args:
+ excerpt: The target excerpt text.
+ excerpt_words: Words in the excerpt with ranks.
+
+ Returns:
+ Formatted excerpt card line.
+ """
+ excerpt_escaped = excerpt.replace(";", ",")
+ if excerpt_words:
+ most_frequent = min(excerpt_words, key=lambda x: x[1])[0]
+ rarest = max(excerpt_words, key=lambda x: x[1])[0]
+ if most_frequent != rarest:
+ pattern_rare = re.compile(
+ rf"\b({re.escape(rarest)})\b", re.IGNORECASE
+ )
+ excerpt_escaped = pattern_rare.sub(
+ r"\1", excerpt_escaped
+ )
+ pattern_freq = re.compile(
+ rf"\b({re.escape(most_frequent)})\b",
+ re.IGNORECASE,
+ )
+ excerpt_escaped = pattern_freq.sub(
+ r"\1", excerpt_escaped
+ )
+ else:
+ pattern = re.compile(
+ rf"\b({re.escape(most_frequent)})\b",
+ re.IGNORECASE,
+ )
+ excerpt_escaped = pattern.sub(
+ r"\1", excerpt_escaped
+ )
+ return f"\U0001f4d6 TARGET EXCERPT;{excerpt_escaped};#0"
+
+
+def _build_translation_lookup(
words_with_ranks: list[tuple[str, int]],
source_lang: str,
- target_lang: str = "en",
- contexts: dict[str, str] | None = None,
- deck_name: str = "Vocabulary",
+ target_lang: str,
+ *,
+ no_translate: bool = False,
+) -> dict[str, str]:
+ """Build word-to-translation lookup dict.
+
+ Args:
+ words_with_ranks: List of (word, rank) tuples.
+ source_lang: Source language code.
+ target_lang: Target language code.
+ no_translate: If True, use placeholder translations.
+
+ Returns:
+ Dict mapping lowercase word to translation.
+ """
+ words = [w for w, _ in words_with_ranks]
+ if no_translate:
+ return {w.lower(): "[TODO]" for w in words}
+ translations = translate_words_batch(words, source_lang, target_lang)
+ trans_lookup: dict[str, str] = {}
+ for result in translations:
+ if result.success:
+ trans_lookup[result.source_word.lower()] = (
+ result.translated_word
+ )
+ else:
+ trans_lookup[result.source_word.lower()] = (
+ f"[{result.source_word}]"
+ )
+ return trans_lookup
+
+
+def generate_anki_deck(
+ deck_input: DeckInput,
+ *,
include_context: bool = False,
no_translate: bool = False,
excerpt: str = "",
@@ -321,15 +473,11 @@ def generate_anki_deck(
"""Generate Anki-compatible deck content.
Args:
- words_with_ranks: List of (word, rank) tuples.
- source_lang: Source language code.
- target_lang: Target language code (default: en).
- contexts: Optional dict of word -> context.
- deck_name: Name for the deck.
+ deck_input: Core deck data (words, langs, contexts, name).
include_context: Whether to include context in cards.
no_translate: If True, skip translation (use placeholder).
excerpt: The target excerpt text to include in cards.
- excerpt_words: List of (word, rank) tuples for words in the excerpt.
+ excerpt_words: Words in the excerpt with ranks.
Returns:
Semicolon-separated content ready for Anki import.
@@ -339,73 +487,45 @@ def generate_anki_deck(
# Add Anki headers
lines.append("#separator:semicolon")
lines.append("#html:true")
- lines.append(f"#deck:{deck_name}")
- lines.append(f"#tags:vocabulary {source_lang}")
+ lines.append(f"#deck:{deck_input.deck_name}")
+ lines.append(f"#tags:vocabulary {deck_input.source_lang}")
if include_context:
lines.append("#columns:Front;Back;Rank;Context")
else:
lines.append("#columns:Front;Back;Rank")
lines.append("") # Empty line before data
- # Add excerpt as first card (goal/context card)
if excerpt:
- excerpt_escaped = excerpt.replace(";", ",")
- # Use excerpt_words from C output (has correct ranks)
- if excerpt_words:
- # Most frequent = lowest rank (italics), rarest = highest rank (bold)
- most_frequent = min(excerpt_words, key=lambda x: x[1])[0]
- rarest = max(excerpt_words, key=lambda x: x[1])[0]
- # Apply formatting - rarest first (bold), then most frequent (italics)
- # to avoid nested tag issues if they're the same word
- if most_frequent != rarest:
- pattern_rare = re.compile(rf"\b({re.escape(rarest)})\b", re.IGNORECASE)
- excerpt_escaped = pattern_rare.sub(r"\1", excerpt_escaped)
- pattern_freq = re.compile(
- rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE
- )
- excerpt_escaped = pattern_freq.sub(r"\1", excerpt_escaped)
- else:
- # Same word is both most and least frequent - use bold+italic
- pattern = re.compile(
- rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE
- )
- excerpt_escaped = pattern.sub(r"\1", excerpt_escaped)
- lines.append(f"📖 TARGET EXCERPT;{excerpt_escaped};#0")
+ lines.append(_format_excerpt_card(excerpt, excerpt_words))
- # Get translations (or skip if no_translate)
- words = [w for w, _ in words_with_ranks]
- if no_translate:
- trans_lookup = {w.lower(): "[TODO]" for w in words}
- else:
- translations = translate_words_batch(words, source_lang, target_lang)
- # Build translation lookup
- trans_lookup = {}
- for result in translations:
- if result.success:
- trans_lookup[result.source_word.lower()] = result.translated_word
- else:
- trans_lookup[result.source_word.lower()] = f"[{result.source_word}]"
+ trans_lookup = _build_translation_lookup(
+ deck_input.words_with_ranks,
+ deck_input.source_lang,
+ deck_input.target_lang,
+ no_translate=no_translate,
+ )
# Generate cards
- for word, rank in words_with_ranks:
+ for word, rank in deck_input.words_with_ranks:
translation = trans_lookup.get(word.lower(), f"[{word}]")
# Escape semicolons in fields
word_escaped = word.replace(";", ",")
translation_escaped = translation.replace(";", ",")
- if include_context and contexts:
- context = contexts.get(word.lower(), "")
- # Highlight the word in context
+ if include_context and deck_input.contexts:
+ context = deck_input.contexts.get(word.lower(), "")
if context:
context_escaped = context.replace(";", ",")
- # Make target word bold in context
pattern = re.compile(re.escape(word), re.IGNORECASE)
- context_escaped = pattern.sub(f"{word}", context_escaped)
+ context_escaped = pattern.sub(
+ f"{word}", context_escaped
+ )
else:
context_escaped = ""
lines.append(
- f"{word_escaped};{translation_escaped};#{rank};{context_escaped}"
+ f"{word_escaped};{translation_escaped}"
+ f";#{rank};{context_escaped}"
)
else:
lines.append(f"{word_escaped};{translation_escaped};#{rank}")
@@ -428,12 +548,7 @@ def get_cached_excerpt(
"""
if force:
return None
- try:
- from python_pkg.word_frequency.cache import get_vocab_curve_cache
-
- return get_vocab_curve_cache().get(filepath, length)
- except ImportError:
- return None
+ return get_vocab_curve_cache().get(filepath, length)
def cache_excerpt(
@@ -447,31 +562,18 @@ def cache_excerpt(
excerpt: The excerpt text.
words: List of (word, rank) tuples.
"""
- try:
- from python_pkg.word_frequency.cache import get_vocab_curve_cache
-
- get_vocab_curve_cache().set(filepath, length, excerpt, words)
- except ImportError:
- pass
+ get_vocab_curve_cache().set(filepath, length, excerpt, words)
def get_cached_deck(
- filepath: Path,
- length: int,
- target_lang: str,
- include_context: bool,
- all_vocab: bool,
+ key: AnkiDeckKey,
*,
force: bool = False,
) -> tuple[str, str, int, int] | None:
"""Get cached Anki deck if available.
Args:
- filepath: Path to source file.
- length: Excerpt length.
- target_lang: Target language.
- include_context: Whether context is included.
- all_vocab: Whether all vocab is included.
+ key: Cache key parameters.
force: If True, ignore cache.
Returns:
@@ -479,22 +581,11 @@ def get_cached_deck(
"""
if force:
return None
- try:
- from python_pkg.word_frequency.cache import get_anki_deck_cache
-
- return get_anki_deck_cache().get(
- filepath, length, target_lang, include_context, all_vocab
- )
- except ImportError:
- return None
+ return get_anki_deck_cache().get(key)
def cache_deck(
- filepath: Path,
- length: int,
- target_lang: str,
- include_context: bool,
- all_vocab: bool,
+ key: AnkiDeckKey,
anki_content: str,
excerpt: str,
num_words: int,
@@ -503,139 +594,136 @@ def cache_deck(
"""Store Anki deck in cache.
Args:
- filepath: Path to source file.
- length: Excerpt length.
- target_lang: Target language.
- include_context: Whether context is included.
- all_vocab: Whether all vocab is included.
+ key: Cache key parameters.
anki_content: The deck content.
excerpt: The excerpt text.
num_words: Number of words.
max_rank: Maximum rank.
"""
- try:
- from python_pkg.word_frequency.cache import get_anki_deck_cache
+ get_anki_deck_cache().set(
+ key,
+ anki_content,
+ excerpt,
+ num_words,
+ max_rank,
+ )
- get_anki_deck_cache().set(
- filepath,
- length,
- target_lang,
- include_context,
- all_vocab,
- anki_content,
- excerpt,
- num_words,
- max_rank,
+
+def _detect_source_language(
+ filepath: Path,
+ text: str,
+) -> str:
+ """Auto-detect source language from file content.
+
+ Args:
+ filepath: Path to source file.
+ text: Already-read text (may be empty).
+
+ Returns:
+ Detected language code.
+
+ Raises:
+ ValueError: If language cannot be detected.
+ """
+ sample_text = read_file(filepath)[:1000] if not text else text[:1000]
+ detected = detect_language(sample_text)
+ if detected is None:
+ msg = (
+ "Could not auto-detect source language. "
+ "Please specify with --from (e.g., --from pl for Polish). "
+ "Install langdetect for auto-detection: "
+ "pip install langdetect"
)
- except ImportError:
- pass
+ raise ValueError(msg)
+ return detected
def generate_flashcards(
filepath: str | Path,
excerpt_length: int,
- source_lang: str | None = None,
- target_lang: str = "en",
- include_context: bool = False,
- deck_name: str | None = None,
- all_vocab: bool = True,
- no_translate: bool = False,
+ options: FlashcardOptions | None = None,
*,
- force: bool = False,
+ all_vocab: bool = True,
) -> tuple[str, str, int, int]:
- """Generate Anki flashcards for vocabulary needed for an excerpt length.
+ """Generate Anki flashcards for vocabulary needed for an excerpt.
Args:
filepath: Path to the source text file.
excerpt_length: Target excerpt length.
- source_lang: Source language (auto-detected if None).
- target_lang: Target language for translations.
- include_context: Whether to include example contexts.
- deck_name: Optional deck name.
- all_vocab: If True, include ALL words from rank 1 to max rank needed.
- If False, only include words that appear in the excerpt.
- no_translate: If True, skip translation.
- force: If True, ignore all caches and regenerate.
+ options: Flashcard generation options.
+ all_vocab: If True, include ALL words rank 1 to max rank.
Returns:
Tuple of (anki_content, excerpt, num_words, max_rank).
"""
+ if options is None:
+ options = FlashcardOptions()
filepath = Path(filepath)
+ deck_key = AnkiDeckKey(
+ filepath=filepath,
+ length=excerpt_length,
+ target_lang=options.target_lang,
+ include_context=options.include_context,
+ all_vocab=all_vocab,
+ )
# Check for cached full deck (if not using no_translate)
- if not no_translate and not force:
- cached = get_cached_deck(
- filepath, excerpt_length, target_lang, include_context, all_vocab
- )
+ if not options.no_translate and not options.force:
+ cached = get_cached_deck(deck_key)
if cached is not None:
return cached
# Read the text (only needed for context finding)
- text = read_file(filepath) if include_context else ""
+ text = read_file(filepath) if options.include_context else ""
# Auto-detect language if not provided
+ source_lang = options.source_lang
if source_lang is None:
- sample_text = read_file(filepath)[:1000] if not text else text[:1000]
- source_lang = detect_language(sample_text)
- if source_lang is None:
- raise ValueError(
- "Could not auto-detect source language. "
- "Please specify with --from (e.g., --from pl for Polish). "
- "Install langdetect for auto-detection: pip install langdetect"
- )
+ source_lang = _detect_source_language(filepath, text)
# Run vocabulary curve analysis with vocab dump for all words
- output = run_vocabulary_curve(filepath, excerpt_length, dump_vocab=all_vocab)
- # Parse the output (now includes all vocabulary from C)
+ output = run_vocabulary_curve(
+ filepath, excerpt_length, dump_vocab=all_vocab
+ )
excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(
output, excerpt_length
)
if not excerpt_words:
- raise ValueError(f"No words found for excerpt length {excerpt_length}")
+ msg = f"No words found for excerpt length {excerpt_length}"
+ raise ValueError(msg)
- # Find max rank needed
max_rank = max(rank for _, rank in excerpt_words)
+ words_with_ranks = (
+ all_vocab_words if all_vocab and all_vocab_words else excerpt_words
+ )
- # Use vocabulary from C output
- if all_vocab and all_vocab_words:
- words_with_ranks = all_vocab_words
- else:
- words_with_ranks = excerpt_words
-
- # Get contexts if requested
contexts = None
- if include_context:
+ if options.include_context:
if not text:
text = read_file(filepath)
words = [w for w, _ in words_with_ranks]
contexts = find_word_contexts(text, words)
- # Generate deck name
- if deck_name is None:
- deck_name = f"{filepath.stem}_vocab_{excerpt_length}"
+ deck_name = options.deck_name or f"{filepath.stem}_vocab_{excerpt_length}"
- # Generate Anki content
anki_content = generate_anki_deck(
- words_with_ranks,
- source_lang,
- target_lang,
- contexts,
- deck_name,
- include_context,
- no_translate,
- excerpt,
- excerpt_words,
+ DeckInput(
+ words_with_ranks=words_with_ranks,
+ source_lang=source_lang,
+ target_lang=options.target_lang,
+ contexts=contexts,
+ deck_name=deck_name,
+ ),
+ include_context=options.include_context,
+ no_translate=options.no_translate,
+ excerpt=excerpt,
+ excerpt_words=excerpt_words,
)
- # Cache the full deck (if translated)
- if not no_translate:
+ if not options.no_translate:
cache_deck(
- filepath,
- excerpt_length,
- target_lang,
- include_context,
- all_vocab,
+ deck_key,
anki_content,
excerpt,
len(words_with_ranks),
@@ -648,13 +736,7 @@ def generate_flashcards(
def generate_flashcards_inverse(
filepath: str | Path,
max_vocab: int,
- source_lang: str | None = None,
- target_lang: str = "en",
- include_context: bool = False,
- deck_name: str | None = None,
- no_translate: bool = False,
- *,
- force: bool = False,
+ options: FlashcardOptions | None = None,
) -> tuple[str, str, int, int, int]:
"""Generate Anki flashcards for the longest excerpt using top N words.
@@ -664,95 +746,262 @@ def generate_flashcards_inverse(
Args:
filepath: Path to the source text file.
max_vocab: Maximum vocabulary size (top N words to learn).
- source_lang: Source language (auto-detected if None).
- target_lang: Target language for translations.
- include_context: Whether to include example contexts.
- deck_name: Optional deck name.
- no_translate: If True, skip translation.
- force: If True, ignore all caches and regenerate.
+ options: Flashcard generation options.
Returns:
- Tuple of (anki_content, excerpt, excerpt_length, num_words, max_rank_used).
+ Tuple of (anki_content, excerpt, excerpt_length,
+ num_words, max_rank_used).
"""
+ if options is None:
+ options = FlashcardOptions()
filepath = Path(filepath)
- # Read the text (only needed for context finding)
- text = read_file(filepath) if include_context else ""
+ text = read_file(filepath) if options.include_context else ""
- # Auto-detect language if not provided
+ source_lang = options.source_lang
if source_lang is None:
- sample_text = read_file(filepath)[:1000] if not text else text[:1000]
- source_lang = detect_language(sample_text)
- if source_lang is None:
- raise ValueError(
- "Could not auto-detect source language. "
- "Please specify with --from (e.g., --from pl for Polish). "
- "Install langdetect for auto-detection: pip install langdetect"
- )
+ source_lang = _detect_source_language(filepath, text)
- # Run vocabulary curve in inverse mode
- output = run_vocabulary_curve_inverse(filepath, max_vocab, dump_vocab=True)
-
- # Parse the output
- excerpt, excerpt_length, max_rank_used, all_vocab_words = parse_inverse_mode_output(
- output
+ output = run_vocabulary_curve_inverse(
+ filepath, max_vocab, dump_vocab=True
+ )
+ excerpt, excerpt_length, max_rank_used, all_vocab_words = (
+ parse_inverse_mode_output(output)
)
if excerpt_length == 0:
- raise ValueError(
- f"No valid excerpt found using only top {max_vocab} words. "
- "Try increasing the vocabulary limit."
+ msg = (
+ f"No valid excerpt found using only top {max_vocab} "
+ "words. Try increasing the vocabulary limit."
)
+ raise ValueError(msg)
if not all_vocab_words:
- raise ValueError(f"No vocabulary returned for max_vocab={max_vocab}")
+ msg = f"No vocabulary returned for max_vocab={max_vocab}"
+ raise ValueError(msg)
- # Use all vocabulary up to max_vocab
words_with_ranks = all_vocab_words
- # Find words that appear in the excerpt (for highlighting)
excerpt_word_set = set(excerpt.lower().split())
excerpt_words = [
- (w, r) for w, r in all_vocab_words if w.lower() in excerpt_word_set
+ (w, r)
+ for w, r in all_vocab_words
+ if w.lower() in excerpt_word_set
]
- # Get contexts if requested
contexts = None
- if include_context:
+ if options.include_context:
if not text:
text = read_file(filepath)
words = [w for w, _ in words_with_ranks]
contexts = find_word_contexts(text, words)
- # Generate deck name
- if deck_name is None:
- deck_name = f"{filepath.stem}_top{max_vocab}"
+ deck_name = options.deck_name or f"{filepath.stem}_top{max_vocab}"
- # Generate Anki content
anki_content = generate_anki_deck(
- words_with_ranks,
- source_lang,
- target_lang,
- contexts,
- deck_name,
- include_context,
- no_translate,
- excerpt,
- excerpt_words if excerpt_words else None,
+ DeckInput(
+ words_with_ranks=words_with_ranks,
+ source_lang=source_lang,
+ target_lang=options.target_lang,
+ contexts=contexts,
+ deck_name=deck_name,
+ ),
+ include_context=options.include_context,
+ no_translate=options.no_translate,
+ excerpt=excerpt,
+ excerpt_words=excerpt_words or None,
)
- return anki_content, excerpt, excerpt_length, len(words_with_ranks), max_rank_used
+ return (
+ anki_content,
+ excerpt,
+ excerpt_length,
+ len(words_with_ranks),
+ max_rank_used,
+ )
-def main(argv: Sequence[str] | None = None) -> int:
- """Main entry point.
+def _format_cache_size(value: int) -> str:
+ """Format a byte size as human-readable string."""
+ if value < _ONE_KB:
+ return f"{value} B"
+ if value < _ONE_MB:
+ return f"{value / _ONE_KB:.1f} KB"
+ return f"{value / _ONE_MB:.1f} MB"
+
+
+def _print_cache_stats() -> int:
+ """Print cache statistics and return exit code."""
+ stats = get_all_cache_stats()
+ logger.info("Cache Statistics")
+ logger.info("=" * 50)
+ for cache_name, cache_stats in stats.items():
+ logger.info("\n%s:", cache_name.upper())
+ for key, value in cache_stats.items():
+ if key == "cache_size_bytes":
+ logger.info(" %s: %s", key, _format_cache_size(value))
+ else:
+ logger.info(" %s: %s", key, value)
+ return 0
+
+
+def _clear_caches() -> int:
+ """Clear all caches and return exit code."""
+ clear_all_caches()
+ logger.info("All caches cleared.")
+ return 0
+
+
+def _log_anki_import_instructions(output_path: Path) -> None:
+ """Log Anki import instructions."""
+ logger.info("")
+ logger.info("To import into Anki:")
+ logger.info(" 1. Open Anki")
+ logger.info(" 2. File -> Import")
+ logger.info(" 3. Select: %s", output_path)
+ logger.info(" 4. Click Import")
+
+
+def _handle_inverse_mode(
+ args: argparse.Namespace,
+ filepath: Path,
+) -> int:
+ """Handle inverse mode (--max-vocab) flashcard generation.
Args:
- argv: Command line arguments.
+ args: Parsed command line arguments.
+ filepath: Path to source file.
Returns:
Exit code.
"""
+ if not args.quiet:
+ logger.info("Analyzing %s...", filepath.name)
+ logger.info(
+ "Finding longest excerpt using top %d words...",
+ args.max_vocab,
+ )
+
+ anki_content, excerpt, excerpt_length, num_words, max_rank_used = (
+ generate_flashcards_inverse(
+ filepath,
+ args.max_vocab,
+ FlashcardOptions(
+ source_lang=args.source_lang,
+ target_lang=args.target_lang,
+ deck_name=args.deck_name,
+ include_context=args.include_context,
+ no_translate=args.no_translate,
+ force=args.force,
+ ),
+ )
+ )
+
+ output_path = (
+ Path(args.output)
+ if args.output
+ else filepath.parent
+ / f"{filepath.stem}_anki_top{args.max_vocab}.txt"
+ )
+ output_path.write_text(anki_content, encoding="utf-8")
+
+ if not args.quiet:
+ logger.info("")
+ logger.info("=" * 60)
+ logger.info("FLASHCARD GENERATION COMPLETE (INVERSE MODE)")
+ logger.info("=" * 60)
+ logger.info("Learning: top %d words", args.max_vocab)
+ logger.info(
+ "Longest excerpt you can understand: %d words",
+ excerpt_length,
+ )
+ logger.info(' "%s"', excerpt)
+ logger.info("")
+ logger.info("Rarest word in excerpt: #%d", max_rank_used)
+ logger.info("Flashcards: %d", num_words)
+ logger.info("Output file: %s", output_path)
+ _log_anki_import_instructions(output_path)
+ else:
+ logger.info("%s", output_path)
+
+ return 0
+
+
+def _handle_normal_mode(
+ args: argparse.Namespace,
+ filepath: Path,
+) -> int:
+ """Handle normal mode (--length) flashcard generation.
+
+ Args:
+ args: Parsed command line arguments.
+ filepath: Path to source file.
+
+ Returns:
+ Exit code.
+ """
+ if not args.quiet:
+ logger.info("Analyzing %s...", filepath.name)
+ logger.info(
+ "Finding vocabulary for %d-word excerpt...", args.length
+ )
+
+ anki_content, excerpt, num_words, max_rank = generate_flashcards(
+ filepath,
+ args.length,
+ FlashcardOptions(
+ source_lang=args.source_lang,
+ target_lang=args.target_lang,
+ deck_name=args.deck_name,
+ include_context=args.include_context,
+ no_translate=args.no_translate,
+ force=args.force,
+ ),
+ all_vocab=not args.excerpt_words_only,
+ )
+
+ output_path = (
+ Path(args.output)
+ if args.output
+ else filepath.parent / f"{filepath.stem}_anki_{args.length}.txt"
+ )
+ output_path.write_text(anki_content, encoding="utf-8")
+
+ if not args.quiet:
+ logger.info("")
+ logger.info("=" * 60)
+ logger.info("FLASHCARD GENERATION COMPLETE")
+ logger.info("=" * 60)
+ logger.info(
+ "Excerpt to understand (%d words):", args.length
+ )
+ logger.info(' "%s"', excerpt)
+ logger.info("")
+ logger.info("Max word rank needed: #%d", max_rank)
+ if args.excerpt_words_only:
+ logger.info(
+ "Flashcards: %d (excerpt words only)", num_words
+ )
+ else:
+ logger.info(
+ "Flashcards: %d (ALL words rank #1 to #%d)",
+ num_words,
+ max_rank,
+ )
+ logger.info("Output file: %s", output_path)
+ _log_anki_import_instructions(output_path)
+ else:
+ logger.info("%s", output_path)
+
+ return 0
+
+
+def _build_parser() -> argparse.ArgumentParser:
+ """Build the argument parser for the CLI.
+
+ Returns:
+ Configured argument parser.
+ """
parser = argparse.ArgumentParser(
description="Generate Anki flashcards from vocabulary analysis.",
formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -771,21 +1020,30 @@ def main(argv: Sequence[str] | None = None) -> int:
"-l",
type=int,
default=None,
- help="Target excerpt length (how many words you want to understand)",
+ help=(
+ "Target excerpt length "
+ "(how many words you want to understand)"
+ ),
)
parser.add_argument(
"--max-vocab",
"-v",
type=int,
default=None,
- help="INVERSE MODE: Learn top N words, find longest excerpt you can understand",
+ help=(
+ "INVERSE MODE: Learn top N words, "
+ "find longest excerpt you can understand"
+ ),
)
parser.add_argument(
"--from",
dest="source_lang",
type=str,
default=None,
- help="Source language code (e.g., 'pl', 'la', 'de'). Auto-detected if not specified.",
+ help=(
+ "Source language code (e.g., 'pl', 'la', 'de'). "
+ "Auto-detected if not specified."
+ ),
)
parser.add_argument(
"--to",
@@ -825,7 +1083,10 @@ def main(argv: Sequence[str] | None = None) -> int:
"--excerpt-words-only",
"-e",
action="store_true",
- help="Only include words that appear in the excerpt (default: include ALL words up to max rank)",
+ help=(
+ "Only include words that appear in the excerpt "
+ "(default: include ALL words up to max rank)"
+ ),
)
parser.add_argument(
"--no-translate",
@@ -849,179 +1110,64 @@ def main(argv: Sequence[str] | None = None) -> int:
action="store_true",
help="Clear all caches and exit",
)
+ return parser
+
+def _run_generation(args: argparse.Namespace) -> int:
+ """Validate args and run flashcard generation.
+
+ Args:
+ args: Parsed command line arguments.
+
+ Returns:
+ Exit code.
+ """
+ filepath = Path(args.file)
+ if not filepath.exists():
+ logger.error("Error: File not found: %s", args.file)
+ return 1
+
+ if args.max_vocab is not None:
+ return _handle_inverse_mode(args, filepath)
+ return _handle_normal_mode(args, filepath)
+
+
+def main(argv: Sequence[str] | None = None) -> int:
+ """Main entry point.
+
+ Args:
+ argv: Command line arguments.
+
+ Returns:
+ Exit code.
+ """
+ parser = _build_parser()
args = parser.parse_args(argv)
- # Handle cache management commands
if args.cache_stats:
- try:
- from python_pkg.word_frequency.cache import get_all_cache_stats
- except ImportError:
- try:
- from cache import get_all_cache_stats
- except ImportError:
- print("Cache module not available", file=sys.stderr)
- return 1
- stats = get_all_cache_stats()
- print("Cache Statistics")
- print("=" * 50)
- for cache_name, cache_stats in stats.items():
- print(f"\n{cache_name.upper()}:")
- for key, value in cache_stats.items():
- if key == "cache_size_bytes":
- if value < 1024:
- size_str = f"{value} B"
- elif value < 1024 * 1024:
- size_str = f"{value / 1024:.1f} KB"
- else:
- size_str = f"{value / (1024 * 1024):.1f} MB"
- print(f" {key}: {size_str}")
- else:
- print(f" {key}: {value}")
- return 0
+ return _print_cache_stats()
if args.clear_cache:
- try:
- from python_pkg.word_frequency.cache import clear_all_caches
- except ImportError:
- try:
- from cache import clear_all_caches
- except ImportError:
- print("Cache module not available", file=sys.stderr)
- return 1
- clear_all_caches()
- print("All caches cleared.")
- return 0
+ return _clear_caches()
- # Validate required arguments for main functionality
if args.file is None:
parser.error("--file/-f is required")
if args.length is None and args.max_vocab is None:
parser.error("Either --length/-l or --max-vocab/-v is required")
if args.length is not None and args.max_vocab is not None:
- parser.error("Cannot use both --length and --max-vocab. Choose one mode.")
-
- try:
- filepath = Path(args.file)
- if not filepath.exists():
- print(f"Error: File not found: {args.file}", file=sys.stderr)
- return 1
-
- # INVERSE MODE: --max-vocab
- if args.max_vocab is not None:
- if not args.quiet:
- print(f"Analyzing {filepath.name}...")
- print(f"Finding longest excerpt using top {args.max_vocab} words...")
-
- # Generate flashcards in inverse mode
- anki_content, excerpt, excerpt_length, num_words, max_rank_used = (
- generate_flashcards_inverse(
- filepath,
- args.max_vocab,
- source_lang=args.source_lang,
- target_lang=args.target_lang,
- include_context=args.include_context,
- deck_name=args.deck_name,
- no_translate=args.no_translate,
- force=args.force,
- )
- )
-
- # Determine output path
- if args.output:
- output_path = Path(args.output)
- else:
- output_path = (
- filepath.parent / f"{filepath.stem}_anki_top{args.max_vocab}.txt"
- )
-
- # Write output
- output_path.write_text(anki_content, encoding="utf-8")
-
- if not args.quiet:
- print()
- print("=" * 60)
- print("FLASHCARD GENERATION COMPLETE (INVERSE MODE)")
- print("=" * 60)
- print(f"Learning: top {args.max_vocab} words")
- print(f"Longest excerpt you can understand: {excerpt_length} words")
- print(f' "{excerpt}"')
- print()
- print(f"Rarest word in excerpt: #{max_rank_used}")
- print(f"Flashcards: {num_words}")
- print(f"Output file: {output_path}")
- print()
- print("To import into Anki:")
- print(" 1. Open Anki")
- print(" 2. File -> Import")
- print(f" 3. Select: {output_path}")
- print(" 4. Click Import")
- else:
- print(output_path)
-
- return 0
-
- # NORMAL MODE: --length
- if not args.quiet:
- print(f"Analyzing {filepath.name}...")
- print(f"Finding vocabulary for {args.length}-word excerpt...")
-
- # Generate flashcards
- anki_content, excerpt, num_words, max_rank = generate_flashcards(
- filepath,
- args.length,
- source_lang=args.source_lang,
- target_lang=args.target_lang,
- include_context=args.include_context,
- deck_name=args.deck_name,
- all_vocab=not args.excerpt_words_only,
- no_translate=args.no_translate,
- force=args.force,
+ parser.error(
+ "Cannot use both --length and --max-vocab. Choose one mode."
)
- # Determine output path
- if args.output:
- output_path = Path(args.output)
- else:
- output_path = filepath.parent / f"{filepath.stem}_anki_{args.length}.txt"
-
- # Write output
- output_path.write_text(anki_content, encoding="utf-8")
-
- if not args.quiet:
- print()
- print("=" * 60)
- print("FLASHCARD GENERATION COMPLETE")
- print("=" * 60)
- print(f"Excerpt to understand ({args.length} words):")
- print(f' "{excerpt}"')
- print()
- print(f"Max word rank needed: #{max_rank}")
- if args.excerpt_words_only:
- print(f"Flashcards: {num_words} (excerpt words only)")
- else:
- print(f"Flashcards: {num_words} (ALL words rank #1 to #{max_rank})")
- print(f"Output file: {output_path}")
- print()
- print("To import into Anki:")
- print(" 1. Open Anki")
- print(" 2. File -> Import")
- print(f" 3. Select: {output_path}")
- print(" 4. Click Import")
- else:
- print(output_path)
-
- return 0
-
- except FileNotFoundError as e:
- print(f"Error: {e}", file=sys.stderr)
- return 1
- except subprocess.CalledProcessError as e:
- print(f"Error running vocabulary_curve: {e}", file=sys.stderr)
- return 1
- except ValueError as e:
- print(f"Error: {e}", file=sys.stderr)
- return 1
+ try:
+ return _run_generation(args)
+ except FileNotFoundError:
+ logger.exception("File not found")
+ except subprocess.CalledProcessError:
+ logger.exception("Error running vocabulary_curve")
+ except ValueError:
+ logger.exception("Value error")
+ return 1
if __name__ == "__main__":
diff --git a/python_pkg/word_frequency/cache.py b/python_pkg/word_frequency/cache.py
index 75f4002..67e03fc 100755
--- a/python_pkg/word_frequency/cache.py
+++ b/python_pkg/word_frequency/cache.py
@@ -11,15 +11,23 @@ Cache location: ~/.cache/word_frequency/
from __future__ import annotations
+import argparse
+from dataclasses import dataclass
import hashlib
import json
+import logging
import os
from pathlib import Path
from typing import Any
+logger = logging.getLogger(__name__)
+
# Default cache directory
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency"
+_ONE_KB = 1024
+_ONE_MB = 1024 * 1024
+
def get_cache_dir() -> Path:
"""Get the cache directory, creating it if needed.
@@ -42,7 +50,7 @@ def get_file_hash(filepath: Path) -> str:
Hex digest of file hash.
"""
hasher = hashlib.sha256()
- with open(filepath, "rb") as f:
+ with filepath.open("rb") as f:
# Read in chunks for large files
for chunk in iter(lambda: f.read(65536), b""):
hasher.update(chunk)
@@ -274,14 +282,15 @@ class VocabCurveCache:
try:
data = json.loads(cache_path.read_text(encoding="utf-8"))
+ except (json.JSONDecodeError, KeyError, OSError):
+ return None
+ else:
# Verify hash matches
if data.get("file_hash") != file_hash:
return None
excerpt = data["excerpt"]
words = [(w, r) for w, r in data["words"]]
return excerpt, words
- except (json.JSONDecodeError, KeyError, OSError):
- return None
def set(
self,
@@ -339,6 +348,17 @@ class VocabCurveCache:
# =============================================================================
+@dataclass(frozen=True)
+class AnkiDeckKey:
+ """Key parameters for Anki deck cache lookups."""
+
+ filepath: Path
+ length: int
+ target_lang: str
+ include_context: bool
+ all_vocab: bool
+
+
class AnkiDeckCache:
"""Cache for generated Anki decks."""
@@ -380,6 +400,7 @@ class AnkiDeckCache:
file_hash: str,
length: int,
target_lang: str,
+ *,
include_context: bool,
all_vocab: bool,
) -> str:
@@ -400,36 +421,35 @@ class AnkiDeckCache:
def get(
self,
- filepath: Path,
- length: int,
- target_lang: str,
- include_context: bool,
- all_vocab: bool,
+ key: AnkiDeckKey,
) -> tuple[str, str, int, int] | None:
"""Get cached Anki deck.
Args:
- filepath: Path to source file.
- length: Excerpt length.
- target_lang: Target language.
- include_context: Whether context is included.
- all_vocab: Whether all vocab is included.
+ key: Cache key parameters.
Returns:
- Tuple of (anki_content, excerpt, num_words, max_rank) or None.
+ Tuple of (anki_content, excerpt, num_words, max_rank)
+ or None.
"""
- file_hash = get_file_hash(filepath)
- key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
+ file_hash = get_file_hash(key.filepath)
+ cache_key = self._make_key(
+ file_hash,
+ key.length,
+ key.target_lang,
+ include_context=key.include_context,
+ all_vocab=key.all_vocab,
+ )
metadata = self._load_metadata()
- if key not in metadata:
+ if cache_key not in metadata:
return None
- entry = metadata[key]
+ entry = metadata[cache_key]
if entry.get("file_hash") != file_hash:
return None
- deck_file = self.cache_dir / f"{key}.txt"
+ deck_file = self.cache_dir / f"{cache_key}.txt"
if not deck_file.exists():
return None
@@ -446,11 +466,7 @@ class AnkiDeckCache:
def set(
self,
- filepath: Path,
- length: int,
- target_lang: str,
- include_context: bool,
- all_vocab: bool,
+ key: AnkiDeckKey,
anki_content: str,
excerpt: str,
num_words: int,
@@ -459,32 +475,34 @@ class AnkiDeckCache:
"""Store Anki deck in cache.
Args:
- filepath: Path to source file.
- length: Excerpt length.
- target_lang: Target language.
- include_context: Whether context is included.
- all_vocab: Whether all vocab is included.
+ key: Cache key parameters.
anki_content: The Anki deck content.
excerpt: The excerpt text.
num_words: Number of words in deck.
max_rank: Maximum word rank.
"""
- file_hash = get_file_hash(filepath)
- key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
+ file_hash = get_file_hash(key.filepath)
+ cache_key = self._make_key(
+ file_hash,
+ key.length,
+ key.target_lang,
+ include_context=key.include_context,
+ all_vocab=key.all_vocab,
+ )
# Save deck content
- deck_file = self.cache_dir / f"{key}.txt"
+ deck_file = self.cache_dir / f"{cache_key}.txt"
deck_file.write_text(anki_content, encoding="utf-8")
# Update metadata
metadata = self._load_metadata()
- metadata[key] = {
+ metadata[cache_key] = {
"file_hash": file_hash,
- "filepath": str(filepath),
- "length": length,
- "target_lang": target_lang,
- "include_context": include_context,
- "all_vocab": all_vocab,
+ "filepath": str(key.filepath),
+ "length": key.length,
+ "target_lang": key.target_lang,
+ "include_context": key.include_context,
+ "all_vocab": key.all_vocab,
"excerpt": excerpt,
"num_words": num_words,
"max_rank": max_rank,
@@ -519,34 +537,33 @@ class AnkiDeckCache:
# Global Cache Instances
# =============================================================================
-# Singleton instances
-_translation_cache: TranslationCache | None = None
-_vocab_curve_cache: VocabCurveCache | None = None
-_anki_deck_cache: AnkiDeckCache | None = None
+class _CacheHolder:
+ """Holds singleton cache instances."""
+
+ translation: TranslationCache | None = None
+ vocab_curve: VocabCurveCache | None = None
+ anki_deck: AnkiDeckCache | None = None
def get_translation_cache() -> TranslationCache:
"""Get the global translation cache instance."""
- global _translation_cache
- if _translation_cache is None:
- _translation_cache = TranslationCache()
- return _translation_cache
+ if _CacheHolder.translation is None:
+ _CacheHolder.translation = TranslationCache()
+ return _CacheHolder.translation
def get_vocab_curve_cache() -> VocabCurveCache:
"""Get the global vocabulary curve cache instance."""
- global _vocab_curve_cache
- if _vocab_curve_cache is None:
- _vocab_curve_cache = VocabCurveCache()
- return _vocab_curve_cache
+ if _CacheHolder.vocab_curve is None:
+ _CacheHolder.vocab_curve = VocabCurveCache()
+ return _CacheHolder.vocab_curve
def get_anki_deck_cache() -> AnkiDeckCache:
"""Get the global Anki deck cache instance."""
- global _anki_deck_cache
- if _anki_deck_cache is None:
- _anki_deck_cache = AnkiDeckCache()
- return _anki_deck_cache
+ if _CacheHolder.anki_deck is None:
+ _CacheHolder.anki_deck = AnkiDeckCache()
+ return _CacheHolder.anki_deck
def clear_all_caches() -> None:
@@ -575,8 +592,6 @@ def main() -> int:
Returns:
Exit code.
"""
- import argparse
-
parser = argparse.ArgumentParser(description="Manage word frequency caches")
parser.add_argument("--stats", action="store_true", help="Show cache statistics")
parser.add_argument("--clear", action="store_true", help="Clear all caches")
@@ -594,42 +609,42 @@ def main() -> int:
if args.clear:
clear_all_caches()
- print("All caches cleared.")
+ logger.info("All caches cleared.")
return 0
if args.clear_translations:
get_translation_cache().clear()
- print("Translation cache cleared.")
+ logger.info("Translation cache cleared.")
return 0
if args.clear_excerpts:
get_vocab_curve_cache().clear()
- print("Excerpt cache cleared.")
+ logger.info("Excerpt cache cleared.")
return 0
if args.clear_anki:
get_anki_deck_cache().clear()
- print("Anki deck cache cleared.")
+ logger.info("Anki deck cache cleared.")
return 0
# Default: show stats
stats = get_all_cache_stats()
- print("Cache Statistics")
- print("=" * 50)
+ logger.info("Cache Statistics")
+ logger.info("=" * 50)
for cache_name, cache_stats in stats.items():
- print(f"\n{cache_name.upper()}:")
+ logger.info("\n%s:", cache_name.upper())
for key, value in cache_stats.items():
if key == "cache_size_bytes":
# Format as human-readable
- if value < 1024:
+ if value < _ONE_KB:
size_str = f"{value} B"
- elif value < 1024 * 1024:
- size_str = f"{value / 1024:.1f} KB"
+ elif value < _ONE_MB:
+ size_str = f"{value / _ONE_KB:.1f} KB"
else:
- size_str = f"{value / (1024 * 1024):.1f} MB"
- print(f" {key}: {size_str}")
+ size_str = f"{value / _ONE_MB:.1f} MB"
+ logger.info(" %s: %s", key, size_str)
else:
- print(f" {key}: {value}")
+ logger.info(" %s: %s", key, value)
return 0
diff --git a/python_pkg/word_frequency/cache.py.bak b/python_pkg/word_frequency/cache.py.bak
new file mode 100755
index 0000000..75f4002
--- /dev/null
+++ b/python_pkg/word_frequency/cache.py.bak
@@ -0,0 +1,640 @@
+#!/usr/bin/env python3
+"""Caching utilities for word frequency analysis.
+
+Provides disk-based caching for:
+- Translations (word -> translation mappings)
+- Vocabulary curve excerpts (file + length -> excerpt + words)
+- Generated Anki decks
+
+Cache location: ~/.cache/word_frequency/
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+# Default cache directory
+DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency"
+
+
+def get_cache_dir() -> Path:
+ """Get the cache directory, creating it if needed.
+
+ Returns:
+ Path to cache directory.
+ """
+ cache_dir = Path(os.environ.get("WORD_FREQ_CACHE_DIR", str(DEFAULT_CACHE_DIR)))
+ cache_dir.mkdir(parents=True, exist_ok=True)
+ return cache_dir
+
+
+def get_file_hash(filepath: Path) -> str:
+ """Compute SHA256 hash of a file's contents.
+
+ Args:
+ filepath: Path to file.
+
+ Returns:
+ Hex digest of file hash.
+ """
+ hasher = hashlib.sha256()
+ with open(filepath, "rb") as f:
+ # Read in chunks for large files
+ for chunk in iter(lambda: f.read(65536), b""):
+ hasher.update(chunk)
+ return hasher.hexdigest()
+
+
+def get_text_hash(text: str) -> str:
+ """Compute SHA256 hash of text content.
+
+ Args:
+ text: Text to hash.
+
+ Returns:
+ Hex digest of text hash.
+ """
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+
+# =============================================================================
+# Translation Cache
+# =============================================================================
+
+
+class TranslationCache:
+ """Cache for word translations."""
+
+ def __init__(self, cache_dir: Path | None = None) -> None:
+ """Initialize translation cache.
+
+ Args:
+ cache_dir: Optional custom cache directory.
+ """
+ self.cache_dir = cache_dir or get_cache_dir()
+ self.cache_file = self.cache_dir / "translations.json"
+ self._cache: dict[str, str] | None = None
+ self._dirty = False # Track if cache needs saving
+
+ def _load_cache(self) -> dict[str, str]:
+ """Load cache from disk."""
+ if self._cache is None:
+ if self.cache_file.exists():
+ try:
+ self._cache = json.loads(
+ self.cache_file.read_text(encoding="utf-8")
+ )
+ except (json.JSONDecodeError, OSError):
+ self._cache = {}
+ else:
+ self._cache = {}
+ return self._cache
+
+ def _save_cache(self) -> None:
+ """Save cache to disk if dirty."""
+ if self._cache is not None and self._dirty:
+ self.cache_file.write_text(
+ json.dumps(self._cache, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+ self._dirty = False
+
+ def flush(self) -> None:
+ """Force save cache to disk."""
+ self._save_cache()
+
+ @staticmethod
+ def _make_key(word: str, source_lang: str, target_lang: str) -> str:
+ """Create cache key for a translation.
+
+ Args:
+ word: Word to translate.
+ source_lang: Source language code.
+ target_lang: Target language code.
+
+ Returns:
+ Cache key string.
+ """
+ return f"{source_lang}:{target_lang}:{word.lower()}"
+
+ def get(self, word: str, source_lang: str, target_lang: str) -> str | None:
+ """Get cached translation.
+
+ Args:
+ word: Word to look up.
+ source_lang: Source language code.
+ target_lang: Target language code.
+
+ Returns:
+ Cached translation or None if not found.
+ """
+ cache = self._load_cache()
+ key = self._make_key(word, source_lang, target_lang)
+ return cache.get(key)
+
+ def set(
+ self,
+ word: str,
+ source_lang: str,
+ target_lang: str,
+ translation: str,
+ *,
+ auto_save: bool = False,
+ ) -> None:
+ """Store translation in cache.
+
+ Args:
+ word: Original word.
+ source_lang: Source language code.
+ target_lang: Target language code.
+ translation: Translated word.
+ auto_save: If True, save to disk immediately.
+ """
+ cache = self._load_cache()
+ key = self._make_key(word, source_lang, target_lang)
+ cache[key] = translation
+ self._dirty = True
+ if auto_save:
+ self._save_cache()
+
+ def get_many(
+ self, words: list[str], source_lang: str, target_lang: str
+ ) -> dict[str, str]:
+ """Get multiple cached translations.
+
+ Args:
+ words: Words to look up.
+ source_lang: Source language code.
+ target_lang: Target language code.
+
+ Returns:
+ Dict mapping words to their cached translations.
+ """
+ cache = self._load_cache()
+ result: dict[str, str] = {}
+ for word in words:
+ key = self._make_key(word, source_lang, target_lang)
+ if key in cache:
+ result[word.lower()] = cache[key]
+ return result
+
+ def set_many(
+ self,
+ translations: dict[str, str],
+ source_lang: str,
+ target_lang: str,
+ ) -> None:
+ """Store multiple translations in cache and save to disk.
+
+ Args:
+ translations: Dict mapping words to translations.
+ source_lang: Source language code.
+ target_lang: Target language code.
+ """
+ cache = self._load_cache()
+ for word, translation in translations.items():
+ key = self._make_key(word, source_lang, target_lang)
+ cache[key] = translation
+ self._dirty = True
+ self._save_cache() # Save once after all additions
+
+ def clear(self) -> None:
+ """Clear all cached translations."""
+ self._cache = {}
+ self._dirty = False
+ if self.cache_file.exists():
+ self.cache_file.unlink()
+
+ def stats(self) -> dict[str, Any]:
+ """Get cache statistics.
+
+ Returns:
+ Dict with cache stats.
+ """
+ cache = self._load_cache()
+ return {
+ "total_entries": len(cache),
+ "cache_file": str(self.cache_file),
+ "cache_size_bytes": (
+ self.cache_file.stat().st_size if self.cache_file.exists() else 0
+ ),
+ }
+
+
+# =============================================================================
+# Vocabulary Curve Cache
+# =============================================================================
+
+
+class VocabCurveCache:
+ """Cache for vocabulary curve analysis results."""
+
+ def __init__(self, cache_dir: Path | None = None) -> None:
+ """Initialize vocabulary curve cache.
+
+ Args:
+ cache_dir: Optional custom cache directory.
+ """
+ self.cache_dir = (cache_dir or get_cache_dir()) / "excerpts"
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
+
+ def _get_cache_path(self, file_hash: str, length: int) -> Path:
+ """Get path to cache file for given hash and length.
+
+ Args:
+ file_hash: Hash of source file.
+ length: Excerpt length.
+
+ Returns:
+ Path to cache file.
+ """
+ return self.cache_dir / f"{file_hash[:16]}_{length}.json"
+
+ def get(
+ self, filepath: Path, length: int
+ ) -> tuple[str, list[tuple[str, int]]] | None:
+ """Get cached excerpt and words for a file and length.
+
+ Args:
+ filepath: Path to source file.
+ length: Excerpt length.
+
+ Returns:
+ Tuple of (excerpt, words_with_ranks) or None if not cached.
+ """
+ file_hash = get_file_hash(filepath)
+ cache_path = self._get_cache_path(file_hash, length)
+
+ if not cache_path.exists():
+ return None
+
+ try:
+ data = json.loads(cache_path.read_text(encoding="utf-8"))
+ # Verify hash matches
+ if data.get("file_hash") != file_hash:
+ return None
+ excerpt = data["excerpt"]
+ words = [(w, r) for w, r in data["words"]]
+ return excerpt, words
+ except (json.JSONDecodeError, KeyError, OSError):
+ return None
+
+ def set(
+ self,
+ filepath: Path,
+ length: int,
+ excerpt: str,
+ words: list[tuple[str, int]],
+ ) -> None:
+ """Store excerpt and words in cache.
+
+ Args:
+ filepath: Path to source file.
+ length: Excerpt length.
+ excerpt: The excerpt text.
+ words: List of (word, rank) tuples.
+ """
+ file_hash = get_file_hash(filepath)
+ cache_path = self._get_cache_path(file_hash, length)
+
+ data = {
+ "file_hash": file_hash,
+ "filepath": str(filepath),
+ "length": length,
+ "excerpt": excerpt,
+ "words": [[w, r] for w, r in words],
+ }
+
+ cache_path.write_text(
+ json.dumps(data, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+
+ def clear(self) -> None:
+ """Clear all cached excerpts."""
+ for cache_file in self.cache_dir.glob("*.json"):
+ cache_file.unlink()
+
+ def stats(self) -> dict[str, Any]:
+ """Get cache statistics.
+
+ Returns:
+ Dict with cache stats.
+ """
+ cache_files = list(self.cache_dir.glob("*.json"))
+ total_size = sum(f.stat().st_size for f in cache_files)
+ return {
+ "total_entries": len(cache_files),
+ "cache_dir": str(self.cache_dir),
+ "cache_size_bytes": total_size,
+ }
+
+
+# =============================================================================
+# Anki Deck Cache
+# =============================================================================
+
+
+class AnkiDeckCache:
+ """Cache for generated Anki decks."""
+
+ def __init__(self, cache_dir: Path | None = None) -> None:
+ """Initialize Anki deck cache.
+
+ Args:
+ cache_dir: Optional custom cache directory.
+ """
+ self.cache_dir = (cache_dir or get_cache_dir()) / "anki_decks"
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
+ self.metadata_file = self.cache_dir / "metadata.json"
+ self._metadata: dict[str, Any] | None = None
+
+ def _load_metadata(self) -> dict[str, Any]:
+ """Load metadata from disk."""
+ if self._metadata is None:
+ if self.metadata_file.exists():
+ try:
+ self._metadata = json.loads(
+ self.metadata_file.read_text(encoding="utf-8")
+ )
+ except (json.JSONDecodeError, OSError):
+ self._metadata = {}
+ else:
+ self._metadata = {}
+ return self._metadata
+
+ def _save_metadata(self) -> None:
+ """Save metadata to disk."""
+ if self._metadata is not None:
+ self.metadata_file.write_text(
+ json.dumps(self._metadata, ensure_ascii=False, indent=2),
+ encoding="utf-8",
+ )
+
+ @staticmethod
+ def _make_key(
+ file_hash: str,
+ length: int,
+ target_lang: str,
+ include_context: bool,
+ all_vocab: bool,
+ ) -> str:
+ """Create cache key for an Anki deck.
+
+ Args:
+ file_hash: Hash of source file.
+ length: Excerpt length.
+ target_lang: Target language.
+ include_context: Whether context is included.
+ all_vocab: Whether all vocab is included.
+
+ Returns:
+ Cache key string.
+ """
+ flags = f"ctx{int(include_context)}_all{int(all_vocab)}"
+ return f"{file_hash[:16]}_{length}_{target_lang}_{flags}"
+
+ def get(
+ self,
+ filepath: Path,
+ length: int,
+ target_lang: str,
+ include_context: bool,
+ all_vocab: bool,
+ ) -> tuple[str, str, int, int] | None:
+ """Get cached Anki deck.
+
+ Args:
+ filepath: Path to source file.
+ length: Excerpt length.
+ target_lang: Target language.
+ include_context: Whether context is included.
+ all_vocab: Whether all vocab is included.
+
+ Returns:
+ Tuple of (anki_content, excerpt, num_words, max_rank) or None.
+ """
+ file_hash = get_file_hash(filepath)
+ key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
+ metadata = self._load_metadata()
+
+ if key not in metadata:
+ return None
+
+ entry = metadata[key]
+ if entry.get("file_hash") != file_hash:
+ return None
+
+ deck_file = self.cache_dir / f"{key}.txt"
+ if not deck_file.exists():
+ return None
+
+ try:
+ content = deck_file.read_text(encoding="utf-8")
+ return (
+ content,
+ entry["excerpt"],
+ entry["num_words"],
+ entry["max_rank"],
+ )
+ except OSError:
+ return None
+
+ def set(
+ self,
+ filepath: Path,
+ length: int,
+ target_lang: str,
+ include_context: bool,
+ all_vocab: bool,
+ anki_content: str,
+ excerpt: str,
+ num_words: int,
+ max_rank: int,
+ ) -> None:
+ """Store Anki deck in cache.
+
+ Args:
+ filepath: Path to source file.
+ length: Excerpt length.
+ target_lang: Target language.
+ include_context: Whether context is included.
+ all_vocab: Whether all vocab is included.
+ anki_content: The Anki deck content.
+ excerpt: The excerpt text.
+ num_words: Number of words in deck.
+ max_rank: Maximum word rank.
+ """
+ file_hash = get_file_hash(filepath)
+ key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
+
+ # Save deck content
+ deck_file = self.cache_dir / f"{key}.txt"
+ deck_file.write_text(anki_content, encoding="utf-8")
+
+ # Update metadata
+ metadata = self._load_metadata()
+ metadata[key] = {
+ "file_hash": file_hash,
+ "filepath": str(filepath),
+ "length": length,
+ "target_lang": target_lang,
+ "include_context": include_context,
+ "all_vocab": all_vocab,
+ "excerpt": excerpt,
+ "num_words": num_words,
+ "max_rank": max_rank,
+ }
+ self._save_metadata()
+
+ def clear(self) -> None:
+ """Clear all cached decks."""
+ self._metadata = {}
+ for cache_file in self.cache_dir.glob("*.txt"):
+ cache_file.unlink()
+ if self.metadata_file.exists():
+ self.metadata_file.unlink()
+
+ def stats(self) -> dict[str, Any]:
+ """Get cache statistics.
+
+ Returns:
+ Dict with cache stats.
+ """
+ metadata = self._load_metadata()
+ cache_files = list(self.cache_dir.glob("*.txt"))
+ total_size = sum(f.stat().st_size for f in cache_files)
+ return {
+ "total_entries": len(metadata),
+ "cache_dir": str(self.cache_dir),
+ "cache_size_bytes": total_size,
+ }
+
+
+# =============================================================================
+# Global Cache Instances
+# =============================================================================
+
+# Singleton instances
+_translation_cache: TranslationCache | None = None
+_vocab_curve_cache: VocabCurveCache | None = None
+_anki_deck_cache: AnkiDeckCache | None = None
+
+
+def get_translation_cache() -> TranslationCache:
+ """Get the global translation cache instance."""
+ global _translation_cache
+ if _translation_cache is None:
+ _translation_cache = TranslationCache()
+ return _translation_cache
+
+
+def get_vocab_curve_cache() -> VocabCurveCache:
+ """Get the global vocabulary curve cache instance."""
+ global _vocab_curve_cache
+ if _vocab_curve_cache is None:
+ _vocab_curve_cache = VocabCurveCache()
+ return _vocab_curve_cache
+
+
+def get_anki_deck_cache() -> AnkiDeckCache:
+ """Get the global Anki deck cache instance."""
+ global _anki_deck_cache
+ if _anki_deck_cache is None:
+ _anki_deck_cache = AnkiDeckCache()
+ return _anki_deck_cache
+
+
+def clear_all_caches() -> None:
+ """Clear all caches."""
+ get_translation_cache().clear()
+ get_vocab_curve_cache().clear()
+ get_anki_deck_cache().clear()
+
+
+def get_all_cache_stats() -> dict[str, dict[str, Any]]:
+ """Get statistics for all caches.
+
+ Returns:
+ Dict with stats for each cache type.
+ """
+ return {
+ "translations": get_translation_cache().stats(),
+ "vocab_curves": get_vocab_curve_cache().stats(),
+ "anki_decks": get_anki_deck_cache().stats(),
+ }
+
+
+def main() -> int:
+ """CLI for cache management.
+
+ Returns:
+ Exit code.
+ """
+ import argparse
+
+ parser = argparse.ArgumentParser(description="Manage word frequency caches")
+ parser.add_argument("--stats", action="store_true", help="Show cache statistics")
+ parser.add_argument("--clear", action="store_true", help="Clear all caches")
+ parser.add_argument(
+ "--clear-translations", action="store_true", help="Clear translation cache"
+ )
+ parser.add_argument(
+ "--clear-excerpts", action="store_true", help="Clear excerpt cache"
+ )
+ parser.add_argument(
+ "--clear-anki", action="store_true", help="Clear Anki deck cache"
+ )
+
+ args = parser.parse_args()
+
+ if args.clear:
+ clear_all_caches()
+ print("All caches cleared.")
+ return 0
+
+ if args.clear_translations:
+ get_translation_cache().clear()
+ print("Translation cache cleared.")
+ return 0
+
+ if args.clear_excerpts:
+ get_vocab_curve_cache().clear()
+ print("Excerpt cache cleared.")
+ return 0
+
+ if args.clear_anki:
+ get_anki_deck_cache().clear()
+ print("Anki deck cache cleared.")
+ return 0
+
+ # Default: show stats
+ stats = get_all_cache_stats()
+ print("Cache Statistics")
+ print("=" * 50)
+ for cache_name, cache_stats in stats.items():
+ print(f"\n{cache_name.upper()}:")
+ for key, value in cache_stats.items():
+ if key == "cache_size_bytes":
+ # Format as human-readable
+ if value < 1024:
+ size_str = f"{value} B"
+ elif value < 1024 * 1024:
+ size_str = f"{value / 1024:.1f} KB"
+ else:
+ size_str = f"{value / (1024 * 1024):.1f} MB"
+ print(f" {key}: {size_str}")
+ else:
+ print(f" {key}: {value}")
+
+ return 0
+
+
+if __name__ == "__main__":
+ import sys
+
+ sys.exit(main())
diff --git a/python_pkg/word_frequency/excerpt_finder.py b/python_pkg/word_frequency/excerpt_finder.py
index 7f92e75..fcbd765 100755
--- a/python_pkg/word_frequency/excerpt_finder.py
+++ b/python_pkg/word_frequency/excerpt_finder.py
@@ -6,21 +6,28 @@ specified length (in words) where the target words appear most frequently.
Usage:
# From raw text with target words
- python -m python_pkg.word_frequency.excerpt_finder --text "they went somewhere he and she and the guy" --words and the --length 3
+ python -m python_pkg.word_frequency.excerpt_finder \
+ --text "they went somewhere he and she and the guy" \
+ --words and the --length 3
# From a file
- python -m python_pkg.word_frequency.excerpt_finder --file path/to/file.txt --words the and of --length 10
+ python -m python_pkg.word_frequency.excerpt_finder \
+ --file path/to/file.txt --words the and of --length 10
# Target words from a file (one word per line)
- python -m python_pkg.word_frequency.excerpt_finder --file text.txt --words-file targets.txt --length 20
+ python -m python_pkg.word_frequency.excerpt_finder \
+ --file text.txt --words-file targets.txt --length 20
# Show top N excerpts instead of just the best one
- python -m python_pkg.word_frequency.excerpt_finder --file text.txt --words the and --length 10 --top 5
+ python -m python_pkg.word_frequency.excerpt_finder \
+ --file text.txt --words the and --length 10 --top 5
"""
from __future__ import annotations
import argparse
+from dataclasses import dataclass
+import logging
from pathlib import Path
import sys
from typing import TYPE_CHECKING, NamedTuple
@@ -33,6 +40,17 @@ except ModuleNotFoundError:
if TYPE_CHECKING:
from collections.abc import Sequence
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class ExcerptSearchOptions:
+ """Options for excerpt search and display."""
+
+ case_sensitive: bool = False
+ top_n: int = 1
+ context_words: int = 0
+
class ExcerptResult(NamedTuple):
"""Result of an excerpt search."""
@@ -141,45 +159,28 @@ def find_best_excerpt(
return output
-def find_best_excerpt_with_context(
+def _expand_results_with_context(
text: str,
- target_words: Sequence[str],
- excerpt_length: int,
+ base_results: list[ExcerptResult],
+ context_words: int,
*,
case_sensitive: bool = False,
- top_n: int = 1,
- context_words: int = 0,
) -> list[ExcerptResult]:
- """Find the excerpt(s) with optional surrounding context.
+ """Expand excerpt results with surrounding context words.
Args:
- text: The input text to search.
- target_words: Words to search for in the excerpt.
- excerpt_length: Length of the excerpt in words.
- case_sensitive: If False, match words case-insensitively.
- top_n: Number of top excerpts to return.
- context_words: Number of words to include before/after the excerpt.
+ text: The full source text.
+ base_results: Results from find_best_excerpt.
+ context_words: Number of words to include before/after.
+ case_sensitive: If False, words are lowercased.
Returns:
- List of ExcerptResult with context included in the excerpt.
+ Expanded ExcerptResult list with context.
"""
- base_results = find_best_excerpt(
- text,
- target_words,
- excerpt_length,
- case_sensitive=case_sensitive,
- top_n=top_n,
- )
-
- if context_words <= 0:
- return base_results
-
- # Re-extract all words to get context
all_words = extract_words(text, case_sensitive=case_sensitive)
expanded_results: list[ExcerptResult] = []
for result in base_results:
- # Expand the excerpt with context
ctx_start = max(0, result.start_index - context_words)
ctx_end = min(len(all_words), result.end_index + context_words)
context_excerpt_words = all_words[ctx_start:ctx_end]
@@ -198,6 +199,40 @@ def find_best_excerpt_with_context(
return expanded_results
+def find_best_excerpt_with_context(
+ text: str,
+ target_words: Sequence[str],
+ excerpt_length: int,
+ options: ExcerptSearchOptions | None = None,
+) -> list[ExcerptResult]:
+ """Find the excerpt(s) with optional surrounding context.
+
+ Args:
+ text: The input text to search.
+ target_words: Words to search for in the excerpt.
+ excerpt_length: Length of the excerpt in words.
+ options: Search options (case_sensitive, top_n, context_words).
+
+ Returns:
+ List of ExcerptResult with context included in the excerpt.
+ """
+ opts = options or ExcerptSearchOptions()
+ base_results = find_best_excerpt(
+ text,
+ target_words,
+ excerpt_length,
+ case_sensitive=opts.case_sensitive,
+ top_n=opts.top_n,
+ )
+
+ if opts.context_words <= 0:
+ return base_results
+
+ return _expand_results_with_context(
+ text, base_results, opts.context_words, case_sensitive=opts.case_sensitive
+ )
+
+
def format_excerpt_results(
results: list[ExcerptResult],
target_words: Sequence[str],
@@ -224,7 +259,8 @@ def format_excerpt_results(
lines.append(f'Excerpt: "{result.excerpt}"')
lines.append(f"Word position: {result.start_index} - {result.end_index - 1}")
lines.append(
- f"Matches: {result.match_count}/{len(result.words)} ({result.match_percentage:.2f}%)"
+ f"Matches: {result.match_count}/{len(result.words)}"
+ f" ({result.match_percentage:.2f}%)"
)
lines.append("")
@@ -316,10 +352,7 @@ def main(argv: Sequence[str] | None = None) -> int:
try:
# Get input text
- if args.text:
- text = args.text
- else:
- text = read_file(args.file)
+ text = args.text or read_file(args.file)
# Get target words
if args.words:
@@ -329,7 +362,7 @@ def main(argv: Sequence[str] | None = None) -> int:
target_words = [w.strip() for w in words_content.splitlines() if w.strip()]
if not target_words:
- print("Error: No target words provided", file=sys.stderr)
+ logger.error("No target words provided")
return 1
# Find excerpts
@@ -337,9 +370,11 @@ def main(argv: Sequence[str] | None = None) -> int:
text,
target_words,
args.length,
- case_sensitive=args.case_sensitive,
- top_n=args.top,
- context_words=args.context,
+ ExcerptSearchOptions(
+ case_sensitive=args.case_sensitive,
+ top_n=args.top,
+ context_words=args.context,
+ ),
)
# Format and print results
@@ -347,15 +382,15 @@ def main(argv: Sequence[str] | None = None) -> int:
if args.output:
Path(args.output).write_text(output, encoding="utf-8")
- print(f"Output written to {args.output}")
+ logger.info("Output written to %s", args.output)
else:
- print(output)
+ logger.info("%s", output)
- except FileNotFoundError as e:
- print(f"Error: File not found - {e}", file=sys.stderr)
+ except FileNotFoundError:
+ logger.exception("File not found")
return 1
- except UnicodeDecodeError as e:
- print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
+ except UnicodeDecodeError:
+ logger.exception("Could not decode file as UTF-8")
return 1
return 0
diff --git a/python_pkg/word_frequency/learning_pipe.py b/python_pkg/word_frequency/learning_pipe.py
index 0bbd253..2d788a2 100755
--- a/python_pkg/word_frequency/learning_pipe.py
+++ b/python_pkg/word_frequency/learning_pipe.py
@@ -1,7 +1,8 @@
#!/usr/bin/env python3
-"""Learning pipe - combines word frequency analysis with excerpt finding for language learning.
+r"""Learning pipe - combines word frequency analysis with excerpt finding.
+
+Helps language learners by:
-This script helps language learners by:
1. Analyzing a text to find the most common words
2. Finding excerpts where those common words are most prevalent
3. Creating a progressive learning experience in batches
@@ -11,26 +12,35 @@ The idea is to:
- Then read excerpts that are dense with those words
- Progressively learn more words and more complex excerpts
-Usage:
- # Basic usage - get top 20 words and find excerpts with them
- python -m python_pkg.word_frequency.learning_pipe --file text.txt
+Usage::
+
+ # Basic usage
+ python -m python_pkg.word_frequency.learning_pipe \\
+ --file text.txt
# Custom batch size and excerpt length
- python -m python_pkg.word_frequency.learning_pipe --file text.txt --batch-size 30 --excerpt-length 50
+ python -m python_pkg.word_frequency.learning_pipe \\
+ --file text.txt --batch-size 30 --excerpt-length 50
# Multiple batches for progressive learning
- python -m python_pkg.word_frequency.learning_pipe --file text.txt --batches 5 --batch-size 20
+ python -m python_pkg.word_frequency.learning_pipe \\
+ --file text.txt --batches 5 --batch-size 20
# Output to file
- python -m python_pkg.word_frequency.learning_pipe --file text.txt --output lesson.txt
+ python -m python_pkg.word_frequency.learning_pipe \\
+ --file text.txt --output lesson.txt
- # Skip common words (like "the", "a", "is") using a stopwords file
- python -m python_pkg.word_frequency.learning_pipe --file text.txt --stopwords stopwords.txt
+ # Skip common words using a stopwords file
+ python -m python_pkg.word_frequency.learning_pipe \\
+ --file text.txt --stopwords stopwords.txt
"""
from __future__ import annotations
import argparse
+from dataclasses import dataclass
+from dataclasses import replace as _replace_dc
+import logging
from pathlib import Path
import sys
from typing import TYPE_CHECKING
@@ -53,6 +63,8 @@ except ModuleNotFoundError:
if TYPE_CHECKING:
from collections.abc import Sequence
+logger = logging.getLogger(__name__)
+
# Common stopwords for various languages (can be overridden with --stopwords)
DEFAULT_STOPWORDS_EN = frozenset(
@@ -181,57 +193,210 @@ def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
)
+@dataclass(frozen=True)
+class LessonConfig:
+ """Configuration for learning lesson generation."""
+
+ batch_size: int = 20
+ num_batches: int = 1
+ excerpt_length: int = 30
+ excerpts_per_batch: int = 3
+ stopwords: frozenset[str] | None = None
+ skip_default_stopwords: bool = False
+ skip_numbers: bool = True
+ case_sensitive: bool = False
+ translate_from: str | None = None
+ translate_to: str | None = None
+
+
+def _resolve_stopwords(config: LessonConfig) -> frozenset[str]:
+ """Resolve combined stopwords from config."""
+ if config.skip_default_stopwords:
+ return config.stopwords or frozenset()
+ return DEFAULT_STOPWORDS_EN | (config.stopwords or frozenset())
+
+
+def _detect_translation_language(
+ text: str,
+ config: LessonConfig,
+ lines: list[str],
+) -> tuple[str | None, str | None]:
+ """Detect translation settings and return (from, to) pair."""
+ actual_from = config.translate_from
+ actual_to = config.translate_to or "en"
+
+ if actual_from == "auto" or (
+ config.translate_to and not config.translate_from
+ ):
+ detected = detect_language(text)
+ if detected:
+ actual_from = detected
+ lines.append(f"Detected language: {detected}")
+ else:
+ lines.append(
+ "Warning: Could not detect language "
+ "(install langdetect: "
+ "pip install langdetect)"
+ )
+ actual_from = None
+
+ return actual_from, actual_to
+
+
+def _format_word_list(
+ batch_words: list[tuple[str, int]],
+ start_idx: int,
+ total_words: int,
+ translations: dict[str, str],
+) -> list[str]:
+ """Format the vocabulary word list for a batch."""
+ lines: list[str] = []
+ for i, (word, count) in enumerate(
+ batch_words, start=start_idx + 1,
+ ):
+ percentage = (count / total_words) * 100
+ if translations:
+ trans = translations.get(word, "?")
+ lines.append(
+ f" {i:3}. {word:<20} -> {trans:<20}"
+ f" ({count:,} occurrences, "
+ f"{percentage:.2f}%)"
+ )
+ else:
+ lines.append(
+ f" {i:3}. {word:<20}"
+ f" ({count:,} occurrences, "
+ f"{percentage:.2f}%)"
+ )
+ return lines
+
+
+@dataclass(frozen=True)
+class _LessonContext:
+ """Shared context for batch generation."""
+
+ text: str
+ word_counts: dict[str, int]
+ config: LessonConfig
+
+
+def _generate_batch_section(
+ ctx: _LessonContext,
+ batch_num: int,
+ batch_words: list[tuple[str, int]],
+ cumulative_words: list[str],
+) -> list[str]:
+ """Generate lines for a single batch section."""
+ config = ctx.config
+ total_words = sum(ctx.word_counts.values())
+ start_idx = batch_num * config.batch_size
+ end_idx = start_idx + config.batch_size
+
+ lines: list[str] = []
+ lines.append("-" * 70)
+ lines.append(
+ f"BATCH {batch_num + 1}: Words "
+ f"{start_idx + 1} - "
+ f"{min(end_idx, start_idx + len(batch_words))}"
+ )
+ lines.append("-" * 70)
+ lines.append("")
+
+ # Get translations if requested
+ translations: dict[str, str] = {}
+ do_translate = (
+ config.translate_from is not None
+ and config.translate_to is not None
+ )
+ if do_translate:
+ words_to_translate = [word for word, _ in batch_words]
+ translation_results = translate_words_batch(
+ words_to_translate,
+ config.translate_from, # type: ignore[arg-type]
+ config.translate_to, # type: ignore[arg-type]
+ )
+ translations = {
+ r.source_word: r.translated_word
+ for r in translation_results
+ if r.success
+ }
+
+ lines.append("VOCABULARY TO LEARN:")
+ lines.append("")
+ lines.extend(
+ _format_word_list(
+ batch_words, start_idx, total_words, translations,
+ )
+ )
+ lines.append("")
+
+ # Cumulative coverage
+ cumulative_count = sum(
+ ctx.word_counts[w]
+ for w in cumulative_words
+ if w in ctx.word_counts
+ )
+ coverage = (cumulative_count / total_words) * 100
+ lines.append(
+ "After learning these words, "
+ f"you'll recognize ~{coverage:.1f}% of the text"
+ )
+ lines.append("")
+
+ # Excerpts
+ lines.append("PRACTICE EXCERPTS:")
+ lines.append(
+ "(Excerpts where your learned vocabulary "
+ "is most concentrated)"
+ )
+ lines.append("")
+
+ excerpts = find_best_excerpt(
+ ctx.text,
+ cumulative_words,
+ config.excerpt_length,
+ case_sensitive=config.case_sensitive,
+ top_n=config.excerpts_per_batch,
+ )
+
+ for j, excerpt in enumerate(excerpts, 1):
+ lines.append(
+ f" Excerpt {j} "
+ f"({excerpt.match_percentage:.1f}% known words):"
+ )
+ lines.append(f' "{excerpt.excerpt}"')
+ lines.append("")
+
+ return lines
+
+
def generate_learning_lesson(
text: str,
- *,
- batch_size: int = 20,
- num_batches: int = 1,
- excerpt_length: int = 30,
- excerpts_per_batch: int = 3,
- stopwords: frozenset[str] | None = None,
- skip_default_stopwords: bool = False,
- skip_numbers: bool = True,
- case_sensitive: bool = False,
- context_words: int = 5,
- translate_from: str | None = None,
- translate_to: str | None = None,
+ config: LessonConfig | None = None,
) -> str:
"""Generate a learning lesson from text.
Args:
text: The source text to analyze.
- batch_size: Number of words per learning batch.
- num_batches: Number of batches to generate.
- excerpt_length: Length of each excerpt in words.
- excerpts_per_batch: Number of excerpts to find per batch.
- stopwords: Custom stopwords to skip (in addition to defaults).
- skip_default_stopwords: If True, don't filter out default English stopwords.
- skip_numbers: If True, filter out numeric words (default: True).
- case_sensitive: If True, treat words case-sensitively.
- context_words: Words of context to include around excerpts.
- translate_from: Source language code for translation (e.g., 'la', 'pl').
- translate_to: Target language code for translation (e.g., 'en').
+ config: Lesson configuration. Uses defaults if None.
Returns:
Formatted learning lesson as a string.
"""
- # Combine stopwords
- all_stopwords: frozenset[str]
- if skip_default_stopwords:
- all_stopwords = stopwords or frozenset()
- else:
- all_stopwords = DEFAULT_STOPWORDS_EN | (stopwords or frozenset())
+ if config is None:
+ config = LessonConfig()
- # Analyze text for word frequencies
- word_counts = analyze_text(text, case_sensitive=case_sensitive)
+ all_stopwords = _resolve_stopwords(config)
+ word_counts = analyze_text(
+ text, case_sensitive=config.case_sensitive,
+ )
- # Filter out stopwords and get sorted words
filtered_words = [
(word, count)
for word, count in word_counts.most_common()
if word.lower() not in all_stopwords
and len(word) > 1
- and not (skip_numbers and word.isdigit())
+ and not (config.skip_numbers and word.isdigit())
]
total_words = sum(word_counts.values())
@@ -241,125 +406,62 @@ def generate_learning_lesson(
lines.append("LANGUAGE LEARNING LESSON")
lines.append("=" * 70)
lines.append(
- f"Source text: {total_words:,} total words, {len(word_counts):,} unique words"
+ f"Source text: {total_words:,} total words, "
+ f"{len(word_counts):,} unique words"
)
if all_stopwords:
lines.append(
- f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words"
+ f"After filtering {len(all_stopwords)} "
+ f"stopwords: {len(filtered_words):,} "
+ "vocabulary words"
)
else:
- lines.append(f"Vocabulary words: {len(filtered_words):,}")
+ lines.append(
+ f"Vocabulary words: {len(filtered_words):,}",
+ )
- # Handle translation setup
- actual_translate_from = translate_from
- actual_translate_to = translate_to or "en" # Default to English
-
- # Auto-detect language if translation is enabled but source not specified
- if translate_from == "auto" or (translate_to and not translate_from):
- detected = detect_language(text)
- if detected:
- actual_translate_from = detected
- lines.append(f"Detected language: {detected}")
- # Note: langdetect doesn't support Latin (often detected as Italian)
- # If detection seems wrong, use --translate-from to override
- else:
- lines.append(
- "Warning: Could not detect language "
- "(install langdetect: pip install langdetect)"
- )
- actual_translate_from = None
-
- do_translate = actual_translate_from is not None and actual_translate_to is not None
+ actual_from, actual_to = _detect_translation_language(
+ text, config, lines,
+ )
+ do_translate = (
+ actual_from is not None and actual_to is not None
+ )
if do_translate:
- lines.append(f"Translation: {actual_translate_from} -> {actual_translate_to}")
-
+ lines.append(
+ f"Translation: {actual_from} -> {actual_to}",
+ )
lines.append("")
- # Generate batches
+ # Create resolved config with detected translation
+ resolved_config = _replace_dc(
+ config,
+ translate_from=actual_from,
+ translate_to=actual_to,
+ )
+ ctx = _LessonContext(
+ text=text,
+ word_counts=word_counts,
+ config=resolved_config,
+ )
+
cumulative_words: list[str] = []
-
- for batch_num in range(num_batches):
- start_idx = batch_num * batch_size
- end_idx = start_idx + batch_size
-
+ for batch_num in range(config.num_batches):
+ start_idx = batch_num * config.batch_size
+ end_idx = start_idx + config.batch_size
if start_idx >= len(filtered_words):
break
batch_words = filtered_words[start_idx:end_idx]
cumulative_words.extend(word for word, _ in batch_words)
- lines.append("-" * 70)
- lines.append(
- f"BATCH {batch_num + 1}: Words {start_idx + 1} - {min(end_idx, len(filtered_words))}"
- )
- lines.append("-" * 70)
- lines.append("")
-
- # Get translations if requested
- translations: dict[str, str] = {}
- if do_translate:
- words_to_translate = [word for word, _ in batch_words]
- translation_results = translate_words_batch(
- words_to_translate,
- actual_translate_from, # type: ignore[arg-type]
- actual_translate_to, # type: ignore[arg-type]
+ lines.extend(
+ _generate_batch_section(
+ ctx,
+ batch_num,
+ batch_words,
+ cumulative_words,
)
- translations = {
- r.source_word: r.translated_word
- for r in translation_results
- if r.success
- }
-
- # Word list with frequencies
- lines.append("VOCABULARY TO LEARN:")
- lines.append("")
-
- if do_translate and translations:
- # Include translations in output
- for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
- percentage = (count / total_words) * 100
- trans = translations.get(word, "?")
- lines.append(
- f" {i:3}. {word:<20} -> {trans:<20} ({count:,} occurrences, {percentage:.2f}%)"
- )
- else:
- for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
- percentage = (count / total_words) * 100
- lines.append(
- f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)"
- )
-
- lines.append("")
-
- # Calculate cumulative coverage
- cumulative_count = sum(
- word_counts[word] for word in cumulative_words if word in word_counts
)
- coverage = (cumulative_count / total_words) * 100
- lines.append(
- f"After learning these words, you'll recognize ~{coverage:.1f}% of the text"
- )
- lines.append("")
-
- # Find excerpts using cumulative words
- lines.append("PRACTICE EXCERPTS:")
- lines.append("(Excerpts where your learned vocabulary is most concentrated)")
- lines.append("")
-
- excerpts = find_best_excerpt(
- text,
- cumulative_words,
- excerpt_length,
- case_sensitive=case_sensitive,
- top_n=excerpts_per_batch,
- )
-
- for j, excerpt in enumerate(excerpts, 1):
- lines.append(
- f" Excerpt {j} ({excerpt.match_percentage:.1f}% known words):"
- )
- lines.append(f' "{excerpt.excerpt}"')
- lines.append("")
# Summary
lines.append("=" * 70)
@@ -368,14 +470,25 @@ def generate_learning_lesson(
if cumulative_words:
final_coverage = sum(
- word_counts[word] for word in cumulative_words if word in word_counts
+ word_counts[w]
+ for w in cumulative_words
+ if w in word_counts
)
- final_percentage = (final_coverage / total_words) * 100
- lines.append(f"Total vocabulary words learned: {len(cumulative_words)}")
- lines.append(f"Text coverage: {final_percentage:.1f}%")
+ final_pct = (final_coverage / total_words) * 100
+ lines.append(
+ "Total vocabulary words learned: "
+ f"{len(cumulative_words)}"
+ )
+ lines.append(f"Text coverage: {final_pct:.1f}%")
lines.append("")
- lines.append("TIP: Focus on understanding the excerpts first, then read")
- lines.append("more of the original text as your vocabulary grows!")
+ lines.append(
+ "TIP: Focus on understanding the excerpts "
+ "first, then read"
+ )
+ lines.append(
+ "more of the original text as your "
+ "vocabulary grows!"
+ )
return "\n".join(lines)
@@ -475,7 +588,10 @@ def main(argv: Sequence[str] | None = None) -> int:
"--translate-from",
type=str,
metavar="LANG",
- help="Source language code (e.g., 'la', 'pl', 'de'). If omitted, auto-detected.",
+ help=(
+ "Source language code (e.g., 'la', 'pl'). "
+ "If omitted, auto-detected."
+ ),
)
parser.add_argument(
"--translate-to",
@@ -496,27 +612,22 @@ def main(argv: Sequence[str] | None = None) -> int:
args = parser.parse_args(argv)
try:
- # Get input text
- if args.text:
- text = args.text
- else:
- text = read_file(args.file)
+ text = args.text or read_file(args.file)
# Load custom stopwords if provided
custom_stopwords = load_stopwords(args.stopwords)
# Determine translation settings
- # Translation enabled by default, --no-translate disables it
translate_from: str | None = None
translate_to: str | None = None
if not args.no_translate:
- translate_from = args.translate_from or "auto" # "auto" triggers detection
+ translate_from = (
+ args.translate_from or "auto"
+ )
translate_to = args.translate_to
- # Generate lesson
- lesson = generate_learning_lesson(
- text,
+ config = LessonConfig(
batch_size=args.batch_size,
num_batches=args.batches,
excerpt_length=args.excerpt_length,
@@ -528,19 +639,26 @@ def main(argv: Sequence[str] | None = None) -> int:
translate_from=translate_from,
translate_to=translate_to,
)
+ lesson = generate_learning_lesson(text, config)
# Output
if args.output:
- Path(args.output).write_text(lesson, encoding="utf-8")
- print(f"Lesson written to {args.output}")
+ Path(args.output).write_text(
+ lesson, encoding="utf-8",
+ )
+ logger.info(
+ "Lesson written to %s", args.output,
+ )
else:
- print(lesson)
+ logger.info(lesson)
- except FileNotFoundError as e:
- print(f"Error: File not found - {e}", file=sys.stderr)
+ except FileNotFoundError:
+ logger.exception("Error: File not found")
return 1
- except UnicodeDecodeError as e:
- print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
+ except UnicodeDecodeError:
+ logger.exception(
+ "Error: Could not decode file as UTF-8",
+ )
return 1
return 0
diff --git a/python_pkg/word_frequency/tests/test_analyzer.py b/python_pkg/word_frequency/tests/test_analyzer.py
index 7ed1137..4b01593 100644
--- a/python_pkg/word_frequency/tests/test_analyzer.py
+++ b/python_pkg/word_frequency/tests/test_analyzer.py
@@ -3,8 +3,11 @@
from __future__ import annotations
from collections import Counter
-from pathlib import Path
import time
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from pathlib import Path
import pytest
@@ -251,12 +254,13 @@ class TestMain:
assert exit_code == 0
assert "Unique words: 3" in captured.out
- def test_file_not_found_error(self, capsys: pytest.CaptureFixture[str]) -> None:
+ def test_file_not_found_error(
+ self, caplog: pytest.LogCaptureFixture
+ ) -> None:
"""Test error handling for missing file."""
exit_code = main(["--file", "/nonexistent/file.txt"])
- captured = capsys.readouterr()
assert exit_code == 1
- assert "Error" in captured.err
+ assert "File not found" in caplog.text
class TestPerformance:
@@ -283,7 +287,7 @@ class TestPerformance:
assert elapsed < 10.0, f"Analysis took {elapsed:.2f}s, expected < 10s"
assert "word0" in result # Most common word should be present
- def test_bible_sized_text_performance(self, tmp_path: Path) -> None:
+ def test_bible_sized_text_performance(self) -> None:
"""Test with Bible-sized text (~800k words)."""
# Generate text similar in size to the Bible
base_words = ["the", "and", "of", "to", "in", "a", "that", "is", "was", "for"]
diff --git a/python_pkg/word_frequency/tests/test_anki_generator.py b/python_pkg/word_frequency/tests/test_anki_generator.py
index b7c0c69..ff421a9 100755
--- a/python_pkg/word_frequency/tests/test_anki_generator.py
+++ b/python_pkg/word_frequency/tests/test_anki_generator.py
@@ -10,6 +10,7 @@ import pytest
try:
from python_pkg.word_frequency.anki_generator import (
+ DeckInput,
find_word_contexts,
generate_anki_deck,
main,
@@ -20,6 +21,7 @@ except ImportError:
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
from python_pkg.word_frequency.anki_generator import (
+ DeckInput,
find_word_contexts,
generate_anki_deck,
main,
@@ -77,7 +79,7 @@ class TestParseVocabularyCurveOutput:
def test_parse_length_1(self, sample_vocabulary_output: str) -> None:
"""Test parsing output for length 1."""
- excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
+ excerpt, excerpt_words, _all_vocab = parse_vocabulary_curve_output(
sample_vocabulary_output, 1
)
assert excerpt == "the"
@@ -85,7 +87,7 @@ class TestParseVocabularyCurveOutput:
def test_parse_length_2(self, sample_vocabulary_output: str) -> None:
"""Test parsing output for length 2."""
- excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
+ excerpt, excerpt_words, _all_vocab = parse_vocabulary_curve_output(
sample_vocabulary_output, 2
)
assert excerpt == "the dog"
@@ -93,7 +95,7 @@ class TestParseVocabularyCurveOutput:
def test_parse_length_3(self, sample_vocabulary_output: str) -> None:
"""Test parsing output for length 3."""
- excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
+ excerpt, excerpt_words, _all_vocab = parse_vocabulary_curve_output(
sample_vocabulary_output, 3
)
assert excerpt == "the quick fox"
@@ -104,7 +106,7 @@ class TestParseVocabularyCurveOutput:
def test_parse_nonexistent_length(self, sample_vocabulary_output: str) -> None:
"""Test parsing output for non-existent length."""
- excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
+ excerpt, excerpt_words, _all_vocab = parse_vocabulary_curve_output(
sample_vocabulary_output, 100
)
assert excerpt == ""
@@ -121,7 +123,7 @@ hello;1
world;2
VOCAB_DUMP_END
"""
- excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(output, 2)
+ _excerpt, _excerpt_words, all_vocab = parse_vocabulary_curve_output(output, 2)
assert all_vocab == [("hello", 1), ("world", 2)]
@@ -168,10 +170,12 @@ class TestGenerateAnkiDeck:
MagicMock(success=True, source_word="hello", translated_word="hola")
]
result = generate_anki_deck(
- [("hello", 1)],
- source_lang="en",
- target_lang="es",
- deck_name="TestDeck",
+ DeckInput(
+ words_with_ranks=[("hello", 1)],
+ source_lang="en",
+ target_lang="es",
+ deck_name="TestDeck",
+ ),
)
assert "#separator:semicolon" in result
@@ -188,9 +192,11 @@ class TestGenerateAnkiDeck:
MagicMock(success=True, source_word="world", translated_word="mundo"),
]
result = generate_anki_deck(
- [("hello", 1), ("world", 2)],
- source_lang="en",
- target_lang="es",
+ DeckInput(
+ words_with_ranks=[("hello", 1), ("world", 2)],
+ source_lang="en",
+ target_lang="es",
+ ),
)
# Check that words and translations are present
@@ -208,9 +214,11 @@ class TestGenerateAnkiDeck:
MagicMock(success=True, source_word="test", translated_word="prueba")
]
result = generate_anki_deck(
- [("test", 42)],
- source_lang="en",
- target_lang="es",
+ DeckInput(
+ words_with_ranks=[("test", 42)],
+ source_lang="en",
+ target_lang="es",
+ ),
)
assert "#42" in result
@@ -226,9 +234,11 @@ class TestGenerateAnkiDeck:
)
]
result = generate_anki_deck(
- [("test;word", 1)],
- source_lang="en",
- target_lang="es",
+ DeckInput(
+ words_with_ranks=[("test;word", 1)],
+ source_lang="en",
+ target_lang="es",
+ ),
)
# Semicolons should be replaced with commas
@@ -244,10 +254,12 @@ class TestGenerateAnkiDeck:
]
contexts = {"hello": "...say hello to..."}
result = generate_anki_deck(
- [("hello", 1)],
- source_lang="en",
- target_lang="es",
- contexts=contexts,
+ DeckInput(
+ words_with_ranks=[("hello", 1)],
+ source_lang="en",
+ target_lang="es",
+ contexts=contexts,
+ ),
include_context=True,
)
@@ -257,9 +269,11 @@ class TestGenerateAnkiDeck:
def test_no_translate_flag(self) -> None:
"""Test that no_translate skips translation."""
result = generate_anki_deck(
- [("hello", 1), ("world", 2)],
- source_lang="en",
- target_lang="es",
+ DeckInput(
+ words_with_ranks=[("hello", 1), ("world", 2)],
+ source_lang="en",
+ target_lang="es",
+ ),
no_translate=True,
)
@@ -280,7 +294,7 @@ class TestMain:
result = main(["--file", "nonexistent.txt", "--length", "10"])
assert result == 1
- def test_help_flag(self, capsys: pytest.CaptureFixture[str]) -> None:
+ def test_help_flag(self) -> None:
"""Test that --help works."""
with pytest.raises(SystemExit) as exc_info:
main(["--help"])
@@ -309,7 +323,7 @@ class TestIntegration:
) as mock_translate:
# Mock translation to avoid network calls
def mock_translate_fn(
- words: list[str], from_lang: str, to_lang: str
+ words: list[str], _from_lang: str, _to_lang: str
) -> list[MagicMock]:
return [
MagicMock(success=True, source_word=w, translated_word=f"[{w}]")
@@ -324,6 +338,8 @@ class TestIntegration:
str(sample_text_file),
"--length",
"5",
+ "--from",
+ "en",
"--output",
str(output_file),
"--quiet",
@@ -337,9 +353,11 @@ class TestIntegration:
assert "#separator:semicolon" in content
def test_cli_with_sample_file(
- self, sample_text_file: Path, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+ self, sample_text_file: Path, tmp_path: Path, caplog: pytest.LogCaptureFixture
) -> None:
"""Test CLI with actual file."""
+ import logging
+
from python_pkg.word_frequency.anki_generator import C_EXECUTABLE
if not C_EXECUTABLE.exists():
@@ -347,9 +365,12 @@ class TestIntegration:
output_file = tmp_path / "anki_output.txt"
- with patch(
- "python_pkg.word_frequency.anki_generator.translate_words_batch"
- ) as mock_translate:
+ with (
+ caplog.at_level(logging.INFO),
+ patch(
+ "python_pkg.word_frequency.anki_generator.translate_words_batch"
+ ) as mock_translate,
+ ):
mock_translate.return_value = [
MagicMock(success=True, source_word="the", translated_word="le")
]
@@ -360,14 +381,15 @@ class TestIntegration:
str(sample_text_file),
"--length",
"1",
+ "--from",
+ "en",
"--output",
str(output_file),
]
)
assert result == 0
- captured = capsys.readouterr()
- assert "FLASHCARD GENERATION COMPLETE" in captured.out
+ assert "FLASHCARD GENERATION COMPLETE" in caplog.text
if __name__ == "__main__":
diff --git a/python_pkg/word_frequency/tests/test_excerpt_finder.py b/python_pkg/word_frequency/tests/test_excerpt_finder.py
index 4ec179d..2cdaea3 100644
--- a/python_pkg/word_frequency/tests/test_excerpt_finder.py
+++ b/python_pkg/word_frequency/tests/test_excerpt_finder.py
@@ -2,13 +2,18 @@
from __future__ import annotations
-from pathlib import Path
+import logging
import time
+from typing import TYPE_CHECKING
import pytest
+if TYPE_CHECKING:
+ from pathlib import Path
+
from python_pkg.word_frequency.excerpt_finder import (
ExcerptResult,
+ ExcerptSearchOptions,
find_best_excerpt,
find_best_excerpt_with_context,
format_excerpt_results,
@@ -146,7 +151,8 @@ class TestFindBestExcerptWithContext:
"""Test with zero context (should behave like find_best_excerpt)."""
text = "a b c d e f g"
result = find_best_excerpt_with_context(
- text, ["c"], excerpt_length=1, context_words=0
+ text, ["c"], excerpt_length=1,
+ options=ExcerptSearchOptions(context_words=0),
)
assert result[0].excerpt == "c"
@@ -155,7 +161,8 @@ class TestFindBestExcerptWithContext:
"""Test with context words."""
text = "a b c d e f g"
result = find_best_excerpt_with_context(
- text, ["d"], excerpt_length=1, context_words=2
+ text, ["d"], excerpt_length=1,
+ options=ExcerptSearchOptions(context_words=2),
)
# "d" at index 3, with context should include 2 words before and after
@@ -167,7 +174,8 @@ class TestFindBestExcerptWithContext:
"""Test context doesn't go before start of text."""
text = "a b c d e"
result = find_best_excerpt_with_context(
- text, ["a"], excerpt_length=1, context_words=3
+ text, ["a"], excerpt_length=1,
+ options=ExcerptSearchOptions(context_words=3),
)
# Can't go before "a", so just get words after
@@ -178,7 +186,8 @@ class TestFindBestExcerptWithContext:
"""Test context doesn't go beyond end of text."""
text = "a b c d e"
result = find_best_excerpt_with_context(
- text, ["e"], excerpt_length=1, context_words=3
+ text, ["e"], excerpt_length=1,
+ options=ExcerptSearchOptions(context_words=3),
)
# Can't go beyond "e"
@@ -240,33 +249,33 @@ class TestFormatExcerptResults:
class TestMain:
"""Tests for main CLI function."""
- def test_text_and_words_input(self, capsys: pytest.CaptureFixture[str]) -> None:
+ def test_text_and_words_input(self, caplog: pytest.LogCaptureFixture) -> None:
"""Test --text and --words options."""
- exit_code = main(
- ["--text", "hello world hello", "--words", "hello", "--length", "2"]
- )
- captured = capsys.readouterr()
+ with caplog.at_level(logging.INFO):
+ exit_code = main(
+ ["--text", "hello world hello", "--words", "hello", "--length", "2"]
+ )
assert exit_code == 0
- assert "hello" in captured.out
+ assert "hello" in caplog.text
def test_file_input(
- self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+ self, tmp_path: Path, caplog: pytest.LogCaptureFixture
) -> None:
"""Test --file input option."""
test_file = tmp_path / "test.txt"
test_file.write_text("hello world hello world", encoding="utf-8")
- exit_code = main(
- ["--file", str(test_file), "--words", "hello", "--length", "2"]
- )
- captured = capsys.readouterr()
+ with caplog.at_level(logging.INFO):
+ exit_code = main(
+ ["--file", str(test_file), "--words", "hello", "--length", "2"]
+ )
assert exit_code == 0
- assert "hello" in captured.out
+ assert "hello" in caplog.text
def test_words_file_input(
- self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+ self, tmp_path: Path, caplog: pytest.LogCaptureFixture
) -> None:
"""Test --words-file option."""
text_file = tmp_path / "text.txt"
@@ -274,91 +283,91 @@ class TestMain:
text_file.write_text("hello world hello world", encoding="utf-8")
words_file.write_text("hello\nworld\n", encoding="utf-8")
- exit_code = main(
- [
- "--file",
- str(text_file),
- "--words-file",
- str(words_file),
- "--length",
- "2",
- ]
- )
- captured = capsys.readouterr()
+ with caplog.at_level(logging.INFO):
+ exit_code = main(
+ [
+ "--file",
+ str(text_file),
+ "--words-file",
+ str(words_file),
+ "--length",
+ "2",
+ ]
+ )
assert exit_code == 0
- assert "100.00%" in captured.out # Both words match
+ assert "100.00%" in caplog.text # Both words match
- def test_top_option(self, capsys: pytest.CaptureFixture[str]) -> None:
+ def test_top_option(self, caplog: pytest.LogCaptureFixture) -> None:
"""Test --top option."""
- exit_code = main(
- [
- "--text",
- "a b c d e f",
- "--words",
- "a",
- "b",
- "--length",
- "2",
- "--top",
- "3",
- ]
- )
- captured = capsys.readouterr()
+ with caplog.at_level(logging.INFO):
+ exit_code = main(
+ [
+ "--text",
+ "a b c d e f",
+ "--words",
+ "a",
+ "b",
+ "--length",
+ "2",
+ "--top",
+ "3",
+ ]
+ )
assert exit_code == 0
# Should show multiple results
- assert "Result #1" in captured.out
+ assert "Result #1" in caplog.text
- def test_context_option(self, capsys: pytest.CaptureFixture[str]) -> None:
+ def test_context_option(self, caplog: pytest.LogCaptureFixture) -> None:
"""Test --context option."""
- exit_code = main(
- [
- "--text",
- "a b c d e f g",
- "--words",
- "d",
- "--length",
- "1",
- "--context",
- "2",
- ]
- )
- capsys.readouterr()
+ with caplog.at_level(logging.INFO):
+ exit_code = main(
+ [
+ "--text",
+ "a b c d e f g",
+ "--words",
+ "d",
+ "--length",
+ "1",
+ "--context",
+ "2",
+ ]
+ )
assert exit_code == 0
# Excerpt should include context words
- def test_case_sensitive_option(self, capsys: pytest.CaptureFixture[str]) -> None:
+ def test_case_sensitive_option(self, caplog: pytest.LogCaptureFixture) -> None:
"""Test --case-sensitive option."""
- exit_code = main(
- [
- "--text",
- "Hello HELLO hello",
- "--words",
- "hello",
- "--length",
- "1",
- "--case-sensitive",
- ]
- )
- capsys.readouterr()
+ with caplog.at_level(logging.INFO):
+ exit_code = main(
+ [
+ "--text",
+ "Hello HELLO hello",
+ "--words",
+ "hello",
+ "--length",
+ "1",
+ "--case-sensitive",
+ ]
+ )
assert exit_code == 0
# Only lowercase "hello" should match
- def test_file_not_found(self, capsys: pytest.CaptureFixture[str]) -> None:
+ def test_file_not_found(self, caplog: pytest.LogCaptureFixture) -> None:
"""Test error handling for missing file."""
- exit_code = main(
- ["--file", "/nonexistent/file.txt", "--words", "hello", "--length", "2"]
- )
- captured = capsys.readouterr()
+ with caplog.at_level(logging.ERROR):
+ exit_code = main(
+ ["--file", "/nonexistent/file.txt", "--words", "hello", "--length", "2"]
+ )
assert exit_code == 1
- assert "Error" in captured.err
+ assert "Error" in caplog.text
def test_empty_words_file(
- self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+ self, tmp_path: Path, caplog: pytest.LogCaptureFixture
) -> None:
"""Test error when words file is empty."""
text_file = tmp_path / "text.txt"
@@ -366,20 +375,20 @@ class TestMain:
text_file.write_text("hello world", encoding="utf-8")
words_file.write_text("", encoding="utf-8")
- exit_code = main(
- [
- "--file",
- str(text_file),
- "--words-file",
- str(words_file),
- "--length",
- "2",
- ]
- )
- captured = capsys.readouterr()
+ with caplog.at_level(logging.ERROR):
+ exit_code = main(
+ [
+ "--file",
+ str(text_file),
+ "--words-file",
+ str(words_file),
+ "--length",
+ "2",
+ ]
+ )
assert exit_code == 1
- assert "No target words" in captured.err
+ assert "No target words" in caplog.text
class TestPerformance:
diff --git a/python_pkg/word_frequency/tests/test_learning_pipe.py b/python_pkg/word_frequency/tests/test_learning_pipe.py
index bfbb7a5..1444c32 100644
--- a/python_pkg/word_frequency/tests/test_learning_pipe.py
+++ b/python_pkg/word_frequency/tests/test_learning_pipe.py
@@ -2,16 +2,20 @@
from __future__ import annotations
-from pathlib import Path
+import logging
import time
from typing import TYPE_CHECKING
from unittest.mock import MagicMock, patch
import pytest
+if TYPE_CHECKING:
+ from pathlib import Path
+
import python_pkg.word_frequency.learning_pipe as learning_pipe_module
from python_pkg.word_frequency.learning_pipe import (
DEFAULT_STOPWORDS_EN,
+ LessonConfig,
generate_learning_lesson,
load_stopwords,
main,
@@ -23,7 +27,7 @@ if TYPE_CHECKING:
@pytest.fixture
-def mock_translation() -> Generator[MagicMock, None, None]:
+def _mock_translation() -> Generator[MagicMock, None, None]:
"""Mock translation to avoid requiring argostranslate."""
def fake_batch_translate(
@@ -31,7 +35,7 @@ def mock_translation() -> Generator[MagicMock, None, None]:
from_lang: str,
to_lang: str,
*,
- use_cache: bool = True,
+ _use_cache: bool = True,
) -> list[TranslationResult]:
"""Fake batch translation that returns word with prefix."""
return [
@@ -95,7 +99,7 @@ class TestGenerateLearningLesson:
"""Test basic lesson generation."""
text = "hello world hello hello world test test test test"
result = generate_learning_lesson(
- text, batch_size=3, num_batches=1, skip_default_stopwords=True
+ text, LessonConfig(batch_size=3, num_batches=1, skip_default_stopwords=True)
)
assert "LANGUAGE LEARNING LESSON" in result
@@ -106,7 +110,7 @@ class TestGenerateLearningLesson:
"""Test generation with multiple batches."""
text = " ".join(f"word{i}" * (100 - i) for i in range(20))
result = generate_learning_lesson(
- text, batch_size=5, num_batches=3, skip_default_stopwords=True
+ text, LessonConfig(batch_size=5, num_batches=3, skip_default_stopwords=True)
)
assert "BATCH 1" in result
@@ -116,7 +120,9 @@ class TestGenerateLearningLesson:
def test_stopwords_filtering(self) -> None:
"""Test that default stopwords are filtered."""
text = "the the the hello world"
- result = generate_learning_lesson(text, batch_size=5, num_batches=1)
+ result = generate_learning_lesson(
+ text, LessonConfig(batch_size=5, num_batches=1)
+ )
# "the" should be filtered, "hello" and "world" should appear
lines = result.split("\n")
@@ -139,7 +145,7 @@ class TestGenerateLearningLesson:
"""Test disabling default stopword filtering."""
text = "the the the hello"
result = generate_learning_lesson(
- text, batch_size=5, num_batches=1, skip_default_stopwords=True
+ text, LessonConfig(batch_size=5, num_batches=1, skip_default_stopwords=True)
)
assert "the" in result.lower()
@@ -148,7 +154,7 @@ class TestGenerateLearningLesson:
"""Test that numbers are filtered by default."""
text = "123 123 123 hello world"
result = generate_learning_lesson(
- text, batch_size=5, num_batches=1, skip_default_stopwords=True
+ text, LessonConfig(batch_size=5, num_batches=1, skip_default_stopwords=True)
)
# Check vocabulary section doesn't include "123"
@@ -162,10 +168,12 @@ class TestGenerateLearningLesson:
text = "123 123 123 hello"
result = generate_learning_lesson(
text,
- batch_size=5,
- num_batches=1,
- skip_default_stopwords=True,
- skip_numbers=False,
+ LessonConfig(
+ batch_size=5,
+ num_batches=1,
+ skip_default_stopwords=True,
+ skip_numbers=False,
+ ),
)
assert "123" in result
@@ -174,7 +182,7 @@ class TestGenerateLearningLesson:
"""Test that coverage percentage is calculated."""
text = "hello hello hello world world test"
result = generate_learning_lesson(
- text, batch_size=3, num_batches=1, skip_default_stopwords=True
+ text, LessonConfig(batch_size=3, num_batches=1, skip_default_stopwords=True)
)
assert "recognize" in result.lower()
@@ -185,11 +193,13 @@ class TestGenerateLearningLesson:
text = "hello world hello world hello world test test test"
result = generate_learning_lesson(
text,
- batch_size=2,
- num_batches=1,
- excerpt_length=3,
- excerpts_per_batch=2,
- skip_default_stopwords=True,
+ LessonConfig(
+ batch_size=2,
+ num_batches=1,
+ excerpt_length=3,
+ excerpts_per_batch=2,
+ skip_default_stopwords=True,
+ ),
)
assert "PRACTICE EXCERPTS" in result
@@ -200,45 +210,45 @@ class TestMain:
"""Tests for main CLI function."""
def test_basic_text_input(
- self, capsys: pytest.CaptureFixture[str], mock_translation: None
+ self, caplog: pytest.LogCaptureFixture, _mock_translation: None
) -> None:
"""Test with text input."""
- exit_code = main(
- [
- "--text",
- "hello world hello world test test test",
- "--batch-size",
- "3",
- "--no-default-stopwords",
- ]
- )
- captured = capsys.readouterr()
+ with caplog.at_level(logging.INFO):
+ exit_code = main(
+ [
+ "--text",
+ "hello world hello world test test test",
+ "--batch-size",
+ "3",
+ "--no-default-stopwords",
+ ]
+ )
assert exit_code == 0
- assert "LANGUAGE LEARNING LESSON" in captured.out
+ assert "LANGUAGE LEARNING LESSON" in caplog.text
def test_file_input(
- self, tmp_path: Path, capsys: pytest.CaptureFixture[str], mock_translation: None
+ self, tmp_path: Path, caplog: pytest.LogCaptureFixture, _mock_translation: None
) -> None:
"""Test with file input."""
test_file = tmp_path / "test.txt"
test_file.write_text("hello world hello world test", encoding="utf-8")
- exit_code = main(
- [
- "--file",
- str(test_file),
- "--batch-size",
- "3",
- "--no-default-stopwords",
- ]
- )
- captured = capsys.readouterr()
+ with caplog.at_level(logging.INFO):
+ exit_code = main(
+ [
+ "--file",
+ str(test_file),
+ "--batch-size",
+ "3",
+ "--no-default-stopwords",
+ ]
+ )
assert exit_code == 0
- assert "hello" in captured.out.lower()
+ assert "hello" in caplog.text.lower()
- def test_output_to_file(self, tmp_path: Path, mock_translation: None) -> None:
+ def test_output_to_file(self, tmp_path: Path, _mock_translation: None) -> None:
"""Test outputting to file."""
output_file = tmp_path / "lesson.txt"
@@ -258,7 +268,7 @@ class TestMain:
assert "LANGUAGE LEARNING LESSON" in content
def test_custom_stopwords(
- self, tmp_path: Path, capsys: pytest.CaptureFixture[str], mock_translation: None
+ self, tmp_path: Path, _mock_translation: None
) -> None:
"""Test with custom stopwords file."""
stopwords_file = tmp_path / "stop.txt"
@@ -275,41 +285,40 @@ class TestMain:
"5",
]
)
- capsys.readouterr()
assert exit_code == 0
# "hello" should be filtered by custom stopwords
def test_multiple_batches_option(
- self, capsys: pytest.CaptureFixture[str], mock_translation: None
+ self, caplog: pytest.LogCaptureFixture, _mock_translation: None
) -> None:
"""Test --batches option."""
text = " ".join(f"word{i}" * (50 - i) for i in range(30))
- exit_code = main(
- [
- "--text",
- text,
- "--batch-size",
- "5",
- "--batches",
- "3",
- "--no-default-stopwords",
- ]
- )
- captured = capsys.readouterr()
+ with caplog.at_level(logging.INFO):
+ exit_code = main(
+ [
+ "--text",
+ text,
+ "--batch-size",
+ "5",
+ "--batches",
+ "3",
+ "--no-default-stopwords",
+ ]
+ )
assert exit_code == 0
- assert "BATCH 1" in captured.out
- assert "BATCH 2" in captured.out
- assert "BATCH 3" in captured.out
+ assert "BATCH 1" in caplog.text
+ assert "BATCH 2" in caplog.text
+ assert "BATCH 3" in caplog.text
- def test_file_not_found(self, capsys: pytest.CaptureFixture[str]) -> None:
+ def test_file_not_found(self, caplog: pytest.LogCaptureFixture) -> None:
"""Test error handling for missing file."""
- exit_code = main(["--file", "/nonexistent/file.txt"])
- captured = capsys.readouterr()
+ with caplog.at_level(logging.ERROR):
+ exit_code = main(["--file", "/nonexistent/file.txt"])
assert exit_code == 1
- assert "Error" in captured.err
+ assert "Error" in caplog.text
class TestPerformance:
@@ -324,10 +333,12 @@ class TestPerformance:
start_time = time.perf_counter()
result = generate_learning_lesson(
large_text,
- batch_size=50,
- num_batches=5,
- excerpt_length=30,
- skip_default_stopwords=True,
+ LessonConfig(
+ batch_size=50,
+ num_batches=5,
+ excerpt_length=30,
+ skip_default_stopwords=True,
+ ),
)
elapsed = time.perf_counter() - start_time
@@ -358,9 +369,11 @@ class TestTranslationIntegration:
text = "hello world hello world hello"
result = generate_learning_lesson(
text,
- batch_size=5,
- num_batches=1,
- skip_default_stopwords=True,
+ LessonConfig(
+ batch_size=5,
+ num_batches=1,
+ skip_default_stopwords=True,
+ ),
)
assert "hello" in result
@@ -368,17 +381,19 @@ class TestTranslationIntegration:
# Should not have translation arrows
assert " -> " not in result or "Translation" not in result
- def test_lesson_with_translation_params(self, mock_translation: None) -> None:
+ def test_lesson_with_translation_params(self, _mock_translation: None) -> None:
"""Test that translation params are accepted."""
text = "hello world hello world hello"
# This should work with mocked translation
result = generate_learning_lesson(
text,
- batch_size=5,
- num_batches=1,
- skip_default_stopwords=True,
- translate_from="en",
- translate_to="es",
+ LessonConfig(
+ batch_size=5,
+ num_batches=1,
+ skip_default_stopwords=True,
+ translate_from="en",
+ translate_to="es",
+ ),
)
# The lesson should still be generated
@@ -386,7 +401,7 @@ class TestTranslationIntegration:
assert "hello" in result
def test_main_with_translate_flags(
- self, tmp_path: Path, mock_translation: None
+ self, tmp_path: Path, _mock_translation: None
) -> None:
"""Test that main accepts translation flags."""
text_file = tmp_path / "test.txt"
@@ -408,36 +423,42 @@ class TestTranslationIntegration:
assert result == 0
def test_translate_to_defaults_to_english(
- self, capsys: pytest.CaptureFixture[str], mock_translation: None
+ self, _mock_translation: None
) -> None:
"""Test that translate_to defaults to 'en' when using auto-detection."""
text = "hello world"
- # When using --translate flag (translate_from="auto"), translate_to defaults to "en"
- result = generate_learning_lesson(
- text,
- batch_size=5,
- num_batches=1,
- skip_default_stopwords=True,
- translate_from="auto", # Auto-detect source language
- translate_to=None, # Should default to English
- )
+ # When using --translate flag (translate_from="auto"),
+ # translate_to defaults to "en"
+ with patch.object(
+ learning_pipe_module, "detect_language", return_value="es"
+ ):
+ result = generate_learning_lesson(
+ text,
+ LessonConfig(
+ batch_size=5,
+ num_batches=1,
+ skip_default_stopwords=True,
+ translate_from="auto", # Auto-detect source language
+ translate_to=None, # Should default to English
+ ),
+ )
# Should have translation output with auto-detected source -> en
assert "Detected language:" in result
assert " -> en" in result
- def test_no_translation_when_both_none(
- self, capsys: pytest.CaptureFixture[str]
- ) -> None:
- """Test no translation happens when both translate_from and translate_to are None."""
+ def test_no_translation_when_both_none(self) -> None:
+ """Test no translation when both translate params are None."""
text = "hello world"
result = generate_learning_lesson(
text,
- batch_size=5,
- num_batches=1,
- skip_default_stopwords=True,
- translate_from=None,
- translate_to=None,
+ LessonConfig(
+ batch_size=5,
+ num_batches=1,
+ skip_default_stopwords=True,
+ translate_from=None,
+ translate_to=None,
+ ),
)
# Should not have translation output
diff --git a/python_pkg/word_frequency/tests/test_translator.py b/python_pkg/word_frequency/tests/test_translator.py
index 620aa4a..d3678f2 100644
--- a/python_pkg/word_frequency/tests/test_translator.py
+++ b/python_pkg/word_frequency/tests/test_translator.py
@@ -61,19 +61,16 @@ class ArgosAvailableMock:
self.mock_translate_module = MagicMock()
self.mock_package_module = MagicMock()
self.mock_parent = MagicMock()
- self.original_available = translator._argos_available
self._sys_modules_patcher: MagicMock | None = None
self._ensure_patcher: MagicMock | None = None
self._lang_patcher: MagicMock | None = None
+ self._check_argos_patcher: MagicMock | None = None
+ self._argos_module_patcher: MagicMock | None = None
def __enter__(self) -> MagicMock:
"""Set up the mocks."""
- translator._argos_available = True
-
# Set up translate return value
- if isinstance(self.translate_returns, Exception) or isinstance(
- self.translate_returns, list
- ):
+ if isinstance(self.translate_returns, (Exception, list)):
self.mock_translate_fn.side_effect = self.translate_returns
elif self.translate_returns is not None:
self.mock_translate_fn.return_value = self.translate_returns
@@ -96,41 +93,52 @@ class ArgosAvailableMock:
},
)
+ # Patch the module-level argostranslate reference in translator
+ self._argos_module_patcher = patch.object(
+ translator, "argostranslate", self.mock_parent, create=True
+ )
+
# Patch _ensure_argos_installed and _ensure_language_pair to no-op
self._ensure_patcher = patch.object(
translator, "_ensure_argos_installed", lambda: None
)
self._lang_patcher = patch.object(
- translator, "_ensure_language_pair", lambda f, t: None
+ translator, "_ensure_language_pair", lambda _f, _t: None
+ )
+ self._check_argos_patcher = patch.object(
+ translator, "_check_argos", return_value=True
)
self._sys_modules_patcher.start() # type: ignore[union-attr]
+ self._argos_module_patcher.start() # type: ignore[union-attr]
self._ensure_patcher.start() # type: ignore[union-attr]
self._lang_patcher.start() # type: ignore[union-attr]
+ self._check_argos_patcher.start() # type: ignore[union-attr]
return self.mock_translate_fn
def __exit__(self, *args: object) -> None:
"""Restore original state."""
+ if self._check_argos_patcher:
+ self._check_argos_patcher.stop()
if self._lang_patcher:
self._lang_patcher.stop()
if self._ensure_patcher:
self._ensure_patcher.stop()
+ if self._argos_module_patcher:
+ self._argos_module_patcher.stop()
if self._sys_modules_patcher:
self._sys_modules_patcher.stop()
- translator._argos_available = self.original_available
# Fixtures
@pytest.fixture
-def mock_argos_unavailable() -> Generator[None, None, None]:
+def _mock_argos_unavailable() -> Generator[None, None, None]:
"""Mock argostranslate being unavailable (for legacy tests)."""
- original_value = translator._argos_available
- translator._argos_available = False
- yield
- translator._argos_available = original_value
+ with patch.object(translator, "_check_argos", return_value=False):
+ yield
@pytest.fixture
@@ -178,7 +186,7 @@ class TestTranslationResult:
def test_result_is_tuple(self) -> None:
"""Test that TranslationResult is a namedtuple."""
- result = TranslationResult("a", "b", "en", "es", True)
+ result = TranslationResult("a", "b", "en", "es", success=True)
assert isinstance(result, tuple)
assert len(result) == 6
@@ -192,13 +200,15 @@ class TestTranslateWord:
def test_translate_word_argos_unavailable_raises(self) -> None:
"""Test that translation raises ImportError when argos is unavailable."""
# Mock _ensure_argos_installed to raise ImportError
- with patch.object(
- translator,
- "_ensure_argos_installed",
- side_effect=ImportError("argostranslate not available"),
+ with (
+ patch.object(
+ translator,
+ "_ensure_argos_installed",
+ side_effect=ImportError("argostranslate not available"),
+ ),
+ pytest.raises(ImportError, match="argostranslate not available"),
):
- with pytest.raises(ImportError, match="argostranslate not available"):
- translate_word("hello", "en", "es", use_cache=False)
+ translate_word("hello", "en", "es", use_cache=False)
def test_translate_word_success(self) -> None:
"""Test successful word translation."""
@@ -243,13 +253,15 @@ class TestTranslateWords:
def test_translate_words_argos_unavailable_raises(self) -> None:
"""Test that translating words raises ImportError when argos unavailable."""
- with patch.object(
- translator,
- "_ensure_argos_installed",
- side_effect=ImportError("argostranslate not available"),
+ with (
+ patch.object(
+ translator,
+ "_ensure_argos_installed",
+ side_effect=ImportError("argostranslate not available"),
+ ),
+ pytest.raises(ImportError, match="argostranslate not available"),
):
- with pytest.raises(ImportError, match="argostranslate not available"):
- translate_words(["hello", "world"], "en", "es", use_cache=False)
+ translate_words(["hello", "world"], "en", "es", use_cache=False)
# translate_words_batch tests
@@ -290,7 +302,7 @@ class TestTranslateWordsBatch:
assert results[4].translated_word == "cinco"
def test_batch_fallback_on_mismatch(self) -> None:
- """Test batch translation falls back to individual when result count mismatches."""
+ """Test batch falls back to individual on result count mismatch."""
words = ["one", "two", "three", "four"]
# First call (batch) returns wrong count, subsequent calls are individual
with ArgosAvailableMock(["wrong", "uno", "dos", "tres", "cuatro"]) as mock:
@@ -313,10 +325,11 @@ class TestTranslateWordsBatch:
mock_parent.translate = mock_translate_module
mock_parent.package = mock_package_module
- original = translator._argos_available
- translator._argos_available = True
-
with (
+ patch.object(translator, "_check_argos", return_value=True),
+ patch.object(
+ translator, "argostranslate", mock_parent, create=True
+ ),
patch.dict(
"sys.modules",
{
@@ -326,22 +339,22 @@ class TestTranslateWordsBatch:
},
),
patch.object(translator, "_ensure_argos_installed", lambda: None),
- patch.object(translator, "_ensure_language_pair", lambda f, t: None),
+ patch.object(translator, "_ensure_language_pair", lambda _f, _t: None),
pytest.raises(RuntimeError, match="Translation failed"),
):
translate_words_batch(words, "en", "es", use_cache=False)
- translator._argos_available = original
-
def test_batch_argos_unavailable_raises(self) -> None:
"""Test that batch translation raises ImportError when argos unavailable."""
- with patch.object(
- translator,
- "_ensure_argos_installed",
- side_effect=ImportError("argostranslate not available"),
+ with (
+ patch.object(
+ translator,
+ "_ensure_argos_installed",
+ side_effect=ImportError("argostranslate not available"),
+ ),
+ pytest.raises(ImportError, match="argostranslate not available"),
):
- with pytest.raises(ImportError, match="argostranslate not available"):
- translate_words_batch(["hello", "world"], "en", "es", use_cache=False)
+ translate_words_batch(["hello", "world"], "en", "es", use_cache=False)
# format_translations tests
@@ -358,7 +371,7 @@ class TestFormatTranslations:
def test_format_single_translation(self) -> None:
"""Test formatting single translation."""
results = [
- TranslationResult("hello", "hola", "en", "es", True),
+ TranslationResult("hello", "hola", "en", "es", success=True),
]
output = format_translations(results)
@@ -369,8 +382,8 @@ class TestFormatTranslations:
def test_format_multiple_translations(self) -> None:
"""Test formatting multiple translations."""
results = [
- TranslationResult("hello", "hola", "en", "es", True),
- TranslationResult("world", "mundo", "en", "es", True),
+ TranslationResult("hello", "hola", "en", "es", success=True),
+ TranslationResult("world", "mundo", "en", "es", success=True),
]
output = format_translations(results)
@@ -382,8 +395,10 @@ class TestFormatTranslations:
def test_format_with_errors(self) -> None:
"""Test formatting with failed translations."""
results = [
- TranslationResult("hello", "hola", "en", "es", True),
- TranslationResult("xyz", "", "en", "es", False, "Unknown word"),
+ TranslationResult("hello", "hola", "en", "es", success=True),
+ TranslationResult(
+ "xyz", "", "en", "es", success=False, error="Unknown word"
+ ),
]
output = format_translations(results, show_errors=True)
@@ -393,8 +408,10 @@ class TestFormatTranslations:
def test_format_hide_errors(self) -> None:
"""Test formatting with errors hidden."""
results = [
- TranslationResult("hello", "hola", "en", "es", True),
- TranslationResult("xyz", "", "en", "es", False, "Unknown word"),
+ TranslationResult("hello", "hola", "en", "es", success=True),
+ TranslationResult(
+ "xyz", "", "en", "es", success=False, error="Unknown word"
+ ),
]
output = format_translations(results, show_errors=False)
@@ -408,7 +425,7 @@ class TestFormatTranslations:
class TestGetInstalledLanguages:
"""Tests for get_installed_languages function."""
- def test_argos_unavailable(self, mock_argos_unavailable: None) -> None:
+ def test_argos_unavailable(self, _mock_argos_unavailable: None) -> None:
"""Test when argos is unavailable."""
result = get_installed_languages()
assert result == []
@@ -433,21 +450,22 @@ class TestGetInstalledLanguages:
mock_parent.translate = mock_translate_module
mock_parent.package = mock_package_module
- original = translator._argos_available
- translator._argos_available = True
-
- with patch.dict(
- "sys.modules",
- {
- "argostranslate": mock_parent,
- "argostranslate.translate": mock_translate_module,
- "argostranslate.package": mock_package_module,
- },
+ with (
+ patch.object(translator, "_check_argos", return_value=True),
+ patch.object(
+ translator, "argostranslate", mock_parent, create=True
+ ),
+ patch.dict(
+ "sys.modules",
+ {
+ "argostranslate": mock_parent,
+ "argostranslate.translate": mock_translate_module,
+ "argostranslate.package": mock_package_module,
+ },
+ ),
):
result = get_installed_languages()
- translator._argos_available = original
-
assert ("en", "English") in result
assert ("es", "Spanish") in result
@@ -458,7 +476,7 @@ class TestGetInstalledLanguages:
class TestGetAvailablePackages:
"""Tests for get_available_packages function."""
- def test_argos_unavailable(self, mock_argos_unavailable: None) -> None:
+ def test_argos_unavailable(self, _mock_argos_unavailable: None) -> None:
"""Test when argos is unavailable."""
result = get_available_packages()
assert result == []
@@ -470,7 +488,7 @@ class TestGetAvailablePackages:
class TestDownloadLanguages:
"""Tests for download_languages function."""
- def test_argos_unavailable(self, mock_argos_unavailable: None) -> None:
+ def test_argos_unavailable(self, _mock_argos_unavailable: None) -> None:
"""Test when argos is unavailable."""
result = download_languages(["en", "es"])
assert result == {}
@@ -503,7 +521,7 @@ class TestReadFile:
class TestMain:
"""Tests for main CLI function."""
- def test_argos_unavailable_error(self, mock_argos_unavailable: None) -> None:
+ def test_argos_unavailable_error(self, _mock_argos_unavailable: None) -> None:
"""Test error when argos not installed."""
result = main(["--text", "hello", "--from", "en", "--to", "es"])
assert result == 1
@@ -517,21 +535,22 @@ class TestMain:
mock_parent.translate = mock_translate_module
mock_parent.package = mock_package_module
- original = translator._argos_available
- translator._argos_available = True
-
- with patch.dict(
- "sys.modules",
- {
- "argostranslate": mock_parent,
- "argostranslate.translate": mock_translate_module,
- "argostranslate.package": mock_package_module,
- },
+ with (
+ patch.object(translator, "_check_argos", return_value=True),
+ patch.object(
+ translator, "argostranslate", mock_parent, create=True
+ ),
+ patch.dict(
+ "sys.modules",
+ {
+ "argostranslate": mock_parent,
+ "argostranslate.translate": mock_translate_module,
+ "argostranslate.package": mock_package_module,
+ },
+ ),
):
result = main(["--list-languages"])
- translator._argos_available = original
-
assert result == 0
captured = capsys.readouterr()
assert "No languages installed" in captured.out
@@ -551,21 +570,22 @@ class TestMain:
mock_parent.translate = mock_translate_module
mock_parent.package = mock_package_module
- original = translator._argos_available
- translator._argos_available = True
-
- with patch.dict(
- "sys.modules",
- {
- "argostranslate": mock_parent,
- "argostranslate.translate": mock_translate_module,
- "argostranslate.package": mock_package_module,
- },
+ with (
+ patch.object(translator, "_check_argos", return_value=True),
+ patch.object(
+ translator, "argostranslate", mock_parent, create=True
+ ),
+ patch.dict(
+ "sys.modules",
+ {
+ "argostranslate": mock_parent,
+ "argostranslate.translate": mock_translate_module,
+ "argostranslate.package": mock_package_module,
+ },
+ ),
):
result = main(["--list-languages"])
- translator._argos_available = original
-
assert result == 0
captured = capsys.readouterr()
assert "en" in captured.out
@@ -622,7 +642,6 @@ class TestMain:
def test_translate_output_to_file(
self,
tmp_path: Path,
- capsys: pytest.CaptureFixture[str],
) -> None:
"""Test outputting translations to file."""
output_file = tmp_path / "output.txt"
@@ -647,7 +666,9 @@ class TestMain:
assert "hello" in content
assert "hola" in content
- def test_no_input_shows_help(self, capsys: pytest.CaptureFixture[str]) -> None:
+ def test_no_input_shows_help(
+ self,
+ ) -> None:
"""Test that no input shows help."""
with ArgosAvailableMock():
result = main([])
diff --git a/python_pkg/word_frequency/tests/test_vocabulary_curve.py b/python_pkg/word_frequency/tests/test_vocabulary_curve.py
index 352093a..df57291 100755
--- a/python_pkg/word_frequency/tests/test_vocabulary_curve.py
+++ b/python_pkg/word_frequency/tests/test_vocabulary_curve.py
@@ -89,7 +89,7 @@ class TestExcerptValidity:
"""Tests that verify excerpts are actually found in the source text."""
def test_excerpt_exists_in_source_text(self, sample_text_file: Path) -> None:
- """Test that each excerpt can be found in the source text as contiguous words."""
+ """Test that each excerpt can be found in source text."""
import re
source_text = sample_text_file.read_text(encoding="utf-8").lower()
diff --git a/python_pkg/word_frequency/translator.py b/python_pkg/word_frequency/translator.py
index dc36e90..354571a 100755
--- a/python_pkg/word_frequency/translator.py
+++ b/python_pkg/word_frequency/translator.py
@@ -1,149 +1,163 @@
#!/usr/bin/env python3
-"""Translator - translates words/text between languages.
+r"""Translator - translates words/text between languages.
This module provides translation capabilities using either:
-1. Argos Translate (offline, requires large downloads) - preferred if installed
-2. deep-translator (online, uses Google Translate) - lightweight fallback
-Usage:
+1. Argos Translate (offline, requires large downloads)
+2. deep-translator (online, uses Google Translate)
+
+Usage::
+
# Translate a single word
- python -m python_pkg.word_frequency.translator --text "hello" --from en --to es
+ python -m python_pkg.word_frequency.translator \\
+ --text "hello" --from en --to es
# Translate multiple words
- python -m python_pkg.word_frequency.translator --words hello world goodbye --from en --to pl
+ python -m python_pkg.word_frequency.translator \\
+ --words hello world goodbye --from en --to pl
# Translate words from a file (one word per line)
- python -m python_pkg.word_frequency.translator --words-file words.txt --from la --to en
+ python -m python_pkg.word_frequency.translator \\
+ --words-file words.txt --from la --to en
# List available languages
- python -m python_pkg.word_frequency.translator --list-languages
+ python -m python_pkg.word_frequency.translator \\
+ --list-languages
# Output to file
- python -m python_pkg.word_frequency.translator --words-file vocab.txt --from pl --to en --output translations.txt
+ python -m python_pkg.word_frequency.translator \\
+ --words-file vocab.txt --from pl --to en \\
+ --output translations.txt
-Dependencies (install one):
- pip install deep-translator # Lightweight, uses Google Translate (online)
- pip install argostranslate # Offline translation (requires ~3GB downloads)
+Dependencies (install one)::
+
+ pip install deep-translator
+ pip install argostranslate
"""
from __future__ import annotations
import argparse
+import importlib
+import logging
+import os
from pathlib import Path
+import subprocess
import sys
from typing import TYPE_CHECKING, NamedTuple
if TYPE_CHECKING:
from collections.abc import Sequence
-# Lazy imports for translation backends (may not be installed)
-_argos_available: bool | None = None
-_deep_translator_available: bool | None = None
-_langdetect_available: bool | None = None
-_gpu_initialized: bool = False
-_gpu_available: bool | None = None
+try:
+ import torch
+except ImportError:
+ torch = None # type: ignore[assignment]
+
+try:
+ import argostranslate.package
+ import argostranslate.translate
+except ImportError:
+ argostranslate = None # type: ignore[assignment]
+
+try:
+ from deep_translator import GoogleTranslator
+except ImportError:
+ GoogleTranslator = None
+
+try:
+ import langdetect
+except ImportError:
+ langdetect = None # type: ignore[assignment]
+
+try:
+ from python_pkg.word_frequency.cache import (
+ get_translation_cache,
+ )
+except ImportError:
+ get_translation_cache = None
+
+logger = logging.getLogger(__name__)
+
+_LANG_DETECT_SAMPLE_SIZE = 5000
+_BATCH_SIZE = 100
+
+
+class _TranslatorState:
+ """Holds module-level state for lazy-initialized backends."""
+
+ gpu_initialized: bool = False
def _check_cuda_available() -> bool:
"""Check if CUDA is available for GPU acceleration."""
- global _gpu_available
- if _gpu_available is None:
- try:
- import torch
+ return torch is not None and torch.cuda.is_available()
- _gpu_available = torch.cuda.is_available()
- except ImportError:
- _gpu_available = False
- return _gpu_available
+
+def _validate_gpu_device() -> str:
+ """Validate GPU device availability and return device name.
+
+ Raises:
+ RuntimeError: If no GPU devices are found.
+ """
+ device_count = torch.cuda.device_count()
+ if device_count == 0:
+ msg = "CUDA reports available but no GPU devices found"
+ raise RuntimeError(msg)
+ return torch.cuda.get_device_name(0)
def _init_gpu_if_available() -> None:
"""Initialize GPU for argostranslate if CUDA is available.
Raises:
- RuntimeError: If CUDA is available but GPU initialization fails.
+ RuntimeError: If CUDA is available but GPU init fails.
"""
- global _gpu_initialized
- if _gpu_initialized:
+ if _TranslatorState.gpu_initialized:
return
if not _check_cuda_available():
- _gpu_initialized = True
+ _TranslatorState.gpu_initialized = True
return
- import sys
-
- print("CUDA detected, initializing GPU acceleration...", file=sys.stderr)
+ logger.info(
+ "CUDA detected, initializing GPU acceleration..."
+ )
try:
- import torch
-
- # Force CTranslate2 to use CUDA
- device_count = torch.cuda.device_count()
- if device_count == 0:
- raise RuntimeError("CUDA reports available but no GPU devices found")
-
- device_name = torch.cuda.get_device_name(0)
- print(f" Using GPU: {device_name}", file=sys.stderr)
-
- # Set environment variable to force GPU usage in argos
- import os
+ device_name = _validate_gpu_device()
+ logger.info(" Using GPU: %s", device_name)
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
os.environ["CT2_USE_EXPERIMENTAL_PACKED_GEMM"] = "1"
- _gpu_initialized = True
- print(" GPU acceleration enabled.", file=sys.stderr)
+ _TranslatorState.gpu_initialized = True
+ logger.info(" GPU acceleration enabled.")
except Exception as e:
- raise RuntimeError(
- f"CUDA is available but GPU initialization failed: {e}\n"
- f"This may be due to incompatible CUDA version or driver issues.\n"
- f"To disable GPU and use CPU only, set environment variable: CT2_FORCE_CPU=1"
- ) from e
+ msg = (
+ f"CUDA is available but GPU initialization failed: "
+ f"{e}\nThis may be due to incompatible CUDA "
+ "version or driver issues.\n"
+ "To disable GPU and use CPU only, set "
+ "environment variable: CT2_FORCE_CPU=1"
+ )
+ raise RuntimeError(msg) from e
def _check_argos() -> bool:
"""Check if argostranslate is available."""
- global _argos_available
- if _argos_available is None:
- try:
- import argostranslate.package
- import argostranslate.translate
-
- _ = (argostranslate.package, argostranslate.translate)
- _argos_available = True
- except ImportError:
- _argos_available = False
- return _argos_available
+ return argostranslate is not None
def _check_deep_translator() -> bool:
"""Check if deep-translator is available."""
- global _deep_translator_available
- if _deep_translator_available is None:
- try:
- from deep_translator import GoogleTranslator
-
- _ = GoogleTranslator
- _deep_translator_available = True
- except ImportError:
- _deep_translator_available = False
- return _deep_translator_available
+ return GoogleTranslator is not None
def _check_langdetect() -> bool:
"""Check if langdetect is available."""
- global _langdetect_available
- if _langdetect_available is None:
- try:
- import langdetect
-
- _ = langdetect
- _langdetect_available = True
- except ImportError:
- _langdetect_available = False
- return _langdetect_available
+ return langdetect is not None
def detect_language(text: str) -> str | None:
@@ -158,13 +172,14 @@ def detect_language(text: str) -> str | None:
if not _check_langdetect():
return None
- import langdetect
-
try:
- # Use a sample of the text for detection (faster and more reliable)
- sample = text[:5000] if len(text) > 5000 else text
- return langdetect.detect(sample) # type: ignore[no-any-return]
- except langdetect.LangDetectException: # type: ignore[attr-defined]
+ sample = (
+ text[:_LANG_DETECT_SAMPLE_SIZE]
+ if len(text) > _LANG_DETECT_SAMPLE_SIZE
+ else text
+ )
+ return langdetect.detect(sample) # type: ignore[no-any-return,union-attr]
+ except langdetect.LangDetectException: # type: ignore[attr-defined,union-attr]
return None
@@ -188,8 +203,6 @@ def get_installed_languages() -> list[tuple[str, str]]:
if not _check_argos():
return []
- import argostranslate.translate
-
languages = argostranslate.translate.get_installed_languages()
return [(lang.code, lang.name) for lang in languages]
@@ -203,8 +216,6 @@ def get_available_packages() -> list[tuple[str, str, str, str]]:
if not _check_argos():
return []
- import argostranslate.package
-
argostranslate.package.update_package_index()
available = argostranslate.package.get_available_packages()
return [
@@ -227,12 +238,10 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
if not _check_argos():
return {}
- import argostranslate.package
-
results: dict[str, bool] = {}
# Update package index
- print("Updating package index...")
+ logger.info("Updating package index...")
argostranslate.package.update_package_index()
available = argostranslate.package.get_available_packages()
@@ -255,13 +264,26 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
if pkg_key in available_lookup:
pkg = available_lookup[pkg_key]
try:
- print(f"Downloading {from_code} -> {to_code}...")
+ logger.info(
+ "Downloading %s -> %s...",
+ from_code,
+ to_code,
+ )
argostranslate.package.install_from_path(pkg.download())
results[key] = True
- print(f" ✓ Installed {from_code} -> {to_code}")
- except Exception as e: # noqa: BLE001
+ logger.info(
+ " Installed %s -> %s",
+ from_code,
+ to_code,
+ )
+ except (OSError, RuntimeError, ValueError) as e:
results[key] = False
- print(f" ✗ Failed {from_code} -> {to_code}: {e}")
+ logger.info(
+ " Failed %s -> %s: %s",
+ from_code,
+ to_code,
+ e,
+ )
else:
# Package not available
results[key] = False
@@ -278,32 +300,38 @@ def _ensure_argos_installed() -> None:
if _check_argos():
return
- import subprocess
- import sys
-
- print("argostranslate not found. Attempting to install...")
+ logger.info("argostranslate not found. Attempting to install...")
try:
subprocess.run(
[sys.executable, "-m", "pip", "install", "argostranslate"],
check=True,
capture_output=True,
)
- # Reset the check flag and verify
- global _argos_available
- _argos_available = None
- if not _check_argos():
- raise ImportError("argostranslate installation succeeded but import failed")
- print("argostranslate installed successfully.")
+ # Attempt runtime re-import
+ importlib.import_module("argostranslate.package")
+ importlib.import_module("argostranslate.translate")
+ logger.info("argostranslate installed successfully.")
except subprocess.CalledProcessError as e:
error_msg = e.stderr.decode() if e.stderr else str(e)
- raise ImportError(
- f"argostranslate is required for offline translation.\n\n"
- f"Install manually with one of:\n"
- f" pip install argostranslate # In a virtualenv\n"
- f" pipx install argostranslate # System-wide via pipx\n"
- f" pacman -S python-argostranslate # Arch Linux (if available)\n\n"
+ msg = (
+ "argostranslate is required for offline "
+ "translation.\n\n"
+ "Install manually with one of:\n"
+ " pip install argostranslate"
+ " # In a virtualenv\n"
+ " pipx install argostranslate"
+ " # System-wide via pipx\n"
+ " pacman -S python-argostranslate"
+ " # Arch Linux (if available)\n\n"
f"Original error: {error_msg}"
- ) from e
+ )
+ raise ImportError(msg) from e
+ except ImportError:
+ msg = (
+ "argostranslate installation succeeded but "
+ "import failed"
+ )
+ raise ImportError(msg) from None
def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
@@ -316,11 +344,9 @@ def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
Raises:
ValueError: If language pair cannot be obtained.
"""
- import argostranslate.package
- import argostranslate.translate
-
- # Check if already installed
- installed_languages = argostranslate.translate.get_installed_languages()
+ installed_languages = (
+ argostranslate.translate.get_installed_languages()
+ )
from_lang_obj = None
to_lang_obj = None
@@ -337,37 +363,44 @@ def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
return # Already available
# Need to download
- import sys
-
- print(
- f"Downloading language pack: {from_lang} -> {to_lang}...",
- file=sys.stderr,
+ logger.info(
+ "Downloading language pack: %s -> %s...",
+ from_lang,
+ to_lang,
)
- print(" Fetching package index...", file=sys.stderr)
+ logger.info(" Fetching package index...")
argostranslate.package.update_package_index()
available = argostranslate.package.get_available_packages()
pkg = next(
- (p for p in available if p.from_code == from_lang and p.to_code == to_lang),
+ (
+ p
+ for p in available
+ if p.from_code == from_lang and p.to_code == to_lang
+ ),
None,
)
if pkg is None:
- raise ValueError(
- f"No language pack available for {from_lang} -> {to_lang}. "
- f"Available pairs can be listed with --list-languages."
+ msg = (
+ f"No language pack available for "
+ f"{from_lang} -> {to_lang}. "
+ "Available pairs can be listed with "
+ "--list-languages."
)
+ raise ValueError(msg)
- print(
- " Downloading package (~50-100MB, this may take a minute)...",
- file=sys.stderr,
+ logger.info(
+ " Downloading package (~50-100MB, "
+ "this may take a minute)...",
)
download_path = pkg.download()
- print(" Installing language pack...", file=sys.stderr)
+ logger.info(" Installing language pack...")
argostranslate.package.install_from_path(download_path)
- print(
- f"Language pack {from_lang} -> {to_lang} installed.",
- file=sys.stderr,
+ logger.info(
+ "Language pack %s -> %s installed.",
+ from_lang,
+ to_lang,
)
@@ -393,38 +426,30 @@ def translate_word(
ImportError: If argostranslate is not available and cannot be installed.
"""
# Check cache first
- if use_cache:
- try:
- from python_pkg.word_frequency.cache import get_translation_cache
-
- cache = get_translation_cache()
- cached = cache.get(word, from_lang, to_lang)
- if cached is not None:
- return TranslationResult(
- source_word=word,
- translated_word=cached,
- source_lang=from_lang,
- target_lang=to_lang,
- success=True,
- )
- except ImportError:
- pass # Cache not available
+ if use_cache and get_translation_cache is not None:
+ cache = get_translation_cache()
+ cached = cache.get(word, from_lang, to_lang)
+ if cached is not None:
+ return TranslationResult(
+ source_word=word,
+ translated_word=cached,
+ source_lang=from_lang,
+ target_lang=to_lang,
+ success=True,
+ )
# Ensure argos is installed (will raise if it can't be)
_ensure_argos_installed()
- import argostranslate.translate
-
try:
- translated = argostranslate.translate.translate(word, from_lang, to_lang)
+ translated = argostranslate.translate.translate(
+ word, from_lang, to_lang,
+ )
# Cache the result
- if use_cache:
- try:
- from python_pkg.word_frequency.cache import get_translation_cache
-
- get_translation_cache().set(word, from_lang, to_lang, translated)
- except ImportError:
- pass
+ if use_cache and get_translation_cache is not None:
+ get_translation_cache().set(
+ word, from_lang, to_lang, translated,
+ )
return TranslationResult(
source_word=word,
translated_word=translated,
@@ -432,7 +457,7 @@ def translate_word(
target_lang=to_lang,
success=True,
)
- except Exception as e: # noqa: BLE001
+ except (OSError, RuntimeError, ValueError, TypeError) as e:
return TranslationResult(
source_word=word,
translated_word="",
@@ -483,8 +508,6 @@ def _translate_batch_worker(
Returns:
Tuple of (batch_idx, translations dict).
"""
- import argostranslate.translate
-
translations: dict[str, str] = {}
# Batch translate by joining with newlines
@@ -507,6 +530,78 @@ def _translate_batch_worker(
return batch_idx, translations
+def _run_batch_translation(
+ words_to_translate: list[str],
+ from_lang: str,
+ to_lang: str,
+) -> dict[str, str]:
+ """Translate a list of words in batches with progress logging.
+
+ Args:
+ words_to_translate: Words needing translation.
+ from_lang: Source language code.
+ to_lang: Target language code.
+
+ Returns:
+ Dict mapping lowercased words to translations.
+
+ Raises:
+ RuntimeError: If translation fails.
+ """
+ new_translations: dict[str, str] = {}
+ num_to_translate = len(words_to_translate)
+
+ gpu_status = (
+ " (GPU)" if _check_cuda_available() else " (CPU)"
+ )
+ logger.info(
+ "Translating %d words from %s to %s%s...",
+ num_to_translate,
+ from_lang,
+ to_lang,
+ gpu_status,
+ )
+
+ try:
+ batches = [
+ words_to_translate[i : i + _BATCH_SIZE]
+ for i in range(0, num_to_translate, _BATCH_SIZE)
+ ]
+ total_batches = len(batches)
+
+ for batch_idx, batch_words in enumerate(batches):
+ words_done = min(
+ (batch_idx + 1) * _BATCH_SIZE,
+ num_to_translate,
+ )
+ pct = int(words_done / num_to_translate * 100)
+
+ logger.info(
+ " [%3d%%] Translating batch %d/%d "
+ "(%d/%d words)...",
+ pct,
+ batch_idx + 1,
+ total_batches,
+ words_done,
+ num_to_translate,
+ )
+
+ _, batch_translations = _translate_batch_worker(
+ batch_words, from_lang, to_lang, batch_idx,
+ )
+ new_translations.update(batch_translations)
+
+ logger.info(" Translation complete.")
+ except Exception as e:
+ msg = (
+ f"Translation failed for "
+ f"{from_lang} -> {to_lang}: {e}"
+ )
+ raise RuntimeError(msg) from e
+
+ return new_translations
+
+
def translate_words_batch(
words: Sequence[str],
from_lang: str,
@@ -535,90 +630,36 @@ def translate_words_batch(
if not words:
return []
- # Ensure argos is installed (will raise if it can't be)
_ensure_argos_installed()
-
- # Initialize GPU if available (will raise if CUDA available but fails)
_init_gpu_if_available()
-
- # Ensure language pair is available
_ensure_language_pair(from_lang, to_lang)
# Check cache for already-translated words
cached_results: dict[str, str] = {}
- words_to_translate: list[str] = []
-
- if use_cache:
- try:
- from python_pkg.word_frequency.cache import get_translation_cache
-
- cache = get_translation_cache()
- cached_results = cache.get_many(list(words), from_lang, to_lang)
- except ImportError:
- pass
+ if use_cache and get_translation_cache is not None:
+ cache = get_translation_cache()
+ cached_results = cache.get_many(
+ list(words), from_lang, to_lang,
+ )
# Find words that still need translation
- for word in words:
- if word.lower() not in cached_results:
- words_to_translate.append(word)
+ words_to_translate = [
+ word for word in words
+ if word.lower() not in cached_results
+ ]
# Translate uncached words using argos batch
new_translations: dict[str, str] = {}
if words_to_translate:
- import sys
-
- num_to_translate = len(words_to_translate)
-
- # Check if GPU is being used
- gpu_status = " (GPU)" if _gpu_available else " (CPU)"
- print(
- f"Translating {num_to_translate} words from {from_lang} to {to_lang}{gpu_status}...",
- file=sys.stderr,
- flush=True,
+ new_translations = _run_batch_translation(
+ words_to_translate, from_lang, to_lang,
)
- try:
- # Split into batches - larger batches are faster but show progress less often
- BATCH_SIZE = 100
- batches: list[list[str]] = []
- for i in range(0, num_to_translate, BATCH_SIZE):
- batches.append(words_to_translate[i : i + BATCH_SIZE])
-
- total_batches = len(batches)
-
- # Sequential translation with progress
- # (argostranslate is not thread-safe - uses global model)
- for batch_idx, batch_words in enumerate(batches):
- words_done = (batch_idx + 1) * BATCH_SIZE
- words_done = min(words_done, num_to_translate)
- pct = int(words_done / num_to_translate * 100)
-
- print(
- f" [{pct:3d}%] Translating batch {batch_idx + 1}/{total_batches} "
- f"({words_done}/{num_to_translate} words)...",
- file=sys.stderr,
- flush=True,
- )
-
- _, batch_translations = _translate_batch_worker(
- batch_words, from_lang, to_lang, batch_idx
- )
- new_translations.update(batch_translations)
-
- print(" Translation complete.", file=sys.stderr, flush=True)
- except Exception as e:
- raise RuntimeError(
- f"Translation failed for {from_lang} -> {to_lang}: {e}"
- ) from e
-
# Cache new translations
- if use_cache and new_translations:
- try:
- from python_pkg.word_frequency.cache import get_translation_cache
-
- get_translation_cache().set_many(new_translations, from_lang, to_lang)
- except ImportError:
- pass
+ if use_cache and get_translation_cache is not None:
+ get_translation_cache().set_many(
+ new_translations, from_lang, to_lang,
+ )
# Merge cached and new translations
all_translations = {**cached_results, **new_translations}
@@ -694,22 +735,14 @@ def read_file(filepath: str | Path) -> str:
return Path(filepath).read_text(encoding="utf-8")
-def main(argv: Sequence[str] | None = None) -> int:
- """Main entry point for the translator.
-
- Args:
- argv: Command line arguments.
-
- Returns:
- Exit code.
- """
+def _build_parser() -> argparse.ArgumentParser:
+ """Build the argument parser for the translator CLI."""
parser = argparse.ArgumentParser(
description="Offline translator using Argos Translate.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
- # Actions
action_group = parser.add_mutually_exclusive_group()
action_group.add_argument(
"--list-languages",
@@ -728,10 +761,12 @@ def main(argv: Sequence[str] | None = None) -> int:
"-d",
nargs="+",
metavar="LANG",
- help="Download language packs (e.g., --download en es pl)",
+ help=(
+ "Download language packs "
+ "(e.g., --download en es pl)"
+ ),
)
- # Input
input_group = parser.add_mutually_exclusive_group()
input_group.add_argument(
"--text",
@@ -752,7 +787,6 @@ def main(argv: Sequence[str] | None = None) -> int:
help="File with words to translate (one per line)",
)
- # Language options
parser.add_argument(
"--from",
"-f",
@@ -769,8 +803,6 @@ def main(argv: Sequence[str] | None = None) -> int:
default="en",
help="Target language code (default: en)",
)
-
- # Output
parser.add_argument(
"--output",
"-o",
@@ -778,87 +810,142 @@ def main(argv: Sequence[str] | None = None) -> int:
help="Output file path",
)
- args = parser.parse_args(argv)
+ return parser
- # Check if argostranslate is available
- if not _check_argos():
- print(
- "Error: argostranslate is not installed.\n"
- "Install it with: pip install argostranslate",
- file=sys.stderr,
+
+def _handle_list_languages() -> int:
+ """Handle --list-languages command."""
+ langs = get_installed_languages()
+ if not langs:
+ sys.stdout.write("No languages installed.\n")
+ sys.stdout.write(
+ "Download some with: --download en es pl de fr\n",
)
- return 1
+ else:
+ sys.stdout.write("Installed languages:\n")
+ for code, name in sorted(langs):
+ sys.stdout.write(f" {code}: {name}\n")
+ return 0
- # Handle list-languages
- if args.list_languages:
- langs = get_installed_languages()
- if not langs:
- print("No languages installed.")
- print("Download some with: --download en es pl de fr")
- else:
- print("Installed languages:")
- for code, name in sorted(langs):
- print(f" {code}: {name}")
- return 0
- # Handle list-available
- if args.list_available:
- packages = get_available_packages()
- if not packages:
- print("No packages available (check internet connection).")
- else:
- print("Available language packages:")
- for from_code, from_name, to_code, to_name in sorted(packages):
- print(f" {from_code} ({from_name}) -> {to_code} ({to_name})")
- return 0
+def _handle_list_available() -> int:
+ """Handle --list-available command."""
+ packages = get_available_packages()
+ if not packages:
+ sys.stdout.write(
+ "No packages available "
+ "(check internet connection).\n",
+ )
+ else:
+ sys.stdout.write("Available language packages:\n")
+ for from_code, from_name, to_code, to_name in sorted(
+ packages,
+ ):
+ sys.stdout.write(
+ f" {from_code} ({from_name})"
+ f" -> {to_code} ({to_name})\n",
+ )
+ return 0
- # Handle download
- if args.download:
- download_results = download_languages(args.download)
- success_count = sum(1 for v in download_results.values() if v)
- print(f"\nDownloaded {success_count}/{len(download_results)} language pairs.")
- return 0 if success_count > 0 else 1
- # Handle translation
- words: list[str] = []
+def _handle_download(lang_codes: list[str]) -> int:
+ """Handle --download command."""
+ download_results = download_languages(lang_codes)
+ success_count = sum(
+ 1 for v in download_results.values() if v
+ )
+ sys.stdout.write(
+ f"\nDownloaded {success_count}/"
+ f"{len(download_results)} language pairs.\n",
+ )
+ return 0 if success_count > 0 else 1
+
+
+def _collect_words(
+ args: argparse.Namespace,
+) -> list[str] | None:
+ """Collect words from args. Returns None on error."""
if args.text:
- words = [args.text]
- elif args.words:
- words = args.words
- elif args.words_file:
+ return [args.text]
+ if args.words:
+ return args.words
+ if args.words_file:
try:
content = read_file(args.words_file)
- words = [w.strip() for w in content.splitlines() if w.strip()]
except FileNotFoundError:
- print(f"Error: File not found: {args.words_file}", file=sys.stderr)
- return 1
+ sys.stderr.write(
+ f"Error: File not found: {args.words_file}\n",
+ )
+ return None
+ return [
+ w.strip()
+ for w in content.splitlines()
+ if w.strip()
+ ]
+ return []
- if not words:
- parser.print_help()
- return 1
- # Translate
+def _handle_translation(args: argparse.Namespace) -> int:
+ """Handle the translation action."""
try:
- results = translate_words_batch(words, args.from_lang, args.to_lang)
- except ImportError as e:
- print(f"Error: {e}", file=sys.stderr)
+ results = translate_words_batch(
+ args.words, args.from_lang, args.to_lang,
+ )
+ except ImportError:
+ logger.exception("Translation import error")
return 1
output = format_translations(results)
- # Output
if args.output:
Path(args.output).write_text(output, encoding="utf-8")
- print(f"Translations written to {args.output}")
+ sys.stdout.write(
+ f"Translations written to {args.output}\n",
+ )
else:
- print(output)
+ sys.stdout.write(output + "\n")
- # Return error if any translation failed
if any(not r.success for r in results):
return 1
return 0
+def main(argv: Sequence[str] | None = None) -> int:
+ """Main entry point for the translator.
+
+ Args:
+ argv: Command line arguments.
+
+ Returns:
+ Exit code.
+ """
+ parser = _build_parser()
+ args = parser.parse_args(argv)
+
+ if not _check_argos():
+ sys.stderr.write(
+ "Error: argostranslate is not installed.\n"
+ "Install it with: pip install argostranslate\n",
+ )
+ return 1
+
+ if args.list_languages:
+ return _handle_list_languages()
+ if args.list_available:
+ return _handle_list_available()
+ if args.download:
+ return _handle_download(args.download)
+
+ words = _collect_words(args)
+ if not words:
+ if words is not None:
+ parser.print_help()
+ return 1
+
+ args.words = words
+ return _handle_translation(args)
+
+
if __name__ == "__main__":
sys.exit(main())
diff --git a/python_pkg/word_frequency/vocabulary_curve.py b/python_pkg/word_frequency/vocabulary_curve.py
index 46c0e2d..54ca7e5 100755
--- a/python_pkg/word_frequency/vocabulary_curve.py
+++ b/python_pkg/word_frequency/vocabulary_curve.py
@@ -14,7 +14,9 @@ Usage:
from __future__ import annotations
import argparse
+import logging
from pathlib import Path
+import re
import sys
from typing import TYPE_CHECKING, NamedTuple
@@ -27,6 +29,9 @@ except ImportError:
from analyzer import analyze_text, read_file
+logger = logging.getLogger(__name__)
+
+
class ExcerptAnalysis(NamedTuple):
"""Analysis result for an excerpt length."""
@@ -111,8 +116,6 @@ def find_optimal_excerpts(
ranked_words = [word for word, _ in word_counts.most_common()]
# Extract all words from text (preserving order)
- import re
-
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
if not case_sensitive:
all_words = [w.lower() for w in all_words]
@@ -150,6 +153,9 @@ def find_optimal_excerpts(
return results
+_MAX_EXCERPT_DISPLAY_LEN = 50
+
+
def format_results(
results: list[ExcerptAnalysis],
*,
@@ -198,7 +204,7 @@ def format_results(
if show_excerpts:
# Truncate long excerpts
excerpt = r.best_excerpt
- if len(excerpt) > 50:
+ if len(excerpt) > _MAX_EXCERPT_DISPLAY_LEN:
excerpt = excerpt[:47] + "..."
lines.append(f"{r.excerpt_length:>6} {r.min_vocab_needed:>5} {excerpt}")
else:
@@ -285,10 +291,7 @@ def main(argv: Sequence[str] | None = None) -> int:
args = parser.parse_args(argv)
try:
- if args.text:
- text = args.text
- else:
- text = read_file(args.file)
+ text = args.text or read_file(args.file)
results = find_optimal_excerpts(
text,
@@ -304,15 +307,15 @@ def main(argv: Sequence[str] | None = None) -> int:
if args.output:
Path(args.output).write_text(output, encoding="utf-8")
- print(f"Output written to {args.output}")
+ logger.info("Output written to %s", args.output)
else:
- print(output)
+ logger.info("%s", output)
- except FileNotFoundError as e:
- print(f"Error: File not found - {e}", file=sys.stderr)
+ except FileNotFoundError:
+ logger.exception("File not found")
return 1
- except UnicodeDecodeError as e:
- print(f"Error: Could not decode file - {e}", file=sys.stderr)
+ except UnicodeDecodeError:
+ logger.exception("Could not decode file")
return 1
return 0