feat: automatic language detection translation and anki generator with cache

2026-07-04 14:23:04 +02:00 · 2025-12-29 14:41:56 +01:00 · 2025-12-29 14:41:56 +01:00 · d2b6f00185
commit d2b6f00185
parent 1411e685c2
10 changed files with 3826 additions and 299 deletions
--- a/C/vocabulary_curve/main.c
+++ b/C/vocabulary_curve/main.c
@ -158,9 +158,20 @@ static void assign_ranks(void) {
    /* Sort all_entries by frequency (this doesn't affect word_sequence) */
    qsort(all_entries, num_unique_words, sizeof(WordEntry *), compare_by_count);
    
-    /* Assign 1-indexed ranks */
+    /* Assign 1-indexed ranks using competition ranking:
+     * Words with same frequency get same rank.
+     * Next rank is current_position + 1 (skipping numbers).
+     * Example: counts 5,3,3,2 -> ranks 1,2,2,4 (not 1,2,3,4) */
    for (int i = 0; i < num_unique_words; i++) {
-        all_entries[i]->rank = i + 1;
+        if (i == 0) {
+            all_entries[i]->rank = 1;
+        } else if (all_entries[i]->count == all_entries[i-1]->count) {
+            /* Same frequency as previous word - same rank */
+            all_entries[i]->rank = all_entries[i-1]->rank;
+        } else {
+            /* Different frequency - rank is position + 1 */
+            all_entries[i]->rank = i + 1;
+        }
    }
 }

@ -306,20 +317,42 @@ static void cleanup(void) {
    }
 }

+/* Dump all vocabulary with ranks (for Python integration) */
+static void dump_vocabulary(int max_rank) {
+    printf("VOCAB_DUMP_START\n");
+    for (int i = 0; i < num_unique_words; i++) {
+        if (all_entries[i]->rank <= max_rank) {
+            printf("%s;%d\n", all_entries[i]->word, all_entries[i]->rank);
+        }
+    }
+    printf("VOCAB_DUMP_END\n");
+}
+
 int main(int argc, char *argv[]) {
    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <file.txt> [max_length]\n", argv[0]);
+        fprintf(stderr, "Usage: %s <file.txt> [max_length] [--dump-vocab [max_rank]]\n", argv[0]);
        fprintf(stderr, "  max_length: maximum excerpt length to analyze (default: 30)\n");
+        fprintf(stderr, "  --dump-vocab: output all words with ranks up to max_rank\n");
        return 1;
    }
    
    const char *filename = argv[1];
    int max_length = 30;
+    bool dump_vocab = false;
+    int dump_max_rank = 0;
    
-    if (argc >= 3) {
-        max_length = atoi(argv[2]);
-        if (max_length < 1) max_length = 1;
-        if (max_length > 1000) max_length = 1000;
+    /* Parse arguments */
+    for (int i = 2; i < argc; i++) {
+        if (strcmp(argv[i], "--dump-vocab") == 0) {
+            dump_vocab = true;
+            if (i + 1 < argc && argv[i + 1][0] != '-') {
+                dump_max_rank = atoi(argv[++i]);
+            }
+        } else if (argv[i][0] != '-') {
+            max_length = atoi(argv[i]);
+            if (max_length < 1) max_length = 1;
+            if (max_length > 1000) max_length = 1000;
+        }
    }
    
    /* Initialize hash table */
@ -351,6 +384,17 @@ int main(int argc, char *argv[]) {
    /* Print results */
    print_results(results, max_length);
    
+    /* Dump vocabulary if requested */
+    if (dump_vocab) {
+        /* If no max_rank specified, use the max from the excerpt */
+        if (dump_max_rank == 0 && max_length > 0) {
+            dump_max_rank = results[max_length - 1].min_vocab_needed;
+        }
+        if (dump_max_rank > 0) {
+            dump_vocabulary(dump_max_rank);
+        }
+    }
+    
    /* Cleanup */
    free(results);
    cleanup();
--- a/C/vocabulary_curve/vocabulary_curve
+++ b/C/vocabulary_curve/vocabulary_curve
--- a/python_pkg/word_frequency/anki_generator.py
+++ b/python_pkg/word_frequency/anki_generator.py
@ -40,10 +40,10 @@ try:
        detect_language,
        translate_words_batch,
    )
-    from python_pkg.word_frequency.analyzer import read_file, analyze_text
+    from python_pkg.word_frequency.analyzer import read_file
 except ImportError:
    from translator import detect_language, translate_words_batch
-    from analyzer import read_file, analyze_text
+    from analyzer import read_file


 # Path to C vocabulary_curve executable
@ -59,12 +59,13 @@ class VocabWord(NamedTuple):
    context: str


-def run_vocabulary_curve(filepath: Path, max_length: int) -> str:
+def run_vocabulary_curve(filepath: Path, max_length: int, *, dump_vocab: bool = False) -> str:
    """Run the C vocabulary_curve executable.

    Args:
        filepath: Path to the text file.
        max_length: Maximum excerpt length.
+        dump_vocab: If True, also dump all vocabulary up to max rank needed.

    Returns:
        Output from the executable.
@ -79,8 +80,12 @@ def run_vocabulary_curve(filepath: Path, max_length: int) -> str:
            "Please compile it first: cd C/vocabulary_curve && make"
        )

+    cmd = [str(C_EXECUTABLE), str(filepath), str(max_length)]
+    if dump_vocab:
+        cmd.append("--dump-vocab")
+
    result = subprocess.run(
-        [str(C_EXECUTABLE), str(filepath), str(max_length)],
+        cmd,
        capture_output=True,
        text=True,
        timeout=120,
@ -89,7 +94,7 @@ def run_vocabulary_curve(filepath: Path, max_length: int) -> str:
    return result.stdout


-def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]]]:
+def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
    """Parse output from vocabulary_curve to get words needed.

    Args:
@ -97,11 +102,14 @@ def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str,
        target_length: The target excerpt length.

    Returns:
-        Tuple of (excerpt_text, list of (word, rank) tuples).
+        Tuple of (excerpt_text, excerpt_words, all_vocab_words).
+        excerpt_words: words in the excerpt with their ranks.
+        all_vocab_words: all words up to max rank (from VOCAB_DUMP if present).
    """
    lines = output.split("\n")
    excerpt = ""
-    words: list[tuple[str, int]] = []
+    excerpt_words: list[tuple[str, int]] = []
+    all_vocab: list[tuple[str, int]] = []

    # Find the line for the target length
    i = 0
@ -131,26 +139,28 @@ def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str,
                    # Parse "word(#rank), word2(#rank2), ..."
                    pattern = r"(\S+)\(#(\d+)\)"
                    matches = re.findall(pattern, words_part)
-                    words = [(w, int(r)) for w, r in matches]
+                    excerpt_words = [(w, int(r)) for w, r in matches]
            break
        i += 1

-    return excerpt, words
+    # Parse VOCAB_DUMP section if present
+    in_vocab_dump = False
+    for line in lines:
+        if line.strip() == "VOCAB_DUMP_START":
+            in_vocab_dump = True
+            continue
+        if line.strip() == "VOCAB_DUMP_END":
+            break
+        if in_vocab_dump and ";" in line:
+            parts = line.strip().split(";")
+            if len(parts) == 2:
+                word, rank_str = parts
+                try:
+                    all_vocab.append((word, int(rank_str)))
+                except ValueError:
+                    pass

-
-def get_top_n_words(text: str, n: int) -> list[tuple[str, int]]:
-    """Get the top N most frequent words from text.
-
-    Args:
-        text: The source text.
-        n: Number of top words to return.
-
-    Returns:
-        List of (word, rank) tuples, ranked 1 to n.
-    """
-    word_counts = analyze_text(text)
-    sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0]))
-    return [(word, rank + 1) for rank, (word, _) in enumerate(sorted_words[:n])]
+    return excerpt, excerpt_words, all_vocab


 def find_word_contexts(
@ -196,6 +206,8 @@ def generate_anki_deck(
    deck_name: str = "Vocabulary",
    include_context: bool = False,
    no_translate: bool = False,
+    excerpt: str = "",
+    excerpt_words: list[tuple[str, int]] | None = None,
 ) -> str:
    """Generate Anki-compatible deck content.

@ -207,6 +219,8 @@ def generate_anki_deck(
        deck_name: Name for the deck.
        include_context: Whether to include context in cards.
        no_translate: If True, skip translation (use placeholder).
+        excerpt: The target excerpt text to include in cards.
+        excerpt_words: List of (word, rank) tuples for words in the excerpt.

    Returns:
        Semicolon-separated content ready for Anki import.
@ -224,6 +238,27 @@ def generate_anki_deck(
        lines.append("#columns:Front;Back;Rank")
    lines.append("")  # Empty line before data

+    # Add excerpt as first card (goal/context card)
+    if excerpt:
+        excerpt_escaped = excerpt.replace(";", ",")
+        # Use excerpt_words from C output (has correct ranks)
+        if excerpt_words:
+            # Most frequent = lowest rank (italics), rarest = highest rank (bold)
+            most_frequent = min(excerpt_words, key=lambda x: x[1])[0]
+            rarest = max(excerpt_words, key=lambda x: x[1])[0]
+            # Apply formatting - rarest first (bold), then most frequent (italics)
+            # to avoid nested tag issues if they're the same word
+            if most_frequent != rarest:
+                pattern_rare = re.compile(rf"\b({re.escape(rarest)})\b", re.IGNORECASE)
+                excerpt_escaped = pattern_rare.sub(r"<b>\1</b>", excerpt_escaped)
+                pattern_freq = re.compile(rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE)
+                excerpt_escaped = pattern_freq.sub(r"<i>\1</i>", excerpt_escaped)
+            else:
+                # Same word is both most and least frequent - use bold+italic
+                pattern = re.compile(rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE)
+                excerpt_escaped = pattern.sub(r"<b><i>\1</i></b>", excerpt_escaped)
+        lines.append(f"📖 TARGET EXCERPT;{excerpt_escaped};#0")
+
    # Get translations (or skip if no_translate)
    words = [w for w, _ in words_with_ranks]
    if no_translate:
@ -263,6 +298,120 @@ def generate_anki_deck(
    return "\n".join(lines)


+def get_cached_excerpt(
+    filepath: Path, length: int, *, force: bool = False
+) -> tuple[str, list[tuple[str, int]]] | None:
+    """Get cached excerpt if available.
+
+    Args:
+        filepath: Path to source file.
+        length: Excerpt length.
+        force: If True, ignore cache.
+
+    Returns:
+        Tuple of (excerpt, words) or None if not cached.
+    """
+    if force:
+        return None
+    try:
+        from python_pkg.word_frequency.cache import get_vocab_curve_cache
+        return get_vocab_curve_cache().get(filepath, length)
+    except ImportError:
+        return None
+
+
+def cache_excerpt(
+    filepath: Path, length: int, excerpt: str, words: list[tuple[str, int]]
+) -> None:
+    """Store excerpt in cache.
+
+    Args:
+        filepath: Path to source file.
+        length: Excerpt length.
+        excerpt: The excerpt text.
+        words: List of (word, rank) tuples.
+    """
+    try:
+        from python_pkg.word_frequency.cache import get_vocab_curve_cache
+        get_vocab_curve_cache().set(filepath, length, excerpt, words)
+    except ImportError:
+        pass
+
+
+def get_cached_deck(
+    filepath: Path,
+    length: int,
+    target_lang: str,
+    include_context: bool,
+    all_vocab: bool,
+    *,
+    force: bool = False,
+) -> tuple[str, str, int, int] | None:
+    """Get cached Anki deck if available.
+
+    Args:
+        filepath: Path to source file.
+        length: Excerpt length.
+        target_lang: Target language.
+        include_context: Whether context is included.
+        all_vocab: Whether all vocab is included.
+        force: If True, ignore cache.
+
+    Returns:
+        Tuple of (content, excerpt, num_words, max_rank) or None.
+    """
+    if force:
+        return None
+    try:
+        from python_pkg.word_frequency.cache import get_anki_deck_cache
+        return get_anki_deck_cache().get(
+            filepath, length, target_lang, include_context, all_vocab
+        )
+    except ImportError:
+        return None
+
+
+def cache_deck(
+    filepath: Path,
+    length: int,
+    target_lang: str,
+    include_context: bool,
+    all_vocab: bool,
+    anki_content: str,
+    excerpt: str,
+    num_words: int,
+    max_rank: int,
+) -> None:
+    """Store Anki deck in cache.
+
+    Args:
+        filepath: Path to source file.
+        length: Excerpt length.
+        target_lang: Target language.
+        include_context: Whether context is included.
+        all_vocab: Whether all vocab is included.
+        anki_content: The deck content.
+        excerpt: The excerpt text.
+        num_words: Number of words.
+        max_rank: Maximum rank.
+    """
+    try:
+        from python_pkg.word_frequency.cache import get_anki_deck_cache
+        get_anki_deck_cache().set(
+            filepath,
+            length,
+            target_lang,
+            include_context,
+            all_vocab,
+            anki_content,
+            excerpt,
+            num_words,
+            max_rank,
+        )
+    except ImportError:
+        pass
+
+
 def generate_flashcards(
    filepath: str | Path,
    excerpt_length: int,
@ -272,6 +421,8 @@ def generate_flashcards(
    deck_name: str | None = None,
    all_vocab: bool = True,
    no_translate: bool = False,
+    *,
+    force: bool = False,
 ) -> tuple[str, str, int, int]:
    """Generate Anki flashcards for vocabulary needed for an excerpt length.

@ -285,26 +436,39 @@ def generate_flashcards(
        all_vocab: If True, include ALL words from rank 1 to max rank needed.
                   If False, only include words that appear in the excerpt.
        no_translate: If True, skip translation.
+        force: If True, ignore all caches and regenerate.

    Returns:
        Tuple of (anki_content, excerpt, num_words, max_rank).
    """
    filepath = Path(filepath)

-    # Read the text
-    text = read_file(filepath)
+    # Check for cached full deck (if not using no_translate)
+    if not no_translate and not force:
+        cached = get_cached_deck(
+            filepath, excerpt_length, target_lang, include_context, all_vocab
+        )
+        if cached is not None:
+            return cached
+
+    # Read the text (only needed for context finding)
+    text = read_file(filepath) if include_context else ""

    # Auto-detect language if not provided
    if source_lang is None:
-        source_lang = detect_language(text)
+        sample_text = read_file(filepath)[:1000] if not text else text[:1000]
+        source_lang = detect_language(sample_text)
        if source_lang is None:
-            source_lang = "auto"
+            raise ValueError(
+                "Could not auto-detect source language. "
+                "Please specify with --from (e.g., --from pl for Polish). "
+                "Install langdetect for auto-detection: pip install langdetect"
+            )

-    # Run vocabulary curve analysis
-    output = run_vocabulary_curve(filepath, excerpt_length)
-
-    # Parse the output
-    excerpt, excerpt_words = parse_vocabulary_curve_output(output, excerpt_length)
+    # Run vocabulary curve analysis with vocab dump for all words
+    output = run_vocabulary_curve(filepath, excerpt_length, dump_vocab=all_vocab)
+    # Parse the output (now includes all vocabulary from C)
+    excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(output, excerpt_length)

    if not excerpt_words:
        raise ValueError(f"No words found for excerpt length {excerpt_length}")
@ -312,15 +476,17 @@ def generate_flashcards(
    # Find max rank needed
    max_rank = max(rank for _, rank in excerpt_words)

-    # Get ALL words up to max_rank if requested
-    if all_vocab:
-        words_with_ranks = get_top_n_words(text, max_rank)
+    # Use vocabulary from C output
+    if all_vocab and all_vocab_words:
+        words_with_ranks = all_vocab_words
    else:
        words_with_ranks = excerpt_words

    # Get contexts if requested
    contexts = None
    if include_context:
+        if not text:
+            text = read_file(filepath)
        words = [w for w, _ in words_with_ranks]
        contexts = find_word_contexts(text, words)

@ -337,8 +503,24 @@ def generate_flashcards(
        deck_name,
        include_context,
        no_translate,
+        excerpt,
+        excerpt_words,
    )

+    # Cache the full deck (if translated)
+    if not no_translate:
+        cache_deck(
+            filepath,
+            excerpt_length,
+            target_lang,
+            include_context,
+            all_vocab,
+            anki_content,
+            excerpt,
+            len(words_with_ranks),
+            max_rank,
+        )
+
    return anki_content, excerpt, len(words_with_ranks), max_rank


@ -361,19 +543,18 @@ def main(argv: Sequence[str] | None = None) -> int:
        "--file",
        "-f",
        type=str,
-        required=True,
+        default=None,
        help="Path to the text file to analyze",
    )
    parser.add_argument(
        "--length",
        "-l",
        type=int,
-        required=True,
+        default=None,
        help="Target excerpt length (how many words you want to understand)",
    )
    parser.add_argument(
        "--from",
-        "-F",
        dest="source_lang",
        type=str,
        default=None,
@ -425,9 +606,72 @@ def main(argv: Sequence[str] | None = None) -> int:
        action="store_true",
        help="Skip translation (output words without translations)",
    )
+    parser.add_argument(
+        "--force",
+        "-F",
+        action="store_true",
+        help="Force regeneration, ignoring all caches",
+    )
+    parser.add_argument(
+        "--cache-stats",
+        action="store_true",
+        help="Show cache statistics and exit",
+    )
+    parser.add_argument(
+        "--clear-cache",
+        action="store_true",
+        help="Clear all caches and exit",
+    )

    args = parser.parse_args(argv)

+    # Handle cache management commands
+    if args.cache_stats:
+        try:
+            from python_pkg.word_frequency.cache import get_all_cache_stats
+        except ImportError:
+            try:
+                from cache import get_all_cache_stats
+            except ImportError:
+                print("Cache module not available", file=sys.stderr)  # noqa: T201
+                return 1
+        stats = get_all_cache_stats()
+        print("Cache Statistics")  # noqa: T201
+        print("=" * 50)  # noqa: T201
+        for cache_name, cache_stats in stats.items():
+            print(f"\n{cache_name.upper()}:")  # noqa: T201
+            for key, value in cache_stats.items():
+                if key == "cache_size_bytes":
+                    if value < 1024:
+                        size_str = f"{value} B"
+                    elif value < 1024 * 1024:
+                        size_str = f"{value / 1024:.1f} KB"
+                    else:
+                        size_str = f"{value / (1024 * 1024):.1f} MB"
+                    print(f"  {key}: {size_str}")  # noqa: T201
+                else:
+                    print(f"  {key}: {value}")  # noqa: T201
+        return 0
+
+    if args.clear_cache:
+        try:
+            from python_pkg.word_frequency.cache import clear_all_caches
+        except ImportError:
+            try:
+                from cache import clear_all_caches
+            except ImportError:
+                print("Cache module not available", file=sys.stderr)  # noqa: T201
+                return 1
+        clear_all_caches()
+        print("All caches cleared.")  # noqa: T201
+        return 0
+
+    # Validate required arguments for main functionality
+    if args.file is None:
+        parser.error("--file/-f is required")
+    if args.length is None:
+        parser.error("--length/-l is required")
+
    try:
        filepath = Path(args.file)
        if not filepath.exists():
@ -448,6 +692,7 @@ def main(argv: Sequence[str] | None = None) -> int:
            deck_name=args.deck_name,
            all_vocab=not args.excerpt_words_only,
            no_translate=args.no_translate,
+            force=args.force,
        )

        # Determine output path
--- a/python_pkg/word_frequency/cache.py
+++ b/python_pkg/word_frequency/cache.py
@ -0,0 +1,641 @@
+#!/usr/bin/env python3
+"""Caching utilities for word frequency analysis.
+
+Provides disk-based caching for:
+- Translations (word -> translation mappings)
+- Vocabulary curve excerpts (file + length -> excerpt + words)
+- Generated Anki decks
+
+Cache location: ~/.cache/word_frequency/
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    pass
+
+# Default cache directory
+DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency"
+
+
+def get_cache_dir() -> Path:
+    """Get the cache directory, creating it if needed.
+
+    Returns:
+        Path to cache directory.
+    """
+    cache_dir = Path(os.environ.get("WORD_FREQ_CACHE_DIR", str(DEFAULT_CACHE_DIR)))
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+
+
+def get_file_hash(filepath: Path) -> str:
+    """Compute SHA256 hash of a file's contents.
+
+    Args:
+        filepath: Path to file.
+
+    Returns:
+        Hex digest of file hash.
+    """
+    hasher = hashlib.sha256()
+    with open(filepath, "rb") as f:
+        # Read in chunks for large files
+        for chunk in iter(lambda: f.read(65536), b""):
+            hasher.update(chunk)
+    return hasher.hexdigest()
+
+
+def get_text_hash(text: str) -> str:
+    """Compute SHA256 hash of text content.
+
+    Args:
+        text: Text to hash.
+
+    Returns:
+        Hex digest of text hash.
+    """
+    return hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+
+# =============================================================================
+# Translation Cache
+# =============================================================================
+
+
+class TranslationCache:
+    """Cache for word translations."""
+
+    def __init__(self, cache_dir: Path | None = None) -> None:
+        """Initialize translation cache.
+
+        Args:
+            cache_dir: Optional custom cache directory.
+        """
+        self.cache_dir = cache_dir or get_cache_dir()
+        self.cache_file = self.cache_dir / "translations.json"
+        self._cache: dict[str, str] | None = None
+        self._dirty = False  # Track if cache needs saving
+
+    def _load_cache(self) -> dict[str, str]:
+        """Load cache from disk."""
+        if self._cache is None:
+            if self.cache_file.exists():
+                try:
+                    self._cache = json.loads(self.cache_file.read_text(encoding="utf-8"))
+                except (json.JSONDecodeError, OSError):
+                    self._cache = {}
+            else:
+                self._cache = {}
+        return self._cache
+
+    def _save_cache(self) -> None:
+        """Save cache to disk if dirty."""
+        if self._cache is not None and self._dirty:
+            self.cache_file.write_text(
+                json.dumps(self._cache, ensure_ascii=False, indent=2),
+                encoding="utf-8",
+            )
+            self._dirty = False
+
+    def flush(self) -> None:
+        """Force save cache to disk."""
+        self._save_cache()
+
+    @staticmethod
+    def _make_key(word: str, source_lang: str, target_lang: str) -> str:
+        """Create cache key for a translation.
+
+        Args:
+            word: Word to translate.
+            source_lang: Source language code.
+            target_lang: Target language code.
+
+        Returns:
+            Cache key string.
+        """
+        return f"{source_lang}:{target_lang}:{word.lower()}"
+
+    def get(
+        self, word: str, source_lang: str, target_lang: str
+    ) -> str | None:
+        """Get cached translation.
+
+        Args:
+            word: Word to look up.
+            source_lang: Source language code.
+            target_lang: Target language code.
+
+        Returns:
+            Cached translation or None if not found.
+        """
+        cache = self._load_cache()
+        key = self._make_key(word, source_lang, target_lang)
+        return cache.get(key)
+
+    def set(
+        self, word: str, source_lang: str, target_lang: str, translation: str,
+        *, auto_save: bool = False,
+    ) -> None:
+        """Store translation in cache.
+
+        Args:
+            word: Original word.
+            source_lang: Source language code.
+            target_lang: Target language code.
+            translation: Translated word.
+            auto_save: If True, save to disk immediately.
+        """
+        cache = self._load_cache()
+        key = self._make_key(word, source_lang, target_lang)
+        cache[key] = translation
+        self._dirty = True
+        if auto_save:
+            self._save_cache()
+
+    def get_many(
+        self, words: list[str], source_lang: str, target_lang: str
+    ) -> dict[str, str]:
+        """Get multiple cached translations.
+
+        Args:
+            words: Words to look up.
+            source_lang: Source language code.
+            target_lang: Target language code.
+
+        Returns:
+            Dict mapping words to their cached translations.
+        """
+        cache = self._load_cache()
+        result: dict[str, str] = {}
+        for word in words:
+            key = self._make_key(word, source_lang, target_lang)
+            if key in cache:
+                result[word.lower()] = cache[key]
+        return result
+
+    def set_many(
+        self,
+        translations: dict[str, str],
+        source_lang: str,
+        target_lang: str,
+    ) -> None:
+        """Store multiple translations in cache and save to disk.
+
+        Args:
+            translations: Dict mapping words to translations.
+            source_lang: Source language code.
+            target_lang: Target language code.
+        """
+        cache = self._load_cache()
+        for word, translation in translations.items():
+            key = self._make_key(word, source_lang, target_lang)
+            cache[key] = translation
+        self._dirty = True
+        self._save_cache()  # Save once after all additions
+
+    def clear(self) -> None:
+        """Clear all cached translations."""
+        self._cache = {}
+        self._dirty = False
+        if self.cache_file.exists():
+            self.cache_file.unlink()
+
+    def stats(self) -> dict[str, Any]:
+        """Get cache statistics.
+
+        Returns:
+            Dict with cache stats.
+        """
+        cache = self._load_cache()
+        return {
+            "total_entries": len(cache),
+            "cache_file": str(self.cache_file),
+            "cache_size_bytes": (
+                self.cache_file.stat().st_size if self.cache_file.exists() else 0
+            ),
+        }
+
+
+# =============================================================================
+# Vocabulary Curve Cache
+# =============================================================================
+
+
+class VocabCurveCache:
+    """Cache for vocabulary curve analysis results."""
+
+    def __init__(self, cache_dir: Path | None = None) -> None:
+        """Initialize vocabulary curve cache.
+
+        Args:
+            cache_dir: Optional custom cache directory.
+        """
+        self.cache_dir = (cache_dir or get_cache_dir()) / "excerpts"
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+
+    def _get_cache_path(self, file_hash: str, length: int) -> Path:
+        """Get path to cache file for given hash and length.
+
+        Args:
+            file_hash: Hash of source file.
+            length: Excerpt length.
+
+        Returns:
+            Path to cache file.
+        """
+        return self.cache_dir / f"{file_hash[:16]}_{length}.json"
+
+    def get(
+        self, filepath: Path, length: int
+    ) -> tuple[str, list[tuple[str, int]]] | None:
+        """Get cached excerpt and words for a file and length.
+
+        Args:
+            filepath: Path to source file.
+            length: Excerpt length.
+
+        Returns:
+            Tuple of (excerpt, words_with_ranks) or None if not cached.
+        """
+        file_hash = get_file_hash(filepath)
+        cache_path = self._get_cache_path(file_hash, length)
+
+        if not cache_path.exists():
+            return None
+
+        try:
+            data = json.loads(cache_path.read_text(encoding="utf-8"))
+            # Verify hash matches
+            if data.get("file_hash") != file_hash:
+                return None
+            excerpt = data["excerpt"]
+            words = [(w, r) for w, r in data["words"]]
+            return excerpt, words
+        except (json.JSONDecodeError, KeyError, OSError):
+            return None
+
+    def set(
+        self,
+        filepath: Path,
+        length: int,
+        excerpt: str,
+        words: list[tuple[str, int]],
+    ) -> None:
+        """Store excerpt and words in cache.
+
+        Args:
+            filepath: Path to source file.
+            length: Excerpt length.
+            excerpt: The excerpt text.
+            words: List of (word, rank) tuples.
+        """
+        file_hash = get_file_hash(filepath)
+        cache_path = self._get_cache_path(file_hash, length)
+
+        data = {
+            "file_hash": file_hash,
+            "filepath": str(filepath),
+            "length": length,
+            "excerpt": excerpt,
+            "words": [[w, r] for w, r in words],
+        }
+
+        cache_path.write_text(
+            json.dumps(data, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+
+    def clear(self) -> None:
+        """Clear all cached excerpts."""
+        for cache_file in self.cache_dir.glob("*.json"):
+            cache_file.unlink()
+
+    def stats(self) -> dict[str, Any]:
+        """Get cache statistics.
+
+        Returns:
+            Dict with cache stats.
+        """
+        cache_files = list(self.cache_dir.glob("*.json"))
+        total_size = sum(f.stat().st_size for f in cache_files)
+        return {
+            "total_entries": len(cache_files),
+            "cache_dir": str(self.cache_dir),
+            "cache_size_bytes": total_size,
+        }
+
+
+# =============================================================================
+# Anki Deck Cache
+# =============================================================================
+
+
+class AnkiDeckCache:
+    """Cache for generated Anki decks."""
+
+    def __init__(self, cache_dir: Path | None = None) -> None:
+        """Initialize Anki deck cache.
+
+        Args:
+            cache_dir: Optional custom cache directory.
+        """
+        self.cache_dir = (cache_dir or get_cache_dir()) / "anki_decks"
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        self.metadata_file = self.cache_dir / "metadata.json"
+        self._metadata: dict[str, Any] | None = None
+
+    def _load_metadata(self) -> dict[str, Any]:
+        """Load metadata from disk."""
+        if self._metadata is None:
+            if self.metadata_file.exists():
+                try:
+                    self._metadata = json.loads(
+                        self.metadata_file.read_text(encoding="utf-8")
+                    )
+                except (json.JSONDecodeError, OSError):
+                    self._metadata = {}
+            else:
+                self._metadata = {}
+        return self._metadata
+
+    def _save_metadata(self) -> None:
+        """Save metadata to disk."""
+        if self._metadata is not None:
+            self.metadata_file.write_text(
+                json.dumps(self._metadata, ensure_ascii=False, indent=2),
+                encoding="utf-8",
+            )
+
+    @staticmethod
+    def _make_key(
+        file_hash: str,
+        length: int,
+        target_lang: str,
+        include_context: bool,
+        all_vocab: bool,
+    ) -> str:
+        """Create cache key for an Anki deck.
+
+        Args:
+            file_hash: Hash of source file.
+            length: Excerpt length.
+            target_lang: Target language.
+            include_context: Whether context is included.
+            all_vocab: Whether all vocab is included.
+
+        Returns:
+            Cache key string.
+        """
+        flags = f"ctx{int(include_context)}_all{int(all_vocab)}"
+        return f"{file_hash[:16]}_{length}_{target_lang}_{flags}"
+
+    def get(
+        self,
+        filepath: Path,
+        length: int,
+        target_lang: str,
+        include_context: bool,
+        all_vocab: bool,
+    ) -> tuple[str, str, int, int] | None:
+        """Get cached Anki deck.
+
+        Args:
+            filepath: Path to source file.
+            length: Excerpt length.
+            target_lang: Target language.
+            include_context: Whether context is included.
+            all_vocab: Whether all vocab is included.
+
+        Returns:
+            Tuple of (anki_content, excerpt, num_words, max_rank) or None.
+        """
+        file_hash = get_file_hash(filepath)
+        key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
+        metadata = self._load_metadata()
+
+        if key not in metadata:
+            return None
+
+        entry = metadata[key]
+        if entry.get("file_hash") != file_hash:
+            return None
+
+        deck_file = self.cache_dir / f"{key}.txt"
+        if not deck_file.exists():
+            return None
+
+        try:
+            content = deck_file.read_text(encoding="utf-8")
+            return (
+                content,
+                entry["excerpt"],
+                entry["num_words"],
+                entry["max_rank"],
+            )
+        except OSError:
+            return None
+
+    def set(
+        self,
+        filepath: Path,
+        length: int,
+        target_lang: str,
+        include_context: bool,
+        all_vocab: bool,
+        anki_content: str,
+        excerpt: str,
+        num_words: int,
+        max_rank: int,
+    ) -> None:
+        """Store Anki deck in cache.
+
+        Args:
+            filepath: Path to source file.
+            length: Excerpt length.
+            target_lang: Target language.
+            include_context: Whether context is included.
+            all_vocab: Whether all vocab is included.
+            anki_content: The Anki deck content.
+            excerpt: The excerpt text.
+            num_words: Number of words in deck.
+            max_rank: Maximum word rank.
+        """
+        file_hash = get_file_hash(filepath)
+        key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
+
+        # Save deck content
+        deck_file = self.cache_dir / f"{key}.txt"
+        deck_file.write_text(anki_content, encoding="utf-8")
+
+        # Update metadata
+        metadata = self._load_metadata()
+        metadata[key] = {
+            "file_hash": file_hash,
+            "filepath": str(filepath),
+            "length": length,
+            "target_lang": target_lang,
+            "include_context": include_context,
+            "all_vocab": all_vocab,
+            "excerpt": excerpt,
+            "num_words": num_words,
+            "max_rank": max_rank,
+        }
+        self._save_metadata()
+
+    def clear(self) -> None:
+        """Clear all cached decks."""
+        self._metadata = {}
+        for cache_file in self.cache_dir.glob("*.txt"):
+            cache_file.unlink()
+        if self.metadata_file.exists():
+            self.metadata_file.unlink()
+
+    def stats(self) -> dict[str, Any]:
+        """Get cache statistics.
+
+        Returns:
+            Dict with cache stats.
+        """
+        metadata = self._load_metadata()
+        cache_files = list(self.cache_dir.glob("*.txt"))
+        total_size = sum(f.stat().st_size for f in cache_files)
+        return {
+            "total_entries": len(metadata),
+            "cache_dir": str(self.cache_dir),
+            "cache_size_bytes": total_size,
+        }
+
+
+# =============================================================================
+# Global Cache Instances
+# =============================================================================
+
+# Singleton instances
+_translation_cache: TranslationCache | None = None
+_vocab_curve_cache: VocabCurveCache | None = None
+_anki_deck_cache: AnkiDeckCache | None = None
+
+
+def get_translation_cache() -> TranslationCache:
+    """Get the global translation cache instance."""
+    global _translation_cache  # noqa: PLW0603
+    if _translation_cache is None:
+        _translation_cache = TranslationCache()
+    return _translation_cache
+
+
+def get_vocab_curve_cache() -> VocabCurveCache:
+    """Get the global vocabulary curve cache instance."""
+    global _vocab_curve_cache  # noqa: PLW0603
+    if _vocab_curve_cache is None:
+        _vocab_curve_cache = VocabCurveCache()
+    return _vocab_curve_cache
+
+
+def get_anki_deck_cache() -> AnkiDeckCache:
+    """Get the global Anki deck cache instance."""
+    global _anki_deck_cache  # noqa: PLW0603
+    if _anki_deck_cache is None:
+        _anki_deck_cache = AnkiDeckCache()
+    return _anki_deck_cache
+
+
+def clear_all_caches() -> None:
+    """Clear all caches."""
+    get_translation_cache().clear()
+    get_vocab_curve_cache().clear()
+    get_anki_deck_cache().clear()
+
+
+def get_all_cache_stats() -> dict[str, dict[str, Any]]:
+    """Get statistics for all caches.
+
+    Returns:
+        Dict with stats for each cache type.
+    """
+    return {
+        "translations": get_translation_cache().stats(),
+        "vocab_curves": get_vocab_curve_cache().stats(),
+        "anki_decks": get_anki_deck_cache().stats(),
+    }
+
+
+def main() -> int:
+    """CLI for cache management.
+
+    Returns:
+        Exit code.
+    """
+    import argparse
+
+    parser = argparse.ArgumentParser(description="Manage word frequency caches")
+    parser.add_argument(
+        "--stats", action="store_true", help="Show cache statistics"
+    )
+    parser.add_argument(
+        "--clear", action="store_true", help="Clear all caches"
+    )
+    parser.add_argument(
+        "--clear-translations", action="store_true", help="Clear translation cache"
+    )
+    parser.add_argument(
+        "--clear-excerpts", action="store_true", help="Clear excerpt cache"
+    )
+    parser.add_argument(
+        "--clear-anki", action="store_true", help="Clear Anki deck cache"
+    )
+
+    args = parser.parse_args()
+
+    if args.clear:
+        clear_all_caches()
+        print("All caches cleared.")  # noqa: T201
+        return 0
+
+    if args.clear_translations:
+        get_translation_cache().clear()
+        print("Translation cache cleared.")  # noqa: T201
+        return 0
+
+    if args.clear_excerpts:
+        get_vocab_curve_cache().clear()
+        print("Excerpt cache cleared.")  # noqa: T201
+        return 0
+
+    if args.clear_anki:
+        get_anki_deck_cache().clear()
+        print("Anki deck cache cleared.")  # noqa: T201
+        return 0
+
+    # Default: show stats
+    stats = get_all_cache_stats()
+    print("Cache Statistics")  # noqa: T201
+    print("=" * 50)  # noqa: T201
+    for cache_name, cache_stats in stats.items():
+        print(f"\n{cache_name.upper()}:")  # noqa: T201
+        for key, value in cache_stats.items():
+            if key == "cache_size_bytes":
+                # Format as human-readable
+                if value < 1024:
+                    size_str = f"{value} B"
+                elif value < 1024 * 1024:
+                    size_str = f"{value / 1024:.1f} KB"
+                else:
+                    size_str = f"{value / (1024 * 1024):.1f} MB"
+                print(f"  {key}: {size_str}")  # noqa: T201
+            else:
+                print(f"  {key}: {value}")  # noqa: T201
+
+    return 0
+
+
+if __name__ == "__main__":
+    import sys
+    sys.exit(main())
--- a/python_pkg/word_frequency/run_anki_generator.sh
+++ b/python_pkg/word_frequency/run_anki_generator.sh
@ -0,0 +1,153 @@
+#!/bin/bash
+# Wrapper script for anki_generator that ensures argostranslate is available
+#
+# Usage: ./run_anki_generator.sh [anki_generator args...]
+# Example: ./run_anki_generator.sh --file text.txt --length 20 --from pl --to en
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# Use /tmp for venv to avoid home directory quota issues
+VENV_DIR="/tmp/.venv_argos_$(id -u)"
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+log_info() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+log_warn() {
+    echo -e "${YELLOW}[WARN]${NC} $1"
+}
+
+log_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Convert relative file paths to absolute before changing directories
+resolve_file_paths() {
+    local args=()
+    local i=0
+    while [[ $i -lt ${#ORIGINAL_ARGS[@]} ]]; do
+        local arg="${ORIGINAL_ARGS[$i]}"
+        if [[ "$arg" == "--file" || "$arg" == "-f" ]]; then
+            args+=("$arg")
+            ((i++))
+            if [[ $i -lt ${#ORIGINAL_ARGS[@]} ]]; then
+                local file="${ORIGINAL_ARGS[$i]}"
+                # Convert relative path to absolute
+                if [[ -f "$file" ]]; then
+                    file="$(cd "$(dirname "$file")" && pwd)/$(basename "$file")"
+                fi
+                args+=("$file")
+            fi
+        else
+            args+=("$arg")
+        fi
+        ((i++))
+    done
+    echo "${args[@]}"
+}
+
+# Store original args before any directory changes
+ORIGINAL_ARGS=("$@")
+
+# Check if argostranslate is available
+check_argos() {
+    python -c "import argostranslate" 2>/dev/null
+}
+
+# Try to install argostranslate using pipx (system-wide)
+try_pipx_install() {
+    if command -v pipx &>/dev/null; then
+        log_info "Trying pipx install argostranslate..."
+        if pipx install argostranslate 2>/dev/null; then
+            log_info "argostranslate installed via pipx"
+            return 0
+        fi
+    fi
+    return 1
+}
+
+# Create/use a virtualenv for argostranslate
+setup_venv() {
+    # Use /tmp for pip cache to avoid home directory quota issues
+    export PIP_CACHE_DIR="/tmp/.pip_cache_$(id -u)"
+    mkdir -p "$PIP_CACHE_DIR"
+    
+    if [[ ! -d "$VENV_DIR" ]]; then
+        log_info "Creating virtual environment at $VENV_DIR..."
+        python -m venv "$VENV_DIR"
+    fi
+    
+    # Activate venv
+    source "$VENV_DIR/bin/activate"
+    
+    # Install argostranslate if not present
+    if ! python -c "import argostranslate" 2>/dev/null; then
+        log_info "Installing argostranslate in virtualenv (this may take a few minutes)..."
+        # Use CPU-only PyTorch to reduce download size significantly (~200MB vs ~900MB)
+        # Use --no-cache-dir to avoid any cache writes to home directory
+        pip install --progress-bar on --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
+        pip install --progress-bar on --no-cache-dir argostranslate
+    fi
+    
+    # Install langdetect for auto language detection
+    if ! python -c "import langdetect" 2>/dev/null; then
+        log_info "Installing langdetect for auto language detection..."
+        pip install --progress-bar on --no-cache-dir langdetect
+    fi
+    
+    # Also ensure other dependencies are available
+    if [[ -f "${SCRIPT_DIR}/../../requirements.txt" ]]; then
+        pip install --progress-bar on --no-cache-dir -r "${SCRIPT_DIR}/../../requirements.txt" 2>/dev/null || true
+    fi
+    
+    log_info "Using virtualenv: $VENV_DIR"
+}
+
+# Main logic
+main() {
+    # Resolve file paths to absolute before changing directories
+    local resolved_args
+    resolved_args=$(resolve_file_paths)
+    
+    # If --no-translate is passed, we don't need argostranslate
+    if [[ " $* " =~ " --no-translate " ]] || [[ " $* " =~ " -n " ]]; then
+        log_info "Running without translation (--no-translate)"
+        cd "$(dirname "$SCRIPT_DIR")" && cd ..
+        python -m python_pkg.word_frequency.anki_generator $resolved_args
+        exit $?
+    fi
+    
+    # Check if argostranslate is already available
+    if check_argos; then
+        log_info "argostranslate is available"
+        cd "$(dirname "$SCRIPT_DIR")" && cd ..
+        python -m python_pkg.word_frequency.anki_generator $resolved_args
+        exit $?
+    fi
+    
+    log_warn "argostranslate not found in system Python"
+    
+    # Try pipx first (cleaner system-wide installation)
+    if try_pipx_install && check_argos; then
+        cd "$(dirname "$SCRIPT_DIR")" && cd ..
+        python -m python_pkg.word_frequency.anki_generator $resolved_args
+        exit $?
+    fi
+    
+    # Fall back to virtualenv
+    log_info "Setting up virtualenv with argostranslate..."
+    setup_venv
+    
+    # Run in venv context
+    cd "$(dirname "$SCRIPT_DIR")" && cd ..
+    python -m python_pkg.word_frequency.anki_generator $resolved_args
+}
+
+main "$@"
--- a/python_pkg/word_frequency/test_texts/polish_pan_tadeusz_anki_20.txt
+++ b/python_pkg/word_frequency/test_texts/polish_pan_tadeusz_anki_20.txt
--- a/python_pkg/word_frequency/tests/test_anki_generator.py
+++ b/python_pkg/word_frequency/tests/test_anki_generator.py
@ -13,7 +13,6 @@ try:
        find_word_contexts,
        generate_anki_deck,
        generate_flashcards,
-        get_top_n_words,
        main,
        parse_vocabulary_curve_output,
    )
@ -24,7 +23,6 @@ except ImportError:
        find_word_contexts,
        generate_anki_deck,
        generate_flashcards,
-        get_top_n_words,
        main,
        parse_vocabulary_curve_output,
    )
@ -80,30 +78,44 @@ class TestParseVocabularyCurveOutput:

    def test_parse_length_1(self, sample_vocabulary_output: str) -> None:
        """Test parsing output for length 1."""
-        excerpt, words = parse_vocabulary_curve_output(sample_vocabulary_output, 1)
+        excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 1)
        assert excerpt == "the"
-        assert words == [("the", 1)]
+        assert excerpt_words == [("the", 1)]

    def test_parse_length_2(self, sample_vocabulary_output: str) -> None:
        """Test parsing output for length 2."""
-        excerpt, words = parse_vocabulary_curve_output(sample_vocabulary_output, 2)
+        excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 2)
        assert excerpt == "the dog"
-        assert words == [("the", 1), ("dog", 2)]
+        assert excerpt_words == [("the", 1), ("dog", 2)]

    def test_parse_length_3(self, sample_vocabulary_output: str) -> None:
        """Test parsing output for length 3."""
-        excerpt, words = parse_vocabulary_curve_output(sample_vocabulary_output, 3)
+        excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 3)
        assert excerpt == "the quick fox"
-        assert len(words) == 3
-        assert ("the", 1) in words
-        assert ("quick", 3) in words
-        assert ("fox", 5) in words
+        assert len(excerpt_words) == 3
+        assert ("the", 1) in excerpt_words
+        assert ("quick", 3) in excerpt_words
+        assert ("fox", 5) in excerpt_words

    def test_parse_nonexistent_length(self, sample_vocabulary_output: str) -> None:
        """Test parsing output for non-existent length."""
-        excerpt, words = parse_vocabulary_curve_output(sample_vocabulary_output, 100)
+        excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 100)
        assert excerpt == ""
-        assert words == []
+        assert excerpt_words == []
+
+    def test_parse_vocab_dump(self) -> None:
+        """Test parsing VOCAB_DUMP section."""
+        output = """[Length 2] Vocab needed: 2
+  Excerpt: "hello world"
+  Words: hello(#1), world(#2)
+
+VOCAB_DUMP_START
+hello;1
+world;2
+VOCAB_DUMP_END
+"""
+        excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(output, 2)
+        assert all_vocab == [("hello", 1), ("world", 2)]


 # Tests for find_word_contexts
@ -250,31 +262,6 @@ class TestGenerateAnkiDeck:
        assert "world" in result


-# Tests for get_top_n_words
-
-
-class TestGetTopNWords:
-    """Tests for getting top N words."""
-
-    def test_get_top_5_words(self) -> None:
-        """Test getting top 5 words from text."""
-        text = "the cat sat on the mat the cat meowed"
-        words = get_top_n_words(text, 5)
-        assert len(words) == 5
-        # 'the' appears 3x, 'cat' appears 2x
-        assert words[0][0] == "the"
-        assert words[0][1] == 1
-        assert words[1][0] == "cat"
-        assert words[1][1] == 2
-
-    def test_ranks_are_sequential(self) -> None:
-        """Test that ranks are 1-based and sequential."""
-        text = "one two three four five six seven eight"
-        words = get_top_n_words(text, 8)
-        ranks = [r for _, r in words]
-        assert ranks == [1, 2, 3, 4, 5, 6, 7, 8]
-
-
 # Tests for main function


--- a/python_pkg/word_frequency/tests/test_learning_pipe.py
+++ b/python_pkg/word_frequency/tests/test_learning_pipe.py
@ -4,6 +4,8 @@ from __future__ import annotations

 import time
 from pathlib import Path
+from typing import TYPE_CHECKING
+from unittest.mock import MagicMock, patch

 import pytest

@ -13,6 +15,40 @@ from python_pkg.word_frequency.learning_pipe import (
    load_stopwords,
    main,
 )
+import python_pkg.word_frequency.learning_pipe as learning_pipe_module
+from python_pkg.word_frequency.translator import TranslationResult
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+@pytest.fixture
+def mock_translation() -> Generator[MagicMock, None, None]:
+    """Mock translation to avoid requiring argostranslate."""
+    def fake_batch_translate(
+        words: list[str],
+        from_lang: str,
+        to_lang: str,
+        *,
+        use_cache: bool = True,  # noqa: ARG001
+    ) -> list[TranslationResult]:
+        """Fake batch translation that returns word with prefix."""
+        return [
+            TranslationResult(
+                source_word=word,
+                translated_word=f"translated_{word}",
+                source_lang=from_lang,
+                target_lang=to_lang,
+                success=True,
+            )
+            for word in words
+        ]
+
+    # Need to patch in learning_pipe module since it imports the function directly
+    with patch.object(
+        learning_pipe_module, "translate_words_batch", side_effect=fake_batch_translate
+    ):
+        yield


 class TestLoadStopwords:
@ -162,7 +198,9 @@ class TestGenerateLearningLesson:
 class TestMain:
    """Tests for main CLI function."""

-    def test_basic_text_input(self, capsys: pytest.CaptureFixture[str]) -> None:
+    def test_basic_text_input(
+        self, capsys: pytest.CaptureFixture[str], mock_translation: None
+    ) -> None:
        """Test with text input."""
        exit_code = main(
            [
@ -179,7 +217,7 @@ class TestMain:
        assert "LANGUAGE LEARNING LESSON" in captured.out

    def test_file_input(
-        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+        self, tmp_path: Path, capsys: pytest.CaptureFixture[str], mock_translation: None
    ) -> None:
        """Test with file input."""
        test_file = tmp_path / "test.txt"
@ -199,7 +237,7 @@ class TestMain:
        assert exit_code == 0
        assert "hello" in captured.out.lower()

-    def test_output_to_file(self, tmp_path: Path) -> None:
+    def test_output_to_file(self, tmp_path: Path, mock_translation: None) -> None:
        """Test outputting to file."""
        output_file = tmp_path / "lesson.txt"

@ -219,7 +257,7 @@ class TestMain:
        assert "LANGUAGE LEARNING LESSON" in content

    def test_custom_stopwords(
-        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
+        self, tmp_path: Path, capsys: pytest.CaptureFixture[str], mock_translation: None
    ) -> None:
        """Test with custom stopwords file."""
        stopwords_file = tmp_path / "stop.txt"
@ -242,7 +280,7 @@ class TestMain:
        # "hello" should be filtered by custom stopwords

    def test_multiple_batches_option(
-        self, capsys: pytest.CaptureFixture[str]
+        self, capsys: pytest.CaptureFixture[str], mock_translation: None
    ) -> None:
        """Test --batches option."""
        text = " ".join(f"word{i}" * (50 - i) for i in range(30))
@ -329,10 +367,10 @@ class TestTranslationIntegration:
        # Should not have translation arrows
        assert " -> " not in result or "Translation" not in result

-    def test_lesson_with_translation_params(self) -> None:
+    def test_lesson_with_translation_params(self, mock_translation: None) -> None:
        """Test that translation params are accepted."""
        text = "hello world hello world hello"
-        # This should not crash even without argostranslate installed
+        # This should work with mocked translation
        result = generate_learning_lesson(
            text,
            batch_size=5,
@ -346,12 +384,14 @@ class TestTranslationIntegration:
        assert "VOCABULARY TO LEARN:" in result
        assert "hello" in result

-    def test_main_with_translate_flags(self, tmp_path: Path) -> None:
+    def test_main_with_translate_flags(
+        self, tmp_path: Path, mock_translation: None
+    ) -> None:
        """Test that main accepts translation flags."""
        text_file = tmp_path / "test.txt"
        text_file.write_text("hello world hello world hello", encoding="utf-8")

-        # Should not crash even if translation fails
+        # Should work with mocked translation
        result = main([
            "--file", str(text_file),
            "--translate-from", "en",
@ -361,7 +401,9 @@ class TestTranslationIntegration:

        assert result == 0

-    def test_translate_to_defaults_to_english(self, capsys: pytest.CaptureFixture[str]) -> None:
+    def test_translate_to_defaults_to_english(
+        self, capsys: pytest.CaptureFixture[str], mock_translation: None
+    ) -> None:
        """Test that translate_to defaults to 'en' when using auto-detection."""
        text = "hello world"
        # When using --translate flag (translate_from="auto"), translate_to defaults to "en"
--- a/python_pkg/word_frequency/tests/test_translator.py
+++ b/python_pkg/word_frequency/tests/test_translator.py
@ -47,15 +47,22 @@ except ImportError:

 # Helper context manager for mocking argostranslate
 class ArgosAvailableMock:
-    """Context manager to mock argostranslate being available."""
+    """Context manager to mock argostranslate being available and control its output.
+
+    Works whether argos is installed or not by patching sys.modules.
+    """

    def __init__(self, translate_returns: str | list[str] | Exception | None = None) -> None:
        """Initialize with return values for translate()."""
        self.translate_returns = translate_returns
+        self.mock_translate_fn = MagicMock()
        self.mock_translate_module = MagicMock()
        self.mock_package_module = MagicMock()
        self.mock_parent = MagicMock()
        self.original_available = translator._argos_available
+        self._sys_modules_patcher: MagicMock | None = None
+        self._ensure_patcher: MagicMock | None = None
+        self._lang_patcher: MagicMock | None = None

    def __enter__(self) -> MagicMock:
        """Set up the mocks."""
@ -63,36 +70,52 @@ class ArgosAvailableMock:

        # Set up translate return value
        if isinstance(self.translate_returns, Exception):
-            self.mock_translate_module.translate.side_effect = self.translate_returns
+            self.mock_translate_fn.side_effect = self.translate_returns
        elif isinstance(self.translate_returns, list):
-            self.mock_translate_module.translate.side_effect = self.translate_returns
+            self.mock_translate_fn.side_effect = self.translate_returns
        elif self.translate_returns is not None:
-            self.mock_translate_module.translate.return_value = self.translate_returns
+            self.mock_translate_fn.return_value = self.translate_returns

-        # Link parent module to submodules (critical for Python imports)
+        # Wire up the mock modules
+        self.mock_translate_module.translate = self.mock_translate_fn
+        self.mock_translate_module.get_installed_languages = MagicMock(return_value=[])
+        self.mock_package_module.update_package_index = MagicMock()
+        self.mock_package_module.get_available_packages = MagicMock(return_value=[])
        self.mock_parent.translate = self.mock_translate_module
        self.mock_parent.package = self.mock_package_module

-        # Patch sys.modules
-        self.patchers = [
-            patch.dict(
-                "sys.modules",
-                {
-                    "argostranslate": self.mock_parent,
-                    "argostranslate.translate": self.mock_translate_module,
-                    "argostranslate.package": self.mock_package_module,
-                },
-            ),
-        ]
-        for p in self.patchers:
-            p.start()
+        # Patch sys.modules to inject our mock (works even if argos not installed)
+        self._sys_modules_patcher = patch.dict(
+            "sys.modules",
+            {
+                "argostranslate": self.mock_parent,
+                "argostranslate.translate": self.mock_translate_module,
+                "argostranslate.package": self.mock_package_module,
+            },
+        )

-        return self.mock_translate_module
+        # Patch _ensure_argos_installed and _ensure_language_pair to no-op
+        self._ensure_patcher = patch.object(
+            translator, "_ensure_argos_installed", lambda: None
+        )
+        self._lang_patcher = patch.object(
+            translator, "_ensure_language_pair", lambda f, t: None
+        )
+
+        self._sys_modules_patcher.start()
+        self._ensure_patcher.start()
+        self._lang_patcher.start()
+
+        return self.mock_translate_fn

    def __exit__(self, *args: object) -> None:
        """Restore original state."""
-        for p in self.patchers:
-            p.stop()
+        if self._lang_patcher:
+            self._lang_patcher.stop()
+        if self._ensure_patcher:
+            self._ensure_patcher.stop()
+        if self._sys_modules_patcher:
+            self._sys_modules_patcher.stop()
        translator._argos_available = self.original_available


@ -101,25 +124,13 @@ class ArgosAvailableMock:

@pytest.fixture
 def mock_argos_unavailable() -> Generator[None, None, None]:
-    """Mock argostranslate being unavailable."""
+    """Mock argostranslate being unavailable (for legacy tests)."""
    original_value = translator._argos_available
    translator._argos_available = False
    yield
    translator._argos_available = original_value


-@pytest.fixture
-def mock_all_translators_unavailable() -> Generator[None, None, None]:
-    """Mock both argostranslate and deep-translator being unavailable."""
-    original_argos = translator._argos_available
-    original_deep = translator._deep_translator_available
-    translator._argos_available = False
-    translator._deep_translator_available = False
-    yield
-    translator._argos_available = original_argos
-    translator._deep_translator_available = original_deep
-
-
@pytest.fixture
 def temp_words_file(tmp_path: Path) -> Path:
    """Create a temporary file with words."""
@ -174,43 +185,36 @@ class TestTranslationResult:


 class TestTranslateWord:
-    """Tests for translate_word function."""
+    """Tests for translate_word function - offline-first behavior."""

-    def test_translate_word_all_backends_unavailable(
-        self, mock_all_translators_unavailable: None
-    ) -> None:
-        """Test translation when no backends are available."""
-        result = translate_word("hello", "en", "es")
-        assert result.success is False
-        assert "No translation backend" in str(result.error)
-
-    def test_translate_word_argos_unavailable_uses_deep_translator(
-        self, mock_argos_unavailable: None
-    ) -> None:
-        """Test that deep-translator is used when argos is unavailable."""
-        # deep-translator should work as fallback (it's installed)
-        result = translate_word("hello", "en", "es")
-        # This may succeed if deep-translator is installed
-        # Just verify we get a result without crashing
-        assert isinstance(result, TranslationResult)
+    def test_translate_word_argos_unavailable_raises(self) -> None:
+        """Test that translation raises ImportError when argos is unavailable."""
+        # Mock _ensure_argos_installed to raise ImportError
+        with patch.object(
+            translator,
+            "_ensure_argos_installed",
+            side_effect=ImportError("argostranslate not available"),
+        ):
+            with pytest.raises(ImportError, match="argostranslate not available"):
+                translate_word("hello", "en", "es", use_cache=False)

    def test_translate_word_success(self) -> None:
        """Test successful word translation."""
        with ArgosAvailableMock("hola"):
-            result = translate_word("hello", "en", "es")
+            result = translate_word("hello", "en", "es", use_cache=False)

        assert result.source_word == "hello"
        assert result.translated_word == "hola"
        assert result.success is True

-    def test_translate_word_argos_exception_falls_back(
-        self, mock_argos_unavailable: None
-    ) -> None:
-        """Test that argos exception falls back to deep-translator."""
-        # With argos unavailable, deep-translator should be used
-        result = translate_word("hello", "en", "es")
-        # Just verify it doesn't crash - may succeed or fail depending on network
-        assert isinstance(result, TranslationResult)
+    def test_translate_word_argos_exception_returns_error(self) -> None:
+        """Test that argos exception returns failed result with error."""
+        # Mock argos being available but translate raising an exception
+        with ArgosAvailableMock(RuntimeError("Translation failed")):
+            result = translate_word("hello", "en", "es", use_cache=False)
+
+        assert result.success is False
+        assert "Translation failed" in str(result.error)


 # translate_words tests
@ -221,99 +225,123 @@ class TestTranslateWords:

    def test_translate_empty_list(self) -> None:
        """Test translating empty list."""
+        # Empty list returns empty result without calling translation
        results = translate_words([], "en", "es")
        assert results == []

    def test_translate_multiple_words(self) -> None:
        """Test translating multiple words."""
-        with ArgosAvailableMock(["hola", "mundo"]):
-            results = translate_words(["hello", "world"], "en", "es")
+        with ArgosAvailableMock(["hola", "mundo"]) as mock:
+            mock.side_effect = ["hola", "mundo"]
+            results = translate_words(["hello", "world"], "en", "es", use_cache=False)

        assert len(results) == 2
        assert results[0].translated_word == "hola"
        assert results[1].translated_word == "mundo"

+    def test_translate_words_argos_unavailable_raises(self) -> None:
+        """Test that translating words raises ImportError when argos unavailable."""
+        with patch.object(
+            translator,
+            "_ensure_argos_installed",
+            side_effect=ImportError("argostranslate not available"),
+        ):
+            with pytest.raises(ImportError, match="argostranslate not available"):
+                translate_words(["hello", "world"], "en", "es", use_cache=False)
+

 # translate_words_batch tests


 class TestTranslateWordsBatch:
-    """Tests for translate_words_batch function."""
+    """Tests for translate_words_batch function - offline-first."""

    def test_batch_empty_list(self) -> None:
        """Test batch translation of empty list."""
-        results = translate_words_batch([], "en", "es")
+        # Empty list doesn't require argos
+        with patch.object(translator, "_ensure_argos_installed", lambda: None):
+            results = translate_words_batch([], "en", "es")
        assert results == []

    def test_batch_small_list(self) -> None:
-        """Test batch translation of small list (3 or fewer)."""
-        with ArgosAvailableMock(["uno", "dos", "tres"]) as mock:
-            results = translate_words_batch(["one", "two", "three"], "en", "es")
+        """Test batch translation of small list (uses batch mode anyway)."""
+        with ArgosAvailableMock("uno\ndos\ntres") as mock:
+            results = translate_words_batch(
+                ["one", "two", "three"], "en", "es", use_cache=False
+            )

        assert len(results) == 3
-        # Small lists use individual translation
-        assert mock.translate.call_count == 3
+        # Batch translation
+        assert mock.call_count == 1

    def test_batch_large_list_success(self) -> None:
        """Test batch translation of large list."""
        words = ["one", "two", "three", "four", "five"]

        with ArgosAvailableMock("uno\ndos\ntres\ncuatro\ncinco") as mock:
-            results = translate_words_batch(words, "en", "es")
+            results = translate_words_batch(words, "en", "es", use_cache=False)

        assert len(results) == 5
        # Batch translation called once
-        mock.translate.assert_called_once()
+        mock.assert_called_once()
        assert results[0].translated_word == "uno"
        assert results[4].translated_word == "cinco"

    def test_batch_fallback_on_mismatch(self) -> None:
-        """Test batch translation falls back when result count mismatches."""
+        """Test batch translation falls back to individual when result count mismatches."""
        words = ["one", "two", "three", "four"]
        # First call (batch) returns wrong count, subsequent calls are individual
        with ArgosAvailableMock(
-            ["wrong\ncount", "uno", "dos", "tres", "cuatro"]
+            ["wrong", "uno", "dos", "tres", "cuatro"]
        ) as mock:
-            results = translate_words_batch(words, "en", "es")
+            results = translate_words_batch(words, "en", "es", use_cache=False)

        assert len(results) == 4
-        # Fallback to individual
-        assert mock.translate.call_count == 5
+        # Fallback to individual argos translation
+        assert mock.call_count == 5

    def test_batch_fallback_on_exception(self) -> None:
-        """Test batch translation falls back on exception."""
+        """Test batch translation raises on exception (no fallback to online)."""
        words = ["one", "two", "three", "four"]

-        # Create mock that raises first then succeeds
-        original = translator._argos_available
-        translator._argos_available = True
-
+        # Create mock that raises
+        mock_translate = MagicMock(side_effect=RuntimeError("Batch failed"))
        mock_translate_module = MagicMock()
-        mock_translate_module.translate.side_effect = [
-            RuntimeError("Batch failed"),
-            "uno",
-            "dos",
-            "tres",
-            "cuatro",
-        ]
+        mock_translate_module.translate = mock_translate
        mock_package_module = MagicMock()
        mock_parent = MagicMock()
        mock_parent.translate = mock_translate_module
        mock_parent.package = mock_package_module

-        with patch.dict(
-            "sys.modules",
-            {
-                "argostranslate": mock_parent,
-                "argostranslate.translate": mock_translate_module,
-                "argostranslate.package": mock_package_module,
-            },
+        original = translator._argos_available
+        translator._argos_available = True
+
+        with (
+            patch.dict(
+                "sys.modules",
+                {
+                    "argostranslate": mock_parent,
+                    "argostranslate.translate": mock_translate_module,
+                    "argostranslate.package": mock_package_module,
+                },
+            ),
+            patch.object(translator, "_ensure_argos_installed", lambda: None),
+            patch.object(translator, "_ensure_language_pair", lambda f, t: None),
+            pytest.raises(RuntimeError, match="Translation failed"),
        ):
-            results = translate_words_batch(words, "en", "es")
+            translate_words_batch(words, "en", "es", use_cache=False)

        translator._argos_available = original

-        assert len(results) == 4
+    def test_batch_argos_unavailable_raises(self) -> None:
+        """Test that batch translation raises ImportError when argos unavailable."""
+        with patch.object(
+            translator,
+            "_ensure_argos_installed",
+            side_effect=ImportError("argostranslate not available"),
+        ):
+            with pytest.raises(ImportError, match="argostranslate not available"):
+                translate_words_batch(["hello", "world"], "en", "es", use_cache=False)


 # format_translations tests
@ -394,10 +422,31 @@ class TestGetInstalledLanguages:
        mock_lang2.code = "es"
        mock_lang2.name = "Spanish"

-        with ArgosAvailableMock() as mock:
-            mock.get_installed_languages.return_value = [mock_lang1, mock_lang2]
+        # We need to mock the translate module's get_installed_languages
+        mock_translate_module = MagicMock()
+        mock_translate_module.get_installed_languages.return_value = [
+            mock_lang1, mock_lang2
+        ]
+        mock_package_module = MagicMock()
+        mock_parent = MagicMock()
+        mock_parent.translate = mock_translate_module
+        mock_parent.package = mock_package_module
+
+        original = translator._argos_available
+        translator._argos_available = True
+
+        with patch.dict(
+            "sys.modules",
+            {
+                "argostranslate": mock_parent,
+                "argostranslate.translate": mock_translate_module,
+                "argostranslate.package": mock_package_module,
+            },
+        ):
            result = get_installed_languages()

+        translator._argos_available = original
+
        assert ("en", "English") in result
        assert ("es", "Spanish") in result

@ -462,10 +511,28 @@ class TestMain:
        self, capsys: pytest.CaptureFixture[str]
    ) -> None:
        """Test listing languages when none installed."""
-        with ArgosAvailableMock() as mock:
-            mock.get_installed_languages.return_value = []
+        mock_translate_module = MagicMock()
+        mock_translate_module.get_installed_languages.return_value = []
+        mock_package_module = MagicMock()
+        mock_parent = MagicMock()
+        mock_parent.translate = mock_translate_module
+        mock_parent.package = mock_package_module
+
+        original = translator._argos_available
+        translator._argos_available = True
+
+        with patch.dict(
+            "sys.modules",
+            {
+                "argostranslate": mock_parent,
+                "argostranslate.translate": mock_translate_module,
+                "argostranslate.package": mock_package_module,
+            },
+        ):
            result = main(["--list-languages"])

+        translator._argos_available = original
+
        assert result == 0
        captured = capsys.readouterr()
        assert "No languages installed" in captured.out
@ -478,10 +545,28 @@ class TestMain:
        mock_lang.code = "en"
        mock_lang.name = "English"

-        with ArgosAvailableMock() as mock:
-            mock.get_installed_languages.return_value = [mock_lang]
+        mock_translate_module = MagicMock()
+        mock_translate_module.get_installed_languages.return_value = [mock_lang]
+        mock_package_module = MagicMock()
+        mock_parent = MagicMock()
+        mock_parent.translate = mock_translate_module
+        mock_parent.package = mock_package_module
+
+        original = translator._argos_available
+        translator._argos_available = True
+
+        with patch.dict(
+            "sys.modules",
+            {
+                "argostranslate": mock_parent,
+                "argostranslate.translate": mock_translate_module,
+                "argostranslate.package": mock_package_module,
+            },
+        ):
            result = main(["--list-languages"])

+        translator._argos_available = original
+
        assert result == 0
        captured = capsys.readouterr()
        assert "en" in captured.out
@ -578,11 +663,14 @@ class TestMain:

        assert result == 1

-    def test_translation_failure_returns_error(
-        self, mock_all_translators_unavailable: None
-    ) -> None:
-        """Test that translation failure returns error code when no backends."""
-        result = main(["--text", "hello", "--from", "en", "--to", "es"])
+    def test_translation_failure_returns_error(self) -> None:
+        """Test that translation failure returns error code when argos unavailable."""
+        with patch.object(
+            translator,
+            "_ensure_argos_installed",
+            side_effect=ImportError("argostranslate not available"),
+        ):
+            result = main(["--text", "hello", "--from", "en", "--to", "es"])
        assert result == 1


@ -594,9 +682,10 @@ class TestIntegration:

    def test_full_translation_flow(self) -> None:
        """Test complete translation flow."""
-        with ArgosAvailableMock(["uno", "dos", "tres"]):
+        with ArgosAvailableMock(["uno", "dos", "tres"]) as mock:
+            mock.side_effect = ["uno", "dos", "tres"]
            words = ["one", "two", "three"]
-            results = translate_words(words, "en", "es")
+            results = translate_words(words, "en", "es", use_cache=False)

        assert all(r.success for r in results)
        assert [r.translated_word for r in results] == ["uno", "dos", "tres"]
@ -606,14 +695,19 @@ class TestIntegration:
        assert "one" in output
        assert "uno" in output

-    def test_mixed_success_failure(
-        self, mock_all_translators_unavailable: None
-    ) -> None:
-        """Test handling when no translation backends are available."""
-        results = translate_words(["hello", "xyz", "world"], "en", "es")
+    def test_mixed_success_failure(self) -> None:
+        """Test handling when argos raises exception for some translations."""
+        # Simulate argos translating first word, then failing, then succeeding
+        with ArgosAvailableMock() as mock:
+            mock.side_effect = ["hola", RuntimeError("Unknown"), "mundo"]
+            results = translate_words(
+                ["hello", "xyz", "world"], "en", "es", use_cache=False
+            )

-        # All should fail when no backends available
-        assert all(not r.success for r in results)
+        # First and third succeed, second fails
+        assert results[0].success is True
+        assert results[1].success is False
+        assert results[2].success is True

        output = format_translations(results)
        assert "Error" in output
--- a/python_pkg/word_frequency/translator.py
+++ b/python_pkg/word_frequency/translator.py
@ -40,6 +40,65 @@ if TYPE_CHECKING:
 _argos_available: bool | None = None
 _deep_translator_available: bool | None = None
 _langdetect_available: bool | None = None
+_gpu_initialized: bool = False
+_gpu_available: bool | None = None
+
+
+def _check_cuda_available() -> bool:
+    """Check if CUDA is available for GPU acceleration."""
+    global _gpu_available
+    if _gpu_available is None:
+        try:
+            import torch
+            _gpu_available = torch.cuda.is_available()
+        except ImportError:
+            _gpu_available = False
+    return _gpu_available
+
+
+def _init_gpu_if_available() -> None:
+    """Initialize GPU for argostranslate if CUDA is available.
+    
+    Raises:
+        RuntimeError: If CUDA is available but GPU initialization fails.
+    """
+    global _gpu_initialized
+    if _gpu_initialized:
+        return
+    
+    if not _check_cuda_available():
+        _gpu_initialized = True
+        return
+    
+    import sys
+    print("CUDA detected, initializing GPU acceleration...", file=sys.stderr)
+    
+    try:
+        import torch
+        import ctranslate2
+        
+        # Force CTranslate2 to use CUDA
+        device_count = torch.cuda.device_count()
+        if device_count == 0:
+            raise RuntimeError("CUDA reports available but no GPU devices found")
+        
+        device_name = torch.cuda.get_device_name(0)
+        print(f"  Using GPU: {device_name}", file=sys.stderr)
+        
+        # Set environment variable to force GPU usage in argos
+        import os
+        os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
+        os.environ["CT2_USE_EXPERIMENTAL_PACKED_GEMM"] = "1"
+        
+        _gpu_initialized = True
+        print("  GPU acceleration enabled.", file=sys.stderr)
+        
+    except Exception as e:
+        raise RuntimeError(
+            f"CUDA is available but GPU initialization failed: {e}\n"
+            f"This may be due to incompatible CUDA version or driver issues.\n"
+            f"To disable GPU and use CPU only, set environment variable: CT2_FORCE_CPU=1"
+        ) from e


 def _check_argos() -> bool:
@ -205,85 +264,184 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
    return results


+def _ensure_argos_installed() -> None:
+    """Ensure argostranslate is installed, attempt installation if not.
+
+    Raises:
+        ImportError: If argos cannot be installed.
+    """
+    if _check_argos():
+        return
+
+    import subprocess
+    import sys
+
+    print("argostranslate not found. Attempting to install...")  # noqa: T201
+    try:
+        subprocess.run(
+            [sys.executable, "-m", "pip", "install", "argostranslate"],
+            check=True,
+            capture_output=True,
+        )
+        # Reset the check flag and verify
+        global _argos_available  # noqa: PLW0603
+        _argos_available = None
+        if not _check_argos():
+            raise ImportError("argostranslate installation succeeded but import failed")
+        print("argostranslate installed successfully.")  # noqa: T201
+    except subprocess.CalledProcessError as e:
+        error_msg = e.stderr.decode() if e.stderr else str(e)
+        raise ImportError(
+            f"argostranslate is required for offline translation.\n\n"
+            f"Install manually with one of:\n"
+            f"  pip install argostranslate          # In a virtualenv\n"
+            f"  pipx install argostranslate         # System-wide via pipx\n"
+            f"  pacman -S python-argostranslate     # Arch Linux (if available)\n\n"
+            f"Original error: {error_msg}"
+        ) from e
+
+
+def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
+    """Ensure the language pair is available, download if needed.
+
+    Args:
+        from_lang: Source language code.
+        to_lang: Target language code.
+
+    Raises:
+        ValueError: If language pair cannot be obtained.
+    """
+    import argostranslate.package
+    import argostranslate.translate
+
+    # Check if already installed
+    installed_languages = argostranslate.translate.get_installed_languages()
+    from_lang_obj = None
+    to_lang_obj = None
+
+    for lang in installed_languages:
+        if lang.code == from_lang:
+            from_lang_obj = lang
+        if lang.code == to_lang:
+            to_lang_obj = lang
+
+    if from_lang_obj and to_lang_obj:
+        # Check if translation is available
+        translation = from_lang_obj.get_translation(to_lang_obj)
+        if translation:
+            return  # Already available
+
+    # Need to download
+    import sys
+
+    print(
+        f"Downloading language pack: {from_lang} -> {to_lang}...",
+        file=sys.stderr,
+    )
+    print("  Fetching package index...", file=sys.stderr)
+    argostranslate.package.update_package_index()
+    available = argostranslate.package.get_available_packages()
+
+    pkg = next(
+        (p for p in available if p.from_code == from_lang and p.to_code == to_lang),
+        None,
+    )
+
+    if pkg is None:
+        raise ValueError(
+            f"No language pack available for {from_lang} -> {to_lang}. "
+            f"Available pairs can be listed with --list-languages."
+        )
+
+    print(
+        f"  Downloading package (~50-100MB, this may take a minute)...",
+        file=sys.stderr,
+    )
+    download_path = pkg.download()
+    print("  Installing language pack...", file=sys.stderr)
+    argostranslate.package.install_from_path(download_path)
+    print(
+        f"Language pack {from_lang} -> {to_lang} installed.",
+        file=sys.stderr,
+    )
+
+
 def translate_word(
    word: str,
    from_lang: str,
    to_lang: str,
+    *,
+    use_cache: bool = True,
 ) -> TranslationResult:
-    """Translate a single word.
-
-    Uses argostranslate if available (offline), otherwise falls back to
-    deep-translator (Google Translate, online).
+    """Translate a single word using argostranslate (offline).

    Args:
        word: The word to translate.
        from_lang: Source language code (e.g., 'en', 'pl', 'la').
        to_lang: Target language code.
+        use_cache: Whether to use/update translation cache.

    Returns:
        TranslationResult with the translation.
+
+    Raises:
+        ImportError: If argostranslate is not available and cannot be installed.
    """
-    # Try argostranslate first (offline)
-    if _check_argos():
-        import argostranslate.translate
-
+    # Check cache first
+    if use_cache:
        try:
-            translated = argostranslate.translate.translate(word, from_lang, to_lang)
-            return TranslationResult(
-                source_word=word,
-                translated_word=translated,
-                source_lang=from_lang,
-                target_lang=to_lang,
-                success=True,
-            )
-        except Exception as e:  # noqa: BLE001
-            # Fall through to try deep-translator
-            argos_error = str(e)
-    else:
-        argos_error = None
+            from python_pkg.word_frequency.cache import get_translation_cache
+            cache = get_translation_cache()
+            cached = cache.get(word, from_lang, to_lang)
+            if cached is not None:
+                return TranslationResult(
+                    source_word=word,
+                    translated_word=cached,
+                    source_lang=from_lang,
+                    target_lang=to_lang,
+                    success=True,
+                )
+        except ImportError:
+            pass  # Cache not available

-    # Try deep-translator (online via Google Translate)
-    if _check_deep_translator():
-        from deep_translator import GoogleTranslator
+    # Ensure argos is installed (will raise if it can't be)
+    _ensure_argos_installed()

-        try:
-            translator = GoogleTranslator(source=from_lang, target=to_lang)
-            translated = translator.translate(word)
-            return TranslationResult(
-                source_word=word,
-                translated_word=translated or "",
-                source_lang=from_lang,
-                target_lang=to_lang,
-                success=True,
-            )
-        except Exception as e:  # noqa: BLE001
-            return TranslationResult(
-                source_word=word,
-                translated_word="",
-                source_lang=from_lang,
-                target_lang=to_lang,
-                success=False,
-                error=str(e),
-            )
+    import argostranslate.translate

-    # Neither backend available
-    error_msg = "No translation backend available. Install: pip install deep-translator"
-    if argos_error:
-        error_msg = f"argostranslate error: {argos_error}"
-    return TranslationResult(
-        source_word=word,
-        translated_word="",
-        source_lang=from_lang,
-        target_lang=to_lang,
-        success=False,
-        error=error_msg,
-    )
+    try:
+        translated = argostranslate.translate.translate(word, from_lang, to_lang)
+        # Cache the result
+        if use_cache:
+            try:
+                from python_pkg.word_frequency.cache import get_translation_cache
+                get_translation_cache().set(word, from_lang, to_lang, translated)
+            except ImportError:
+                pass
+        return TranslationResult(
+            source_word=word,
+            translated_word=translated,
+            source_lang=from_lang,
+            target_lang=to_lang,
+            success=True,
+        )
+    except Exception as e:  # noqa: BLE001
+        return TranslationResult(
+            source_word=word,
+            translated_word="",
+            source_lang=from_lang,
+            target_lang=to_lang,
+            success=False,
+            error=str(e),
+        )


 def translate_words(
    words: Sequence[str],
    from_lang: str,
    to_lang: str,
+    *,
+    use_cache: bool = True,
 ) -> list[TranslationResult]:
    """Translate multiple words.

@ -291,69 +449,187 @@ def translate_words(
        words: List of words to translate.
        from_lang: Source language code.
        to_lang: Target language code.
+        use_cache: Whether to use translation cache.

    Returns:
        List of TranslationResult for each word.
    """
-    return [translate_word(word, from_lang, to_lang) for word in words]
+    return [translate_word(word, from_lang, to_lang, use_cache=use_cache) for word in words]
+
+
+def _translate_batch_worker(
+    batch_words: list[str],
+    from_lang: str,
+    to_lang: str,
+    batch_idx: int,
+) -> tuple[int, dict[str, str]]:
+    """Worker function to translate a batch of words.
+    
+    Args:
+        batch_words: Words to translate in this batch.
+        from_lang: Source language code.
+        to_lang: Target language code.
+        batch_idx: Index of this batch (for ordering results).
+    
+    Returns:
+        Tuple of (batch_idx, translations dict).
+    """
+    import argostranslate.translate
+    
+    translations: dict[str, str] = {}
+    
+    # Batch translate by joining with newlines
+    batch_text = "\n".join(batch_words)
+    translated_batch = argostranslate.translate.translate(
+        batch_text, from_lang, to_lang
+    )
+    translated_words = translated_batch.split("\n")
+
+    # If we got the same number of translations, use them
+    if len(translated_words) == len(batch_words):
+        for word, trans in zip(batch_words, translated_words, strict=True):
+            translations[word.lower()] = trans.strip()
+    else:
+        # Fall back to individual translation for this batch
+        for word in batch_words:
+            translated = argostranslate.translate.translate(
+                word, from_lang, to_lang
+            )
+            translations[word.lower()] = translated
+    
+    return batch_idx, translations


 def translate_words_batch(
    words: Sequence[str],
    from_lang: str,
    to_lang: str,
+    *,
+    use_cache: bool = True,
 ) -> list[TranslationResult]:
-    """Translate multiple words, attempting batch translation for efficiency.
+    """Translate multiple words using argostranslate (offline).

-    For better results with context, this joins words and translates together,
-    then splits. Falls back to word-by-word if batch fails.
+    Uses small batch translation for efficiency with frequent progress updates.
+    Requires argostranslate. Will use GPU if CUDA is available.

    Args:
        words: List of words to translate.
        from_lang: Source language code.
        to_lang: Target language code.
+        use_cache: Whether to use translation cache.

    Returns:
        List of TranslationResult for each word.
+
+    Raises:
+        ImportError: If argostranslate is not available and cannot be installed.
+        RuntimeError: If CUDA is available but GPU initialization fails.
    """
    if not words:
        return []

-    # For single words or small batches, just translate individually
-    if len(words) <= 3:
-        return translate_words(words, from_lang, to_lang)
+    # Ensure argos is installed (will raise if it can't be)
+    _ensure_argos_installed()
+    
+    # Initialize GPU if available (will raise if CUDA available but fails)
+    _init_gpu_if_available()

-    # Try batch translation by joining with newlines
-    if not _check_argos():
-        return translate_words(words, from_lang, to_lang)
+    # Ensure language pair is available
+    _ensure_language_pair(from_lang, to_lang)

-    import argostranslate.translate
+    # Check cache for already-translated words
+    cached_results: dict[str, str] = {}
+    words_to_translate: list[str] = []

-    try:
-        # Join words with newlines for batch translation
-        batch_text = "\n".join(words)
-        translated_batch = argostranslate.translate.translate(
-            batch_text, from_lang, to_lang
+    if use_cache:
+        try:
+            from python_pkg.word_frequency.cache import get_translation_cache
+            cache = get_translation_cache()
+            cached_results = cache.get_many(list(words), from_lang, to_lang)
+        except ImportError:
+            pass
+
+    # Find words that still need translation
+    for word in words:
+        if word.lower() not in cached_results:
+            words_to_translate.append(word)
+
+    # Translate uncached words using argos batch
+    new_translations: dict[str, str] = {}
+    if words_to_translate:
+        import sys
+
+        num_to_translate = len(words_to_translate)
+        
+        # Check if GPU is being used
+        gpu_status = " (GPU)" if _gpu_available else " (CPU)"
+        print(
+            f"Translating {num_to_translate} words from {from_lang} to {to_lang}{gpu_status}...",
+            file=sys.stderr,
+            flush=True,
        )
-        translated_words = translated_batch.split("\n")

-        # If we got the same number of translations, use them
-        if len(translated_words) == len(words):
-            return [
-                TranslationResult(
-                    source_word=word,
-                    translated_word=trans.strip(),
-                    source_lang=from_lang,
-                    target_lang=to_lang,
-                    success=True,
+        try:
+            # Split into batches - larger batches are faster but show progress less often
+            BATCH_SIZE = 100
+            batches: list[list[str]] = []
+            for i in range(0, num_to_translate, BATCH_SIZE):
+                batches.append(words_to_translate[i:i + BATCH_SIZE])
+            
+            total_batches = len(batches)
+            
+            # Sequential translation with progress
+            # (argostranslate is not thread-safe - uses global model)
+            for batch_idx, batch_words in enumerate(batches):
+                words_done = (batch_idx + 1) * BATCH_SIZE
+                words_done = min(words_done, num_to_translate)
+                pct = int(words_done / num_to_translate * 100)
+                
+                print(
+                    f"  [{pct:3d}%] Translating batch {batch_idx + 1}/{total_batches} "
+                    f"({words_done}/{num_to_translate} words)...",
+                    file=sys.stderr,
+                    flush=True,
                )
-                for word, trans in zip(words, translated_words, strict=True)
-            ]
-    except Exception:  # noqa: BLE001, S110
-        pass
+                
+                _, batch_translations = _translate_batch_worker(
+                    batch_words, from_lang, to_lang, batch_idx
+                )
+                new_translations.update(batch_translations)
+            
+            print(f"  Translation complete.", file=sys.stderr, flush=True)
+        except Exception as e:  # noqa: BLE001
+            raise RuntimeError(
+                f"Translation failed for {from_lang} -> {to_lang}: {e}"
+            ) from e

-    # Fall back to individual translation
-    return translate_words(words, from_lang, to_lang)
+        # Cache new translations
+        if use_cache and new_translations:
+            try:
+                from python_pkg.word_frequency.cache import get_translation_cache
+                get_translation_cache().set_many(new_translations, from_lang, to_lang)
+            except ImportError:
+                pass
+
+    # Merge cached and new translations
+    all_translations = {**cached_results, **new_translations}
+
+    # Build results in original order
+    results: list[TranslationResult] = []
+    for word in words:
+        translation = all_translations.get(word.lower(), "")
+        results.append(
+            TranslationResult(
+                source_word=word,
+                translated_word=translation,
+                source_lang=from_lang,
+                target_lang=to_lang,
+                success=bool(translation),
+                error=None if translation else "Translation failed",
+            )
+        )
+
+    return results


 def format_translations(
@ -551,7 +827,12 @@ def main(argv: Sequence[str] | None = None) -> int:
        return 1

    # Translate
-    results = translate_words_batch(words, args.from_lang, args.to_lang)
+    try:
+        results = translate_words_batch(words, args.from_lang, args.to_lang)
+    except ImportError as e:
+        print(f"Error: {e}", file=sys.stderr)  # noqa: T201
+        return 1
+
    output = format_translations(results)

    # Output