#!/usr/bin/env python3 """Caching utilities for word frequency analysis. Provides disk-based caching for: - Translations (word -> translation mappings) - Vocabulary curve excerpts (file + length -> excerpt + words) - Generated Anki decks Cache location: ~/.cache/word_frequency/ """ from __future__ import annotations import argparse from dataclasses import dataclass import hashlib import json import logging import os from pathlib import Path from typing import Any logger = logging.getLogger(__name__) # Default cache directory DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency" _ONE_KB = 1024 _ONE_MB = 1024 * 1024 def get_cache_dir() -> Path: """Get the cache directory, creating it if needed. Returns: Path to cache directory. """ cache_dir = Path(os.environ.get("WORD_FREQ_CACHE_DIR", str(DEFAULT_CACHE_DIR))) cache_dir.mkdir(parents=True, exist_ok=True) return cache_dir def get_file_hash(filepath: Path) -> str: """Compute SHA256 hash of a file's contents. Args: filepath: Path to file. Returns: Hex digest of file hash. """ hasher = hashlib.sha256() with filepath.open("rb") as f: # Read in chunks for large files for chunk in iter(lambda: f.read(65536), b""): hasher.update(chunk) return hasher.hexdigest() def get_text_hash(text: str) -> str: """Compute SHA256 hash of text content. Args: text: Text to hash. Returns: Hex digest of text hash. """ return hashlib.sha256(text.encode("utf-8")).hexdigest() # ============================================================================= # Translation Cache # ============================================================================= class TranslationCache: """Cache for word translations.""" def __init__(self, cache_dir: Path | None = None) -> None: """Initialize translation cache. Args: cache_dir: Optional custom cache directory. """ self.cache_dir = cache_dir or get_cache_dir() self.cache_file = self.cache_dir / "translations.json" self._cache: dict[str, str] | None = None self._dirty = False # Track if cache needs saving def _load_cache(self) -> dict[str, str]: """Load cache from disk.""" if self._cache is None: if self.cache_file.exists(): try: self._cache = json.loads( self.cache_file.read_text(encoding="utf-8") ) except (json.JSONDecodeError, OSError): self._cache = {} else: self._cache = {} return self._cache def _save_cache(self) -> None: """Save cache to disk if dirty.""" if self._cache is not None and self._dirty: self.cache_file.write_text( json.dumps(self._cache, ensure_ascii=False, indent=2), encoding="utf-8", ) self._dirty = False def flush(self) -> None: """Force save cache to disk.""" self._save_cache() @staticmethod def _make_key(word: str, source_lang: str, target_lang: str) -> str: """Create cache key for a translation. Args: word: Word to translate. source_lang: Source language code. target_lang: Target language code. Returns: Cache key string. """ return f"{source_lang}:{target_lang}:{word.lower()}" def get(self, word: str, source_lang: str, target_lang: str) -> str | None: """Get cached translation. Args: word: Word to look up. source_lang: Source language code. target_lang: Target language code. Returns: Cached translation or None if not found. """ cache = self._load_cache() key = self._make_key(word, source_lang, target_lang) return cache.get(key) def set( self, word: str, source_lang: str, target_lang: str, translation: str, *, auto_save: bool = False, ) -> None: """Store translation in cache. Args: word: Original word. source_lang: Source language code. target_lang: Target language code. translation: Translated word. auto_save: If True, save to disk immediately. """ cache = self._load_cache() key = self._make_key(word, source_lang, target_lang) cache[key] = translation self._dirty = True if auto_save: self._save_cache() def get_many( self, words: list[str], source_lang: str, target_lang: str ) -> dict[str, str]: """Get multiple cached translations. Args: words: Words to look up. source_lang: Source language code. target_lang: Target language code. Returns: Dict mapping words to their cached translations. """ cache = self._load_cache() result: dict[str, str] = {} for word in words: key = self._make_key(word, source_lang, target_lang) if key in cache: result[word.lower()] = cache[key] return result def set_many( self, translations: dict[str, str], source_lang: str, target_lang: str, ) -> None: """Store multiple translations in cache and save to disk. Args: translations: Dict mapping words to translations. source_lang: Source language code. target_lang: Target language code. """ cache = self._load_cache() for word, translation in translations.items(): key = self._make_key(word, source_lang, target_lang) cache[key] = translation self._dirty = True self._save_cache() # Save once after all additions def clear(self) -> None: """Clear all cached translations.""" self._cache = {} self._dirty = False if self.cache_file.exists(): self.cache_file.unlink() def stats(self) -> dict[str, Any]: """Get cache statistics. Returns: Dict with cache stats. """ cache = self._load_cache() return { "total_entries": len(cache), "cache_file": str(self.cache_file), "cache_size_bytes": ( self.cache_file.stat().st_size if self.cache_file.exists() else 0 ), } # ============================================================================= # Vocabulary Curve Cache # ============================================================================= class VocabCurveCache: """Cache for vocabulary curve analysis results.""" def __init__(self, cache_dir: Path | None = None) -> None: """Initialize vocabulary curve cache. Args: cache_dir: Optional custom cache directory. """ self.cache_dir = (cache_dir or get_cache_dir()) / "excerpts" self.cache_dir.mkdir(parents=True, exist_ok=True) def _get_cache_path(self, file_hash: str, length: int) -> Path: """Get path to cache file for given hash and length. Args: file_hash: Hash of source file. length: Excerpt length. Returns: Path to cache file. """ return self.cache_dir / f"{file_hash[:16]}_{length}.json" def get( self, filepath: Path, length: int ) -> tuple[str, list[tuple[str, int]]] | None: """Get cached excerpt and words for a file and length. Args: filepath: Path to source file. length: Excerpt length. Returns: Tuple of (excerpt, words_with_ranks) or None if not cached. """ file_hash = get_file_hash(filepath) cache_path = self._get_cache_path(file_hash, length) if not cache_path.exists(): return None try: data = json.loads(cache_path.read_text(encoding="utf-8")) except (json.JSONDecodeError, KeyError, OSError): return None else: # Verify hash matches if data.get("file_hash") != file_hash: return None excerpt = data["excerpt"] words = [(w, r) for w, r in data["words"]] return excerpt, words def set( self, filepath: Path, length: int, excerpt: str, words: list[tuple[str, int]], ) -> None: """Store excerpt and words in cache. Args: filepath: Path to source file. length: Excerpt length. excerpt: The excerpt text. words: List of (word, rank) tuples. """ file_hash = get_file_hash(filepath) cache_path = self._get_cache_path(file_hash, length) data = { "file_hash": file_hash, "filepath": str(filepath), "length": length, "excerpt": excerpt, "words": [[w, r] for w, r in words], } cache_path.write_text( json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8", ) def clear(self) -> None: """Clear all cached excerpts.""" for cache_file in self.cache_dir.glob("*.json"): cache_file.unlink() def stats(self) -> dict[str, Any]: """Get cache statistics. Returns: Dict with cache stats. """ cache_files = list(self.cache_dir.glob("*.json")) total_size = sum(f.stat().st_size for f in cache_files) return { "total_entries": len(cache_files), "cache_dir": str(self.cache_dir), "cache_size_bytes": total_size, } # ============================================================================= # Anki Deck Cache # ============================================================================= @dataclass(frozen=True) class AnkiDeckKey: """Key parameters for Anki deck cache lookups.""" filepath: Path length: int target_lang: str include_context: bool all_vocab: bool class AnkiDeckCache: """Cache for generated Anki decks.""" def __init__(self, cache_dir: Path | None = None) -> None: """Initialize Anki deck cache. Args: cache_dir: Optional custom cache directory. """ self.cache_dir = (cache_dir or get_cache_dir()) / "anki_decks" self.cache_dir.mkdir(parents=True, exist_ok=True) self.metadata_file = self.cache_dir / "metadata.json" self._metadata: dict[str, Any] | None = None def _load_metadata(self) -> dict[str, Any]: """Load metadata from disk.""" if self._metadata is None: if self.metadata_file.exists(): try: self._metadata = json.loads( self.metadata_file.read_text(encoding="utf-8") ) except (json.JSONDecodeError, OSError): self._metadata = {} else: self._metadata = {} return self._metadata def _save_metadata(self) -> None: """Save metadata to disk.""" if self._metadata is not None: self.metadata_file.write_text( json.dumps(self._metadata, ensure_ascii=False, indent=2), encoding="utf-8", ) @staticmethod def _make_key( file_hash: str, length: int, target_lang: str, *, include_context: bool, all_vocab: bool, ) -> str: """Create cache key for an Anki deck. Args: file_hash: Hash of source file. length: Excerpt length. target_lang: Target language. include_context: Whether context is included. all_vocab: Whether all vocab is included. Returns: Cache key string. """ flags = f"ctx{int(include_context)}_all{int(all_vocab)}" return f"{file_hash[:16]}_{length}_{target_lang}_{flags}" def get( self, key: AnkiDeckKey, ) -> tuple[str, str, int, int] | None: """Get cached Anki deck. Args: key: Cache key parameters. Returns: Tuple of (anki_content, excerpt, num_words, max_rank) or None. """ file_hash = get_file_hash(key.filepath) cache_key = self._make_key( file_hash, key.length, key.target_lang, include_context=key.include_context, all_vocab=key.all_vocab, ) metadata = self._load_metadata() if cache_key not in metadata: return None entry = metadata[cache_key] if entry.get("file_hash") != file_hash: return None deck_file = self.cache_dir / f"{cache_key}.txt" if not deck_file.exists(): return None try: content = deck_file.read_text(encoding="utf-8") return ( content, entry["excerpt"], entry["num_words"], entry["max_rank"], ) except OSError: return None def set( self, key: AnkiDeckKey, anki_content: str, excerpt: str, num_words: int, max_rank: int, ) -> None: """Store Anki deck in cache. Args: key: Cache key parameters. anki_content: The Anki deck content. excerpt: The excerpt text. num_words: Number of words in deck. max_rank: Maximum word rank. """ file_hash = get_file_hash(key.filepath) cache_key = self._make_key( file_hash, key.length, key.target_lang, include_context=key.include_context, all_vocab=key.all_vocab, ) # Save deck content deck_file = self.cache_dir / f"{cache_key}.txt" deck_file.write_text(anki_content, encoding="utf-8") # Update metadata metadata = self._load_metadata() metadata[cache_key] = { "file_hash": file_hash, "filepath": str(key.filepath), "length": key.length, "target_lang": key.target_lang, "include_context": key.include_context, "all_vocab": key.all_vocab, "excerpt": excerpt, "num_words": num_words, "max_rank": max_rank, } self._save_metadata() def clear(self) -> None: """Clear all cached decks.""" self._metadata = {} for cache_file in self.cache_dir.glob("*.txt"): cache_file.unlink() if self.metadata_file.exists(): self.metadata_file.unlink() def stats(self) -> dict[str, Any]: """Get cache statistics. Returns: Dict with cache stats. """ metadata = self._load_metadata() cache_files = list(self.cache_dir.glob("*.txt")) total_size = sum(f.stat().st_size for f in cache_files) return { "total_entries": len(metadata), "cache_dir": str(self.cache_dir), "cache_size_bytes": total_size, } # ============================================================================= # Global Cache Instances # ============================================================================= class _CacheHolder: """Holds singleton cache instances.""" translation: TranslationCache | None = None vocab_curve: VocabCurveCache | None = None anki_deck: AnkiDeckCache | None = None def get_translation_cache() -> TranslationCache: """Get the global translation cache instance.""" if _CacheHolder.translation is None: _CacheHolder.translation = TranslationCache() return _CacheHolder.translation def get_vocab_curve_cache() -> VocabCurveCache: """Get the global vocabulary curve cache instance.""" if _CacheHolder.vocab_curve is None: _CacheHolder.vocab_curve = VocabCurveCache() return _CacheHolder.vocab_curve def get_anki_deck_cache() -> AnkiDeckCache: """Get the global Anki deck cache instance.""" if _CacheHolder.anki_deck is None: _CacheHolder.anki_deck = AnkiDeckCache() return _CacheHolder.anki_deck def clear_all_caches() -> None: """Clear all caches.""" get_translation_cache().clear() get_vocab_curve_cache().clear() get_anki_deck_cache().clear() def get_all_cache_stats() -> dict[str, dict[str, Any]]: """Get statistics for all caches. Returns: Dict with stats for each cache type. """ return { "translations": get_translation_cache().stats(), "vocab_curves": get_vocab_curve_cache().stats(), "anki_decks": get_anki_deck_cache().stats(), } def main() -> int: """CLI for cache management. Returns: Exit code. """ parser = argparse.ArgumentParser(description="Manage word frequency caches") parser.add_argument("--stats", action="store_true", help="Show cache statistics") parser.add_argument("--clear", action="store_true", help="Clear all caches") parser.add_argument( "--clear-translations", action="store_true", help="Clear translation cache" ) parser.add_argument( "--clear-excerpts", action="store_true", help="Clear excerpt cache" ) parser.add_argument( "--clear-anki", action="store_true", help="Clear Anki deck cache" ) args = parser.parse_args() if args.clear: clear_all_caches() logger.info("All caches cleared.") return 0 if args.clear_translations: get_translation_cache().clear() logger.info("Translation cache cleared.") return 0 if args.clear_excerpts: get_vocab_curve_cache().clear() logger.info("Excerpt cache cleared.") return 0 if args.clear_anki: get_anki_deck_cache().clear() logger.info("Anki deck cache cleared.") return 0 # Default: show stats stats = get_all_cache_stats() logger.info("Cache Statistics") logger.info("=" * 50) for cache_name, cache_stats in stats.items(): logger.info("\n%s:", cache_name.upper()) for key, value in cache_stats.items(): if key == "cache_size_bytes": # Format as human-readable if value < _ONE_KB: size_str = f"{value} B" elif value < _ONE_MB: size_str = f"{value / _ONE_KB:.1f} KB" else: size_str = f"{value / _ONE_MB:.1f} MB" logger.info(" %s: %s", key, size_str) else: logger.info(" %s: %s", key, value) return 0 if __name__ == "__main__": import sys sys.exit(main())