mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 13:23:01 +02:00
refactor(word_frequency): fix all ruff violations and remove noqa comments
- Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters
This commit is contained in:
parent
ac1228f9c4
commit
2bb930db6f
@ -22,11 +22,14 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
|
|
||||||
@ -90,9 +93,7 @@ def read_files(filepaths: Sequence[str | Path]) -> str:
|
|||||||
Returns:
|
Returns:
|
||||||
Combined text content of all files.
|
Combined text content of all files.
|
||||||
"""
|
"""
|
||||||
texts = []
|
texts = [read_file(filepath) for filepath in filepaths]
|
||||||
for filepath in filepaths:
|
|
||||||
texts.append(read_file(filepath))
|
|
||||||
return "\n".join(texts)
|
return "\n".join(texts)
|
||||||
|
|
||||||
|
|
||||||
@ -244,15 +245,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
|
|
||||||
if args.output:
|
if args.output:
|
||||||
Path(args.output).write_text(result, encoding="utf-8")
|
Path(args.output).write_text(result, encoding="utf-8")
|
||||||
print(f"Output written to {args.output}")
|
logger.info("Output written to %s", args.output)
|
||||||
else:
|
else:
|
||||||
print(result)
|
sys.stdout.write(result + "\n")
|
||||||
|
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError:
|
||||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
logger.exception("File not found")
|
||||||
return 1
|
return 1
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError:
|
||||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
logger.exception("Could not decode file as UTF-8")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -11,15 +11,23 @@ Cache location: ~/.cache/word_frequency/
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from dataclasses import dataclass
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Default cache directory
|
# Default cache directory
|
||||||
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency"
|
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency"
|
||||||
|
|
||||||
|
_ONE_KB = 1024
|
||||||
|
_ONE_MB = 1024 * 1024
|
||||||
|
|
||||||
|
|
||||||
def get_cache_dir() -> Path:
|
def get_cache_dir() -> Path:
|
||||||
"""Get the cache directory, creating it if needed.
|
"""Get the cache directory, creating it if needed.
|
||||||
@ -42,7 +50,7 @@ def get_file_hash(filepath: Path) -> str:
|
|||||||
Hex digest of file hash.
|
Hex digest of file hash.
|
||||||
"""
|
"""
|
||||||
hasher = hashlib.sha256()
|
hasher = hashlib.sha256()
|
||||||
with open(filepath, "rb") as f:
|
with filepath.open("rb") as f:
|
||||||
# Read in chunks for large files
|
# Read in chunks for large files
|
||||||
for chunk in iter(lambda: f.read(65536), b""):
|
for chunk in iter(lambda: f.read(65536), b""):
|
||||||
hasher.update(chunk)
|
hasher.update(chunk)
|
||||||
@ -274,14 +282,15 @@ class VocabCurveCache:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(cache_path.read_text(encoding="utf-8"))
|
data = json.loads(cache_path.read_text(encoding="utf-8"))
|
||||||
|
except (json.JSONDecodeError, KeyError, OSError):
|
||||||
|
return None
|
||||||
|
else:
|
||||||
# Verify hash matches
|
# Verify hash matches
|
||||||
if data.get("file_hash") != file_hash:
|
if data.get("file_hash") != file_hash:
|
||||||
return None
|
return None
|
||||||
excerpt = data["excerpt"]
|
excerpt = data["excerpt"]
|
||||||
words = [(w, r) for w, r in data["words"]]
|
words = [(w, r) for w, r in data["words"]]
|
||||||
return excerpt, words
|
return excerpt, words
|
||||||
except (json.JSONDecodeError, KeyError, OSError):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def set(
|
def set(
|
||||||
self,
|
self,
|
||||||
@ -339,6 +348,17 @@ class VocabCurveCache:
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class AnkiDeckKey:
|
||||||
|
"""Key parameters for Anki deck cache lookups."""
|
||||||
|
|
||||||
|
filepath: Path
|
||||||
|
length: int
|
||||||
|
target_lang: str
|
||||||
|
include_context: bool
|
||||||
|
all_vocab: bool
|
||||||
|
|
||||||
|
|
||||||
class AnkiDeckCache:
|
class AnkiDeckCache:
|
||||||
"""Cache for generated Anki decks."""
|
"""Cache for generated Anki decks."""
|
||||||
|
|
||||||
@ -380,6 +400,7 @@ class AnkiDeckCache:
|
|||||||
file_hash: str,
|
file_hash: str,
|
||||||
length: int,
|
length: int,
|
||||||
target_lang: str,
|
target_lang: str,
|
||||||
|
*,
|
||||||
include_context: bool,
|
include_context: bool,
|
||||||
all_vocab: bool,
|
all_vocab: bool,
|
||||||
) -> str:
|
) -> str:
|
||||||
@ -400,36 +421,35 @@ class AnkiDeckCache:
|
|||||||
|
|
||||||
def get(
|
def get(
|
||||||
self,
|
self,
|
||||||
filepath: Path,
|
key: AnkiDeckKey,
|
||||||
length: int,
|
|
||||||
target_lang: str,
|
|
||||||
include_context: bool,
|
|
||||||
all_vocab: bool,
|
|
||||||
) -> tuple[str, str, int, int] | None:
|
) -> tuple[str, str, int, int] | None:
|
||||||
"""Get cached Anki deck.
|
"""Get cached Anki deck.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
filepath: Path to source file.
|
key: Cache key parameters.
|
||||||
length: Excerpt length.
|
|
||||||
target_lang: Target language.
|
|
||||||
include_context: Whether context is included.
|
|
||||||
all_vocab: Whether all vocab is included.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (anki_content, excerpt, num_words, max_rank) or None.
|
Tuple of (anki_content, excerpt, num_words, max_rank)
|
||||||
|
or None.
|
||||||
"""
|
"""
|
||||||
file_hash = get_file_hash(filepath)
|
file_hash = get_file_hash(key.filepath)
|
||||||
key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
|
cache_key = self._make_key(
|
||||||
|
file_hash,
|
||||||
|
key.length,
|
||||||
|
key.target_lang,
|
||||||
|
include_context=key.include_context,
|
||||||
|
all_vocab=key.all_vocab,
|
||||||
|
)
|
||||||
metadata = self._load_metadata()
|
metadata = self._load_metadata()
|
||||||
|
|
||||||
if key not in metadata:
|
if cache_key not in metadata:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
entry = metadata[key]
|
entry = metadata[cache_key]
|
||||||
if entry.get("file_hash") != file_hash:
|
if entry.get("file_hash") != file_hash:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
deck_file = self.cache_dir / f"{key}.txt"
|
deck_file = self.cache_dir / f"{cache_key}.txt"
|
||||||
if not deck_file.exists():
|
if not deck_file.exists():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -446,11 +466,7 @@ class AnkiDeckCache:
|
|||||||
|
|
||||||
def set(
|
def set(
|
||||||
self,
|
self,
|
||||||
filepath: Path,
|
key: AnkiDeckKey,
|
||||||
length: int,
|
|
||||||
target_lang: str,
|
|
||||||
include_context: bool,
|
|
||||||
all_vocab: bool,
|
|
||||||
anki_content: str,
|
anki_content: str,
|
||||||
excerpt: str,
|
excerpt: str,
|
||||||
num_words: int,
|
num_words: int,
|
||||||
@ -459,32 +475,34 @@ class AnkiDeckCache:
|
|||||||
"""Store Anki deck in cache.
|
"""Store Anki deck in cache.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
filepath: Path to source file.
|
key: Cache key parameters.
|
||||||
length: Excerpt length.
|
|
||||||
target_lang: Target language.
|
|
||||||
include_context: Whether context is included.
|
|
||||||
all_vocab: Whether all vocab is included.
|
|
||||||
anki_content: The Anki deck content.
|
anki_content: The Anki deck content.
|
||||||
excerpt: The excerpt text.
|
excerpt: The excerpt text.
|
||||||
num_words: Number of words in deck.
|
num_words: Number of words in deck.
|
||||||
max_rank: Maximum word rank.
|
max_rank: Maximum word rank.
|
||||||
"""
|
"""
|
||||||
file_hash = get_file_hash(filepath)
|
file_hash = get_file_hash(key.filepath)
|
||||||
key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
|
cache_key = self._make_key(
|
||||||
|
file_hash,
|
||||||
|
key.length,
|
||||||
|
key.target_lang,
|
||||||
|
include_context=key.include_context,
|
||||||
|
all_vocab=key.all_vocab,
|
||||||
|
)
|
||||||
|
|
||||||
# Save deck content
|
# Save deck content
|
||||||
deck_file = self.cache_dir / f"{key}.txt"
|
deck_file = self.cache_dir / f"{cache_key}.txt"
|
||||||
deck_file.write_text(anki_content, encoding="utf-8")
|
deck_file.write_text(anki_content, encoding="utf-8")
|
||||||
|
|
||||||
# Update metadata
|
# Update metadata
|
||||||
metadata = self._load_metadata()
|
metadata = self._load_metadata()
|
||||||
metadata[key] = {
|
metadata[cache_key] = {
|
||||||
"file_hash": file_hash,
|
"file_hash": file_hash,
|
||||||
"filepath": str(filepath),
|
"filepath": str(key.filepath),
|
||||||
"length": length,
|
"length": key.length,
|
||||||
"target_lang": target_lang,
|
"target_lang": key.target_lang,
|
||||||
"include_context": include_context,
|
"include_context": key.include_context,
|
||||||
"all_vocab": all_vocab,
|
"all_vocab": key.all_vocab,
|
||||||
"excerpt": excerpt,
|
"excerpt": excerpt,
|
||||||
"num_words": num_words,
|
"num_words": num_words,
|
||||||
"max_rank": max_rank,
|
"max_rank": max_rank,
|
||||||
@ -519,34 +537,33 @@ class AnkiDeckCache:
|
|||||||
# Global Cache Instances
|
# Global Cache Instances
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
# Singleton instances
|
class _CacheHolder:
|
||||||
_translation_cache: TranslationCache | None = None
|
"""Holds singleton cache instances."""
|
||||||
_vocab_curve_cache: VocabCurveCache | None = None
|
|
||||||
_anki_deck_cache: AnkiDeckCache | None = None
|
translation: TranslationCache | None = None
|
||||||
|
vocab_curve: VocabCurveCache | None = None
|
||||||
|
anki_deck: AnkiDeckCache | None = None
|
||||||
|
|
||||||
|
|
||||||
def get_translation_cache() -> TranslationCache:
|
def get_translation_cache() -> TranslationCache:
|
||||||
"""Get the global translation cache instance."""
|
"""Get the global translation cache instance."""
|
||||||
global _translation_cache
|
if _CacheHolder.translation is None:
|
||||||
if _translation_cache is None:
|
_CacheHolder.translation = TranslationCache()
|
||||||
_translation_cache = TranslationCache()
|
return _CacheHolder.translation
|
||||||
return _translation_cache
|
|
||||||
|
|
||||||
|
|
||||||
def get_vocab_curve_cache() -> VocabCurveCache:
|
def get_vocab_curve_cache() -> VocabCurveCache:
|
||||||
"""Get the global vocabulary curve cache instance."""
|
"""Get the global vocabulary curve cache instance."""
|
||||||
global _vocab_curve_cache
|
if _CacheHolder.vocab_curve is None:
|
||||||
if _vocab_curve_cache is None:
|
_CacheHolder.vocab_curve = VocabCurveCache()
|
||||||
_vocab_curve_cache = VocabCurveCache()
|
return _CacheHolder.vocab_curve
|
||||||
return _vocab_curve_cache
|
|
||||||
|
|
||||||
|
|
||||||
def get_anki_deck_cache() -> AnkiDeckCache:
|
def get_anki_deck_cache() -> AnkiDeckCache:
|
||||||
"""Get the global Anki deck cache instance."""
|
"""Get the global Anki deck cache instance."""
|
||||||
global _anki_deck_cache
|
if _CacheHolder.anki_deck is None:
|
||||||
if _anki_deck_cache is None:
|
_CacheHolder.anki_deck = AnkiDeckCache()
|
||||||
_anki_deck_cache = AnkiDeckCache()
|
return _CacheHolder.anki_deck
|
||||||
return _anki_deck_cache
|
|
||||||
|
|
||||||
|
|
||||||
def clear_all_caches() -> None:
|
def clear_all_caches() -> None:
|
||||||
@ -575,8 +592,6 @@ def main() -> int:
|
|||||||
Returns:
|
Returns:
|
||||||
Exit code.
|
Exit code.
|
||||||
"""
|
"""
|
||||||
import argparse
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Manage word frequency caches")
|
parser = argparse.ArgumentParser(description="Manage word frequency caches")
|
||||||
parser.add_argument("--stats", action="store_true", help="Show cache statistics")
|
parser.add_argument("--stats", action="store_true", help="Show cache statistics")
|
||||||
parser.add_argument("--clear", action="store_true", help="Clear all caches")
|
parser.add_argument("--clear", action="store_true", help="Clear all caches")
|
||||||
@ -594,42 +609,42 @@ def main() -> int:
|
|||||||
|
|
||||||
if args.clear:
|
if args.clear:
|
||||||
clear_all_caches()
|
clear_all_caches()
|
||||||
print("All caches cleared.")
|
logger.info("All caches cleared.")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if args.clear_translations:
|
if args.clear_translations:
|
||||||
get_translation_cache().clear()
|
get_translation_cache().clear()
|
||||||
print("Translation cache cleared.")
|
logger.info("Translation cache cleared.")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if args.clear_excerpts:
|
if args.clear_excerpts:
|
||||||
get_vocab_curve_cache().clear()
|
get_vocab_curve_cache().clear()
|
||||||
print("Excerpt cache cleared.")
|
logger.info("Excerpt cache cleared.")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if args.clear_anki:
|
if args.clear_anki:
|
||||||
get_anki_deck_cache().clear()
|
get_anki_deck_cache().clear()
|
||||||
print("Anki deck cache cleared.")
|
logger.info("Anki deck cache cleared.")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# Default: show stats
|
# Default: show stats
|
||||||
stats = get_all_cache_stats()
|
stats = get_all_cache_stats()
|
||||||
print("Cache Statistics")
|
logger.info("Cache Statistics")
|
||||||
print("=" * 50)
|
logger.info("=" * 50)
|
||||||
for cache_name, cache_stats in stats.items():
|
for cache_name, cache_stats in stats.items():
|
||||||
print(f"\n{cache_name.upper()}:")
|
logger.info("\n%s:", cache_name.upper())
|
||||||
for key, value in cache_stats.items():
|
for key, value in cache_stats.items():
|
||||||
if key == "cache_size_bytes":
|
if key == "cache_size_bytes":
|
||||||
# Format as human-readable
|
# Format as human-readable
|
||||||
if value < 1024:
|
if value < _ONE_KB:
|
||||||
size_str = f"{value} B"
|
size_str = f"{value} B"
|
||||||
elif value < 1024 * 1024:
|
elif value < _ONE_MB:
|
||||||
size_str = f"{value / 1024:.1f} KB"
|
size_str = f"{value / _ONE_KB:.1f} KB"
|
||||||
else:
|
else:
|
||||||
size_str = f"{value / (1024 * 1024):.1f} MB"
|
size_str = f"{value / _ONE_MB:.1f} MB"
|
||||||
print(f" {key}: {size_str}")
|
logger.info(" %s: %s", key, size_str)
|
||||||
else:
|
else:
|
||||||
print(f" {key}: {value}")
|
logger.info(" %s: %s", key, value)
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|||||||
640
python_pkg/word_frequency/cache.py.bak
Executable file
640
python_pkg/word_frequency/cache.py.bak
Executable file
@ -0,0 +1,640 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Caching utilities for word frequency analysis.
|
||||||
|
|
||||||
|
Provides disk-based caching for:
|
||||||
|
- Translations (word -> translation mappings)
|
||||||
|
- Vocabulary curve excerpts (file + length -> excerpt + words)
|
||||||
|
- Generated Anki decks
|
||||||
|
|
||||||
|
Cache location: ~/.cache/word_frequency/
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
# Default cache directory
|
||||||
|
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency"
|
||||||
|
|
||||||
|
|
||||||
|
def get_cache_dir() -> Path:
|
||||||
|
"""Get the cache directory, creating it if needed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to cache directory.
|
||||||
|
"""
|
||||||
|
cache_dir = Path(os.environ.get("WORD_FREQ_CACHE_DIR", str(DEFAULT_CACHE_DIR)))
|
||||||
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return cache_dir
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_hash(filepath: Path) -> str:
|
||||||
|
"""Compute SHA256 hash of a file's contents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hex digest of file hash.
|
||||||
|
"""
|
||||||
|
hasher = hashlib.sha256()
|
||||||
|
with open(filepath, "rb") as f:
|
||||||
|
# Read in chunks for large files
|
||||||
|
for chunk in iter(lambda: f.read(65536), b""):
|
||||||
|
hasher.update(chunk)
|
||||||
|
return hasher.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_hash(text: str) -> str:
|
||||||
|
"""Compute SHA256 hash of text content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text to hash.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hex digest of text hash.
|
||||||
|
"""
|
||||||
|
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Translation Cache
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationCache:
|
||||||
|
"""Cache for word translations."""
|
||||||
|
|
||||||
|
def __init__(self, cache_dir: Path | None = None) -> None:
|
||||||
|
"""Initialize translation cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache_dir: Optional custom cache directory.
|
||||||
|
"""
|
||||||
|
self.cache_dir = cache_dir or get_cache_dir()
|
||||||
|
self.cache_file = self.cache_dir / "translations.json"
|
||||||
|
self._cache: dict[str, str] | None = None
|
||||||
|
self._dirty = False # Track if cache needs saving
|
||||||
|
|
||||||
|
def _load_cache(self) -> dict[str, str]:
|
||||||
|
"""Load cache from disk."""
|
||||||
|
if self._cache is None:
|
||||||
|
if self.cache_file.exists():
|
||||||
|
try:
|
||||||
|
self._cache = json.loads(
|
||||||
|
self.cache_file.read_text(encoding="utf-8")
|
||||||
|
)
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
self._cache = {}
|
||||||
|
else:
|
||||||
|
self._cache = {}
|
||||||
|
return self._cache
|
||||||
|
|
||||||
|
def _save_cache(self) -> None:
|
||||||
|
"""Save cache to disk if dirty."""
|
||||||
|
if self._cache is not None and self._dirty:
|
||||||
|
self.cache_file.write_text(
|
||||||
|
json.dumps(self._cache, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
self._dirty = False
|
||||||
|
|
||||||
|
def flush(self) -> None:
|
||||||
|
"""Force save cache to disk."""
|
||||||
|
self._save_cache()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _make_key(word: str, source_lang: str, target_lang: str) -> str:
|
||||||
|
"""Create cache key for a translation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word: Word to translate.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cache key string.
|
||||||
|
"""
|
||||||
|
return f"{source_lang}:{target_lang}:{word.lower()}"
|
||||||
|
|
||||||
|
def get(self, word: str, source_lang: str, target_lang: str) -> str | None:
|
||||||
|
"""Get cached translation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word: Word to look up.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cached translation or None if not found.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
key = self._make_key(word, source_lang, target_lang)
|
||||||
|
return cache.get(key)
|
||||||
|
|
||||||
|
def set(
|
||||||
|
self,
|
||||||
|
word: str,
|
||||||
|
source_lang: str,
|
||||||
|
target_lang: str,
|
||||||
|
translation: str,
|
||||||
|
*,
|
||||||
|
auto_save: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""Store translation in cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word: Original word.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
translation: Translated word.
|
||||||
|
auto_save: If True, save to disk immediately.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
key = self._make_key(word, source_lang, target_lang)
|
||||||
|
cache[key] = translation
|
||||||
|
self._dirty = True
|
||||||
|
if auto_save:
|
||||||
|
self._save_cache()
|
||||||
|
|
||||||
|
def get_many(
|
||||||
|
self, words: list[str], source_lang: str, target_lang: str
|
||||||
|
) -> dict[str, str]:
|
||||||
|
"""Get multiple cached translations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: Words to look up.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping words to their cached translations.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
result: dict[str, str] = {}
|
||||||
|
for word in words:
|
||||||
|
key = self._make_key(word, source_lang, target_lang)
|
||||||
|
if key in cache:
|
||||||
|
result[word.lower()] = cache[key]
|
||||||
|
return result
|
||||||
|
|
||||||
|
def set_many(
|
||||||
|
self,
|
||||||
|
translations: dict[str, str],
|
||||||
|
source_lang: str,
|
||||||
|
target_lang: str,
|
||||||
|
) -> None:
|
||||||
|
"""Store multiple translations in cache and save to disk.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
translations: Dict mapping words to translations.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
for word, translation in translations.items():
|
||||||
|
key = self._make_key(word, source_lang, target_lang)
|
||||||
|
cache[key] = translation
|
||||||
|
self._dirty = True
|
||||||
|
self._save_cache() # Save once after all additions
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Clear all cached translations."""
|
||||||
|
self._cache = {}
|
||||||
|
self._dirty = False
|
||||||
|
if self.cache_file.exists():
|
||||||
|
self.cache_file.unlink()
|
||||||
|
|
||||||
|
def stats(self) -> dict[str, Any]:
|
||||||
|
"""Get cache statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with cache stats.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
return {
|
||||||
|
"total_entries": len(cache),
|
||||||
|
"cache_file": str(self.cache_file),
|
||||||
|
"cache_size_bytes": (
|
||||||
|
self.cache_file.stat().st_size if self.cache_file.exists() else 0
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Vocabulary Curve Cache
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class VocabCurveCache:
|
||||||
|
"""Cache for vocabulary curve analysis results."""
|
||||||
|
|
||||||
|
def __init__(self, cache_dir: Path | None = None) -> None:
|
||||||
|
"""Initialize vocabulary curve cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache_dir: Optional custom cache directory.
|
||||||
|
"""
|
||||||
|
self.cache_dir = (cache_dir or get_cache_dir()) / "excerpts"
|
||||||
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _get_cache_path(self, file_hash: str, length: int) -> Path:
|
||||||
|
"""Get path to cache file for given hash and length.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_hash: Hash of source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to cache file.
|
||||||
|
"""
|
||||||
|
return self.cache_dir / f"{file_hash[:16]}_{length}.json"
|
||||||
|
|
||||||
|
def get(
|
||||||
|
self, filepath: Path, length: int
|
||||||
|
) -> tuple[str, list[tuple[str, int]]] | None:
|
||||||
|
"""Get cached excerpt and words for a file and length.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (excerpt, words_with_ranks) or None if not cached.
|
||||||
|
"""
|
||||||
|
file_hash = get_file_hash(filepath)
|
||||||
|
cache_path = self._get_cache_path(file_hash, length)
|
||||||
|
|
||||||
|
if not cache_path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(cache_path.read_text(encoding="utf-8"))
|
||||||
|
# Verify hash matches
|
||||||
|
if data.get("file_hash") != file_hash:
|
||||||
|
return None
|
||||||
|
excerpt = data["excerpt"]
|
||||||
|
words = [(w, r) for w, r in data["words"]]
|
||||||
|
return excerpt, words
|
||||||
|
except (json.JSONDecodeError, KeyError, OSError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def set(
|
||||||
|
self,
|
||||||
|
filepath: Path,
|
||||||
|
length: int,
|
||||||
|
excerpt: str,
|
||||||
|
words: list[tuple[str, int]],
|
||||||
|
) -> None:
|
||||||
|
"""Store excerpt and words in cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
excerpt: The excerpt text.
|
||||||
|
words: List of (word, rank) tuples.
|
||||||
|
"""
|
||||||
|
file_hash = get_file_hash(filepath)
|
||||||
|
cache_path = self._get_cache_path(file_hash, length)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"file_hash": file_hash,
|
||||||
|
"filepath": str(filepath),
|
||||||
|
"length": length,
|
||||||
|
"excerpt": excerpt,
|
||||||
|
"words": [[w, r] for w, r in words],
|
||||||
|
}
|
||||||
|
|
||||||
|
cache_path.write_text(
|
||||||
|
json.dumps(data, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Clear all cached excerpts."""
|
||||||
|
for cache_file in self.cache_dir.glob("*.json"):
|
||||||
|
cache_file.unlink()
|
||||||
|
|
||||||
|
def stats(self) -> dict[str, Any]:
|
||||||
|
"""Get cache statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with cache stats.
|
||||||
|
"""
|
||||||
|
cache_files = list(self.cache_dir.glob("*.json"))
|
||||||
|
total_size = sum(f.stat().st_size for f in cache_files)
|
||||||
|
return {
|
||||||
|
"total_entries": len(cache_files),
|
||||||
|
"cache_dir": str(self.cache_dir),
|
||||||
|
"cache_size_bytes": total_size,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Anki Deck Cache
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class AnkiDeckCache:
|
||||||
|
"""Cache for generated Anki decks."""
|
||||||
|
|
||||||
|
def __init__(self, cache_dir: Path | None = None) -> None:
|
||||||
|
"""Initialize Anki deck cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache_dir: Optional custom cache directory.
|
||||||
|
"""
|
||||||
|
self.cache_dir = (cache_dir or get_cache_dir()) / "anki_decks"
|
||||||
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.metadata_file = self.cache_dir / "metadata.json"
|
||||||
|
self._metadata: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
def _load_metadata(self) -> dict[str, Any]:
|
||||||
|
"""Load metadata from disk."""
|
||||||
|
if self._metadata is None:
|
||||||
|
if self.metadata_file.exists():
|
||||||
|
try:
|
||||||
|
self._metadata = json.loads(
|
||||||
|
self.metadata_file.read_text(encoding="utf-8")
|
||||||
|
)
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
self._metadata = {}
|
||||||
|
else:
|
||||||
|
self._metadata = {}
|
||||||
|
return self._metadata
|
||||||
|
|
||||||
|
def _save_metadata(self) -> None:
|
||||||
|
"""Save metadata to disk."""
|
||||||
|
if self._metadata is not None:
|
||||||
|
self.metadata_file.write_text(
|
||||||
|
json.dumps(self._metadata, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _make_key(
|
||||||
|
file_hash: str,
|
||||||
|
length: int,
|
||||||
|
target_lang: str,
|
||||||
|
include_context: bool,
|
||||||
|
all_vocab: bool,
|
||||||
|
) -> str:
|
||||||
|
"""Create cache key for an Anki deck.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_hash: Hash of source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
target_lang: Target language.
|
||||||
|
include_context: Whether context is included.
|
||||||
|
all_vocab: Whether all vocab is included.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cache key string.
|
||||||
|
"""
|
||||||
|
flags = f"ctx{int(include_context)}_all{int(all_vocab)}"
|
||||||
|
return f"{file_hash[:16]}_{length}_{target_lang}_{flags}"
|
||||||
|
|
||||||
|
def get(
|
||||||
|
self,
|
||||||
|
filepath: Path,
|
||||||
|
length: int,
|
||||||
|
target_lang: str,
|
||||||
|
include_context: bool,
|
||||||
|
all_vocab: bool,
|
||||||
|
) -> tuple[str, str, int, int] | None:
|
||||||
|
"""Get cached Anki deck.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
target_lang: Target language.
|
||||||
|
include_context: Whether context is included.
|
||||||
|
all_vocab: Whether all vocab is included.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (anki_content, excerpt, num_words, max_rank) or None.
|
||||||
|
"""
|
||||||
|
file_hash = get_file_hash(filepath)
|
||||||
|
key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
|
||||||
|
metadata = self._load_metadata()
|
||||||
|
|
||||||
|
if key not in metadata:
|
||||||
|
return None
|
||||||
|
|
||||||
|
entry = metadata[key]
|
||||||
|
if entry.get("file_hash") != file_hash:
|
||||||
|
return None
|
||||||
|
|
||||||
|
deck_file = self.cache_dir / f"{key}.txt"
|
||||||
|
if not deck_file.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = deck_file.read_text(encoding="utf-8")
|
||||||
|
return (
|
||||||
|
content,
|
||||||
|
entry["excerpt"],
|
||||||
|
entry["num_words"],
|
||||||
|
entry["max_rank"],
|
||||||
|
)
|
||||||
|
except OSError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def set(
|
||||||
|
self,
|
||||||
|
filepath: Path,
|
||||||
|
length: int,
|
||||||
|
target_lang: str,
|
||||||
|
include_context: bool,
|
||||||
|
all_vocab: bool,
|
||||||
|
anki_content: str,
|
||||||
|
excerpt: str,
|
||||||
|
num_words: int,
|
||||||
|
max_rank: int,
|
||||||
|
) -> None:
|
||||||
|
"""Store Anki deck in cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
target_lang: Target language.
|
||||||
|
include_context: Whether context is included.
|
||||||
|
all_vocab: Whether all vocab is included.
|
||||||
|
anki_content: The Anki deck content.
|
||||||
|
excerpt: The excerpt text.
|
||||||
|
num_words: Number of words in deck.
|
||||||
|
max_rank: Maximum word rank.
|
||||||
|
"""
|
||||||
|
file_hash = get_file_hash(filepath)
|
||||||
|
key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
|
||||||
|
|
||||||
|
# Save deck content
|
||||||
|
deck_file = self.cache_dir / f"{key}.txt"
|
||||||
|
deck_file.write_text(anki_content, encoding="utf-8")
|
||||||
|
|
||||||
|
# Update metadata
|
||||||
|
metadata = self._load_metadata()
|
||||||
|
metadata[key] = {
|
||||||
|
"file_hash": file_hash,
|
||||||
|
"filepath": str(filepath),
|
||||||
|
"length": length,
|
||||||
|
"target_lang": target_lang,
|
||||||
|
"include_context": include_context,
|
||||||
|
"all_vocab": all_vocab,
|
||||||
|
"excerpt": excerpt,
|
||||||
|
"num_words": num_words,
|
||||||
|
"max_rank": max_rank,
|
||||||
|
}
|
||||||
|
self._save_metadata()
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Clear all cached decks."""
|
||||||
|
self._metadata = {}
|
||||||
|
for cache_file in self.cache_dir.glob("*.txt"):
|
||||||
|
cache_file.unlink()
|
||||||
|
if self.metadata_file.exists():
|
||||||
|
self.metadata_file.unlink()
|
||||||
|
|
||||||
|
def stats(self) -> dict[str, Any]:
|
||||||
|
"""Get cache statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with cache stats.
|
||||||
|
"""
|
||||||
|
metadata = self._load_metadata()
|
||||||
|
cache_files = list(self.cache_dir.glob("*.txt"))
|
||||||
|
total_size = sum(f.stat().st_size for f in cache_files)
|
||||||
|
return {
|
||||||
|
"total_entries": len(metadata),
|
||||||
|
"cache_dir": str(self.cache_dir),
|
||||||
|
"cache_size_bytes": total_size,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Global Cache Instances
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Singleton instances
|
||||||
|
_translation_cache: TranslationCache | None = None
|
||||||
|
_vocab_curve_cache: VocabCurveCache | None = None
|
||||||
|
_anki_deck_cache: AnkiDeckCache | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_translation_cache() -> TranslationCache:
|
||||||
|
"""Get the global translation cache instance."""
|
||||||
|
global _translation_cache
|
||||||
|
if _translation_cache is None:
|
||||||
|
_translation_cache = TranslationCache()
|
||||||
|
return _translation_cache
|
||||||
|
|
||||||
|
|
||||||
|
def get_vocab_curve_cache() -> VocabCurveCache:
|
||||||
|
"""Get the global vocabulary curve cache instance."""
|
||||||
|
global _vocab_curve_cache
|
||||||
|
if _vocab_curve_cache is None:
|
||||||
|
_vocab_curve_cache = VocabCurveCache()
|
||||||
|
return _vocab_curve_cache
|
||||||
|
|
||||||
|
|
||||||
|
def get_anki_deck_cache() -> AnkiDeckCache:
|
||||||
|
"""Get the global Anki deck cache instance."""
|
||||||
|
global _anki_deck_cache
|
||||||
|
if _anki_deck_cache is None:
|
||||||
|
_anki_deck_cache = AnkiDeckCache()
|
||||||
|
return _anki_deck_cache
|
||||||
|
|
||||||
|
|
||||||
|
def clear_all_caches() -> None:
|
||||||
|
"""Clear all caches."""
|
||||||
|
get_translation_cache().clear()
|
||||||
|
get_vocab_curve_cache().clear()
|
||||||
|
get_anki_deck_cache().clear()
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_cache_stats() -> dict[str, dict[str, Any]]:
|
||||||
|
"""Get statistics for all caches.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with stats for each cache type.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"translations": get_translation_cache().stats(),
|
||||||
|
"vocab_curves": get_vocab_curve_cache().stats(),
|
||||||
|
"anki_decks": get_anki_deck_cache().stats(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
"""CLI for cache management.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Exit code.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Manage word frequency caches")
|
||||||
|
parser.add_argument("--stats", action="store_true", help="Show cache statistics")
|
||||||
|
parser.add_argument("--clear", action="store_true", help="Clear all caches")
|
||||||
|
parser.add_argument(
|
||||||
|
"--clear-translations", action="store_true", help="Clear translation cache"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clear-excerpts", action="store_true", help="Clear excerpt cache"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clear-anki", action="store_true", help="Clear Anki deck cache"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.clear:
|
||||||
|
clear_all_caches()
|
||||||
|
print("All caches cleared.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.clear_translations:
|
||||||
|
get_translation_cache().clear()
|
||||||
|
print("Translation cache cleared.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.clear_excerpts:
|
||||||
|
get_vocab_curve_cache().clear()
|
||||||
|
print("Excerpt cache cleared.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.clear_anki:
|
||||||
|
get_anki_deck_cache().clear()
|
||||||
|
print("Anki deck cache cleared.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Default: show stats
|
||||||
|
stats = get_all_cache_stats()
|
||||||
|
print("Cache Statistics")
|
||||||
|
print("=" * 50)
|
||||||
|
for cache_name, cache_stats in stats.items():
|
||||||
|
print(f"\n{cache_name.upper()}:")
|
||||||
|
for key, value in cache_stats.items():
|
||||||
|
if key == "cache_size_bytes":
|
||||||
|
# Format as human-readable
|
||||||
|
if value < 1024:
|
||||||
|
size_str = f"{value} B"
|
||||||
|
elif value < 1024 * 1024:
|
||||||
|
size_str = f"{value / 1024:.1f} KB"
|
||||||
|
else:
|
||||||
|
size_str = f"{value / (1024 * 1024):.1f} MB"
|
||||||
|
print(f" {key}: {size_str}")
|
||||||
|
else:
|
||||||
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.exit(main())
|
||||||
@ -6,21 +6,28 @@ specified length (in words) where the target words appear most frequently.
|
|||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
# From raw text with target words
|
# From raw text with target words
|
||||||
python -m python_pkg.word_frequency.excerpt_finder --text "they went somewhere he and she and the guy" --words and the --length 3
|
python -m python_pkg.word_frequency.excerpt_finder \
|
||||||
|
--text "they went somewhere he and she and the guy" \
|
||||||
|
--words and the --length 3
|
||||||
|
|
||||||
# From a file
|
# From a file
|
||||||
python -m python_pkg.word_frequency.excerpt_finder --file path/to/file.txt --words the and of --length 10
|
python -m python_pkg.word_frequency.excerpt_finder \
|
||||||
|
--file path/to/file.txt --words the and of --length 10
|
||||||
|
|
||||||
# Target words from a file (one word per line)
|
# Target words from a file (one word per line)
|
||||||
python -m python_pkg.word_frequency.excerpt_finder --file text.txt --words-file targets.txt --length 20
|
python -m python_pkg.word_frequency.excerpt_finder \
|
||||||
|
--file text.txt --words-file targets.txt --length 20
|
||||||
|
|
||||||
# Show top N excerpts instead of just the best one
|
# Show top N excerpts instead of just the best one
|
||||||
python -m python_pkg.word_frequency.excerpt_finder --file text.txt --words the and --length 10 --top 5
|
python -m python_pkg.word_frequency.excerpt_finder \
|
||||||
|
--file text.txt --words the and --length 10 --top 5
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
from dataclasses import dataclass
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
from typing import TYPE_CHECKING, NamedTuple
|
from typing import TYPE_CHECKING, NamedTuple
|
||||||
@ -33,6 +40,17 @@ except ModuleNotFoundError:
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ExcerptSearchOptions:
|
||||||
|
"""Options for excerpt search and display."""
|
||||||
|
|
||||||
|
case_sensitive: bool = False
|
||||||
|
top_n: int = 1
|
||||||
|
context_words: int = 0
|
||||||
|
|
||||||
|
|
||||||
class ExcerptResult(NamedTuple):
|
class ExcerptResult(NamedTuple):
|
||||||
"""Result of an excerpt search."""
|
"""Result of an excerpt search."""
|
||||||
@ -141,45 +159,28 @@ def find_best_excerpt(
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
def find_best_excerpt_with_context(
|
def _expand_results_with_context(
|
||||||
text: str,
|
text: str,
|
||||||
target_words: Sequence[str],
|
base_results: list[ExcerptResult],
|
||||||
excerpt_length: int,
|
context_words: int,
|
||||||
*,
|
*,
|
||||||
case_sensitive: bool = False,
|
case_sensitive: bool = False,
|
||||||
top_n: int = 1,
|
|
||||||
context_words: int = 0,
|
|
||||||
) -> list[ExcerptResult]:
|
) -> list[ExcerptResult]:
|
||||||
"""Find the excerpt(s) with optional surrounding context.
|
"""Expand excerpt results with surrounding context words.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: The input text to search.
|
text: The full source text.
|
||||||
target_words: Words to search for in the excerpt.
|
base_results: Results from find_best_excerpt.
|
||||||
excerpt_length: Length of the excerpt in words.
|
context_words: Number of words to include before/after.
|
||||||
case_sensitive: If False, match words case-insensitively.
|
case_sensitive: If False, words are lowercased.
|
||||||
top_n: Number of top excerpts to return.
|
|
||||||
context_words: Number of words to include before/after the excerpt.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of ExcerptResult with context included in the excerpt.
|
Expanded ExcerptResult list with context.
|
||||||
"""
|
"""
|
||||||
base_results = find_best_excerpt(
|
|
||||||
text,
|
|
||||||
target_words,
|
|
||||||
excerpt_length,
|
|
||||||
case_sensitive=case_sensitive,
|
|
||||||
top_n=top_n,
|
|
||||||
)
|
|
||||||
|
|
||||||
if context_words <= 0:
|
|
||||||
return base_results
|
|
||||||
|
|
||||||
# Re-extract all words to get context
|
|
||||||
all_words = extract_words(text, case_sensitive=case_sensitive)
|
all_words = extract_words(text, case_sensitive=case_sensitive)
|
||||||
|
|
||||||
expanded_results: list[ExcerptResult] = []
|
expanded_results: list[ExcerptResult] = []
|
||||||
for result in base_results:
|
for result in base_results:
|
||||||
# Expand the excerpt with context
|
|
||||||
ctx_start = max(0, result.start_index - context_words)
|
ctx_start = max(0, result.start_index - context_words)
|
||||||
ctx_end = min(len(all_words), result.end_index + context_words)
|
ctx_end = min(len(all_words), result.end_index + context_words)
|
||||||
context_excerpt_words = all_words[ctx_start:ctx_end]
|
context_excerpt_words = all_words[ctx_start:ctx_end]
|
||||||
@ -198,6 +199,40 @@ def find_best_excerpt_with_context(
|
|||||||
return expanded_results
|
return expanded_results
|
||||||
|
|
||||||
|
|
||||||
|
def find_best_excerpt_with_context(
|
||||||
|
text: str,
|
||||||
|
target_words: Sequence[str],
|
||||||
|
excerpt_length: int,
|
||||||
|
options: ExcerptSearchOptions | None = None,
|
||||||
|
) -> list[ExcerptResult]:
|
||||||
|
"""Find the excerpt(s) with optional surrounding context.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: The input text to search.
|
||||||
|
target_words: Words to search for in the excerpt.
|
||||||
|
excerpt_length: Length of the excerpt in words.
|
||||||
|
options: Search options (case_sensitive, top_n, context_words).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ExcerptResult with context included in the excerpt.
|
||||||
|
"""
|
||||||
|
opts = options or ExcerptSearchOptions()
|
||||||
|
base_results = find_best_excerpt(
|
||||||
|
text,
|
||||||
|
target_words,
|
||||||
|
excerpt_length,
|
||||||
|
case_sensitive=opts.case_sensitive,
|
||||||
|
top_n=opts.top_n,
|
||||||
|
)
|
||||||
|
|
||||||
|
if opts.context_words <= 0:
|
||||||
|
return base_results
|
||||||
|
|
||||||
|
return _expand_results_with_context(
|
||||||
|
text, base_results, opts.context_words, case_sensitive=opts.case_sensitive
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def format_excerpt_results(
|
def format_excerpt_results(
|
||||||
results: list[ExcerptResult],
|
results: list[ExcerptResult],
|
||||||
target_words: Sequence[str],
|
target_words: Sequence[str],
|
||||||
@ -224,7 +259,8 @@ def format_excerpt_results(
|
|||||||
lines.append(f'Excerpt: "{result.excerpt}"')
|
lines.append(f'Excerpt: "{result.excerpt}"')
|
||||||
lines.append(f"Word position: {result.start_index} - {result.end_index - 1}")
|
lines.append(f"Word position: {result.start_index} - {result.end_index - 1}")
|
||||||
lines.append(
|
lines.append(
|
||||||
f"Matches: {result.match_count}/{len(result.words)} ({result.match_percentage:.2f}%)"
|
f"Matches: {result.match_count}/{len(result.words)}"
|
||||||
|
f" ({result.match_percentage:.2f}%)"
|
||||||
)
|
)
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
@ -316,10 +352,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
# Get input text
|
# Get input text
|
||||||
if args.text:
|
text = args.text or read_file(args.file)
|
||||||
text = args.text
|
|
||||||
else:
|
|
||||||
text = read_file(args.file)
|
|
||||||
|
|
||||||
# Get target words
|
# Get target words
|
||||||
if args.words:
|
if args.words:
|
||||||
@ -329,7 +362,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
target_words = [w.strip() for w in words_content.splitlines() if w.strip()]
|
target_words = [w.strip() for w in words_content.splitlines() if w.strip()]
|
||||||
|
|
||||||
if not target_words:
|
if not target_words:
|
||||||
print("Error: No target words provided", file=sys.stderr)
|
logger.error("No target words provided")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Find excerpts
|
# Find excerpts
|
||||||
@ -337,9 +370,11 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
text,
|
text,
|
||||||
target_words,
|
target_words,
|
||||||
args.length,
|
args.length,
|
||||||
case_sensitive=args.case_sensitive,
|
ExcerptSearchOptions(
|
||||||
top_n=args.top,
|
case_sensitive=args.case_sensitive,
|
||||||
context_words=args.context,
|
top_n=args.top,
|
||||||
|
context_words=args.context,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Format and print results
|
# Format and print results
|
||||||
@ -347,15 +382,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
|
|
||||||
if args.output:
|
if args.output:
|
||||||
Path(args.output).write_text(output, encoding="utf-8")
|
Path(args.output).write_text(output, encoding="utf-8")
|
||||||
print(f"Output written to {args.output}")
|
logger.info("Output written to %s", args.output)
|
||||||
else:
|
else:
|
||||||
print(output)
|
logger.info("%s", output)
|
||||||
|
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError:
|
||||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
logger.exception("File not found")
|
||||||
return 1
|
return 1
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError:
|
||||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
logger.exception("Could not decode file as UTF-8")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@ -1,7 +1,8 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Learning pipe - combines word frequency analysis with excerpt finding for language learning.
|
r"""Learning pipe - combines word frequency analysis with excerpt finding.
|
||||||
|
|
||||||
|
Helps language learners by:
|
||||||
|
|
||||||
This script helps language learners by:
|
|
||||||
1. Analyzing a text to find the most common words
|
1. Analyzing a text to find the most common words
|
||||||
2. Finding excerpts where those common words are most prevalent
|
2. Finding excerpts where those common words are most prevalent
|
||||||
3. Creating a progressive learning experience in batches
|
3. Creating a progressive learning experience in batches
|
||||||
@ -11,26 +12,35 @@ The idea is to:
|
|||||||
- Then read excerpts that are dense with those words
|
- Then read excerpts that are dense with those words
|
||||||
- Progressively learn more words and more complex excerpts
|
- Progressively learn more words and more complex excerpts
|
||||||
|
|
||||||
Usage:
|
Usage::
|
||||||
# Basic usage - get top 20 words and find excerpts with them
|
|
||||||
python -m python_pkg.word_frequency.learning_pipe --file text.txt
|
# Basic usage
|
||||||
|
python -m python_pkg.word_frequency.learning_pipe \\
|
||||||
|
--file text.txt
|
||||||
|
|
||||||
# Custom batch size and excerpt length
|
# Custom batch size and excerpt length
|
||||||
python -m python_pkg.word_frequency.learning_pipe --file text.txt --batch-size 30 --excerpt-length 50
|
python -m python_pkg.word_frequency.learning_pipe \\
|
||||||
|
--file text.txt --batch-size 30 --excerpt-length 50
|
||||||
|
|
||||||
# Multiple batches for progressive learning
|
# Multiple batches for progressive learning
|
||||||
python -m python_pkg.word_frequency.learning_pipe --file text.txt --batches 5 --batch-size 20
|
python -m python_pkg.word_frequency.learning_pipe \\
|
||||||
|
--file text.txt --batches 5 --batch-size 20
|
||||||
|
|
||||||
# Output to file
|
# Output to file
|
||||||
python -m python_pkg.word_frequency.learning_pipe --file text.txt --output lesson.txt
|
python -m python_pkg.word_frequency.learning_pipe \\
|
||||||
|
--file text.txt --output lesson.txt
|
||||||
|
|
||||||
# Skip common words (like "the", "a", "is") using a stopwords file
|
# Skip common words using a stopwords file
|
||||||
python -m python_pkg.word_frequency.learning_pipe --file text.txt --stopwords stopwords.txt
|
python -m python_pkg.word_frequency.learning_pipe \\
|
||||||
|
--file text.txt --stopwords stopwords.txt
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from dataclasses import replace as _replace_dc
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
@ -53,6 +63,8 @@ except ModuleNotFoundError:
|
|||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
# Common stopwords for various languages (can be overridden with --stopwords)
|
# Common stopwords for various languages (can be overridden with --stopwords)
|
||||||
DEFAULT_STOPWORDS_EN = frozenset(
|
DEFAULT_STOPWORDS_EN = frozenset(
|
||||||
@ -181,57 +193,210 @@ def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class LessonConfig:
|
||||||
|
"""Configuration for learning lesson generation."""
|
||||||
|
|
||||||
|
batch_size: int = 20
|
||||||
|
num_batches: int = 1
|
||||||
|
excerpt_length: int = 30
|
||||||
|
excerpts_per_batch: int = 3
|
||||||
|
stopwords: frozenset[str] | None = None
|
||||||
|
skip_default_stopwords: bool = False
|
||||||
|
skip_numbers: bool = True
|
||||||
|
case_sensitive: bool = False
|
||||||
|
translate_from: str | None = None
|
||||||
|
translate_to: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_stopwords(config: LessonConfig) -> frozenset[str]:
|
||||||
|
"""Resolve combined stopwords from config."""
|
||||||
|
if config.skip_default_stopwords:
|
||||||
|
return config.stopwords or frozenset()
|
||||||
|
return DEFAULT_STOPWORDS_EN | (config.stopwords or frozenset())
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_translation_language(
|
||||||
|
text: str,
|
||||||
|
config: LessonConfig,
|
||||||
|
lines: list[str],
|
||||||
|
) -> tuple[str | None, str | None]:
|
||||||
|
"""Detect translation settings and return (from, to) pair."""
|
||||||
|
actual_from = config.translate_from
|
||||||
|
actual_to = config.translate_to or "en"
|
||||||
|
|
||||||
|
if actual_from == "auto" or (
|
||||||
|
config.translate_to and not config.translate_from
|
||||||
|
):
|
||||||
|
detected = detect_language(text)
|
||||||
|
if detected:
|
||||||
|
actual_from = detected
|
||||||
|
lines.append(f"Detected language: {detected}")
|
||||||
|
else:
|
||||||
|
lines.append(
|
||||||
|
"Warning: Could not detect language "
|
||||||
|
"(install langdetect: "
|
||||||
|
"pip install langdetect)"
|
||||||
|
)
|
||||||
|
actual_from = None
|
||||||
|
|
||||||
|
return actual_from, actual_to
|
||||||
|
|
||||||
|
|
||||||
|
def _format_word_list(
|
||||||
|
batch_words: list[tuple[str, int]],
|
||||||
|
start_idx: int,
|
||||||
|
total_words: int,
|
||||||
|
translations: dict[str, str],
|
||||||
|
) -> list[str]:
|
||||||
|
"""Format the vocabulary word list for a batch."""
|
||||||
|
lines: list[str] = []
|
||||||
|
for i, (word, count) in enumerate(
|
||||||
|
batch_words, start=start_idx + 1,
|
||||||
|
):
|
||||||
|
percentage = (count / total_words) * 100
|
||||||
|
if translations:
|
||||||
|
trans = translations.get(word, "?")
|
||||||
|
lines.append(
|
||||||
|
f" {i:3}. {word:<20} -> {trans:<20}"
|
||||||
|
f" ({count:,} occurrences, "
|
||||||
|
f"{percentage:.2f}%)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
lines.append(
|
||||||
|
f" {i:3}. {word:<20}"
|
||||||
|
f" ({count:,} occurrences, "
|
||||||
|
f"{percentage:.2f}%)"
|
||||||
|
)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _LessonContext:
|
||||||
|
"""Shared context for batch generation."""
|
||||||
|
|
||||||
|
text: str
|
||||||
|
word_counts: dict[str, int]
|
||||||
|
config: LessonConfig
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_batch_section(
|
||||||
|
ctx: _LessonContext,
|
||||||
|
batch_num: int,
|
||||||
|
batch_words: list[tuple[str, int]],
|
||||||
|
cumulative_words: list[str],
|
||||||
|
) -> list[str]:
|
||||||
|
"""Generate lines for a single batch section."""
|
||||||
|
config = ctx.config
|
||||||
|
total_words = sum(ctx.word_counts.values())
|
||||||
|
start_idx = batch_num * config.batch_size
|
||||||
|
end_idx = start_idx + config.batch_size
|
||||||
|
|
||||||
|
lines: list[str] = []
|
||||||
|
lines.append("-" * 70)
|
||||||
|
lines.append(
|
||||||
|
f"BATCH {batch_num + 1}: Words "
|
||||||
|
f"{start_idx + 1} - "
|
||||||
|
f"{min(end_idx, start_idx + len(batch_words))}"
|
||||||
|
)
|
||||||
|
lines.append("-" * 70)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Get translations if requested
|
||||||
|
translations: dict[str, str] = {}
|
||||||
|
do_translate = (
|
||||||
|
config.translate_from is not None
|
||||||
|
and config.translate_to is not None
|
||||||
|
)
|
||||||
|
if do_translate:
|
||||||
|
words_to_translate = [word for word, _ in batch_words]
|
||||||
|
translation_results = translate_words_batch(
|
||||||
|
words_to_translate,
|
||||||
|
config.translate_from, # type: ignore[arg-type]
|
||||||
|
config.translate_to, # type: ignore[arg-type]
|
||||||
|
)
|
||||||
|
translations = {
|
||||||
|
r.source_word: r.translated_word
|
||||||
|
for r in translation_results
|
||||||
|
if r.success
|
||||||
|
}
|
||||||
|
|
||||||
|
lines.append("VOCABULARY TO LEARN:")
|
||||||
|
lines.append("")
|
||||||
|
lines.extend(
|
||||||
|
_format_word_list(
|
||||||
|
batch_words, start_idx, total_words, translations,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Cumulative coverage
|
||||||
|
cumulative_count = sum(
|
||||||
|
ctx.word_counts[w]
|
||||||
|
for w in cumulative_words
|
||||||
|
if w in ctx.word_counts
|
||||||
|
)
|
||||||
|
coverage = (cumulative_count / total_words) * 100
|
||||||
|
lines.append(
|
||||||
|
"After learning these words, "
|
||||||
|
f"you'll recognize ~{coverage:.1f}% of the text"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# Excerpts
|
||||||
|
lines.append("PRACTICE EXCERPTS:")
|
||||||
|
lines.append(
|
||||||
|
"(Excerpts where your learned vocabulary "
|
||||||
|
"is most concentrated)"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
excerpts = find_best_excerpt(
|
||||||
|
ctx.text,
|
||||||
|
cumulative_words,
|
||||||
|
config.excerpt_length,
|
||||||
|
case_sensitive=config.case_sensitive,
|
||||||
|
top_n=config.excerpts_per_batch,
|
||||||
|
)
|
||||||
|
|
||||||
|
for j, excerpt in enumerate(excerpts, 1):
|
||||||
|
lines.append(
|
||||||
|
f" Excerpt {j} "
|
||||||
|
f"({excerpt.match_percentage:.1f}% known words):"
|
||||||
|
)
|
||||||
|
lines.append(f' "{excerpt.excerpt}"')
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def generate_learning_lesson(
|
def generate_learning_lesson(
|
||||||
text: str,
|
text: str,
|
||||||
*,
|
config: LessonConfig | None = None,
|
||||||
batch_size: int = 20,
|
|
||||||
num_batches: int = 1,
|
|
||||||
excerpt_length: int = 30,
|
|
||||||
excerpts_per_batch: int = 3,
|
|
||||||
stopwords: frozenset[str] | None = None,
|
|
||||||
skip_default_stopwords: bool = False,
|
|
||||||
skip_numbers: bool = True,
|
|
||||||
case_sensitive: bool = False,
|
|
||||||
context_words: int = 5,
|
|
||||||
translate_from: str | None = None,
|
|
||||||
translate_to: str | None = None,
|
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Generate a learning lesson from text.
|
"""Generate a learning lesson from text.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
text: The source text to analyze.
|
text: The source text to analyze.
|
||||||
batch_size: Number of words per learning batch.
|
config: Lesson configuration. Uses defaults if None.
|
||||||
num_batches: Number of batches to generate.
|
|
||||||
excerpt_length: Length of each excerpt in words.
|
|
||||||
excerpts_per_batch: Number of excerpts to find per batch.
|
|
||||||
stopwords: Custom stopwords to skip (in addition to defaults).
|
|
||||||
skip_default_stopwords: If True, don't filter out default English stopwords.
|
|
||||||
skip_numbers: If True, filter out numeric words (default: True).
|
|
||||||
case_sensitive: If True, treat words case-sensitively.
|
|
||||||
context_words: Words of context to include around excerpts.
|
|
||||||
translate_from: Source language code for translation (e.g., 'la', 'pl').
|
|
||||||
translate_to: Target language code for translation (e.g., 'en').
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Formatted learning lesson as a string.
|
Formatted learning lesson as a string.
|
||||||
"""
|
"""
|
||||||
# Combine stopwords
|
if config is None:
|
||||||
all_stopwords: frozenset[str]
|
config = LessonConfig()
|
||||||
if skip_default_stopwords:
|
|
||||||
all_stopwords = stopwords or frozenset()
|
|
||||||
else:
|
|
||||||
all_stopwords = DEFAULT_STOPWORDS_EN | (stopwords or frozenset())
|
|
||||||
|
|
||||||
# Analyze text for word frequencies
|
all_stopwords = _resolve_stopwords(config)
|
||||||
word_counts = analyze_text(text, case_sensitive=case_sensitive)
|
word_counts = analyze_text(
|
||||||
|
text, case_sensitive=config.case_sensitive,
|
||||||
|
)
|
||||||
|
|
||||||
# Filter out stopwords and get sorted words
|
|
||||||
filtered_words = [
|
filtered_words = [
|
||||||
(word, count)
|
(word, count)
|
||||||
for word, count in word_counts.most_common()
|
for word, count in word_counts.most_common()
|
||||||
if word.lower() not in all_stopwords
|
if word.lower() not in all_stopwords
|
||||||
and len(word) > 1
|
and len(word) > 1
|
||||||
and not (skip_numbers and word.isdigit())
|
and not (config.skip_numbers and word.isdigit())
|
||||||
]
|
]
|
||||||
|
|
||||||
total_words = sum(word_counts.values())
|
total_words = sum(word_counts.values())
|
||||||
@ -241,125 +406,62 @@ def generate_learning_lesson(
|
|||||||
lines.append("LANGUAGE LEARNING LESSON")
|
lines.append("LANGUAGE LEARNING LESSON")
|
||||||
lines.append("=" * 70)
|
lines.append("=" * 70)
|
||||||
lines.append(
|
lines.append(
|
||||||
f"Source text: {total_words:,} total words, {len(word_counts):,} unique words"
|
f"Source text: {total_words:,} total words, "
|
||||||
|
f"{len(word_counts):,} unique words"
|
||||||
)
|
)
|
||||||
if all_stopwords:
|
if all_stopwords:
|
||||||
lines.append(
|
lines.append(
|
||||||
f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words"
|
f"After filtering {len(all_stopwords)} "
|
||||||
|
f"stopwords: {len(filtered_words):,} "
|
||||||
|
"vocabulary words"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
lines.append(f"Vocabulary words: {len(filtered_words):,}")
|
lines.append(
|
||||||
|
f"Vocabulary words: {len(filtered_words):,}",
|
||||||
|
)
|
||||||
|
|
||||||
# Handle translation setup
|
actual_from, actual_to = _detect_translation_language(
|
||||||
actual_translate_from = translate_from
|
text, config, lines,
|
||||||
actual_translate_to = translate_to or "en" # Default to English
|
)
|
||||||
|
do_translate = (
|
||||||
# Auto-detect language if translation is enabled but source not specified
|
actual_from is not None and actual_to is not None
|
||||||
if translate_from == "auto" or (translate_to and not translate_from):
|
)
|
||||||
detected = detect_language(text)
|
|
||||||
if detected:
|
|
||||||
actual_translate_from = detected
|
|
||||||
lines.append(f"Detected language: {detected}")
|
|
||||||
# Note: langdetect doesn't support Latin (often detected as Italian)
|
|
||||||
# If detection seems wrong, use --translate-from to override
|
|
||||||
else:
|
|
||||||
lines.append(
|
|
||||||
"Warning: Could not detect language "
|
|
||||||
"(install langdetect: pip install langdetect)"
|
|
||||||
)
|
|
||||||
actual_translate_from = None
|
|
||||||
|
|
||||||
do_translate = actual_translate_from is not None and actual_translate_to is not None
|
|
||||||
if do_translate:
|
if do_translate:
|
||||||
lines.append(f"Translation: {actual_translate_from} -> {actual_translate_to}")
|
lines.append(
|
||||||
|
f"Translation: {actual_from} -> {actual_to}",
|
||||||
|
)
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
# Generate batches
|
# Create resolved config with detected translation
|
||||||
|
resolved_config = _replace_dc(
|
||||||
|
config,
|
||||||
|
translate_from=actual_from,
|
||||||
|
translate_to=actual_to,
|
||||||
|
)
|
||||||
|
ctx = _LessonContext(
|
||||||
|
text=text,
|
||||||
|
word_counts=word_counts,
|
||||||
|
config=resolved_config,
|
||||||
|
)
|
||||||
|
|
||||||
cumulative_words: list[str] = []
|
cumulative_words: list[str] = []
|
||||||
|
for batch_num in range(config.num_batches):
|
||||||
for batch_num in range(num_batches):
|
start_idx = batch_num * config.batch_size
|
||||||
start_idx = batch_num * batch_size
|
end_idx = start_idx + config.batch_size
|
||||||
end_idx = start_idx + batch_size
|
|
||||||
|
|
||||||
if start_idx >= len(filtered_words):
|
if start_idx >= len(filtered_words):
|
||||||
break
|
break
|
||||||
|
|
||||||
batch_words = filtered_words[start_idx:end_idx]
|
batch_words = filtered_words[start_idx:end_idx]
|
||||||
cumulative_words.extend(word for word, _ in batch_words)
|
cumulative_words.extend(word for word, _ in batch_words)
|
||||||
|
|
||||||
lines.append("-" * 70)
|
lines.extend(
|
||||||
lines.append(
|
_generate_batch_section(
|
||||||
f"BATCH {batch_num + 1}: Words {start_idx + 1} - {min(end_idx, len(filtered_words))}"
|
ctx,
|
||||||
)
|
batch_num,
|
||||||
lines.append("-" * 70)
|
batch_words,
|
||||||
lines.append("")
|
cumulative_words,
|
||||||
|
|
||||||
# Get translations if requested
|
|
||||||
translations: dict[str, str] = {}
|
|
||||||
if do_translate:
|
|
||||||
words_to_translate = [word for word, _ in batch_words]
|
|
||||||
translation_results = translate_words_batch(
|
|
||||||
words_to_translate,
|
|
||||||
actual_translate_from, # type: ignore[arg-type]
|
|
||||||
actual_translate_to, # type: ignore[arg-type]
|
|
||||||
)
|
)
|
||||||
translations = {
|
|
||||||
r.source_word: r.translated_word
|
|
||||||
for r in translation_results
|
|
||||||
if r.success
|
|
||||||
}
|
|
||||||
|
|
||||||
# Word list with frequencies
|
|
||||||
lines.append("VOCABULARY TO LEARN:")
|
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
if do_translate and translations:
|
|
||||||
# Include translations in output
|
|
||||||
for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
|
|
||||||
percentage = (count / total_words) * 100
|
|
||||||
trans = translations.get(word, "?")
|
|
||||||
lines.append(
|
|
||||||
f" {i:3}. {word:<20} -> {trans:<20} ({count:,} occurrences, {percentage:.2f}%)"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
|
|
||||||
percentage = (count / total_words) * 100
|
|
||||||
lines.append(
|
|
||||||
f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)"
|
|
||||||
)
|
|
||||||
|
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
# Calculate cumulative coverage
|
|
||||||
cumulative_count = sum(
|
|
||||||
word_counts[word] for word in cumulative_words if word in word_counts
|
|
||||||
)
|
)
|
||||||
coverage = (cumulative_count / total_words) * 100
|
|
||||||
lines.append(
|
|
||||||
f"After learning these words, you'll recognize ~{coverage:.1f}% of the text"
|
|
||||||
)
|
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
# Find excerpts using cumulative words
|
|
||||||
lines.append("PRACTICE EXCERPTS:")
|
|
||||||
lines.append("(Excerpts where your learned vocabulary is most concentrated)")
|
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
excerpts = find_best_excerpt(
|
|
||||||
text,
|
|
||||||
cumulative_words,
|
|
||||||
excerpt_length,
|
|
||||||
case_sensitive=case_sensitive,
|
|
||||||
top_n=excerpts_per_batch,
|
|
||||||
)
|
|
||||||
|
|
||||||
for j, excerpt in enumerate(excerpts, 1):
|
|
||||||
lines.append(
|
|
||||||
f" Excerpt {j} ({excerpt.match_percentage:.1f}% known words):"
|
|
||||||
)
|
|
||||||
lines.append(f' "{excerpt.excerpt}"')
|
|
||||||
lines.append("")
|
|
||||||
|
|
||||||
# Summary
|
# Summary
|
||||||
lines.append("=" * 70)
|
lines.append("=" * 70)
|
||||||
@ -368,14 +470,25 @@ def generate_learning_lesson(
|
|||||||
|
|
||||||
if cumulative_words:
|
if cumulative_words:
|
||||||
final_coverage = sum(
|
final_coverage = sum(
|
||||||
word_counts[word] for word in cumulative_words if word in word_counts
|
word_counts[w]
|
||||||
|
for w in cumulative_words
|
||||||
|
if w in word_counts
|
||||||
)
|
)
|
||||||
final_percentage = (final_coverage / total_words) * 100
|
final_pct = (final_coverage / total_words) * 100
|
||||||
lines.append(f"Total vocabulary words learned: {len(cumulative_words)}")
|
lines.append(
|
||||||
lines.append(f"Text coverage: {final_percentage:.1f}%")
|
"Total vocabulary words learned: "
|
||||||
|
f"{len(cumulative_words)}"
|
||||||
|
)
|
||||||
|
lines.append(f"Text coverage: {final_pct:.1f}%")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
lines.append("TIP: Focus on understanding the excerpts first, then read")
|
lines.append(
|
||||||
lines.append("more of the original text as your vocabulary grows!")
|
"TIP: Focus on understanding the excerpts "
|
||||||
|
"first, then read"
|
||||||
|
)
|
||||||
|
lines.append(
|
||||||
|
"more of the original text as your "
|
||||||
|
"vocabulary grows!"
|
||||||
|
)
|
||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
@ -475,7 +588,10 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
"--translate-from",
|
"--translate-from",
|
||||||
type=str,
|
type=str,
|
||||||
metavar="LANG",
|
metavar="LANG",
|
||||||
help="Source language code (e.g., 'la', 'pl', 'de'). If omitted, auto-detected.",
|
help=(
|
||||||
|
"Source language code (e.g., 'la', 'pl'). "
|
||||||
|
"If omitted, auto-detected."
|
||||||
|
),
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--translate-to",
|
"--translate-to",
|
||||||
@ -496,27 +612,22 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
args = parser.parse_args(argv)
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get input text
|
text = args.text or read_file(args.file)
|
||||||
if args.text:
|
|
||||||
text = args.text
|
|
||||||
else:
|
|
||||||
text = read_file(args.file)
|
|
||||||
|
|
||||||
# Load custom stopwords if provided
|
# Load custom stopwords if provided
|
||||||
custom_stopwords = load_stopwords(args.stopwords)
|
custom_stopwords = load_stopwords(args.stopwords)
|
||||||
|
|
||||||
# Determine translation settings
|
# Determine translation settings
|
||||||
# Translation enabled by default, --no-translate disables it
|
|
||||||
translate_from: str | None = None
|
translate_from: str | None = None
|
||||||
translate_to: str | None = None
|
translate_to: str | None = None
|
||||||
|
|
||||||
if not args.no_translate:
|
if not args.no_translate:
|
||||||
translate_from = args.translate_from or "auto" # "auto" triggers detection
|
translate_from = (
|
||||||
|
args.translate_from or "auto"
|
||||||
|
)
|
||||||
translate_to = args.translate_to
|
translate_to = args.translate_to
|
||||||
|
|
||||||
# Generate lesson
|
config = LessonConfig(
|
||||||
lesson = generate_learning_lesson(
|
|
||||||
text,
|
|
||||||
batch_size=args.batch_size,
|
batch_size=args.batch_size,
|
||||||
num_batches=args.batches,
|
num_batches=args.batches,
|
||||||
excerpt_length=args.excerpt_length,
|
excerpt_length=args.excerpt_length,
|
||||||
@ -528,19 +639,26 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
translate_from=translate_from,
|
translate_from=translate_from,
|
||||||
translate_to=translate_to,
|
translate_to=translate_to,
|
||||||
)
|
)
|
||||||
|
lesson = generate_learning_lesson(text, config)
|
||||||
|
|
||||||
# Output
|
# Output
|
||||||
if args.output:
|
if args.output:
|
||||||
Path(args.output).write_text(lesson, encoding="utf-8")
|
Path(args.output).write_text(
|
||||||
print(f"Lesson written to {args.output}")
|
lesson, encoding="utf-8",
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Lesson written to %s", args.output,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print(lesson)
|
logger.info(lesson)
|
||||||
|
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError:
|
||||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
logger.exception("Error: File not found")
|
||||||
return 1
|
return 1
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError:
|
||||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
logger.exception(
|
||||||
|
"Error: Could not decode file as UTF-8",
|
||||||
|
)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@ -3,8 +3,11 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from pathlib import Path
|
|
||||||
import time
|
import time
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -251,12 +254,13 @@ class TestMain:
|
|||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
assert "Unique words: 3" in captured.out
|
assert "Unique words: 3" in captured.out
|
||||||
|
|
||||||
def test_file_not_found_error(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_file_not_found_error(
|
||||||
|
self, caplog: pytest.LogCaptureFixture
|
||||||
|
) -> None:
|
||||||
"""Test error handling for missing file."""
|
"""Test error handling for missing file."""
|
||||||
exit_code = main(["--file", "/nonexistent/file.txt"])
|
exit_code = main(["--file", "/nonexistent/file.txt"])
|
||||||
captured = capsys.readouterr()
|
|
||||||
assert exit_code == 1
|
assert exit_code == 1
|
||||||
assert "Error" in captured.err
|
assert "File not found" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
class TestPerformance:
|
class TestPerformance:
|
||||||
@ -283,7 +287,7 @@ class TestPerformance:
|
|||||||
assert elapsed < 10.0, f"Analysis took {elapsed:.2f}s, expected < 10s"
|
assert elapsed < 10.0, f"Analysis took {elapsed:.2f}s, expected < 10s"
|
||||||
assert "word0" in result # Most common word should be present
|
assert "word0" in result # Most common word should be present
|
||||||
|
|
||||||
def test_bible_sized_text_performance(self, tmp_path: Path) -> None:
|
def test_bible_sized_text_performance(self) -> None:
|
||||||
"""Test with Bible-sized text (~800k words)."""
|
"""Test with Bible-sized text (~800k words)."""
|
||||||
# Generate text similar in size to the Bible
|
# Generate text similar in size to the Bible
|
||||||
base_words = ["the", "and", "of", "to", "in", "a", "that", "is", "was", "for"]
|
base_words = ["the", "and", "of", "to", "in", "a", "that", "is", "was", "for"]
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import pytest
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
from python_pkg.word_frequency.anki_generator import (
|
from python_pkg.word_frequency.anki_generator import (
|
||||||
|
DeckInput,
|
||||||
find_word_contexts,
|
find_word_contexts,
|
||||||
generate_anki_deck,
|
generate_anki_deck,
|
||||||
main,
|
main,
|
||||||
@ -20,6 +21,7 @@ except ImportError:
|
|||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
||||||
from python_pkg.word_frequency.anki_generator import (
|
from python_pkg.word_frequency.anki_generator import (
|
||||||
|
DeckInput,
|
||||||
find_word_contexts,
|
find_word_contexts,
|
||||||
generate_anki_deck,
|
generate_anki_deck,
|
||||||
main,
|
main,
|
||||||
@ -77,7 +79,7 @@ class TestParseVocabularyCurveOutput:
|
|||||||
|
|
||||||
def test_parse_length_1(self, sample_vocabulary_output: str) -> None:
|
def test_parse_length_1(self, sample_vocabulary_output: str) -> None:
|
||||||
"""Test parsing output for length 1."""
|
"""Test parsing output for length 1."""
|
||||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
excerpt, excerpt_words, _all_vocab = parse_vocabulary_curve_output(
|
||||||
sample_vocabulary_output, 1
|
sample_vocabulary_output, 1
|
||||||
)
|
)
|
||||||
assert excerpt == "the"
|
assert excerpt == "the"
|
||||||
@ -85,7 +87,7 @@ class TestParseVocabularyCurveOutput:
|
|||||||
|
|
||||||
def test_parse_length_2(self, sample_vocabulary_output: str) -> None:
|
def test_parse_length_2(self, sample_vocabulary_output: str) -> None:
|
||||||
"""Test parsing output for length 2."""
|
"""Test parsing output for length 2."""
|
||||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
excerpt, excerpt_words, _all_vocab = parse_vocabulary_curve_output(
|
||||||
sample_vocabulary_output, 2
|
sample_vocabulary_output, 2
|
||||||
)
|
)
|
||||||
assert excerpt == "the dog"
|
assert excerpt == "the dog"
|
||||||
@ -93,7 +95,7 @@ class TestParseVocabularyCurveOutput:
|
|||||||
|
|
||||||
def test_parse_length_3(self, sample_vocabulary_output: str) -> None:
|
def test_parse_length_3(self, sample_vocabulary_output: str) -> None:
|
||||||
"""Test parsing output for length 3."""
|
"""Test parsing output for length 3."""
|
||||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
excerpt, excerpt_words, _all_vocab = parse_vocabulary_curve_output(
|
||||||
sample_vocabulary_output, 3
|
sample_vocabulary_output, 3
|
||||||
)
|
)
|
||||||
assert excerpt == "the quick fox"
|
assert excerpt == "the quick fox"
|
||||||
@ -104,7 +106,7 @@ class TestParseVocabularyCurveOutput:
|
|||||||
|
|
||||||
def test_parse_nonexistent_length(self, sample_vocabulary_output: str) -> None:
|
def test_parse_nonexistent_length(self, sample_vocabulary_output: str) -> None:
|
||||||
"""Test parsing output for non-existent length."""
|
"""Test parsing output for non-existent length."""
|
||||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
excerpt, excerpt_words, _all_vocab = parse_vocabulary_curve_output(
|
||||||
sample_vocabulary_output, 100
|
sample_vocabulary_output, 100
|
||||||
)
|
)
|
||||||
assert excerpt == ""
|
assert excerpt == ""
|
||||||
@ -121,7 +123,7 @@ hello;1
|
|||||||
world;2
|
world;2
|
||||||
VOCAB_DUMP_END
|
VOCAB_DUMP_END
|
||||||
"""
|
"""
|
||||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(output, 2)
|
_excerpt, _excerpt_words, all_vocab = parse_vocabulary_curve_output(output, 2)
|
||||||
assert all_vocab == [("hello", 1), ("world", 2)]
|
assert all_vocab == [("hello", 1), ("world", 2)]
|
||||||
|
|
||||||
|
|
||||||
@ -168,10 +170,12 @@ class TestGenerateAnkiDeck:
|
|||||||
MagicMock(success=True, source_word="hello", translated_word="hola")
|
MagicMock(success=True, source_word="hello", translated_word="hola")
|
||||||
]
|
]
|
||||||
result = generate_anki_deck(
|
result = generate_anki_deck(
|
||||||
[("hello", 1)],
|
DeckInput(
|
||||||
source_lang="en",
|
words_with_ranks=[("hello", 1)],
|
||||||
target_lang="es",
|
source_lang="en",
|
||||||
deck_name="TestDeck",
|
target_lang="es",
|
||||||
|
deck_name="TestDeck",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "#separator:semicolon" in result
|
assert "#separator:semicolon" in result
|
||||||
@ -188,9 +192,11 @@ class TestGenerateAnkiDeck:
|
|||||||
MagicMock(success=True, source_word="world", translated_word="mundo"),
|
MagicMock(success=True, source_word="world", translated_word="mundo"),
|
||||||
]
|
]
|
||||||
result = generate_anki_deck(
|
result = generate_anki_deck(
|
||||||
[("hello", 1), ("world", 2)],
|
DeckInput(
|
||||||
source_lang="en",
|
words_with_ranks=[("hello", 1), ("world", 2)],
|
||||||
target_lang="es",
|
source_lang="en",
|
||||||
|
target_lang="es",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check that words and translations are present
|
# Check that words and translations are present
|
||||||
@ -208,9 +214,11 @@ class TestGenerateAnkiDeck:
|
|||||||
MagicMock(success=True, source_word="test", translated_word="prueba")
|
MagicMock(success=True, source_word="test", translated_word="prueba")
|
||||||
]
|
]
|
||||||
result = generate_anki_deck(
|
result = generate_anki_deck(
|
||||||
[("test", 42)],
|
DeckInput(
|
||||||
source_lang="en",
|
words_with_ranks=[("test", 42)],
|
||||||
target_lang="es",
|
source_lang="en",
|
||||||
|
target_lang="es",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "#42" in result
|
assert "#42" in result
|
||||||
@ -226,9 +234,11 @@ class TestGenerateAnkiDeck:
|
|||||||
)
|
)
|
||||||
]
|
]
|
||||||
result = generate_anki_deck(
|
result = generate_anki_deck(
|
||||||
[("test;word", 1)],
|
DeckInput(
|
||||||
source_lang="en",
|
words_with_ranks=[("test;word", 1)],
|
||||||
target_lang="es",
|
source_lang="en",
|
||||||
|
target_lang="es",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Semicolons should be replaced with commas
|
# Semicolons should be replaced with commas
|
||||||
@ -244,10 +254,12 @@ class TestGenerateAnkiDeck:
|
|||||||
]
|
]
|
||||||
contexts = {"hello": "...say hello to..."}
|
contexts = {"hello": "...say hello to..."}
|
||||||
result = generate_anki_deck(
|
result = generate_anki_deck(
|
||||||
[("hello", 1)],
|
DeckInput(
|
||||||
source_lang="en",
|
words_with_ranks=[("hello", 1)],
|
||||||
target_lang="es",
|
source_lang="en",
|
||||||
contexts=contexts,
|
target_lang="es",
|
||||||
|
contexts=contexts,
|
||||||
|
),
|
||||||
include_context=True,
|
include_context=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -257,9 +269,11 @@ class TestGenerateAnkiDeck:
|
|||||||
def test_no_translate_flag(self) -> None:
|
def test_no_translate_flag(self) -> None:
|
||||||
"""Test that no_translate skips translation."""
|
"""Test that no_translate skips translation."""
|
||||||
result = generate_anki_deck(
|
result = generate_anki_deck(
|
||||||
[("hello", 1), ("world", 2)],
|
DeckInput(
|
||||||
source_lang="en",
|
words_with_ranks=[("hello", 1), ("world", 2)],
|
||||||
target_lang="es",
|
source_lang="en",
|
||||||
|
target_lang="es",
|
||||||
|
),
|
||||||
no_translate=True,
|
no_translate=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -280,7 +294,7 @@ class TestMain:
|
|||||||
result = main(["--file", "nonexistent.txt", "--length", "10"])
|
result = main(["--file", "nonexistent.txt", "--length", "10"])
|
||||||
assert result == 1
|
assert result == 1
|
||||||
|
|
||||||
def test_help_flag(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_help_flag(self) -> None:
|
||||||
"""Test that --help works."""
|
"""Test that --help works."""
|
||||||
with pytest.raises(SystemExit) as exc_info:
|
with pytest.raises(SystemExit) as exc_info:
|
||||||
main(["--help"])
|
main(["--help"])
|
||||||
@ -309,7 +323,7 @@ class TestIntegration:
|
|||||||
) as mock_translate:
|
) as mock_translate:
|
||||||
# Mock translation to avoid network calls
|
# Mock translation to avoid network calls
|
||||||
def mock_translate_fn(
|
def mock_translate_fn(
|
||||||
words: list[str], from_lang: str, to_lang: str
|
words: list[str], _from_lang: str, _to_lang: str
|
||||||
) -> list[MagicMock]:
|
) -> list[MagicMock]:
|
||||||
return [
|
return [
|
||||||
MagicMock(success=True, source_word=w, translated_word=f"[{w}]")
|
MagicMock(success=True, source_word=w, translated_word=f"[{w}]")
|
||||||
@ -324,6 +338,8 @@ class TestIntegration:
|
|||||||
str(sample_text_file),
|
str(sample_text_file),
|
||||||
"--length",
|
"--length",
|
||||||
"5",
|
"5",
|
||||||
|
"--from",
|
||||||
|
"en",
|
||||||
"--output",
|
"--output",
|
||||||
str(output_file),
|
str(output_file),
|
||||||
"--quiet",
|
"--quiet",
|
||||||
@ -337,9 +353,11 @@ class TestIntegration:
|
|||||||
assert "#separator:semicolon" in content
|
assert "#separator:semicolon" in content
|
||||||
|
|
||||||
def test_cli_with_sample_file(
|
def test_cli_with_sample_file(
|
||||||
self, sample_text_file: Path, tmp_path: Path, capsys: pytest.CaptureFixture[str]
|
self, sample_text_file: Path, tmp_path: Path, caplog: pytest.LogCaptureFixture
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test CLI with actual file."""
|
"""Test CLI with actual file."""
|
||||||
|
import logging
|
||||||
|
|
||||||
from python_pkg.word_frequency.anki_generator import C_EXECUTABLE
|
from python_pkg.word_frequency.anki_generator import C_EXECUTABLE
|
||||||
|
|
||||||
if not C_EXECUTABLE.exists():
|
if not C_EXECUTABLE.exists():
|
||||||
@ -347,9 +365,12 @@ class TestIntegration:
|
|||||||
|
|
||||||
output_file = tmp_path / "anki_output.txt"
|
output_file = tmp_path / "anki_output.txt"
|
||||||
|
|
||||||
with patch(
|
with (
|
||||||
"python_pkg.word_frequency.anki_generator.translate_words_batch"
|
caplog.at_level(logging.INFO),
|
||||||
) as mock_translate:
|
patch(
|
||||||
|
"python_pkg.word_frequency.anki_generator.translate_words_batch"
|
||||||
|
) as mock_translate,
|
||||||
|
):
|
||||||
mock_translate.return_value = [
|
mock_translate.return_value = [
|
||||||
MagicMock(success=True, source_word="the", translated_word="le")
|
MagicMock(success=True, source_word="the", translated_word="le")
|
||||||
]
|
]
|
||||||
@ -360,14 +381,15 @@ class TestIntegration:
|
|||||||
str(sample_text_file),
|
str(sample_text_file),
|
||||||
"--length",
|
"--length",
|
||||||
"1",
|
"1",
|
||||||
|
"--from",
|
||||||
|
"en",
|
||||||
"--output",
|
"--output",
|
||||||
str(output_file),
|
str(output_file),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
assert result == 0
|
assert result == 0
|
||||||
captured = capsys.readouterr()
|
assert "FLASHCARD GENERATION COMPLETE" in caplog.text
|
||||||
assert "FLASHCARD GENERATION COMPLETE" in captured.out
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@ -2,13 +2,18 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from python_pkg.word_frequency.excerpt_finder import (
|
from python_pkg.word_frequency.excerpt_finder import (
|
||||||
ExcerptResult,
|
ExcerptResult,
|
||||||
|
ExcerptSearchOptions,
|
||||||
find_best_excerpt,
|
find_best_excerpt,
|
||||||
find_best_excerpt_with_context,
|
find_best_excerpt_with_context,
|
||||||
format_excerpt_results,
|
format_excerpt_results,
|
||||||
@ -146,7 +151,8 @@ class TestFindBestExcerptWithContext:
|
|||||||
"""Test with zero context (should behave like find_best_excerpt)."""
|
"""Test with zero context (should behave like find_best_excerpt)."""
|
||||||
text = "a b c d e f g"
|
text = "a b c d e f g"
|
||||||
result = find_best_excerpt_with_context(
|
result = find_best_excerpt_with_context(
|
||||||
text, ["c"], excerpt_length=1, context_words=0
|
text, ["c"], excerpt_length=1,
|
||||||
|
options=ExcerptSearchOptions(context_words=0),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert result[0].excerpt == "c"
|
assert result[0].excerpt == "c"
|
||||||
@ -155,7 +161,8 @@ class TestFindBestExcerptWithContext:
|
|||||||
"""Test with context words."""
|
"""Test with context words."""
|
||||||
text = "a b c d e f g"
|
text = "a b c d e f g"
|
||||||
result = find_best_excerpt_with_context(
|
result = find_best_excerpt_with_context(
|
||||||
text, ["d"], excerpt_length=1, context_words=2
|
text, ["d"], excerpt_length=1,
|
||||||
|
options=ExcerptSearchOptions(context_words=2),
|
||||||
)
|
)
|
||||||
|
|
||||||
# "d" at index 3, with context should include 2 words before and after
|
# "d" at index 3, with context should include 2 words before and after
|
||||||
@ -167,7 +174,8 @@ class TestFindBestExcerptWithContext:
|
|||||||
"""Test context doesn't go before start of text."""
|
"""Test context doesn't go before start of text."""
|
||||||
text = "a b c d e"
|
text = "a b c d e"
|
||||||
result = find_best_excerpt_with_context(
|
result = find_best_excerpt_with_context(
|
||||||
text, ["a"], excerpt_length=1, context_words=3
|
text, ["a"], excerpt_length=1,
|
||||||
|
options=ExcerptSearchOptions(context_words=3),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Can't go before "a", so just get words after
|
# Can't go before "a", so just get words after
|
||||||
@ -178,7 +186,8 @@ class TestFindBestExcerptWithContext:
|
|||||||
"""Test context doesn't go beyond end of text."""
|
"""Test context doesn't go beyond end of text."""
|
||||||
text = "a b c d e"
|
text = "a b c d e"
|
||||||
result = find_best_excerpt_with_context(
|
result = find_best_excerpt_with_context(
|
||||||
text, ["e"], excerpt_length=1, context_words=3
|
text, ["e"], excerpt_length=1,
|
||||||
|
options=ExcerptSearchOptions(context_words=3),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Can't go beyond "e"
|
# Can't go beyond "e"
|
||||||
@ -240,33 +249,33 @@ class TestFormatExcerptResults:
|
|||||||
class TestMain:
|
class TestMain:
|
||||||
"""Tests for main CLI function."""
|
"""Tests for main CLI function."""
|
||||||
|
|
||||||
def test_text_and_words_input(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_text_and_words_input(self, caplog: pytest.LogCaptureFixture) -> None:
|
||||||
"""Test --text and --words options."""
|
"""Test --text and --words options."""
|
||||||
exit_code = main(
|
with caplog.at_level(logging.INFO):
|
||||||
["--text", "hello world hello", "--words", "hello", "--length", "2"]
|
exit_code = main(
|
||||||
)
|
["--text", "hello world hello", "--words", "hello", "--length", "2"]
|
||||||
captured = capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
assert "hello" in captured.out
|
assert "hello" in caplog.text
|
||||||
|
|
||||||
def test_file_input(
|
def test_file_input(
|
||||||
self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
|
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test --file input option."""
|
"""Test --file input option."""
|
||||||
test_file = tmp_path / "test.txt"
|
test_file = tmp_path / "test.txt"
|
||||||
test_file.write_text("hello world hello world", encoding="utf-8")
|
test_file.write_text("hello world hello world", encoding="utf-8")
|
||||||
|
|
||||||
exit_code = main(
|
with caplog.at_level(logging.INFO):
|
||||||
["--file", str(test_file), "--words", "hello", "--length", "2"]
|
exit_code = main(
|
||||||
)
|
["--file", str(test_file), "--words", "hello", "--length", "2"]
|
||||||
captured = capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
assert "hello" in captured.out
|
assert "hello" in caplog.text
|
||||||
|
|
||||||
def test_words_file_input(
|
def test_words_file_input(
|
||||||
self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
|
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test --words-file option."""
|
"""Test --words-file option."""
|
||||||
text_file = tmp_path / "text.txt"
|
text_file = tmp_path / "text.txt"
|
||||||
@ -274,91 +283,91 @@ class TestMain:
|
|||||||
text_file.write_text("hello world hello world", encoding="utf-8")
|
text_file.write_text("hello world hello world", encoding="utf-8")
|
||||||
words_file.write_text("hello\nworld\n", encoding="utf-8")
|
words_file.write_text("hello\nworld\n", encoding="utf-8")
|
||||||
|
|
||||||
exit_code = main(
|
with caplog.at_level(logging.INFO):
|
||||||
[
|
exit_code = main(
|
||||||
"--file",
|
[
|
||||||
str(text_file),
|
"--file",
|
||||||
"--words-file",
|
str(text_file),
|
||||||
str(words_file),
|
"--words-file",
|
||||||
"--length",
|
str(words_file),
|
||||||
"2",
|
"--length",
|
||||||
]
|
"2",
|
||||||
)
|
]
|
||||||
captured = capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
assert "100.00%" in captured.out # Both words match
|
assert "100.00%" in caplog.text # Both words match
|
||||||
|
|
||||||
def test_top_option(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_top_option(self, caplog: pytest.LogCaptureFixture) -> None:
|
||||||
"""Test --top option."""
|
"""Test --top option."""
|
||||||
exit_code = main(
|
with caplog.at_level(logging.INFO):
|
||||||
[
|
exit_code = main(
|
||||||
"--text",
|
[
|
||||||
"a b c d e f",
|
"--text",
|
||||||
"--words",
|
"a b c d e f",
|
||||||
"a",
|
"--words",
|
||||||
"b",
|
"a",
|
||||||
"--length",
|
"b",
|
||||||
"2",
|
"--length",
|
||||||
"--top",
|
"2",
|
||||||
"3",
|
"--top",
|
||||||
]
|
"3",
|
||||||
)
|
]
|
||||||
captured = capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
# Should show multiple results
|
# Should show multiple results
|
||||||
assert "Result #1" in captured.out
|
assert "Result #1" in caplog.text
|
||||||
|
|
||||||
def test_context_option(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_context_option(self, caplog: pytest.LogCaptureFixture) -> None:
|
||||||
"""Test --context option."""
|
"""Test --context option."""
|
||||||
exit_code = main(
|
with caplog.at_level(logging.INFO):
|
||||||
[
|
exit_code = main(
|
||||||
"--text",
|
[
|
||||||
"a b c d e f g",
|
"--text",
|
||||||
"--words",
|
"a b c d e f g",
|
||||||
"d",
|
"--words",
|
||||||
"--length",
|
"d",
|
||||||
"1",
|
"--length",
|
||||||
"--context",
|
"1",
|
||||||
"2",
|
"--context",
|
||||||
]
|
"2",
|
||||||
)
|
]
|
||||||
capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
# Excerpt should include context words
|
# Excerpt should include context words
|
||||||
|
|
||||||
def test_case_sensitive_option(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_case_sensitive_option(self, caplog: pytest.LogCaptureFixture) -> None:
|
||||||
"""Test --case-sensitive option."""
|
"""Test --case-sensitive option."""
|
||||||
exit_code = main(
|
with caplog.at_level(logging.INFO):
|
||||||
[
|
exit_code = main(
|
||||||
"--text",
|
[
|
||||||
"Hello HELLO hello",
|
"--text",
|
||||||
"--words",
|
"Hello HELLO hello",
|
||||||
"hello",
|
"--words",
|
||||||
"--length",
|
"hello",
|
||||||
"1",
|
"--length",
|
||||||
"--case-sensitive",
|
"1",
|
||||||
]
|
"--case-sensitive",
|
||||||
)
|
]
|
||||||
capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
# Only lowercase "hello" should match
|
# Only lowercase "hello" should match
|
||||||
|
|
||||||
def test_file_not_found(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_file_not_found(self, caplog: pytest.LogCaptureFixture) -> None:
|
||||||
"""Test error handling for missing file."""
|
"""Test error handling for missing file."""
|
||||||
exit_code = main(
|
with caplog.at_level(logging.ERROR):
|
||||||
["--file", "/nonexistent/file.txt", "--words", "hello", "--length", "2"]
|
exit_code = main(
|
||||||
)
|
["--file", "/nonexistent/file.txt", "--words", "hello", "--length", "2"]
|
||||||
captured = capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 1
|
assert exit_code == 1
|
||||||
assert "Error" in captured.err
|
assert "Error" in caplog.text
|
||||||
|
|
||||||
def test_empty_words_file(
|
def test_empty_words_file(
|
||||||
self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
|
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test error when words file is empty."""
|
"""Test error when words file is empty."""
|
||||||
text_file = tmp_path / "text.txt"
|
text_file = tmp_path / "text.txt"
|
||||||
@ -366,20 +375,20 @@ class TestMain:
|
|||||||
text_file.write_text("hello world", encoding="utf-8")
|
text_file.write_text("hello world", encoding="utf-8")
|
||||||
words_file.write_text("", encoding="utf-8")
|
words_file.write_text("", encoding="utf-8")
|
||||||
|
|
||||||
exit_code = main(
|
with caplog.at_level(logging.ERROR):
|
||||||
[
|
exit_code = main(
|
||||||
"--file",
|
[
|
||||||
str(text_file),
|
"--file",
|
||||||
"--words-file",
|
str(text_file),
|
||||||
str(words_file),
|
"--words-file",
|
||||||
"--length",
|
str(words_file),
|
||||||
"2",
|
"--length",
|
||||||
]
|
"2",
|
||||||
)
|
]
|
||||||
captured = capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 1
|
assert exit_code == 1
|
||||||
assert "No target words" in captured.err
|
assert "No target words" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
class TestPerformance:
|
class TestPerformance:
|
||||||
|
|||||||
@ -2,16 +2,20 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from pathlib import Path
|
import logging
|
||||||
import time
|
import time
|
||||||
from typing import TYPE_CHECKING
|
from typing import TYPE_CHECKING
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import python_pkg.word_frequency.learning_pipe as learning_pipe_module
|
import python_pkg.word_frequency.learning_pipe as learning_pipe_module
|
||||||
from python_pkg.word_frequency.learning_pipe import (
|
from python_pkg.word_frequency.learning_pipe import (
|
||||||
DEFAULT_STOPWORDS_EN,
|
DEFAULT_STOPWORDS_EN,
|
||||||
|
LessonConfig,
|
||||||
generate_learning_lesson,
|
generate_learning_lesson,
|
||||||
load_stopwords,
|
load_stopwords,
|
||||||
main,
|
main,
|
||||||
@ -23,7 +27,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_translation() -> Generator[MagicMock, None, None]:
|
def _mock_translation() -> Generator[MagicMock, None, None]:
|
||||||
"""Mock translation to avoid requiring argostranslate."""
|
"""Mock translation to avoid requiring argostranslate."""
|
||||||
|
|
||||||
def fake_batch_translate(
|
def fake_batch_translate(
|
||||||
@ -31,7 +35,7 @@ def mock_translation() -> Generator[MagicMock, None, None]:
|
|||||||
from_lang: str,
|
from_lang: str,
|
||||||
to_lang: str,
|
to_lang: str,
|
||||||
*,
|
*,
|
||||||
use_cache: bool = True,
|
_use_cache: bool = True,
|
||||||
) -> list[TranslationResult]:
|
) -> list[TranslationResult]:
|
||||||
"""Fake batch translation that returns word with prefix."""
|
"""Fake batch translation that returns word with prefix."""
|
||||||
return [
|
return [
|
||||||
@ -95,7 +99,7 @@ class TestGenerateLearningLesson:
|
|||||||
"""Test basic lesson generation."""
|
"""Test basic lesson generation."""
|
||||||
text = "hello world hello hello world test test test test"
|
text = "hello world hello hello world test test test test"
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text, batch_size=3, num_batches=1, skip_default_stopwords=True
|
text, LessonConfig(batch_size=3, num_batches=1, skip_default_stopwords=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "LANGUAGE LEARNING LESSON" in result
|
assert "LANGUAGE LEARNING LESSON" in result
|
||||||
@ -106,7 +110,7 @@ class TestGenerateLearningLesson:
|
|||||||
"""Test generation with multiple batches."""
|
"""Test generation with multiple batches."""
|
||||||
text = " ".join(f"word{i}" * (100 - i) for i in range(20))
|
text = " ".join(f"word{i}" * (100 - i) for i in range(20))
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text, batch_size=5, num_batches=3, skip_default_stopwords=True
|
text, LessonConfig(batch_size=5, num_batches=3, skip_default_stopwords=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "BATCH 1" in result
|
assert "BATCH 1" in result
|
||||||
@ -116,7 +120,9 @@ class TestGenerateLearningLesson:
|
|||||||
def test_stopwords_filtering(self) -> None:
|
def test_stopwords_filtering(self) -> None:
|
||||||
"""Test that default stopwords are filtered."""
|
"""Test that default stopwords are filtered."""
|
||||||
text = "the the the hello world"
|
text = "the the the hello world"
|
||||||
result = generate_learning_lesson(text, batch_size=5, num_batches=1)
|
result = generate_learning_lesson(
|
||||||
|
text, LessonConfig(batch_size=5, num_batches=1)
|
||||||
|
)
|
||||||
|
|
||||||
# "the" should be filtered, "hello" and "world" should appear
|
# "the" should be filtered, "hello" and "world" should appear
|
||||||
lines = result.split("\n")
|
lines = result.split("\n")
|
||||||
@ -139,7 +145,7 @@ class TestGenerateLearningLesson:
|
|||||||
"""Test disabling default stopword filtering."""
|
"""Test disabling default stopword filtering."""
|
||||||
text = "the the the hello"
|
text = "the the the hello"
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text, batch_size=5, num_batches=1, skip_default_stopwords=True
|
text, LessonConfig(batch_size=5, num_batches=1, skip_default_stopwords=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "the" in result.lower()
|
assert "the" in result.lower()
|
||||||
@ -148,7 +154,7 @@ class TestGenerateLearningLesson:
|
|||||||
"""Test that numbers are filtered by default."""
|
"""Test that numbers are filtered by default."""
|
||||||
text = "123 123 123 hello world"
|
text = "123 123 123 hello world"
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text, batch_size=5, num_batches=1, skip_default_stopwords=True
|
text, LessonConfig(batch_size=5, num_batches=1, skip_default_stopwords=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check vocabulary section doesn't include "123"
|
# Check vocabulary section doesn't include "123"
|
||||||
@ -162,10 +168,12 @@ class TestGenerateLearningLesson:
|
|||||||
text = "123 123 123 hello"
|
text = "123 123 123 hello"
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text,
|
text,
|
||||||
batch_size=5,
|
LessonConfig(
|
||||||
num_batches=1,
|
batch_size=5,
|
||||||
skip_default_stopwords=True,
|
num_batches=1,
|
||||||
skip_numbers=False,
|
skip_default_stopwords=True,
|
||||||
|
skip_numbers=False,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "123" in result
|
assert "123" in result
|
||||||
@ -174,7 +182,7 @@ class TestGenerateLearningLesson:
|
|||||||
"""Test that coverage percentage is calculated."""
|
"""Test that coverage percentage is calculated."""
|
||||||
text = "hello hello hello world world test"
|
text = "hello hello hello world world test"
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text, batch_size=3, num_batches=1, skip_default_stopwords=True
|
text, LessonConfig(batch_size=3, num_batches=1, skip_default_stopwords=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "recognize" in result.lower()
|
assert "recognize" in result.lower()
|
||||||
@ -185,11 +193,13 @@ class TestGenerateLearningLesson:
|
|||||||
text = "hello world hello world hello world test test test"
|
text = "hello world hello world hello world test test test"
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text,
|
text,
|
||||||
batch_size=2,
|
LessonConfig(
|
||||||
num_batches=1,
|
batch_size=2,
|
||||||
excerpt_length=3,
|
num_batches=1,
|
||||||
excerpts_per_batch=2,
|
excerpt_length=3,
|
||||||
skip_default_stopwords=True,
|
excerpts_per_batch=2,
|
||||||
|
skip_default_stopwords=True,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "PRACTICE EXCERPTS" in result
|
assert "PRACTICE EXCERPTS" in result
|
||||||
@ -200,45 +210,45 @@ class TestMain:
|
|||||||
"""Tests for main CLI function."""
|
"""Tests for main CLI function."""
|
||||||
|
|
||||||
def test_basic_text_input(
|
def test_basic_text_input(
|
||||||
self, capsys: pytest.CaptureFixture[str], mock_translation: None
|
self, caplog: pytest.LogCaptureFixture, _mock_translation: None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test with text input."""
|
"""Test with text input."""
|
||||||
exit_code = main(
|
with caplog.at_level(logging.INFO):
|
||||||
[
|
exit_code = main(
|
||||||
"--text",
|
[
|
||||||
"hello world hello world test test test",
|
"--text",
|
||||||
"--batch-size",
|
"hello world hello world test test test",
|
||||||
"3",
|
"--batch-size",
|
||||||
"--no-default-stopwords",
|
"3",
|
||||||
]
|
"--no-default-stopwords",
|
||||||
)
|
]
|
||||||
captured = capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
assert "LANGUAGE LEARNING LESSON" in captured.out
|
assert "LANGUAGE LEARNING LESSON" in caplog.text
|
||||||
|
|
||||||
def test_file_input(
|
def test_file_input(
|
||||||
self, tmp_path: Path, capsys: pytest.CaptureFixture[str], mock_translation: None
|
self, tmp_path: Path, caplog: pytest.LogCaptureFixture, _mock_translation: None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test with file input."""
|
"""Test with file input."""
|
||||||
test_file = tmp_path / "test.txt"
|
test_file = tmp_path / "test.txt"
|
||||||
test_file.write_text("hello world hello world test", encoding="utf-8")
|
test_file.write_text("hello world hello world test", encoding="utf-8")
|
||||||
|
|
||||||
exit_code = main(
|
with caplog.at_level(logging.INFO):
|
||||||
[
|
exit_code = main(
|
||||||
"--file",
|
[
|
||||||
str(test_file),
|
"--file",
|
||||||
"--batch-size",
|
str(test_file),
|
||||||
"3",
|
"--batch-size",
|
||||||
"--no-default-stopwords",
|
"3",
|
||||||
]
|
"--no-default-stopwords",
|
||||||
)
|
]
|
||||||
captured = capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
assert "hello" in captured.out.lower()
|
assert "hello" in caplog.text.lower()
|
||||||
|
|
||||||
def test_output_to_file(self, tmp_path: Path, mock_translation: None) -> None:
|
def test_output_to_file(self, tmp_path: Path, _mock_translation: None) -> None:
|
||||||
"""Test outputting to file."""
|
"""Test outputting to file."""
|
||||||
output_file = tmp_path / "lesson.txt"
|
output_file = tmp_path / "lesson.txt"
|
||||||
|
|
||||||
@ -258,7 +268,7 @@ class TestMain:
|
|||||||
assert "LANGUAGE LEARNING LESSON" in content
|
assert "LANGUAGE LEARNING LESSON" in content
|
||||||
|
|
||||||
def test_custom_stopwords(
|
def test_custom_stopwords(
|
||||||
self, tmp_path: Path, capsys: pytest.CaptureFixture[str], mock_translation: None
|
self, tmp_path: Path, _mock_translation: None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test with custom stopwords file."""
|
"""Test with custom stopwords file."""
|
||||||
stopwords_file = tmp_path / "stop.txt"
|
stopwords_file = tmp_path / "stop.txt"
|
||||||
@ -275,41 +285,40 @@ class TestMain:
|
|||||||
"5",
|
"5",
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
capsys.readouterr()
|
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
# "hello" should be filtered by custom stopwords
|
# "hello" should be filtered by custom stopwords
|
||||||
|
|
||||||
def test_multiple_batches_option(
|
def test_multiple_batches_option(
|
||||||
self, capsys: pytest.CaptureFixture[str], mock_translation: None
|
self, caplog: pytest.LogCaptureFixture, _mock_translation: None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test --batches option."""
|
"""Test --batches option."""
|
||||||
text = " ".join(f"word{i}" * (50 - i) for i in range(30))
|
text = " ".join(f"word{i}" * (50 - i) for i in range(30))
|
||||||
exit_code = main(
|
with caplog.at_level(logging.INFO):
|
||||||
[
|
exit_code = main(
|
||||||
"--text",
|
[
|
||||||
text,
|
"--text",
|
||||||
"--batch-size",
|
text,
|
||||||
"5",
|
"--batch-size",
|
||||||
"--batches",
|
"5",
|
||||||
"3",
|
"--batches",
|
||||||
"--no-default-stopwords",
|
"3",
|
||||||
]
|
"--no-default-stopwords",
|
||||||
)
|
]
|
||||||
captured = capsys.readouterr()
|
)
|
||||||
|
|
||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
assert "BATCH 1" in captured.out
|
assert "BATCH 1" in caplog.text
|
||||||
assert "BATCH 2" in captured.out
|
assert "BATCH 2" in caplog.text
|
||||||
assert "BATCH 3" in captured.out
|
assert "BATCH 3" in caplog.text
|
||||||
|
|
||||||
def test_file_not_found(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_file_not_found(self, caplog: pytest.LogCaptureFixture) -> None:
|
||||||
"""Test error handling for missing file."""
|
"""Test error handling for missing file."""
|
||||||
exit_code = main(["--file", "/nonexistent/file.txt"])
|
with caplog.at_level(logging.ERROR):
|
||||||
captured = capsys.readouterr()
|
exit_code = main(["--file", "/nonexistent/file.txt"])
|
||||||
|
|
||||||
assert exit_code == 1
|
assert exit_code == 1
|
||||||
assert "Error" in captured.err
|
assert "Error" in caplog.text
|
||||||
|
|
||||||
|
|
||||||
class TestPerformance:
|
class TestPerformance:
|
||||||
@ -324,10 +333,12 @@ class TestPerformance:
|
|||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
large_text,
|
large_text,
|
||||||
batch_size=50,
|
LessonConfig(
|
||||||
num_batches=5,
|
batch_size=50,
|
||||||
excerpt_length=30,
|
num_batches=5,
|
||||||
skip_default_stopwords=True,
|
excerpt_length=30,
|
||||||
|
skip_default_stopwords=True,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
elapsed = time.perf_counter() - start_time
|
elapsed = time.perf_counter() - start_time
|
||||||
|
|
||||||
@ -358,9 +369,11 @@ class TestTranslationIntegration:
|
|||||||
text = "hello world hello world hello"
|
text = "hello world hello world hello"
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text,
|
text,
|
||||||
batch_size=5,
|
LessonConfig(
|
||||||
num_batches=1,
|
batch_size=5,
|
||||||
skip_default_stopwords=True,
|
num_batches=1,
|
||||||
|
skip_default_stopwords=True,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert "hello" in result
|
assert "hello" in result
|
||||||
@ -368,17 +381,19 @@ class TestTranslationIntegration:
|
|||||||
# Should not have translation arrows
|
# Should not have translation arrows
|
||||||
assert " -> " not in result or "Translation" not in result
|
assert " -> " not in result or "Translation" not in result
|
||||||
|
|
||||||
def test_lesson_with_translation_params(self, mock_translation: None) -> None:
|
def test_lesson_with_translation_params(self, _mock_translation: None) -> None:
|
||||||
"""Test that translation params are accepted."""
|
"""Test that translation params are accepted."""
|
||||||
text = "hello world hello world hello"
|
text = "hello world hello world hello"
|
||||||
# This should work with mocked translation
|
# This should work with mocked translation
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text,
|
text,
|
||||||
batch_size=5,
|
LessonConfig(
|
||||||
num_batches=1,
|
batch_size=5,
|
||||||
skip_default_stopwords=True,
|
num_batches=1,
|
||||||
translate_from="en",
|
skip_default_stopwords=True,
|
||||||
translate_to="es",
|
translate_from="en",
|
||||||
|
translate_to="es",
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# The lesson should still be generated
|
# The lesson should still be generated
|
||||||
@ -386,7 +401,7 @@ class TestTranslationIntegration:
|
|||||||
assert "hello" in result
|
assert "hello" in result
|
||||||
|
|
||||||
def test_main_with_translate_flags(
|
def test_main_with_translate_flags(
|
||||||
self, tmp_path: Path, mock_translation: None
|
self, tmp_path: Path, _mock_translation: None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test that main accepts translation flags."""
|
"""Test that main accepts translation flags."""
|
||||||
text_file = tmp_path / "test.txt"
|
text_file = tmp_path / "test.txt"
|
||||||
@ -408,36 +423,42 @@ class TestTranslationIntegration:
|
|||||||
assert result == 0
|
assert result == 0
|
||||||
|
|
||||||
def test_translate_to_defaults_to_english(
|
def test_translate_to_defaults_to_english(
|
||||||
self, capsys: pytest.CaptureFixture[str], mock_translation: None
|
self, _mock_translation: None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test that translate_to defaults to 'en' when using auto-detection."""
|
"""Test that translate_to defaults to 'en' when using auto-detection."""
|
||||||
text = "hello world"
|
text = "hello world"
|
||||||
# When using --translate flag (translate_from="auto"), translate_to defaults to "en"
|
# When using --translate flag (translate_from="auto"),
|
||||||
result = generate_learning_lesson(
|
# translate_to defaults to "en"
|
||||||
text,
|
with patch.object(
|
||||||
batch_size=5,
|
learning_pipe_module, "detect_language", return_value="es"
|
||||||
num_batches=1,
|
):
|
||||||
skip_default_stopwords=True,
|
result = generate_learning_lesson(
|
||||||
translate_from="auto", # Auto-detect source language
|
text,
|
||||||
translate_to=None, # Should default to English
|
LessonConfig(
|
||||||
)
|
batch_size=5,
|
||||||
|
num_batches=1,
|
||||||
|
skip_default_stopwords=True,
|
||||||
|
translate_from="auto", # Auto-detect source language
|
||||||
|
translate_to=None, # Should default to English
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
# Should have translation output with auto-detected source -> en
|
# Should have translation output with auto-detected source -> en
|
||||||
assert "Detected language:" in result
|
assert "Detected language:" in result
|
||||||
assert " -> en" in result
|
assert " -> en" in result
|
||||||
|
|
||||||
def test_no_translation_when_both_none(
|
def test_no_translation_when_both_none(self) -> None:
|
||||||
self, capsys: pytest.CaptureFixture[str]
|
"""Test no translation when both translate params are None."""
|
||||||
) -> None:
|
|
||||||
"""Test no translation happens when both translate_from and translate_to are None."""
|
|
||||||
text = "hello world"
|
text = "hello world"
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text,
|
text,
|
||||||
batch_size=5,
|
LessonConfig(
|
||||||
num_batches=1,
|
batch_size=5,
|
||||||
skip_default_stopwords=True,
|
num_batches=1,
|
||||||
translate_from=None,
|
skip_default_stopwords=True,
|
||||||
translate_to=None,
|
translate_from=None,
|
||||||
|
translate_to=None,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Should not have translation output
|
# Should not have translation output
|
||||||
|
|||||||
@ -61,19 +61,16 @@ class ArgosAvailableMock:
|
|||||||
self.mock_translate_module = MagicMock()
|
self.mock_translate_module = MagicMock()
|
||||||
self.mock_package_module = MagicMock()
|
self.mock_package_module = MagicMock()
|
||||||
self.mock_parent = MagicMock()
|
self.mock_parent = MagicMock()
|
||||||
self.original_available = translator._argos_available
|
|
||||||
self._sys_modules_patcher: MagicMock | None = None
|
self._sys_modules_patcher: MagicMock | None = None
|
||||||
self._ensure_patcher: MagicMock | None = None
|
self._ensure_patcher: MagicMock | None = None
|
||||||
self._lang_patcher: MagicMock | None = None
|
self._lang_patcher: MagicMock | None = None
|
||||||
|
self._check_argos_patcher: MagicMock | None = None
|
||||||
|
self._argos_module_patcher: MagicMock | None = None
|
||||||
|
|
||||||
def __enter__(self) -> MagicMock:
|
def __enter__(self) -> MagicMock:
|
||||||
"""Set up the mocks."""
|
"""Set up the mocks."""
|
||||||
translator._argos_available = True
|
|
||||||
|
|
||||||
# Set up translate return value
|
# Set up translate return value
|
||||||
if isinstance(self.translate_returns, Exception) or isinstance(
|
if isinstance(self.translate_returns, (Exception, list)):
|
||||||
self.translate_returns, list
|
|
||||||
):
|
|
||||||
self.mock_translate_fn.side_effect = self.translate_returns
|
self.mock_translate_fn.side_effect = self.translate_returns
|
||||||
elif self.translate_returns is not None:
|
elif self.translate_returns is not None:
|
||||||
self.mock_translate_fn.return_value = self.translate_returns
|
self.mock_translate_fn.return_value = self.translate_returns
|
||||||
@ -96,41 +93,52 @@ class ArgosAvailableMock:
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Patch the module-level argostranslate reference in translator
|
||||||
|
self._argos_module_patcher = patch.object(
|
||||||
|
translator, "argostranslate", self.mock_parent, create=True
|
||||||
|
)
|
||||||
|
|
||||||
# Patch _ensure_argos_installed and _ensure_language_pair to no-op
|
# Patch _ensure_argos_installed and _ensure_language_pair to no-op
|
||||||
self._ensure_patcher = patch.object(
|
self._ensure_patcher = patch.object(
|
||||||
translator, "_ensure_argos_installed", lambda: None
|
translator, "_ensure_argos_installed", lambda: None
|
||||||
)
|
)
|
||||||
self._lang_patcher = patch.object(
|
self._lang_patcher = patch.object(
|
||||||
translator, "_ensure_language_pair", lambda f, t: None
|
translator, "_ensure_language_pair", lambda _f, _t: None
|
||||||
|
)
|
||||||
|
self._check_argos_patcher = patch.object(
|
||||||
|
translator, "_check_argos", return_value=True
|
||||||
)
|
)
|
||||||
|
|
||||||
self._sys_modules_patcher.start() # type: ignore[union-attr]
|
self._sys_modules_patcher.start() # type: ignore[union-attr]
|
||||||
|
self._argos_module_patcher.start() # type: ignore[union-attr]
|
||||||
self._ensure_patcher.start() # type: ignore[union-attr]
|
self._ensure_patcher.start() # type: ignore[union-attr]
|
||||||
self._lang_patcher.start() # type: ignore[union-attr]
|
self._lang_patcher.start() # type: ignore[union-attr]
|
||||||
|
self._check_argos_patcher.start() # type: ignore[union-attr]
|
||||||
|
|
||||||
return self.mock_translate_fn
|
return self.mock_translate_fn
|
||||||
|
|
||||||
def __exit__(self, *args: object) -> None:
|
def __exit__(self, *args: object) -> None:
|
||||||
"""Restore original state."""
|
"""Restore original state."""
|
||||||
|
if self._check_argos_patcher:
|
||||||
|
self._check_argos_patcher.stop()
|
||||||
if self._lang_patcher:
|
if self._lang_patcher:
|
||||||
self._lang_patcher.stop()
|
self._lang_patcher.stop()
|
||||||
if self._ensure_patcher:
|
if self._ensure_patcher:
|
||||||
self._ensure_patcher.stop()
|
self._ensure_patcher.stop()
|
||||||
|
if self._argos_module_patcher:
|
||||||
|
self._argos_module_patcher.stop()
|
||||||
if self._sys_modules_patcher:
|
if self._sys_modules_patcher:
|
||||||
self._sys_modules_patcher.stop()
|
self._sys_modules_patcher.stop()
|
||||||
translator._argos_available = self.original_available
|
|
||||||
|
|
||||||
|
|
||||||
# Fixtures
|
# Fixtures
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_argos_unavailable() -> Generator[None, None, None]:
|
def _mock_argos_unavailable() -> Generator[None, None, None]:
|
||||||
"""Mock argostranslate being unavailable (for legacy tests)."""
|
"""Mock argostranslate being unavailable (for legacy tests)."""
|
||||||
original_value = translator._argos_available
|
with patch.object(translator, "_check_argos", return_value=False):
|
||||||
translator._argos_available = False
|
yield
|
||||||
yield
|
|
||||||
translator._argos_available = original_value
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -178,7 +186,7 @@ class TestTranslationResult:
|
|||||||
|
|
||||||
def test_result_is_tuple(self) -> None:
|
def test_result_is_tuple(self) -> None:
|
||||||
"""Test that TranslationResult is a namedtuple."""
|
"""Test that TranslationResult is a namedtuple."""
|
||||||
result = TranslationResult("a", "b", "en", "es", True)
|
result = TranslationResult("a", "b", "en", "es", success=True)
|
||||||
assert isinstance(result, tuple)
|
assert isinstance(result, tuple)
|
||||||
assert len(result) == 6
|
assert len(result) == 6
|
||||||
|
|
||||||
@ -192,13 +200,15 @@ class TestTranslateWord:
|
|||||||
def test_translate_word_argos_unavailable_raises(self) -> None:
|
def test_translate_word_argos_unavailable_raises(self) -> None:
|
||||||
"""Test that translation raises ImportError when argos is unavailable."""
|
"""Test that translation raises ImportError when argos is unavailable."""
|
||||||
# Mock _ensure_argos_installed to raise ImportError
|
# Mock _ensure_argos_installed to raise ImportError
|
||||||
with patch.object(
|
with (
|
||||||
translator,
|
patch.object(
|
||||||
"_ensure_argos_installed",
|
translator,
|
||||||
side_effect=ImportError("argostranslate not available"),
|
"_ensure_argos_installed",
|
||||||
|
side_effect=ImportError("argostranslate not available"),
|
||||||
|
),
|
||||||
|
pytest.raises(ImportError, match="argostranslate not available"),
|
||||||
):
|
):
|
||||||
with pytest.raises(ImportError, match="argostranslate not available"):
|
translate_word("hello", "en", "es", use_cache=False)
|
||||||
translate_word("hello", "en", "es", use_cache=False)
|
|
||||||
|
|
||||||
def test_translate_word_success(self) -> None:
|
def test_translate_word_success(self) -> None:
|
||||||
"""Test successful word translation."""
|
"""Test successful word translation."""
|
||||||
@ -243,13 +253,15 @@ class TestTranslateWords:
|
|||||||
|
|
||||||
def test_translate_words_argos_unavailable_raises(self) -> None:
|
def test_translate_words_argos_unavailable_raises(self) -> None:
|
||||||
"""Test that translating words raises ImportError when argos unavailable."""
|
"""Test that translating words raises ImportError when argos unavailable."""
|
||||||
with patch.object(
|
with (
|
||||||
translator,
|
patch.object(
|
||||||
"_ensure_argos_installed",
|
translator,
|
||||||
side_effect=ImportError("argostranslate not available"),
|
"_ensure_argos_installed",
|
||||||
|
side_effect=ImportError("argostranslate not available"),
|
||||||
|
),
|
||||||
|
pytest.raises(ImportError, match="argostranslate not available"),
|
||||||
):
|
):
|
||||||
with pytest.raises(ImportError, match="argostranslate not available"):
|
translate_words(["hello", "world"], "en", "es", use_cache=False)
|
||||||
translate_words(["hello", "world"], "en", "es", use_cache=False)
|
|
||||||
|
|
||||||
|
|
||||||
# translate_words_batch tests
|
# translate_words_batch tests
|
||||||
@ -290,7 +302,7 @@ class TestTranslateWordsBatch:
|
|||||||
assert results[4].translated_word == "cinco"
|
assert results[4].translated_word == "cinco"
|
||||||
|
|
||||||
def test_batch_fallback_on_mismatch(self) -> None:
|
def test_batch_fallback_on_mismatch(self) -> None:
|
||||||
"""Test batch translation falls back to individual when result count mismatches."""
|
"""Test batch falls back to individual on result count mismatch."""
|
||||||
words = ["one", "two", "three", "four"]
|
words = ["one", "two", "three", "four"]
|
||||||
# First call (batch) returns wrong count, subsequent calls are individual
|
# First call (batch) returns wrong count, subsequent calls are individual
|
||||||
with ArgosAvailableMock(["wrong", "uno", "dos", "tres", "cuatro"]) as mock:
|
with ArgosAvailableMock(["wrong", "uno", "dos", "tres", "cuatro"]) as mock:
|
||||||
@ -313,10 +325,11 @@ class TestTranslateWordsBatch:
|
|||||||
mock_parent.translate = mock_translate_module
|
mock_parent.translate = mock_translate_module
|
||||||
mock_parent.package = mock_package_module
|
mock_parent.package = mock_package_module
|
||||||
|
|
||||||
original = translator._argos_available
|
|
||||||
translator._argos_available = True
|
|
||||||
|
|
||||||
with (
|
with (
|
||||||
|
patch.object(translator, "_check_argos", return_value=True),
|
||||||
|
patch.object(
|
||||||
|
translator, "argostranslate", mock_parent, create=True
|
||||||
|
),
|
||||||
patch.dict(
|
patch.dict(
|
||||||
"sys.modules",
|
"sys.modules",
|
||||||
{
|
{
|
||||||
@ -326,22 +339,22 @@ class TestTranslateWordsBatch:
|
|||||||
},
|
},
|
||||||
),
|
),
|
||||||
patch.object(translator, "_ensure_argos_installed", lambda: None),
|
patch.object(translator, "_ensure_argos_installed", lambda: None),
|
||||||
patch.object(translator, "_ensure_language_pair", lambda f, t: None),
|
patch.object(translator, "_ensure_language_pair", lambda _f, _t: None),
|
||||||
pytest.raises(RuntimeError, match="Translation failed"),
|
pytest.raises(RuntimeError, match="Translation failed"),
|
||||||
):
|
):
|
||||||
translate_words_batch(words, "en", "es", use_cache=False)
|
translate_words_batch(words, "en", "es", use_cache=False)
|
||||||
|
|
||||||
translator._argos_available = original
|
|
||||||
|
|
||||||
def test_batch_argos_unavailable_raises(self) -> None:
|
def test_batch_argos_unavailable_raises(self) -> None:
|
||||||
"""Test that batch translation raises ImportError when argos unavailable."""
|
"""Test that batch translation raises ImportError when argos unavailable."""
|
||||||
with patch.object(
|
with (
|
||||||
translator,
|
patch.object(
|
||||||
"_ensure_argos_installed",
|
translator,
|
||||||
side_effect=ImportError("argostranslate not available"),
|
"_ensure_argos_installed",
|
||||||
|
side_effect=ImportError("argostranslate not available"),
|
||||||
|
),
|
||||||
|
pytest.raises(ImportError, match="argostranslate not available"),
|
||||||
):
|
):
|
||||||
with pytest.raises(ImportError, match="argostranslate not available"):
|
translate_words_batch(["hello", "world"], "en", "es", use_cache=False)
|
||||||
translate_words_batch(["hello", "world"], "en", "es", use_cache=False)
|
|
||||||
|
|
||||||
|
|
||||||
# format_translations tests
|
# format_translations tests
|
||||||
@ -358,7 +371,7 @@ class TestFormatTranslations:
|
|||||||
def test_format_single_translation(self) -> None:
|
def test_format_single_translation(self) -> None:
|
||||||
"""Test formatting single translation."""
|
"""Test formatting single translation."""
|
||||||
results = [
|
results = [
|
||||||
TranslationResult("hello", "hola", "en", "es", True),
|
TranslationResult("hello", "hola", "en", "es", success=True),
|
||||||
]
|
]
|
||||||
output = format_translations(results)
|
output = format_translations(results)
|
||||||
|
|
||||||
@ -369,8 +382,8 @@ class TestFormatTranslations:
|
|||||||
def test_format_multiple_translations(self) -> None:
|
def test_format_multiple_translations(self) -> None:
|
||||||
"""Test formatting multiple translations."""
|
"""Test formatting multiple translations."""
|
||||||
results = [
|
results = [
|
||||||
TranslationResult("hello", "hola", "en", "es", True),
|
TranslationResult("hello", "hola", "en", "es", success=True),
|
||||||
TranslationResult("world", "mundo", "en", "es", True),
|
TranslationResult("world", "mundo", "en", "es", success=True),
|
||||||
]
|
]
|
||||||
output = format_translations(results)
|
output = format_translations(results)
|
||||||
|
|
||||||
@ -382,8 +395,10 @@ class TestFormatTranslations:
|
|||||||
def test_format_with_errors(self) -> None:
|
def test_format_with_errors(self) -> None:
|
||||||
"""Test formatting with failed translations."""
|
"""Test formatting with failed translations."""
|
||||||
results = [
|
results = [
|
||||||
TranslationResult("hello", "hola", "en", "es", True),
|
TranslationResult("hello", "hola", "en", "es", success=True),
|
||||||
TranslationResult("xyz", "", "en", "es", False, "Unknown word"),
|
TranslationResult(
|
||||||
|
"xyz", "", "en", "es", success=False, error="Unknown word"
|
||||||
|
),
|
||||||
]
|
]
|
||||||
output = format_translations(results, show_errors=True)
|
output = format_translations(results, show_errors=True)
|
||||||
|
|
||||||
@ -393,8 +408,10 @@ class TestFormatTranslations:
|
|||||||
def test_format_hide_errors(self) -> None:
|
def test_format_hide_errors(self) -> None:
|
||||||
"""Test formatting with errors hidden."""
|
"""Test formatting with errors hidden."""
|
||||||
results = [
|
results = [
|
||||||
TranslationResult("hello", "hola", "en", "es", True),
|
TranslationResult("hello", "hola", "en", "es", success=True),
|
||||||
TranslationResult("xyz", "", "en", "es", False, "Unknown word"),
|
TranslationResult(
|
||||||
|
"xyz", "", "en", "es", success=False, error="Unknown word"
|
||||||
|
),
|
||||||
]
|
]
|
||||||
output = format_translations(results, show_errors=False)
|
output = format_translations(results, show_errors=False)
|
||||||
|
|
||||||
@ -408,7 +425,7 @@ class TestFormatTranslations:
|
|||||||
class TestGetInstalledLanguages:
|
class TestGetInstalledLanguages:
|
||||||
"""Tests for get_installed_languages function."""
|
"""Tests for get_installed_languages function."""
|
||||||
|
|
||||||
def test_argos_unavailable(self, mock_argos_unavailable: None) -> None:
|
def test_argos_unavailable(self, _mock_argos_unavailable: None) -> None:
|
||||||
"""Test when argos is unavailable."""
|
"""Test when argos is unavailable."""
|
||||||
result = get_installed_languages()
|
result = get_installed_languages()
|
||||||
assert result == []
|
assert result == []
|
||||||
@ -433,21 +450,22 @@ class TestGetInstalledLanguages:
|
|||||||
mock_parent.translate = mock_translate_module
|
mock_parent.translate = mock_translate_module
|
||||||
mock_parent.package = mock_package_module
|
mock_parent.package = mock_package_module
|
||||||
|
|
||||||
original = translator._argos_available
|
with (
|
||||||
translator._argos_available = True
|
patch.object(translator, "_check_argos", return_value=True),
|
||||||
|
patch.object(
|
||||||
with patch.dict(
|
translator, "argostranslate", mock_parent, create=True
|
||||||
"sys.modules",
|
),
|
||||||
{
|
patch.dict(
|
||||||
"argostranslate": mock_parent,
|
"sys.modules",
|
||||||
"argostranslate.translate": mock_translate_module,
|
{
|
||||||
"argostranslate.package": mock_package_module,
|
"argostranslate": mock_parent,
|
||||||
},
|
"argostranslate.translate": mock_translate_module,
|
||||||
|
"argostranslate.package": mock_package_module,
|
||||||
|
},
|
||||||
|
),
|
||||||
):
|
):
|
||||||
result = get_installed_languages()
|
result = get_installed_languages()
|
||||||
|
|
||||||
translator._argos_available = original
|
|
||||||
|
|
||||||
assert ("en", "English") in result
|
assert ("en", "English") in result
|
||||||
assert ("es", "Spanish") in result
|
assert ("es", "Spanish") in result
|
||||||
|
|
||||||
@ -458,7 +476,7 @@ class TestGetInstalledLanguages:
|
|||||||
class TestGetAvailablePackages:
|
class TestGetAvailablePackages:
|
||||||
"""Tests for get_available_packages function."""
|
"""Tests for get_available_packages function."""
|
||||||
|
|
||||||
def test_argos_unavailable(self, mock_argos_unavailable: None) -> None:
|
def test_argos_unavailable(self, _mock_argos_unavailable: None) -> None:
|
||||||
"""Test when argos is unavailable."""
|
"""Test when argos is unavailable."""
|
||||||
result = get_available_packages()
|
result = get_available_packages()
|
||||||
assert result == []
|
assert result == []
|
||||||
@ -470,7 +488,7 @@ class TestGetAvailablePackages:
|
|||||||
class TestDownloadLanguages:
|
class TestDownloadLanguages:
|
||||||
"""Tests for download_languages function."""
|
"""Tests for download_languages function."""
|
||||||
|
|
||||||
def test_argos_unavailable(self, mock_argos_unavailable: None) -> None:
|
def test_argos_unavailable(self, _mock_argos_unavailable: None) -> None:
|
||||||
"""Test when argos is unavailable."""
|
"""Test when argos is unavailable."""
|
||||||
result = download_languages(["en", "es"])
|
result = download_languages(["en", "es"])
|
||||||
assert result == {}
|
assert result == {}
|
||||||
@ -503,7 +521,7 @@ class TestReadFile:
|
|||||||
class TestMain:
|
class TestMain:
|
||||||
"""Tests for main CLI function."""
|
"""Tests for main CLI function."""
|
||||||
|
|
||||||
def test_argos_unavailable_error(self, mock_argos_unavailable: None) -> None:
|
def test_argos_unavailable_error(self, _mock_argos_unavailable: None) -> None:
|
||||||
"""Test error when argos not installed."""
|
"""Test error when argos not installed."""
|
||||||
result = main(["--text", "hello", "--from", "en", "--to", "es"])
|
result = main(["--text", "hello", "--from", "en", "--to", "es"])
|
||||||
assert result == 1
|
assert result == 1
|
||||||
@ -517,21 +535,22 @@ class TestMain:
|
|||||||
mock_parent.translate = mock_translate_module
|
mock_parent.translate = mock_translate_module
|
||||||
mock_parent.package = mock_package_module
|
mock_parent.package = mock_package_module
|
||||||
|
|
||||||
original = translator._argos_available
|
with (
|
||||||
translator._argos_available = True
|
patch.object(translator, "_check_argos", return_value=True),
|
||||||
|
patch.object(
|
||||||
with patch.dict(
|
translator, "argostranslate", mock_parent, create=True
|
||||||
"sys.modules",
|
),
|
||||||
{
|
patch.dict(
|
||||||
"argostranslate": mock_parent,
|
"sys.modules",
|
||||||
"argostranslate.translate": mock_translate_module,
|
{
|
||||||
"argostranslate.package": mock_package_module,
|
"argostranslate": mock_parent,
|
||||||
},
|
"argostranslate.translate": mock_translate_module,
|
||||||
|
"argostranslate.package": mock_package_module,
|
||||||
|
},
|
||||||
|
),
|
||||||
):
|
):
|
||||||
result = main(["--list-languages"])
|
result = main(["--list-languages"])
|
||||||
|
|
||||||
translator._argos_available = original
|
|
||||||
|
|
||||||
assert result == 0
|
assert result == 0
|
||||||
captured = capsys.readouterr()
|
captured = capsys.readouterr()
|
||||||
assert "No languages installed" in captured.out
|
assert "No languages installed" in captured.out
|
||||||
@ -551,21 +570,22 @@ class TestMain:
|
|||||||
mock_parent.translate = mock_translate_module
|
mock_parent.translate = mock_translate_module
|
||||||
mock_parent.package = mock_package_module
|
mock_parent.package = mock_package_module
|
||||||
|
|
||||||
original = translator._argos_available
|
with (
|
||||||
translator._argos_available = True
|
patch.object(translator, "_check_argos", return_value=True),
|
||||||
|
patch.object(
|
||||||
with patch.dict(
|
translator, "argostranslate", mock_parent, create=True
|
||||||
"sys.modules",
|
),
|
||||||
{
|
patch.dict(
|
||||||
"argostranslate": mock_parent,
|
"sys.modules",
|
||||||
"argostranslate.translate": mock_translate_module,
|
{
|
||||||
"argostranslate.package": mock_package_module,
|
"argostranslate": mock_parent,
|
||||||
},
|
"argostranslate.translate": mock_translate_module,
|
||||||
|
"argostranslate.package": mock_package_module,
|
||||||
|
},
|
||||||
|
),
|
||||||
):
|
):
|
||||||
result = main(["--list-languages"])
|
result = main(["--list-languages"])
|
||||||
|
|
||||||
translator._argos_available = original
|
|
||||||
|
|
||||||
assert result == 0
|
assert result == 0
|
||||||
captured = capsys.readouterr()
|
captured = capsys.readouterr()
|
||||||
assert "en" in captured.out
|
assert "en" in captured.out
|
||||||
@ -622,7 +642,6 @@ class TestMain:
|
|||||||
def test_translate_output_to_file(
|
def test_translate_output_to_file(
|
||||||
self,
|
self,
|
||||||
tmp_path: Path,
|
tmp_path: Path,
|
||||||
capsys: pytest.CaptureFixture[str],
|
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test outputting translations to file."""
|
"""Test outputting translations to file."""
|
||||||
output_file = tmp_path / "output.txt"
|
output_file = tmp_path / "output.txt"
|
||||||
@ -647,7 +666,9 @@ class TestMain:
|
|||||||
assert "hello" in content
|
assert "hello" in content
|
||||||
assert "hola" in content
|
assert "hola" in content
|
||||||
|
|
||||||
def test_no_input_shows_help(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_no_input_shows_help(
|
||||||
|
self,
|
||||||
|
) -> None:
|
||||||
"""Test that no input shows help."""
|
"""Test that no input shows help."""
|
||||||
with ArgosAvailableMock():
|
with ArgosAvailableMock():
|
||||||
result = main([])
|
result = main([])
|
||||||
|
|||||||
@ -89,7 +89,7 @@ class TestExcerptValidity:
|
|||||||
"""Tests that verify excerpts are actually found in the source text."""
|
"""Tests that verify excerpts are actually found in the source text."""
|
||||||
|
|
||||||
def test_excerpt_exists_in_source_text(self, sample_text_file: Path) -> None:
|
def test_excerpt_exists_in_source_text(self, sample_text_file: Path) -> None:
|
||||||
"""Test that each excerpt can be found in the source text as contiguous words."""
|
"""Test that each excerpt can be found in source text."""
|
||||||
import re
|
import re
|
||||||
|
|
||||||
source_text = sample_text_file.read_text(encoding="utf-8").lower()
|
source_text = sample_text_file.read_text(encoding="utf-8").lower()
|
||||||
|
|||||||
@ -1,149 +1,163 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Translator - translates words/text between languages.
|
r"""Translator - translates words/text between languages.
|
||||||
|
|
||||||
This module provides translation capabilities using either:
|
This module provides translation capabilities using either:
|
||||||
1. Argos Translate (offline, requires large downloads) - preferred if installed
|
|
||||||
2. deep-translator (online, uses Google Translate) - lightweight fallback
|
|
||||||
|
|
||||||
Usage:
|
1. Argos Translate (offline, requires large downloads)
|
||||||
|
2. deep-translator (online, uses Google Translate)
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
# Translate a single word
|
# Translate a single word
|
||||||
python -m python_pkg.word_frequency.translator --text "hello" --from en --to es
|
python -m python_pkg.word_frequency.translator \\
|
||||||
|
--text "hello" --from en --to es
|
||||||
|
|
||||||
# Translate multiple words
|
# Translate multiple words
|
||||||
python -m python_pkg.word_frequency.translator --words hello world goodbye --from en --to pl
|
python -m python_pkg.word_frequency.translator \\
|
||||||
|
--words hello world goodbye --from en --to pl
|
||||||
|
|
||||||
# Translate words from a file (one word per line)
|
# Translate words from a file (one word per line)
|
||||||
python -m python_pkg.word_frequency.translator --words-file words.txt --from la --to en
|
python -m python_pkg.word_frequency.translator \\
|
||||||
|
--words-file words.txt --from la --to en
|
||||||
|
|
||||||
# List available languages
|
# List available languages
|
||||||
python -m python_pkg.word_frequency.translator --list-languages
|
python -m python_pkg.word_frequency.translator \\
|
||||||
|
--list-languages
|
||||||
|
|
||||||
# Output to file
|
# Output to file
|
||||||
python -m python_pkg.word_frequency.translator --words-file vocab.txt --from pl --to en --output translations.txt
|
python -m python_pkg.word_frequency.translator \\
|
||||||
|
--words-file vocab.txt --from pl --to en \\
|
||||||
|
--output translations.txt
|
||||||
|
|
||||||
Dependencies (install one):
|
Dependencies (install one)::
|
||||||
pip install deep-translator # Lightweight, uses Google Translate (online)
|
|
||||||
pip install argostranslate # Offline translation (requires ~3GB downloads)
|
pip install deep-translator
|
||||||
|
pip install argostranslate
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import importlib
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
from typing import TYPE_CHECKING, NamedTuple
|
from typing import TYPE_CHECKING, NamedTuple
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
|
|
||||||
# Lazy imports for translation backends (may not be installed)
|
try:
|
||||||
_argos_available: bool | None = None
|
import torch
|
||||||
_deep_translator_available: bool | None = None
|
except ImportError:
|
||||||
_langdetect_available: bool | None = None
|
torch = None # type: ignore[assignment]
|
||||||
_gpu_initialized: bool = False
|
|
||||||
_gpu_available: bool | None = None
|
try:
|
||||||
|
import argostranslate.package
|
||||||
|
import argostranslate.translate
|
||||||
|
except ImportError:
|
||||||
|
argostranslate = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from deep_translator import GoogleTranslator
|
||||||
|
except ImportError:
|
||||||
|
GoogleTranslator = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
import langdetect
|
||||||
|
except ImportError:
|
||||||
|
langdetect = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from python_pkg.word_frequency.cache import (
|
||||||
|
get_translation_cache,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
get_translation_cache = None
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_LANG_DETECT_SAMPLE_SIZE = 5000
|
||||||
|
_BATCH_SIZE = 100
|
||||||
|
|
||||||
|
|
||||||
|
class _TranslatorState:
|
||||||
|
"""Holds module-level state for lazy-initialized backends."""
|
||||||
|
|
||||||
|
gpu_initialized: bool = False
|
||||||
|
|
||||||
|
|
||||||
def _check_cuda_available() -> bool:
|
def _check_cuda_available() -> bool:
|
||||||
"""Check if CUDA is available for GPU acceleration."""
|
"""Check if CUDA is available for GPU acceleration."""
|
||||||
global _gpu_available
|
return torch is not None and torch.cuda.is_available()
|
||||||
if _gpu_available is None:
|
|
||||||
try:
|
|
||||||
import torch
|
|
||||||
|
|
||||||
_gpu_available = torch.cuda.is_available()
|
|
||||||
except ImportError:
|
def _validate_gpu_device() -> str:
|
||||||
_gpu_available = False
|
"""Validate GPU device availability and return device name.
|
||||||
return _gpu_available
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If no GPU devices are found.
|
||||||
|
"""
|
||||||
|
device_count = torch.cuda.device_count()
|
||||||
|
if device_count == 0:
|
||||||
|
msg = "CUDA reports available but no GPU devices found"
|
||||||
|
raise RuntimeError(msg)
|
||||||
|
return torch.cuda.get_device_name(0)
|
||||||
|
|
||||||
|
|
||||||
def _init_gpu_if_available() -> None:
|
def _init_gpu_if_available() -> None:
|
||||||
"""Initialize GPU for argostranslate if CUDA is available.
|
"""Initialize GPU for argostranslate if CUDA is available.
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
RuntimeError: If CUDA is available but GPU initialization fails.
|
RuntimeError: If CUDA is available but GPU init fails.
|
||||||
"""
|
"""
|
||||||
global _gpu_initialized
|
if _TranslatorState.gpu_initialized:
|
||||||
if _gpu_initialized:
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if not _check_cuda_available():
|
if not _check_cuda_available():
|
||||||
_gpu_initialized = True
|
_TranslatorState.gpu_initialized = True
|
||||||
return
|
return
|
||||||
|
|
||||||
import sys
|
logger.info(
|
||||||
|
"CUDA detected, initializing GPU acceleration..."
|
||||||
print("CUDA detected, initializing GPU acceleration...", file=sys.stderr)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import torch
|
device_name = _validate_gpu_device()
|
||||||
|
logger.info(" Using GPU: %s", device_name)
|
||||||
# Force CTranslate2 to use CUDA
|
|
||||||
device_count = torch.cuda.device_count()
|
|
||||||
if device_count == 0:
|
|
||||||
raise RuntimeError("CUDA reports available but no GPU devices found")
|
|
||||||
|
|
||||||
device_name = torch.cuda.get_device_name(0)
|
|
||||||
print(f" Using GPU: {device_name}", file=sys.stderr)
|
|
||||||
|
|
||||||
# Set environment variable to force GPU usage in argos
|
|
||||||
import os
|
|
||||||
|
|
||||||
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
|
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
|
||||||
os.environ["CT2_USE_EXPERIMENTAL_PACKED_GEMM"] = "1"
|
os.environ["CT2_USE_EXPERIMENTAL_PACKED_GEMM"] = "1"
|
||||||
|
|
||||||
_gpu_initialized = True
|
_TranslatorState.gpu_initialized = True
|
||||||
print(" GPU acceleration enabled.", file=sys.stderr)
|
logger.info(" GPU acceleration enabled.")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
msg = (
|
||||||
f"CUDA is available but GPU initialization failed: {e}\n"
|
f"CUDA is available but GPU initialization failed: "
|
||||||
f"This may be due to incompatible CUDA version or driver issues.\n"
|
f"{e}\nThis may be due to incompatible CUDA "
|
||||||
f"To disable GPU and use CPU only, set environment variable: CT2_FORCE_CPU=1"
|
"version or driver issues.\n"
|
||||||
) from e
|
"To disable GPU and use CPU only, set "
|
||||||
|
"environment variable: CT2_FORCE_CPU=1"
|
||||||
|
)
|
||||||
|
raise RuntimeError(msg) from e
|
||||||
|
|
||||||
|
|
||||||
def _check_argos() -> bool:
|
def _check_argos() -> bool:
|
||||||
"""Check if argostranslate is available."""
|
"""Check if argostranslate is available."""
|
||||||
global _argos_available
|
return argostranslate is not None
|
||||||
if _argos_available is None:
|
|
||||||
try:
|
|
||||||
import argostranslate.package
|
|
||||||
import argostranslate.translate
|
|
||||||
|
|
||||||
_ = (argostranslate.package, argostranslate.translate)
|
|
||||||
_argos_available = True
|
|
||||||
except ImportError:
|
|
||||||
_argos_available = False
|
|
||||||
return _argos_available
|
|
||||||
|
|
||||||
|
|
||||||
def _check_deep_translator() -> bool:
|
def _check_deep_translator() -> bool:
|
||||||
"""Check if deep-translator is available."""
|
"""Check if deep-translator is available."""
|
||||||
global _deep_translator_available
|
return GoogleTranslator is not None
|
||||||
if _deep_translator_available is None:
|
|
||||||
try:
|
|
||||||
from deep_translator import GoogleTranslator
|
|
||||||
|
|
||||||
_ = GoogleTranslator
|
|
||||||
_deep_translator_available = True
|
|
||||||
except ImportError:
|
|
||||||
_deep_translator_available = False
|
|
||||||
return _deep_translator_available
|
|
||||||
|
|
||||||
|
|
||||||
def _check_langdetect() -> bool:
|
def _check_langdetect() -> bool:
|
||||||
"""Check if langdetect is available."""
|
"""Check if langdetect is available."""
|
||||||
global _langdetect_available
|
return langdetect is not None
|
||||||
if _langdetect_available is None:
|
|
||||||
try:
|
|
||||||
import langdetect
|
|
||||||
|
|
||||||
_ = langdetect
|
|
||||||
_langdetect_available = True
|
|
||||||
except ImportError:
|
|
||||||
_langdetect_available = False
|
|
||||||
return _langdetect_available
|
|
||||||
|
|
||||||
|
|
||||||
def detect_language(text: str) -> str | None:
|
def detect_language(text: str) -> str | None:
|
||||||
@ -158,13 +172,14 @@ def detect_language(text: str) -> str | None:
|
|||||||
if not _check_langdetect():
|
if not _check_langdetect():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
import langdetect
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Use a sample of the text for detection (faster and more reliable)
|
sample = (
|
||||||
sample = text[:5000] if len(text) > 5000 else text
|
text[:_LANG_DETECT_SAMPLE_SIZE]
|
||||||
return langdetect.detect(sample) # type: ignore[no-any-return]
|
if len(text) > _LANG_DETECT_SAMPLE_SIZE
|
||||||
except langdetect.LangDetectException: # type: ignore[attr-defined]
|
else text
|
||||||
|
)
|
||||||
|
return langdetect.detect(sample) # type: ignore[no-any-return,union-attr]
|
||||||
|
except langdetect.LangDetectException: # type: ignore[attr-defined,union-attr]
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ -188,8 +203,6 @@ def get_installed_languages() -> list[tuple[str, str]]:
|
|||||||
if not _check_argos():
|
if not _check_argos():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
import argostranslate.translate
|
|
||||||
|
|
||||||
languages = argostranslate.translate.get_installed_languages()
|
languages = argostranslate.translate.get_installed_languages()
|
||||||
return [(lang.code, lang.name) for lang in languages]
|
return [(lang.code, lang.name) for lang in languages]
|
||||||
|
|
||||||
@ -203,8 +216,6 @@ def get_available_packages() -> list[tuple[str, str, str, str]]:
|
|||||||
if not _check_argos():
|
if not _check_argos():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
import argostranslate.package
|
|
||||||
|
|
||||||
argostranslate.package.update_package_index()
|
argostranslate.package.update_package_index()
|
||||||
available = argostranslate.package.get_available_packages()
|
available = argostranslate.package.get_available_packages()
|
||||||
return [
|
return [
|
||||||
@ -227,12 +238,10 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
|
|||||||
if not _check_argos():
|
if not _check_argos():
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
import argostranslate.package
|
|
||||||
|
|
||||||
results: dict[str, bool] = {}
|
results: dict[str, bool] = {}
|
||||||
|
|
||||||
# Update package index
|
# Update package index
|
||||||
print("Updating package index...")
|
logger.info("Updating package index...")
|
||||||
argostranslate.package.update_package_index()
|
argostranslate.package.update_package_index()
|
||||||
available = argostranslate.package.get_available_packages()
|
available = argostranslate.package.get_available_packages()
|
||||||
|
|
||||||
@ -255,13 +264,26 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
|
|||||||
if pkg_key in available_lookup:
|
if pkg_key in available_lookup:
|
||||||
pkg = available_lookup[pkg_key]
|
pkg = available_lookup[pkg_key]
|
||||||
try:
|
try:
|
||||||
print(f"Downloading {from_code} -> {to_code}...")
|
logger.info(
|
||||||
|
"Downloading %s -> %s...",
|
||||||
|
from_code,
|
||||||
|
to_code,
|
||||||
|
)
|
||||||
argostranslate.package.install_from_path(pkg.download())
|
argostranslate.package.install_from_path(pkg.download())
|
||||||
results[key] = True
|
results[key] = True
|
||||||
print(f" ✓ Installed {from_code} -> {to_code}")
|
logger.info(
|
||||||
except Exception as e: # noqa: BLE001
|
" Installed %s -> %s",
|
||||||
|
from_code,
|
||||||
|
to_code,
|
||||||
|
)
|
||||||
|
except (OSError, RuntimeError, ValueError) as e:
|
||||||
results[key] = False
|
results[key] = False
|
||||||
print(f" ✗ Failed {from_code} -> {to_code}: {e}")
|
logger.info(
|
||||||
|
" Failed %s -> %s: %s",
|
||||||
|
from_code,
|
||||||
|
to_code,
|
||||||
|
e,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Package not available
|
# Package not available
|
||||||
results[key] = False
|
results[key] = False
|
||||||
@ -278,32 +300,38 @@ def _ensure_argos_installed() -> None:
|
|||||||
if _check_argos():
|
if _check_argos():
|
||||||
return
|
return
|
||||||
|
|
||||||
import subprocess
|
logger.info("argostranslate not found. Attempting to install...")
|
||||||
import sys
|
|
||||||
|
|
||||||
print("argostranslate not found. Attempting to install...")
|
|
||||||
try:
|
try:
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
[sys.executable, "-m", "pip", "install", "argostranslate"],
|
[sys.executable, "-m", "pip", "install", "argostranslate"],
|
||||||
check=True,
|
check=True,
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
)
|
)
|
||||||
# Reset the check flag and verify
|
# Attempt runtime re-import
|
||||||
global _argos_available
|
importlib.import_module("argostranslate.package")
|
||||||
_argos_available = None
|
importlib.import_module("argostranslate.translate")
|
||||||
if not _check_argos():
|
logger.info("argostranslate installed successfully.")
|
||||||
raise ImportError("argostranslate installation succeeded but import failed")
|
|
||||||
print("argostranslate installed successfully.")
|
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
error_msg = e.stderr.decode() if e.stderr else str(e)
|
error_msg = e.stderr.decode() if e.stderr else str(e)
|
||||||
raise ImportError(
|
msg = (
|
||||||
f"argostranslate is required for offline translation.\n\n"
|
"argostranslate is required for offline "
|
||||||
f"Install manually with one of:\n"
|
"translation.\n\n"
|
||||||
f" pip install argostranslate # In a virtualenv\n"
|
"Install manually with one of:\n"
|
||||||
f" pipx install argostranslate # System-wide via pipx\n"
|
" pip install argostranslate"
|
||||||
f" pacman -S python-argostranslate # Arch Linux (if available)\n\n"
|
" # In a virtualenv\n"
|
||||||
|
" pipx install argostranslate"
|
||||||
|
" # System-wide via pipx\n"
|
||||||
|
" pacman -S python-argostranslate"
|
||||||
|
" # Arch Linux (if available)\n\n"
|
||||||
f"Original error: {error_msg}"
|
f"Original error: {error_msg}"
|
||||||
) from e
|
)
|
||||||
|
raise ImportError(msg) from e
|
||||||
|
except ImportError:
|
||||||
|
msg = (
|
||||||
|
"argostranslate installation succeeded but "
|
||||||
|
"import failed"
|
||||||
|
)
|
||||||
|
raise ImportError(msg) from None
|
||||||
|
|
||||||
|
|
||||||
def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
|
def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
|
||||||
@ -316,11 +344,9 @@ def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
|
|||||||
Raises:
|
Raises:
|
||||||
ValueError: If language pair cannot be obtained.
|
ValueError: If language pair cannot be obtained.
|
||||||
"""
|
"""
|
||||||
import argostranslate.package
|
installed_languages = (
|
||||||
import argostranslate.translate
|
argostranslate.translate.get_installed_languages()
|
||||||
|
)
|
||||||
# Check if already installed
|
|
||||||
installed_languages = argostranslate.translate.get_installed_languages()
|
|
||||||
from_lang_obj = None
|
from_lang_obj = None
|
||||||
to_lang_obj = None
|
to_lang_obj = None
|
||||||
|
|
||||||
@ -337,37 +363,44 @@ def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
|
|||||||
return # Already available
|
return # Already available
|
||||||
|
|
||||||
# Need to download
|
# Need to download
|
||||||
import sys
|
logger.info(
|
||||||
|
"Downloading language pack: %s -> %s...",
|
||||||
print(
|
from_lang,
|
||||||
f"Downloading language pack: {from_lang} -> {to_lang}...",
|
to_lang,
|
||||||
file=sys.stderr,
|
|
||||||
)
|
)
|
||||||
print(" Fetching package index...", file=sys.stderr)
|
logger.info(" Fetching package index...")
|
||||||
argostranslate.package.update_package_index()
|
argostranslate.package.update_package_index()
|
||||||
available = argostranslate.package.get_available_packages()
|
available = argostranslate.package.get_available_packages()
|
||||||
|
|
||||||
pkg = next(
|
pkg = next(
|
||||||
(p for p in available if p.from_code == from_lang and p.to_code == to_lang),
|
(
|
||||||
|
p
|
||||||
|
for p in available
|
||||||
|
if p.from_code == from_lang and p.to_code == to_lang
|
||||||
|
),
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
|
|
||||||
if pkg is None:
|
if pkg is None:
|
||||||
raise ValueError(
|
msg = (
|
||||||
f"No language pack available for {from_lang} -> {to_lang}. "
|
f"No language pack available for "
|
||||||
f"Available pairs can be listed with --list-languages."
|
f"{from_lang} -> {to_lang}. "
|
||||||
|
"Available pairs can be listed with "
|
||||||
|
"--list-languages."
|
||||||
)
|
)
|
||||||
|
raise ValueError(msg)
|
||||||
|
|
||||||
print(
|
logger.info(
|
||||||
" Downloading package (~50-100MB, this may take a minute)...",
|
" Downloading package (~50-100MB, "
|
||||||
file=sys.stderr,
|
"this may take a minute)...",
|
||||||
)
|
)
|
||||||
download_path = pkg.download()
|
download_path = pkg.download()
|
||||||
print(" Installing language pack...", file=sys.stderr)
|
logger.info(" Installing language pack...")
|
||||||
argostranslate.package.install_from_path(download_path)
|
argostranslate.package.install_from_path(download_path)
|
||||||
print(
|
logger.info(
|
||||||
f"Language pack {from_lang} -> {to_lang} installed.",
|
"Language pack %s -> %s installed.",
|
||||||
file=sys.stderr,
|
from_lang,
|
||||||
|
to_lang,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -393,38 +426,30 @@ def translate_word(
|
|||||||
ImportError: If argostranslate is not available and cannot be installed.
|
ImportError: If argostranslate is not available and cannot be installed.
|
||||||
"""
|
"""
|
||||||
# Check cache first
|
# Check cache first
|
||||||
if use_cache:
|
if use_cache and get_translation_cache is not None:
|
||||||
try:
|
cache = get_translation_cache()
|
||||||
from python_pkg.word_frequency.cache import get_translation_cache
|
cached = cache.get(word, from_lang, to_lang)
|
||||||
|
if cached is not None:
|
||||||
cache = get_translation_cache()
|
return TranslationResult(
|
||||||
cached = cache.get(word, from_lang, to_lang)
|
source_word=word,
|
||||||
if cached is not None:
|
translated_word=cached,
|
||||||
return TranslationResult(
|
source_lang=from_lang,
|
||||||
source_word=word,
|
target_lang=to_lang,
|
||||||
translated_word=cached,
|
success=True,
|
||||||
source_lang=from_lang,
|
)
|
||||||
target_lang=to_lang,
|
|
||||||
success=True,
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
pass # Cache not available
|
|
||||||
|
|
||||||
# Ensure argos is installed (will raise if it can't be)
|
# Ensure argos is installed (will raise if it can't be)
|
||||||
_ensure_argos_installed()
|
_ensure_argos_installed()
|
||||||
|
|
||||||
import argostranslate.translate
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
translated = argostranslate.translate.translate(word, from_lang, to_lang)
|
translated = argostranslate.translate.translate(
|
||||||
|
word, from_lang, to_lang,
|
||||||
|
)
|
||||||
# Cache the result
|
# Cache the result
|
||||||
if use_cache:
|
if use_cache and get_translation_cache is not None:
|
||||||
try:
|
get_translation_cache().set(
|
||||||
from python_pkg.word_frequency.cache import get_translation_cache
|
word, from_lang, to_lang, translated,
|
||||||
|
)
|
||||||
get_translation_cache().set(word, from_lang, to_lang, translated)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
return TranslationResult(
|
return TranslationResult(
|
||||||
source_word=word,
|
source_word=word,
|
||||||
translated_word=translated,
|
translated_word=translated,
|
||||||
@ -432,7 +457,7 @@ def translate_word(
|
|||||||
target_lang=to_lang,
|
target_lang=to_lang,
|
||||||
success=True,
|
success=True,
|
||||||
)
|
)
|
||||||
except Exception as e: # noqa: BLE001
|
except (OSError, RuntimeError, ValueError, TypeError) as e:
|
||||||
return TranslationResult(
|
return TranslationResult(
|
||||||
source_word=word,
|
source_word=word,
|
||||||
translated_word="",
|
translated_word="",
|
||||||
@ -483,8 +508,6 @@ def _translate_batch_worker(
|
|||||||
Returns:
|
Returns:
|
||||||
Tuple of (batch_idx, translations dict).
|
Tuple of (batch_idx, translations dict).
|
||||||
"""
|
"""
|
||||||
import argostranslate.translate
|
|
||||||
|
|
||||||
translations: dict[str, str] = {}
|
translations: dict[str, str] = {}
|
||||||
|
|
||||||
# Batch translate by joining with newlines
|
# Batch translate by joining with newlines
|
||||||
@ -507,6 +530,78 @@ def _translate_batch_worker(
|
|||||||
return batch_idx, translations
|
return batch_idx, translations
|
||||||
|
|
||||||
|
|
||||||
|
def _run_batch_translation(
|
||||||
|
words_to_translate: list[str],
|
||||||
|
from_lang: str,
|
||||||
|
to_lang: str,
|
||||||
|
) -> dict[str, str]:
|
||||||
|
"""Translate a list of words in batches with progress logging.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words_to_translate: Words needing translation.
|
||||||
|
from_lang: Source language code.
|
||||||
|
to_lang: Target language code.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping lowercased words to translations.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If translation fails.
|
||||||
|
"""
|
||||||
|
new_translations: dict[str, str] = {}
|
||||||
|
num_to_translate = len(words_to_translate)
|
||||||
|
|
||||||
|
gpu_status = (
|
||||||
|
" (GPU)" if _check_cuda_available() else " (CPU)"
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"Translating %d words from %s to %s%s...",
|
||||||
|
num_to_translate,
|
||||||
|
from_lang,
|
||||||
|
to_lang,
|
||||||
|
gpu_status,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
batches = [
|
||||||
|
words_to_translate[i : i + _BATCH_SIZE]
|
||||||
|
for i in range(0, num_to_translate, _BATCH_SIZE)
|
||||||
|
]
|
||||||
|
total_batches = len(batches)
|
||||||
|
|
||||||
|
for batch_idx, batch_words in enumerate(batches):
|
||||||
|
words_done = min(
|
||||||
|
(batch_idx + 1) * _BATCH_SIZE,
|
||||||
|
num_to_translate,
|
||||||
|
)
|
||||||
|
pct = int(words_done / num_to_translate * 100)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
" [%3d%%] Translating batch %d/%d "
|
||||||
|
"(%d/%d words)...",
|
||||||
|
pct,
|
||||||
|
batch_idx + 1,
|
||||||
|
total_batches,
|
||||||
|
words_done,
|
||||||
|
num_to_translate,
|
||||||
|
)
|
||||||
|
|
||||||
|
_, batch_translations = _translate_batch_worker(
|
||||||
|
batch_words, from_lang, to_lang, batch_idx,
|
||||||
|
)
|
||||||
|
new_translations.update(batch_translations)
|
||||||
|
|
||||||
|
logger.info(" Translation complete.")
|
||||||
|
except Exception as e:
|
||||||
|
msg = (
|
||||||
|
f"Translation failed for "
|
||||||
|
f"{from_lang} -> {to_lang}: {e}"
|
||||||
|
)
|
||||||
|
raise RuntimeError(msg) from e
|
||||||
|
|
||||||
|
return new_translations
|
||||||
|
|
||||||
|
|
||||||
def translate_words_batch(
|
def translate_words_batch(
|
||||||
words: Sequence[str],
|
words: Sequence[str],
|
||||||
from_lang: str,
|
from_lang: str,
|
||||||
@ -535,90 +630,36 @@ def translate_words_batch(
|
|||||||
if not words:
|
if not words:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Ensure argos is installed (will raise if it can't be)
|
|
||||||
_ensure_argos_installed()
|
_ensure_argos_installed()
|
||||||
|
|
||||||
# Initialize GPU if available (will raise if CUDA available but fails)
|
|
||||||
_init_gpu_if_available()
|
_init_gpu_if_available()
|
||||||
|
|
||||||
# Ensure language pair is available
|
|
||||||
_ensure_language_pair(from_lang, to_lang)
|
_ensure_language_pair(from_lang, to_lang)
|
||||||
|
|
||||||
# Check cache for already-translated words
|
# Check cache for already-translated words
|
||||||
cached_results: dict[str, str] = {}
|
cached_results: dict[str, str] = {}
|
||||||
words_to_translate: list[str] = []
|
if use_cache and get_translation_cache is not None:
|
||||||
|
cache = get_translation_cache()
|
||||||
if use_cache:
|
cached_results = cache.get_many(
|
||||||
try:
|
list(words), from_lang, to_lang,
|
||||||
from python_pkg.word_frequency.cache import get_translation_cache
|
)
|
||||||
|
|
||||||
cache = get_translation_cache()
|
|
||||||
cached_results = cache.get_many(list(words), from_lang, to_lang)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Find words that still need translation
|
# Find words that still need translation
|
||||||
for word in words:
|
words_to_translate = [
|
||||||
if word.lower() not in cached_results:
|
word for word in words
|
||||||
words_to_translate.append(word)
|
if word.lower() not in cached_results
|
||||||
|
]
|
||||||
|
|
||||||
# Translate uncached words using argos batch
|
# Translate uncached words using argos batch
|
||||||
new_translations: dict[str, str] = {}
|
new_translations: dict[str, str] = {}
|
||||||
if words_to_translate:
|
if words_to_translate:
|
||||||
import sys
|
new_translations = _run_batch_translation(
|
||||||
|
words_to_translate, from_lang, to_lang,
|
||||||
num_to_translate = len(words_to_translate)
|
|
||||||
|
|
||||||
# Check if GPU is being used
|
|
||||||
gpu_status = " (GPU)" if _gpu_available else " (CPU)"
|
|
||||||
print(
|
|
||||||
f"Translating {num_to_translate} words from {from_lang} to {to_lang}{gpu_status}...",
|
|
||||||
file=sys.stderr,
|
|
||||||
flush=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
|
||||||
# Split into batches - larger batches are faster but show progress less often
|
|
||||||
BATCH_SIZE = 100
|
|
||||||
batches: list[list[str]] = []
|
|
||||||
for i in range(0, num_to_translate, BATCH_SIZE):
|
|
||||||
batches.append(words_to_translate[i : i + BATCH_SIZE])
|
|
||||||
|
|
||||||
total_batches = len(batches)
|
|
||||||
|
|
||||||
# Sequential translation with progress
|
|
||||||
# (argostranslate is not thread-safe - uses global model)
|
|
||||||
for batch_idx, batch_words in enumerate(batches):
|
|
||||||
words_done = (batch_idx + 1) * BATCH_SIZE
|
|
||||||
words_done = min(words_done, num_to_translate)
|
|
||||||
pct = int(words_done / num_to_translate * 100)
|
|
||||||
|
|
||||||
print(
|
|
||||||
f" [{pct:3d}%] Translating batch {batch_idx + 1}/{total_batches} "
|
|
||||||
f"({words_done}/{num_to_translate} words)...",
|
|
||||||
file=sys.stderr,
|
|
||||||
flush=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
_, batch_translations = _translate_batch_worker(
|
|
||||||
batch_words, from_lang, to_lang, batch_idx
|
|
||||||
)
|
|
||||||
new_translations.update(batch_translations)
|
|
||||||
|
|
||||||
print(" Translation complete.", file=sys.stderr, flush=True)
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Translation failed for {from_lang} -> {to_lang}: {e}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
# Cache new translations
|
# Cache new translations
|
||||||
if use_cache and new_translations:
|
if use_cache and get_translation_cache is not None:
|
||||||
try:
|
get_translation_cache().set_many(
|
||||||
from python_pkg.word_frequency.cache import get_translation_cache
|
new_translations, from_lang, to_lang,
|
||||||
|
)
|
||||||
get_translation_cache().set_many(new_translations, from_lang, to_lang)
|
|
||||||
except ImportError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Merge cached and new translations
|
# Merge cached and new translations
|
||||||
all_translations = {**cached_results, **new_translations}
|
all_translations = {**cached_results, **new_translations}
|
||||||
@ -694,22 +735,14 @@ def read_file(filepath: str | Path) -> str:
|
|||||||
return Path(filepath).read_text(encoding="utf-8")
|
return Path(filepath).read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
def main(argv: Sequence[str] | None = None) -> int:
|
def _build_parser() -> argparse.ArgumentParser:
|
||||||
"""Main entry point for the translator.
|
"""Build the argument parser for the translator CLI."""
|
||||||
|
|
||||||
Args:
|
|
||||||
argv: Command line arguments.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Exit code.
|
|
||||||
"""
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Offline translator using Argos Translate.",
|
description="Offline translator using Argos Translate.",
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog=__doc__,
|
epilog=__doc__,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Actions
|
|
||||||
action_group = parser.add_mutually_exclusive_group()
|
action_group = parser.add_mutually_exclusive_group()
|
||||||
action_group.add_argument(
|
action_group.add_argument(
|
||||||
"--list-languages",
|
"--list-languages",
|
||||||
@ -728,10 +761,12 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
"-d",
|
"-d",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
metavar="LANG",
|
metavar="LANG",
|
||||||
help="Download language packs (e.g., --download en es pl)",
|
help=(
|
||||||
|
"Download language packs "
|
||||||
|
"(e.g., --download en es pl)"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Input
|
|
||||||
input_group = parser.add_mutually_exclusive_group()
|
input_group = parser.add_mutually_exclusive_group()
|
||||||
input_group.add_argument(
|
input_group.add_argument(
|
||||||
"--text",
|
"--text",
|
||||||
@ -752,7 +787,6 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
help="File with words to translate (one per line)",
|
help="File with words to translate (one per line)",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Language options
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--from",
|
"--from",
|
||||||
"-f",
|
"-f",
|
||||||
@ -769,8 +803,6 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
default="en",
|
default="en",
|
||||||
help="Target language code (default: en)",
|
help="Target language code (default: en)",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Output
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output",
|
"--output",
|
||||||
"-o",
|
"-o",
|
||||||
@ -778,87 +810,142 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
help="Output file path",
|
help="Output file path",
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args(argv)
|
return parser
|
||||||
|
|
||||||
# Check if argostranslate is available
|
|
||||||
if not _check_argos():
|
def _handle_list_languages() -> int:
|
||||||
print(
|
"""Handle --list-languages command."""
|
||||||
"Error: argostranslate is not installed.\n"
|
langs = get_installed_languages()
|
||||||
"Install it with: pip install argostranslate",
|
if not langs:
|
||||||
file=sys.stderr,
|
sys.stdout.write("No languages installed.\n")
|
||||||
|
sys.stdout.write(
|
||||||
|
"Download some with: --download en es pl de fr\n",
|
||||||
)
|
)
|
||||||
return 1
|
else:
|
||||||
|
sys.stdout.write("Installed languages:\n")
|
||||||
|
for code, name in sorted(langs):
|
||||||
|
sys.stdout.write(f" {code}: {name}\n")
|
||||||
|
return 0
|
||||||
|
|
||||||
# Handle list-languages
|
|
||||||
if args.list_languages:
|
|
||||||
langs = get_installed_languages()
|
|
||||||
if not langs:
|
|
||||||
print("No languages installed.")
|
|
||||||
print("Download some with: --download en es pl de fr")
|
|
||||||
else:
|
|
||||||
print("Installed languages:")
|
|
||||||
for code, name in sorted(langs):
|
|
||||||
print(f" {code}: {name}")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
# Handle list-available
|
def _handle_list_available() -> int:
|
||||||
if args.list_available:
|
"""Handle --list-available command."""
|
||||||
packages = get_available_packages()
|
packages = get_available_packages()
|
||||||
if not packages:
|
if not packages:
|
||||||
print("No packages available (check internet connection).")
|
sys.stdout.write(
|
||||||
else:
|
"No packages available "
|
||||||
print("Available language packages:")
|
"(check internet connection).\n",
|
||||||
for from_code, from_name, to_code, to_name in sorted(packages):
|
)
|
||||||
print(f" {from_code} ({from_name}) -> {to_code} ({to_name})")
|
else:
|
||||||
return 0
|
sys.stdout.write("Available language packages:\n")
|
||||||
|
for from_code, from_name, to_code, to_name in sorted(
|
||||||
|
packages,
|
||||||
|
):
|
||||||
|
sys.stdout.write(
|
||||||
|
f" {from_code} ({from_name})"
|
||||||
|
f" -> {to_code} ({to_name})\n",
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
# Handle download
|
|
||||||
if args.download:
|
|
||||||
download_results = download_languages(args.download)
|
|
||||||
success_count = sum(1 for v in download_results.values() if v)
|
|
||||||
print(f"\nDownloaded {success_count}/{len(download_results)} language pairs.")
|
|
||||||
return 0 if success_count > 0 else 1
|
|
||||||
|
|
||||||
# Handle translation
|
def _handle_download(lang_codes: list[str]) -> int:
|
||||||
words: list[str] = []
|
"""Handle --download command."""
|
||||||
|
download_results = download_languages(lang_codes)
|
||||||
|
success_count = sum(
|
||||||
|
1 for v in download_results.values() if v
|
||||||
|
)
|
||||||
|
sys.stdout.write(
|
||||||
|
f"\nDownloaded {success_count}/"
|
||||||
|
f"{len(download_results)} language pairs.\n",
|
||||||
|
)
|
||||||
|
return 0 if success_count > 0 else 1
|
||||||
|
|
||||||
|
|
||||||
|
def _collect_words(
|
||||||
|
args: argparse.Namespace,
|
||||||
|
) -> list[str] | None:
|
||||||
|
"""Collect words from args. Returns None on error."""
|
||||||
if args.text:
|
if args.text:
|
||||||
words = [args.text]
|
return [args.text]
|
||||||
elif args.words:
|
if args.words:
|
||||||
words = args.words
|
return args.words
|
||||||
elif args.words_file:
|
if args.words_file:
|
||||||
try:
|
try:
|
||||||
content = read_file(args.words_file)
|
content = read_file(args.words_file)
|
||||||
words = [w.strip() for w in content.splitlines() if w.strip()]
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print(f"Error: File not found: {args.words_file}", file=sys.stderr)
|
sys.stderr.write(
|
||||||
return 1
|
f"Error: File not found: {args.words_file}\n",
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
return [
|
||||||
|
w.strip()
|
||||||
|
for w in content.splitlines()
|
||||||
|
if w.strip()
|
||||||
|
]
|
||||||
|
return []
|
||||||
|
|
||||||
if not words:
|
|
||||||
parser.print_help()
|
|
||||||
return 1
|
|
||||||
|
|
||||||
# Translate
|
def _handle_translation(args: argparse.Namespace) -> int:
|
||||||
|
"""Handle the translation action."""
|
||||||
try:
|
try:
|
||||||
results = translate_words_batch(words, args.from_lang, args.to_lang)
|
results = translate_words_batch(
|
||||||
except ImportError as e:
|
args.words, args.from_lang, args.to_lang,
|
||||||
print(f"Error: {e}", file=sys.stderr)
|
)
|
||||||
|
except ImportError:
|
||||||
|
logger.exception("Translation import error")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
output = format_translations(results)
|
output = format_translations(results)
|
||||||
|
|
||||||
# Output
|
|
||||||
if args.output:
|
if args.output:
|
||||||
Path(args.output).write_text(output, encoding="utf-8")
|
Path(args.output).write_text(output, encoding="utf-8")
|
||||||
print(f"Translations written to {args.output}")
|
sys.stdout.write(
|
||||||
|
f"Translations written to {args.output}\n",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print(output)
|
sys.stdout.write(output + "\n")
|
||||||
|
|
||||||
# Return error if any translation failed
|
|
||||||
if any(not r.success for r in results):
|
if any(not r.success for r in results):
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: Sequence[str] | None = None) -> int:
|
||||||
|
"""Main entry point for the translator.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
argv: Command line arguments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Exit code.
|
||||||
|
"""
|
||||||
|
parser = _build_parser()
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
if not _check_argos():
|
||||||
|
sys.stderr.write(
|
||||||
|
"Error: argostranslate is not installed.\n"
|
||||||
|
"Install it with: pip install argostranslate\n",
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if args.list_languages:
|
||||||
|
return _handle_list_languages()
|
||||||
|
if args.list_available:
|
||||||
|
return _handle_list_available()
|
||||||
|
if args.download:
|
||||||
|
return _handle_download(args.download)
|
||||||
|
|
||||||
|
words = _collect_words(args)
|
||||||
|
if not words:
|
||||||
|
if words is not None:
|
||||||
|
parser.print_help()
|
||||||
|
return 1
|
||||||
|
|
||||||
|
args.words = words
|
||||||
|
return _handle_translation(args)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
|||||||
@ -14,7 +14,9 @@ Usage:
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
from typing import TYPE_CHECKING, NamedTuple
|
from typing import TYPE_CHECKING, NamedTuple
|
||||||
|
|
||||||
@ -27,6 +29,9 @@ except ImportError:
|
|||||||
from analyzer import analyze_text, read_file
|
from analyzer import analyze_text, read_file
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class ExcerptAnalysis(NamedTuple):
|
class ExcerptAnalysis(NamedTuple):
|
||||||
"""Analysis result for an excerpt length."""
|
"""Analysis result for an excerpt length."""
|
||||||
|
|
||||||
@ -111,8 +116,6 @@ def find_optimal_excerpts(
|
|||||||
ranked_words = [word for word, _ in word_counts.most_common()]
|
ranked_words = [word for word, _ in word_counts.most_common()]
|
||||||
|
|
||||||
# Extract all words from text (preserving order)
|
# Extract all words from text (preserving order)
|
||||||
import re
|
|
||||||
|
|
||||||
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
|
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
|
||||||
if not case_sensitive:
|
if not case_sensitive:
|
||||||
all_words = [w.lower() for w in all_words]
|
all_words = [w.lower() for w in all_words]
|
||||||
@ -150,6 +153,9 @@ def find_optimal_excerpts(
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
_MAX_EXCERPT_DISPLAY_LEN = 50
|
||||||
|
|
||||||
|
|
||||||
def format_results(
|
def format_results(
|
||||||
results: list[ExcerptAnalysis],
|
results: list[ExcerptAnalysis],
|
||||||
*,
|
*,
|
||||||
@ -198,7 +204,7 @@ def format_results(
|
|||||||
if show_excerpts:
|
if show_excerpts:
|
||||||
# Truncate long excerpts
|
# Truncate long excerpts
|
||||||
excerpt = r.best_excerpt
|
excerpt = r.best_excerpt
|
||||||
if len(excerpt) > 50:
|
if len(excerpt) > _MAX_EXCERPT_DISPLAY_LEN:
|
||||||
excerpt = excerpt[:47] + "..."
|
excerpt = excerpt[:47] + "..."
|
||||||
lines.append(f"{r.excerpt_length:>6} {r.min_vocab_needed:>5} {excerpt}")
|
lines.append(f"{r.excerpt_length:>6} {r.min_vocab_needed:>5} {excerpt}")
|
||||||
else:
|
else:
|
||||||
@ -285,10 +291,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
args = parser.parse_args(argv)
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if args.text:
|
text = args.text or read_file(args.file)
|
||||||
text = args.text
|
|
||||||
else:
|
|
||||||
text = read_file(args.file)
|
|
||||||
|
|
||||||
results = find_optimal_excerpts(
|
results = find_optimal_excerpts(
|
||||||
text,
|
text,
|
||||||
@ -304,15 +307,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
|
|
||||||
if args.output:
|
if args.output:
|
||||||
Path(args.output).write_text(output, encoding="utf-8")
|
Path(args.output).write_text(output, encoding="utf-8")
|
||||||
print(f"Output written to {args.output}")
|
logger.info("Output written to %s", args.output)
|
||||||
else:
|
else:
|
||||||
print(output)
|
logger.info("%s", output)
|
||||||
|
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError:
|
||||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
logger.exception("File not found")
|
||||||
return 1
|
return 1
|
||||||
except UnicodeDecodeError as e:
|
except UnicodeDecodeError:
|
||||||
print(f"Error: Could not decode file - {e}", file=sys.stderr)
|
logger.exception("Could not decode file")
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user