mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 14:23:04 +02:00
feat: automatic language detection translation and anki generator with cache
This commit is contained in:
parent
1411e685c2
commit
d2b6f00185
@ -158,9 +158,20 @@ static void assign_ranks(void) {
|
|||||||
/* Sort all_entries by frequency (this doesn't affect word_sequence) */
|
/* Sort all_entries by frequency (this doesn't affect word_sequence) */
|
||||||
qsort(all_entries, num_unique_words, sizeof(WordEntry *), compare_by_count);
|
qsort(all_entries, num_unique_words, sizeof(WordEntry *), compare_by_count);
|
||||||
|
|
||||||
/* Assign 1-indexed ranks */
|
/* Assign 1-indexed ranks using competition ranking:
|
||||||
|
* Words with same frequency get same rank.
|
||||||
|
* Next rank is current_position + 1 (skipping numbers).
|
||||||
|
* Example: counts 5,3,3,2 -> ranks 1,2,2,4 (not 1,2,3,4) */
|
||||||
for (int i = 0; i < num_unique_words; i++) {
|
for (int i = 0; i < num_unique_words; i++) {
|
||||||
all_entries[i]->rank = i + 1;
|
if (i == 0) {
|
||||||
|
all_entries[i]->rank = 1;
|
||||||
|
} else if (all_entries[i]->count == all_entries[i-1]->count) {
|
||||||
|
/* Same frequency as previous word - same rank */
|
||||||
|
all_entries[i]->rank = all_entries[i-1]->rank;
|
||||||
|
} else {
|
||||||
|
/* Different frequency - rank is position + 1 */
|
||||||
|
all_entries[i]->rank = i + 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -306,20 +317,42 @@ static void cleanup(void) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Dump all vocabulary with ranks (for Python integration) */
|
||||||
|
static void dump_vocabulary(int max_rank) {
|
||||||
|
printf("VOCAB_DUMP_START\n");
|
||||||
|
for (int i = 0; i < num_unique_words; i++) {
|
||||||
|
if (all_entries[i]->rank <= max_rank) {
|
||||||
|
printf("%s;%d\n", all_entries[i]->word, all_entries[i]->rank);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("VOCAB_DUMP_END\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
fprintf(stderr, "Usage: %s <file.txt> [max_length]\n", argv[0]);
|
fprintf(stderr, "Usage: %s <file.txt> [max_length] [--dump-vocab [max_rank]]\n", argv[0]);
|
||||||
fprintf(stderr, " max_length: maximum excerpt length to analyze (default: 30)\n");
|
fprintf(stderr, " max_length: maximum excerpt length to analyze (default: 30)\n");
|
||||||
|
fprintf(stderr, " --dump-vocab: output all words with ranks up to max_rank\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char *filename = argv[1];
|
const char *filename = argv[1];
|
||||||
int max_length = 30;
|
int max_length = 30;
|
||||||
|
bool dump_vocab = false;
|
||||||
|
int dump_max_rank = 0;
|
||||||
|
|
||||||
if (argc >= 3) {
|
/* Parse arguments */
|
||||||
max_length = atoi(argv[2]);
|
for (int i = 2; i < argc; i++) {
|
||||||
if (max_length < 1) max_length = 1;
|
if (strcmp(argv[i], "--dump-vocab") == 0) {
|
||||||
if (max_length > 1000) max_length = 1000;
|
dump_vocab = true;
|
||||||
|
if (i + 1 < argc && argv[i + 1][0] != '-') {
|
||||||
|
dump_max_rank = atoi(argv[++i]);
|
||||||
|
}
|
||||||
|
} else if (argv[i][0] != '-') {
|
||||||
|
max_length = atoi(argv[i]);
|
||||||
|
if (max_length < 1) max_length = 1;
|
||||||
|
if (max_length > 1000) max_length = 1000;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Initialize hash table */
|
/* Initialize hash table */
|
||||||
@ -351,6 +384,17 @@ int main(int argc, char *argv[]) {
|
|||||||
/* Print results */
|
/* Print results */
|
||||||
print_results(results, max_length);
|
print_results(results, max_length);
|
||||||
|
|
||||||
|
/* Dump vocabulary if requested */
|
||||||
|
if (dump_vocab) {
|
||||||
|
/* If no max_rank specified, use the max from the excerpt */
|
||||||
|
if (dump_max_rank == 0 && max_length > 0) {
|
||||||
|
dump_max_rank = results[max_length - 1].min_vocab_needed;
|
||||||
|
}
|
||||||
|
if (dump_max_rank > 0) {
|
||||||
|
dump_vocabulary(dump_max_rank);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Cleanup */
|
/* Cleanup */
|
||||||
free(results);
|
free(results);
|
||||||
cleanup();
|
cleanup();
|
||||||
|
|||||||
Binary file not shown.
@ -40,10 +40,10 @@ try:
|
|||||||
detect_language,
|
detect_language,
|
||||||
translate_words_batch,
|
translate_words_batch,
|
||||||
)
|
)
|
||||||
from python_pkg.word_frequency.analyzer import read_file, analyze_text
|
from python_pkg.word_frequency.analyzer import read_file
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from translator import detect_language, translate_words_batch
|
from translator import detect_language, translate_words_batch
|
||||||
from analyzer import read_file, analyze_text
|
from analyzer import read_file
|
||||||
|
|
||||||
|
|
||||||
# Path to C vocabulary_curve executable
|
# Path to C vocabulary_curve executable
|
||||||
@ -59,12 +59,13 @@ class VocabWord(NamedTuple):
|
|||||||
context: str
|
context: str
|
||||||
|
|
||||||
|
|
||||||
def run_vocabulary_curve(filepath: Path, max_length: int) -> str:
|
def run_vocabulary_curve(filepath: Path, max_length: int, *, dump_vocab: bool = False) -> str:
|
||||||
"""Run the C vocabulary_curve executable.
|
"""Run the C vocabulary_curve executable.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
filepath: Path to the text file.
|
filepath: Path to the text file.
|
||||||
max_length: Maximum excerpt length.
|
max_length: Maximum excerpt length.
|
||||||
|
dump_vocab: If True, also dump all vocabulary up to max rank needed.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Output from the executable.
|
Output from the executable.
|
||||||
@ -79,8 +80,12 @@ def run_vocabulary_curve(filepath: Path, max_length: int) -> str:
|
|||||||
"Please compile it first: cd C/vocabulary_curve && make"
|
"Please compile it first: cd C/vocabulary_curve && make"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
cmd = [str(C_EXECUTABLE), str(filepath), str(max_length)]
|
||||||
|
if dump_vocab:
|
||||||
|
cmd.append("--dump-vocab")
|
||||||
|
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
[str(C_EXECUTABLE), str(filepath), str(max_length)],
|
cmd,
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=120,
|
timeout=120,
|
||||||
@ -89,7 +94,7 @@ def run_vocabulary_curve(filepath: Path, max_length: int) -> str:
|
|||||||
return result.stdout
|
return result.stdout
|
||||||
|
|
||||||
|
|
||||||
def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]]]:
|
def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
|
||||||
"""Parse output from vocabulary_curve to get words needed.
|
"""Parse output from vocabulary_curve to get words needed.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -97,11 +102,14 @@ def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str,
|
|||||||
target_length: The target excerpt length.
|
target_length: The target excerpt length.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (excerpt_text, list of (word, rank) tuples).
|
Tuple of (excerpt_text, excerpt_words, all_vocab_words).
|
||||||
|
excerpt_words: words in the excerpt with their ranks.
|
||||||
|
all_vocab_words: all words up to max rank (from VOCAB_DUMP if present).
|
||||||
"""
|
"""
|
||||||
lines = output.split("\n")
|
lines = output.split("\n")
|
||||||
excerpt = ""
|
excerpt = ""
|
||||||
words: list[tuple[str, int]] = []
|
excerpt_words: list[tuple[str, int]] = []
|
||||||
|
all_vocab: list[tuple[str, int]] = []
|
||||||
|
|
||||||
# Find the line for the target length
|
# Find the line for the target length
|
||||||
i = 0
|
i = 0
|
||||||
@ -131,26 +139,28 @@ def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str,
|
|||||||
# Parse "word(#rank), word2(#rank2), ..."
|
# Parse "word(#rank), word2(#rank2), ..."
|
||||||
pattern = r"(\S+)\(#(\d+)\)"
|
pattern = r"(\S+)\(#(\d+)\)"
|
||||||
matches = re.findall(pattern, words_part)
|
matches = re.findall(pattern, words_part)
|
||||||
words = [(w, int(r)) for w, r in matches]
|
excerpt_words = [(w, int(r)) for w, r in matches]
|
||||||
break
|
break
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
return excerpt, words
|
# Parse VOCAB_DUMP section if present
|
||||||
|
in_vocab_dump = False
|
||||||
|
for line in lines:
|
||||||
|
if line.strip() == "VOCAB_DUMP_START":
|
||||||
|
in_vocab_dump = True
|
||||||
|
continue
|
||||||
|
if line.strip() == "VOCAB_DUMP_END":
|
||||||
|
break
|
||||||
|
if in_vocab_dump and ";" in line:
|
||||||
|
parts = line.strip().split(";")
|
||||||
|
if len(parts) == 2:
|
||||||
|
word, rank_str = parts
|
||||||
|
try:
|
||||||
|
all_vocab.append((word, int(rank_str)))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return excerpt, excerpt_words, all_vocab
|
||||||
def get_top_n_words(text: str, n: int) -> list[tuple[str, int]]:
|
|
||||||
"""Get the top N most frequent words from text.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: The source text.
|
|
||||||
n: Number of top words to return.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of (word, rank) tuples, ranked 1 to n.
|
|
||||||
"""
|
|
||||||
word_counts = analyze_text(text)
|
|
||||||
sorted_words = sorted(word_counts.items(), key=lambda x: (-x[1], x[0]))
|
|
||||||
return [(word, rank + 1) for rank, (word, _) in enumerate(sorted_words[:n])]
|
|
||||||
|
|
||||||
|
|
||||||
def find_word_contexts(
|
def find_word_contexts(
|
||||||
@ -196,6 +206,8 @@ def generate_anki_deck(
|
|||||||
deck_name: str = "Vocabulary",
|
deck_name: str = "Vocabulary",
|
||||||
include_context: bool = False,
|
include_context: bool = False,
|
||||||
no_translate: bool = False,
|
no_translate: bool = False,
|
||||||
|
excerpt: str = "",
|
||||||
|
excerpt_words: list[tuple[str, int]] | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Generate Anki-compatible deck content.
|
"""Generate Anki-compatible deck content.
|
||||||
|
|
||||||
@ -207,6 +219,8 @@ def generate_anki_deck(
|
|||||||
deck_name: Name for the deck.
|
deck_name: Name for the deck.
|
||||||
include_context: Whether to include context in cards.
|
include_context: Whether to include context in cards.
|
||||||
no_translate: If True, skip translation (use placeholder).
|
no_translate: If True, skip translation (use placeholder).
|
||||||
|
excerpt: The target excerpt text to include in cards.
|
||||||
|
excerpt_words: List of (word, rank) tuples for words in the excerpt.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Semicolon-separated content ready for Anki import.
|
Semicolon-separated content ready for Anki import.
|
||||||
@ -224,6 +238,27 @@ def generate_anki_deck(
|
|||||||
lines.append("#columns:Front;Back;Rank")
|
lines.append("#columns:Front;Back;Rank")
|
||||||
lines.append("") # Empty line before data
|
lines.append("") # Empty line before data
|
||||||
|
|
||||||
|
# Add excerpt as first card (goal/context card)
|
||||||
|
if excerpt:
|
||||||
|
excerpt_escaped = excerpt.replace(";", ",")
|
||||||
|
# Use excerpt_words from C output (has correct ranks)
|
||||||
|
if excerpt_words:
|
||||||
|
# Most frequent = lowest rank (italics), rarest = highest rank (bold)
|
||||||
|
most_frequent = min(excerpt_words, key=lambda x: x[1])[0]
|
||||||
|
rarest = max(excerpt_words, key=lambda x: x[1])[0]
|
||||||
|
# Apply formatting - rarest first (bold), then most frequent (italics)
|
||||||
|
# to avoid nested tag issues if they're the same word
|
||||||
|
if most_frequent != rarest:
|
||||||
|
pattern_rare = re.compile(rf"\b({re.escape(rarest)})\b", re.IGNORECASE)
|
||||||
|
excerpt_escaped = pattern_rare.sub(r"<b>\1</b>", excerpt_escaped)
|
||||||
|
pattern_freq = re.compile(rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE)
|
||||||
|
excerpt_escaped = pattern_freq.sub(r"<i>\1</i>", excerpt_escaped)
|
||||||
|
else:
|
||||||
|
# Same word is both most and least frequent - use bold+italic
|
||||||
|
pattern = re.compile(rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE)
|
||||||
|
excerpt_escaped = pattern.sub(r"<b><i>\1</i></b>", excerpt_escaped)
|
||||||
|
lines.append(f"📖 TARGET EXCERPT;{excerpt_escaped};#0")
|
||||||
|
|
||||||
# Get translations (or skip if no_translate)
|
# Get translations (or skip if no_translate)
|
||||||
words = [w for w, _ in words_with_ranks]
|
words = [w for w, _ in words_with_ranks]
|
||||||
if no_translate:
|
if no_translate:
|
||||||
@ -263,6 +298,120 @@ def generate_anki_deck(
|
|||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_excerpt(
|
||||||
|
filepath: Path, length: int, *, force: bool = False
|
||||||
|
) -> tuple[str, list[tuple[str, int]]] | None:
|
||||||
|
"""Get cached excerpt if available.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
force: If True, ignore cache.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (excerpt, words) or None if not cached.
|
||||||
|
"""
|
||||||
|
if force:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
from python_pkg.word_frequency.cache import get_vocab_curve_cache
|
||||||
|
return get_vocab_curve_cache().get(filepath, length)
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def cache_excerpt(
|
||||||
|
filepath: Path, length: int, excerpt: str, words: list[tuple[str, int]]
|
||||||
|
) -> None:
|
||||||
|
"""Store excerpt in cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
excerpt: The excerpt text.
|
||||||
|
words: List of (word, rank) tuples.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from python_pkg.word_frequency.cache import get_vocab_curve_cache
|
||||||
|
get_vocab_curve_cache().set(filepath, length, excerpt, words)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_deck(
|
||||||
|
filepath: Path,
|
||||||
|
length: int,
|
||||||
|
target_lang: str,
|
||||||
|
include_context: bool,
|
||||||
|
all_vocab: bool,
|
||||||
|
*,
|
||||||
|
force: bool = False,
|
||||||
|
) -> tuple[str, str, int, int] | None:
|
||||||
|
"""Get cached Anki deck if available.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
target_lang: Target language.
|
||||||
|
include_context: Whether context is included.
|
||||||
|
all_vocab: Whether all vocab is included.
|
||||||
|
force: If True, ignore cache.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (content, excerpt, num_words, max_rank) or None.
|
||||||
|
"""
|
||||||
|
if force:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
from python_pkg.word_frequency.cache import get_anki_deck_cache
|
||||||
|
return get_anki_deck_cache().get(
|
||||||
|
filepath, length, target_lang, include_context, all_vocab
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def cache_deck(
|
||||||
|
filepath: Path,
|
||||||
|
length: int,
|
||||||
|
target_lang: str,
|
||||||
|
include_context: bool,
|
||||||
|
all_vocab: bool,
|
||||||
|
anki_content: str,
|
||||||
|
excerpt: str,
|
||||||
|
num_words: int,
|
||||||
|
max_rank: int,
|
||||||
|
) -> None:
|
||||||
|
"""Store Anki deck in cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
target_lang: Target language.
|
||||||
|
include_context: Whether context is included.
|
||||||
|
all_vocab: Whether all vocab is included.
|
||||||
|
anki_content: The deck content.
|
||||||
|
excerpt: The excerpt text.
|
||||||
|
num_words: Number of words.
|
||||||
|
max_rank: Maximum rank.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from python_pkg.word_frequency.cache import get_anki_deck_cache
|
||||||
|
get_anki_deck_cache().set(
|
||||||
|
filepath,
|
||||||
|
length,
|
||||||
|
target_lang,
|
||||||
|
include_context,
|
||||||
|
all_vocab,
|
||||||
|
anki_content,
|
||||||
|
excerpt,
|
||||||
|
num_words,
|
||||||
|
max_rank,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def generate_flashcards(
|
def generate_flashcards(
|
||||||
filepath: str | Path,
|
filepath: str | Path,
|
||||||
excerpt_length: int,
|
excerpt_length: int,
|
||||||
@ -272,6 +421,8 @@ def generate_flashcards(
|
|||||||
deck_name: str | None = None,
|
deck_name: str | None = None,
|
||||||
all_vocab: bool = True,
|
all_vocab: bool = True,
|
||||||
no_translate: bool = False,
|
no_translate: bool = False,
|
||||||
|
*,
|
||||||
|
force: bool = False,
|
||||||
) -> tuple[str, str, int, int]:
|
) -> tuple[str, str, int, int]:
|
||||||
"""Generate Anki flashcards for vocabulary needed for an excerpt length.
|
"""Generate Anki flashcards for vocabulary needed for an excerpt length.
|
||||||
|
|
||||||
@ -285,26 +436,39 @@ def generate_flashcards(
|
|||||||
all_vocab: If True, include ALL words from rank 1 to max rank needed.
|
all_vocab: If True, include ALL words from rank 1 to max rank needed.
|
||||||
If False, only include words that appear in the excerpt.
|
If False, only include words that appear in the excerpt.
|
||||||
no_translate: If True, skip translation.
|
no_translate: If True, skip translation.
|
||||||
|
force: If True, ignore all caches and regenerate.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (anki_content, excerpt, num_words, max_rank).
|
Tuple of (anki_content, excerpt, num_words, max_rank).
|
||||||
"""
|
"""
|
||||||
filepath = Path(filepath)
|
filepath = Path(filepath)
|
||||||
|
|
||||||
# Read the text
|
# Check for cached full deck (if not using no_translate)
|
||||||
text = read_file(filepath)
|
if not no_translate and not force:
|
||||||
|
cached = get_cached_deck(
|
||||||
|
filepath, excerpt_length, target_lang, include_context, all_vocab
|
||||||
|
)
|
||||||
|
if cached is not None:
|
||||||
|
return cached
|
||||||
|
|
||||||
|
# Read the text (only needed for context finding)
|
||||||
|
text = read_file(filepath) if include_context else ""
|
||||||
|
|
||||||
# Auto-detect language if not provided
|
# Auto-detect language if not provided
|
||||||
if source_lang is None:
|
if source_lang is None:
|
||||||
source_lang = detect_language(text)
|
sample_text = read_file(filepath)[:1000] if not text else text[:1000]
|
||||||
|
source_lang = detect_language(sample_text)
|
||||||
if source_lang is None:
|
if source_lang is None:
|
||||||
source_lang = "auto"
|
raise ValueError(
|
||||||
|
"Could not auto-detect source language. "
|
||||||
|
"Please specify with --from (e.g., --from pl for Polish). "
|
||||||
|
"Install langdetect for auto-detection: pip install langdetect"
|
||||||
|
)
|
||||||
|
|
||||||
# Run vocabulary curve analysis
|
# Run vocabulary curve analysis with vocab dump for all words
|
||||||
output = run_vocabulary_curve(filepath, excerpt_length)
|
output = run_vocabulary_curve(filepath, excerpt_length, dump_vocab=all_vocab)
|
||||||
|
# Parse the output (now includes all vocabulary from C)
|
||||||
# Parse the output
|
excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(output, excerpt_length)
|
||||||
excerpt, excerpt_words = parse_vocabulary_curve_output(output, excerpt_length)
|
|
||||||
|
|
||||||
if not excerpt_words:
|
if not excerpt_words:
|
||||||
raise ValueError(f"No words found for excerpt length {excerpt_length}")
|
raise ValueError(f"No words found for excerpt length {excerpt_length}")
|
||||||
@ -312,15 +476,17 @@ def generate_flashcards(
|
|||||||
# Find max rank needed
|
# Find max rank needed
|
||||||
max_rank = max(rank for _, rank in excerpt_words)
|
max_rank = max(rank for _, rank in excerpt_words)
|
||||||
|
|
||||||
# Get ALL words up to max_rank if requested
|
# Use vocabulary from C output
|
||||||
if all_vocab:
|
if all_vocab and all_vocab_words:
|
||||||
words_with_ranks = get_top_n_words(text, max_rank)
|
words_with_ranks = all_vocab_words
|
||||||
else:
|
else:
|
||||||
words_with_ranks = excerpt_words
|
words_with_ranks = excerpt_words
|
||||||
|
|
||||||
# Get contexts if requested
|
# Get contexts if requested
|
||||||
contexts = None
|
contexts = None
|
||||||
if include_context:
|
if include_context:
|
||||||
|
if not text:
|
||||||
|
text = read_file(filepath)
|
||||||
words = [w for w, _ in words_with_ranks]
|
words = [w for w, _ in words_with_ranks]
|
||||||
contexts = find_word_contexts(text, words)
|
contexts = find_word_contexts(text, words)
|
||||||
|
|
||||||
@ -337,8 +503,24 @@ def generate_flashcards(
|
|||||||
deck_name,
|
deck_name,
|
||||||
include_context,
|
include_context,
|
||||||
no_translate,
|
no_translate,
|
||||||
|
excerpt,
|
||||||
|
excerpt_words,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Cache the full deck (if translated)
|
||||||
|
if not no_translate:
|
||||||
|
cache_deck(
|
||||||
|
filepath,
|
||||||
|
excerpt_length,
|
||||||
|
target_lang,
|
||||||
|
include_context,
|
||||||
|
all_vocab,
|
||||||
|
anki_content,
|
||||||
|
excerpt,
|
||||||
|
len(words_with_ranks),
|
||||||
|
max_rank,
|
||||||
|
)
|
||||||
|
|
||||||
return anki_content, excerpt, len(words_with_ranks), max_rank
|
return anki_content, excerpt, len(words_with_ranks), max_rank
|
||||||
|
|
||||||
|
|
||||||
@ -361,19 +543,18 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
"--file",
|
"--file",
|
||||||
"-f",
|
"-f",
|
||||||
type=str,
|
type=str,
|
||||||
required=True,
|
default=None,
|
||||||
help="Path to the text file to analyze",
|
help="Path to the text file to analyze",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--length",
|
"--length",
|
||||||
"-l",
|
"-l",
|
||||||
type=int,
|
type=int,
|
||||||
required=True,
|
default=None,
|
||||||
help="Target excerpt length (how many words you want to understand)",
|
help="Target excerpt length (how many words you want to understand)",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--from",
|
"--from",
|
||||||
"-F",
|
|
||||||
dest="source_lang",
|
dest="source_lang",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
@ -425,9 +606,72 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
action="store_true",
|
action="store_true",
|
||||||
help="Skip translation (output words without translations)",
|
help="Skip translation (output words without translations)",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--force",
|
||||||
|
"-F",
|
||||||
|
action="store_true",
|
||||||
|
help="Force regeneration, ignoring all caches",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--cache-stats",
|
||||||
|
action="store_true",
|
||||||
|
help="Show cache statistics and exit",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clear-cache",
|
||||||
|
action="store_true",
|
||||||
|
help="Clear all caches and exit",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args(argv)
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
# Handle cache management commands
|
||||||
|
if args.cache_stats:
|
||||||
|
try:
|
||||||
|
from python_pkg.word_frequency.cache import get_all_cache_stats
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
from cache import get_all_cache_stats
|
||||||
|
except ImportError:
|
||||||
|
print("Cache module not available", file=sys.stderr) # noqa: T201
|
||||||
|
return 1
|
||||||
|
stats = get_all_cache_stats()
|
||||||
|
print("Cache Statistics") # noqa: T201
|
||||||
|
print("=" * 50) # noqa: T201
|
||||||
|
for cache_name, cache_stats in stats.items():
|
||||||
|
print(f"\n{cache_name.upper()}:") # noqa: T201
|
||||||
|
for key, value in cache_stats.items():
|
||||||
|
if key == "cache_size_bytes":
|
||||||
|
if value < 1024:
|
||||||
|
size_str = f"{value} B"
|
||||||
|
elif value < 1024 * 1024:
|
||||||
|
size_str = f"{value / 1024:.1f} KB"
|
||||||
|
else:
|
||||||
|
size_str = f"{value / (1024 * 1024):.1f} MB"
|
||||||
|
print(f" {key}: {size_str}") # noqa: T201
|
||||||
|
else:
|
||||||
|
print(f" {key}: {value}") # noqa: T201
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.clear_cache:
|
||||||
|
try:
|
||||||
|
from python_pkg.word_frequency.cache import clear_all_caches
|
||||||
|
except ImportError:
|
||||||
|
try:
|
||||||
|
from cache import clear_all_caches
|
||||||
|
except ImportError:
|
||||||
|
print("Cache module not available", file=sys.stderr) # noqa: T201
|
||||||
|
return 1
|
||||||
|
clear_all_caches()
|
||||||
|
print("All caches cleared.") # noqa: T201
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Validate required arguments for main functionality
|
||||||
|
if args.file is None:
|
||||||
|
parser.error("--file/-f is required")
|
||||||
|
if args.length is None:
|
||||||
|
parser.error("--length/-l is required")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
filepath = Path(args.file)
|
filepath = Path(args.file)
|
||||||
if not filepath.exists():
|
if not filepath.exists():
|
||||||
@ -448,6 +692,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
deck_name=args.deck_name,
|
deck_name=args.deck_name,
|
||||||
all_vocab=not args.excerpt_words_only,
|
all_vocab=not args.excerpt_words_only,
|
||||||
no_translate=args.no_translate,
|
no_translate=args.no_translate,
|
||||||
|
force=args.force,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Determine output path
|
# Determine output path
|
||||||
|
|||||||
641
python_pkg/word_frequency/cache.py
Normal file
641
python_pkg/word_frequency/cache.py
Normal file
@ -0,0 +1,641 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Caching utilities for word frequency analysis.
|
||||||
|
|
||||||
|
Provides disk-based caching for:
|
||||||
|
- Translations (word -> translation mappings)
|
||||||
|
- Vocabulary curve excerpts (file + length -> excerpt + words)
|
||||||
|
- Generated Anki decks
|
||||||
|
|
||||||
|
Cache location: ~/.cache/word_frequency/
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Default cache directory
|
||||||
|
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency"
|
||||||
|
|
||||||
|
|
||||||
|
def get_cache_dir() -> Path:
|
||||||
|
"""Get the cache directory, creating it if needed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to cache directory.
|
||||||
|
"""
|
||||||
|
cache_dir = Path(os.environ.get("WORD_FREQ_CACHE_DIR", str(DEFAULT_CACHE_DIR)))
|
||||||
|
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return cache_dir
|
||||||
|
|
||||||
|
|
||||||
|
def get_file_hash(filepath: Path) -> str:
|
||||||
|
"""Compute SHA256 hash of a file's contents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to file.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hex digest of file hash.
|
||||||
|
"""
|
||||||
|
hasher = hashlib.sha256()
|
||||||
|
with open(filepath, "rb") as f:
|
||||||
|
# Read in chunks for large files
|
||||||
|
for chunk in iter(lambda: f.read(65536), b""):
|
||||||
|
hasher.update(chunk)
|
||||||
|
return hasher.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def get_text_hash(text: str) -> str:
|
||||||
|
"""Compute SHA256 hash of text content.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Text to hash.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Hex digest of text hash.
|
||||||
|
"""
|
||||||
|
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Translation Cache
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class TranslationCache:
|
||||||
|
"""Cache for word translations."""
|
||||||
|
|
||||||
|
def __init__(self, cache_dir: Path | None = None) -> None:
|
||||||
|
"""Initialize translation cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache_dir: Optional custom cache directory.
|
||||||
|
"""
|
||||||
|
self.cache_dir = cache_dir or get_cache_dir()
|
||||||
|
self.cache_file = self.cache_dir / "translations.json"
|
||||||
|
self._cache: dict[str, str] | None = None
|
||||||
|
self._dirty = False # Track if cache needs saving
|
||||||
|
|
||||||
|
def _load_cache(self) -> dict[str, str]:
|
||||||
|
"""Load cache from disk."""
|
||||||
|
if self._cache is None:
|
||||||
|
if self.cache_file.exists():
|
||||||
|
try:
|
||||||
|
self._cache = json.loads(self.cache_file.read_text(encoding="utf-8"))
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
self._cache = {}
|
||||||
|
else:
|
||||||
|
self._cache = {}
|
||||||
|
return self._cache
|
||||||
|
|
||||||
|
def _save_cache(self) -> None:
|
||||||
|
"""Save cache to disk if dirty."""
|
||||||
|
if self._cache is not None and self._dirty:
|
||||||
|
self.cache_file.write_text(
|
||||||
|
json.dumps(self._cache, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
self._dirty = False
|
||||||
|
|
||||||
|
def flush(self) -> None:
|
||||||
|
"""Force save cache to disk."""
|
||||||
|
self._save_cache()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _make_key(word: str, source_lang: str, target_lang: str) -> str:
|
||||||
|
"""Create cache key for a translation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word: Word to translate.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cache key string.
|
||||||
|
"""
|
||||||
|
return f"{source_lang}:{target_lang}:{word.lower()}"
|
||||||
|
|
||||||
|
def get(
|
||||||
|
self, word: str, source_lang: str, target_lang: str
|
||||||
|
) -> str | None:
|
||||||
|
"""Get cached translation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word: Word to look up.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cached translation or None if not found.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
key = self._make_key(word, source_lang, target_lang)
|
||||||
|
return cache.get(key)
|
||||||
|
|
||||||
|
def set(
|
||||||
|
self, word: str, source_lang: str, target_lang: str, translation: str,
|
||||||
|
*, auto_save: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""Store translation in cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word: Original word.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
translation: Translated word.
|
||||||
|
auto_save: If True, save to disk immediately.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
key = self._make_key(word, source_lang, target_lang)
|
||||||
|
cache[key] = translation
|
||||||
|
self._dirty = True
|
||||||
|
if auto_save:
|
||||||
|
self._save_cache()
|
||||||
|
|
||||||
|
def get_many(
|
||||||
|
self, words: list[str], source_lang: str, target_lang: str
|
||||||
|
) -> dict[str, str]:
|
||||||
|
"""Get multiple cached translations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: Words to look up.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping words to their cached translations.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
result: dict[str, str] = {}
|
||||||
|
for word in words:
|
||||||
|
key = self._make_key(word, source_lang, target_lang)
|
||||||
|
if key in cache:
|
||||||
|
result[word.lower()] = cache[key]
|
||||||
|
return result
|
||||||
|
|
||||||
|
def set_many(
|
||||||
|
self,
|
||||||
|
translations: dict[str, str],
|
||||||
|
source_lang: str,
|
||||||
|
target_lang: str,
|
||||||
|
) -> None:
|
||||||
|
"""Store multiple translations in cache and save to disk.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
translations: Dict mapping words to translations.
|
||||||
|
source_lang: Source language code.
|
||||||
|
target_lang: Target language code.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
for word, translation in translations.items():
|
||||||
|
key = self._make_key(word, source_lang, target_lang)
|
||||||
|
cache[key] = translation
|
||||||
|
self._dirty = True
|
||||||
|
self._save_cache() # Save once after all additions
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Clear all cached translations."""
|
||||||
|
self._cache = {}
|
||||||
|
self._dirty = False
|
||||||
|
if self.cache_file.exists():
|
||||||
|
self.cache_file.unlink()
|
||||||
|
|
||||||
|
def stats(self) -> dict[str, Any]:
|
||||||
|
"""Get cache statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with cache stats.
|
||||||
|
"""
|
||||||
|
cache = self._load_cache()
|
||||||
|
return {
|
||||||
|
"total_entries": len(cache),
|
||||||
|
"cache_file": str(self.cache_file),
|
||||||
|
"cache_size_bytes": (
|
||||||
|
self.cache_file.stat().st_size if self.cache_file.exists() else 0
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Vocabulary Curve Cache
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class VocabCurveCache:
|
||||||
|
"""Cache for vocabulary curve analysis results."""
|
||||||
|
|
||||||
|
def __init__(self, cache_dir: Path | None = None) -> None:
|
||||||
|
"""Initialize vocabulary curve cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache_dir: Optional custom cache directory.
|
||||||
|
"""
|
||||||
|
self.cache_dir = (cache_dir or get_cache_dir()) / "excerpts"
|
||||||
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _get_cache_path(self, file_hash: str, length: int) -> Path:
|
||||||
|
"""Get path to cache file for given hash and length.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_hash: Hash of source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Path to cache file.
|
||||||
|
"""
|
||||||
|
return self.cache_dir / f"{file_hash[:16]}_{length}.json"
|
||||||
|
|
||||||
|
def get(
|
||||||
|
self, filepath: Path, length: int
|
||||||
|
) -> tuple[str, list[tuple[str, int]]] | None:
|
||||||
|
"""Get cached excerpt and words for a file and length.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (excerpt, words_with_ranks) or None if not cached.
|
||||||
|
"""
|
||||||
|
file_hash = get_file_hash(filepath)
|
||||||
|
cache_path = self._get_cache_path(file_hash, length)
|
||||||
|
|
||||||
|
if not cache_path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = json.loads(cache_path.read_text(encoding="utf-8"))
|
||||||
|
# Verify hash matches
|
||||||
|
if data.get("file_hash") != file_hash:
|
||||||
|
return None
|
||||||
|
excerpt = data["excerpt"]
|
||||||
|
words = [(w, r) for w, r in data["words"]]
|
||||||
|
return excerpt, words
|
||||||
|
except (json.JSONDecodeError, KeyError, OSError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def set(
|
||||||
|
self,
|
||||||
|
filepath: Path,
|
||||||
|
length: int,
|
||||||
|
excerpt: str,
|
||||||
|
words: list[tuple[str, int]],
|
||||||
|
) -> None:
|
||||||
|
"""Store excerpt and words in cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
excerpt: The excerpt text.
|
||||||
|
words: List of (word, rank) tuples.
|
||||||
|
"""
|
||||||
|
file_hash = get_file_hash(filepath)
|
||||||
|
cache_path = self._get_cache_path(file_hash, length)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"file_hash": file_hash,
|
||||||
|
"filepath": str(filepath),
|
||||||
|
"length": length,
|
||||||
|
"excerpt": excerpt,
|
||||||
|
"words": [[w, r] for w, r in words],
|
||||||
|
}
|
||||||
|
|
||||||
|
cache_path.write_text(
|
||||||
|
json.dumps(data, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Clear all cached excerpts."""
|
||||||
|
for cache_file in self.cache_dir.glob("*.json"):
|
||||||
|
cache_file.unlink()
|
||||||
|
|
||||||
|
def stats(self) -> dict[str, Any]:
|
||||||
|
"""Get cache statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with cache stats.
|
||||||
|
"""
|
||||||
|
cache_files = list(self.cache_dir.glob("*.json"))
|
||||||
|
total_size = sum(f.stat().st_size for f in cache_files)
|
||||||
|
return {
|
||||||
|
"total_entries": len(cache_files),
|
||||||
|
"cache_dir": str(self.cache_dir),
|
||||||
|
"cache_size_bytes": total_size,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Anki Deck Cache
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class AnkiDeckCache:
|
||||||
|
"""Cache for generated Anki decks."""
|
||||||
|
|
||||||
|
def __init__(self, cache_dir: Path | None = None) -> None:
|
||||||
|
"""Initialize Anki deck cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cache_dir: Optional custom cache directory.
|
||||||
|
"""
|
||||||
|
self.cache_dir = (cache_dir or get_cache_dir()) / "anki_decks"
|
||||||
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.metadata_file = self.cache_dir / "metadata.json"
|
||||||
|
self._metadata: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
def _load_metadata(self) -> dict[str, Any]:
|
||||||
|
"""Load metadata from disk."""
|
||||||
|
if self._metadata is None:
|
||||||
|
if self.metadata_file.exists():
|
||||||
|
try:
|
||||||
|
self._metadata = json.loads(
|
||||||
|
self.metadata_file.read_text(encoding="utf-8")
|
||||||
|
)
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
self._metadata = {}
|
||||||
|
else:
|
||||||
|
self._metadata = {}
|
||||||
|
return self._metadata
|
||||||
|
|
||||||
|
def _save_metadata(self) -> None:
|
||||||
|
"""Save metadata to disk."""
|
||||||
|
if self._metadata is not None:
|
||||||
|
self.metadata_file.write_text(
|
||||||
|
json.dumps(self._metadata, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _make_key(
|
||||||
|
file_hash: str,
|
||||||
|
length: int,
|
||||||
|
target_lang: str,
|
||||||
|
include_context: bool,
|
||||||
|
all_vocab: bool,
|
||||||
|
) -> str:
|
||||||
|
"""Create cache key for an Anki deck.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_hash: Hash of source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
target_lang: Target language.
|
||||||
|
include_context: Whether context is included.
|
||||||
|
all_vocab: Whether all vocab is included.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cache key string.
|
||||||
|
"""
|
||||||
|
flags = f"ctx{int(include_context)}_all{int(all_vocab)}"
|
||||||
|
return f"{file_hash[:16]}_{length}_{target_lang}_{flags}"
|
||||||
|
|
||||||
|
def get(
|
||||||
|
self,
|
||||||
|
filepath: Path,
|
||||||
|
length: int,
|
||||||
|
target_lang: str,
|
||||||
|
include_context: bool,
|
||||||
|
all_vocab: bool,
|
||||||
|
) -> tuple[str, str, int, int] | None:
|
||||||
|
"""Get cached Anki deck.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
target_lang: Target language.
|
||||||
|
include_context: Whether context is included.
|
||||||
|
all_vocab: Whether all vocab is included.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (anki_content, excerpt, num_words, max_rank) or None.
|
||||||
|
"""
|
||||||
|
file_hash = get_file_hash(filepath)
|
||||||
|
key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
|
||||||
|
metadata = self._load_metadata()
|
||||||
|
|
||||||
|
if key not in metadata:
|
||||||
|
return None
|
||||||
|
|
||||||
|
entry = metadata[key]
|
||||||
|
if entry.get("file_hash") != file_hash:
|
||||||
|
return None
|
||||||
|
|
||||||
|
deck_file = self.cache_dir / f"{key}.txt"
|
||||||
|
if not deck_file.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = deck_file.read_text(encoding="utf-8")
|
||||||
|
return (
|
||||||
|
content,
|
||||||
|
entry["excerpt"],
|
||||||
|
entry["num_words"],
|
||||||
|
entry["max_rank"],
|
||||||
|
)
|
||||||
|
except OSError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def set(
|
||||||
|
self,
|
||||||
|
filepath: Path,
|
||||||
|
length: int,
|
||||||
|
target_lang: str,
|
||||||
|
include_context: bool,
|
||||||
|
all_vocab: bool,
|
||||||
|
anki_content: str,
|
||||||
|
excerpt: str,
|
||||||
|
num_words: int,
|
||||||
|
max_rank: int,
|
||||||
|
) -> None:
|
||||||
|
"""Store Anki deck in cache.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filepath: Path to source file.
|
||||||
|
length: Excerpt length.
|
||||||
|
target_lang: Target language.
|
||||||
|
include_context: Whether context is included.
|
||||||
|
all_vocab: Whether all vocab is included.
|
||||||
|
anki_content: The Anki deck content.
|
||||||
|
excerpt: The excerpt text.
|
||||||
|
num_words: Number of words in deck.
|
||||||
|
max_rank: Maximum word rank.
|
||||||
|
"""
|
||||||
|
file_hash = get_file_hash(filepath)
|
||||||
|
key = self._make_key(file_hash, length, target_lang, include_context, all_vocab)
|
||||||
|
|
||||||
|
# Save deck content
|
||||||
|
deck_file = self.cache_dir / f"{key}.txt"
|
||||||
|
deck_file.write_text(anki_content, encoding="utf-8")
|
||||||
|
|
||||||
|
# Update metadata
|
||||||
|
metadata = self._load_metadata()
|
||||||
|
metadata[key] = {
|
||||||
|
"file_hash": file_hash,
|
||||||
|
"filepath": str(filepath),
|
||||||
|
"length": length,
|
||||||
|
"target_lang": target_lang,
|
||||||
|
"include_context": include_context,
|
||||||
|
"all_vocab": all_vocab,
|
||||||
|
"excerpt": excerpt,
|
||||||
|
"num_words": num_words,
|
||||||
|
"max_rank": max_rank,
|
||||||
|
}
|
||||||
|
self._save_metadata()
|
||||||
|
|
||||||
|
def clear(self) -> None:
|
||||||
|
"""Clear all cached decks."""
|
||||||
|
self._metadata = {}
|
||||||
|
for cache_file in self.cache_dir.glob("*.txt"):
|
||||||
|
cache_file.unlink()
|
||||||
|
if self.metadata_file.exists():
|
||||||
|
self.metadata_file.unlink()
|
||||||
|
|
||||||
|
def stats(self) -> dict[str, Any]:
|
||||||
|
"""Get cache statistics.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with cache stats.
|
||||||
|
"""
|
||||||
|
metadata = self._load_metadata()
|
||||||
|
cache_files = list(self.cache_dir.glob("*.txt"))
|
||||||
|
total_size = sum(f.stat().st_size for f in cache_files)
|
||||||
|
return {
|
||||||
|
"total_entries": len(metadata),
|
||||||
|
"cache_dir": str(self.cache_dir),
|
||||||
|
"cache_size_bytes": total_size,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Global Cache Instances
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Singleton instances
|
||||||
|
_translation_cache: TranslationCache | None = None
|
||||||
|
_vocab_curve_cache: VocabCurveCache | None = None
|
||||||
|
_anki_deck_cache: AnkiDeckCache | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_translation_cache() -> TranslationCache:
|
||||||
|
"""Get the global translation cache instance."""
|
||||||
|
global _translation_cache # noqa: PLW0603
|
||||||
|
if _translation_cache is None:
|
||||||
|
_translation_cache = TranslationCache()
|
||||||
|
return _translation_cache
|
||||||
|
|
||||||
|
|
||||||
|
def get_vocab_curve_cache() -> VocabCurveCache:
|
||||||
|
"""Get the global vocabulary curve cache instance."""
|
||||||
|
global _vocab_curve_cache # noqa: PLW0603
|
||||||
|
if _vocab_curve_cache is None:
|
||||||
|
_vocab_curve_cache = VocabCurveCache()
|
||||||
|
return _vocab_curve_cache
|
||||||
|
|
||||||
|
|
||||||
|
def get_anki_deck_cache() -> AnkiDeckCache:
|
||||||
|
"""Get the global Anki deck cache instance."""
|
||||||
|
global _anki_deck_cache # noqa: PLW0603
|
||||||
|
if _anki_deck_cache is None:
|
||||||
|
_anki_deck_cache = AnkiDeckCache()
|
||||||
|
return _anki_deck_cache
|
||||||
|
|
||||||
|
|
||||||
|
def clear_all_caches() -> None:
|
||||||
|
"""Clear all caches."""
|
||||||
|
get_translation_cache().clear()
|
||||||
|
get_vocab_curve_cache().clear()
|
||||||
|
get_anki_deck_cache().clear()
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_cache_stats() -> dict[str, dict[str, Any]]:
|
||||||
|
"""Get statistics for all caches.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with stats for each cache type.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"translations": get_translation_cache().stats(),
|
||||||
|
"vocab_curves": get_vocab_curve_cache().stats(),
|
||||||
|
"anki_decks": get_anki_deck_cache().stats(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
"""CLI for cache management.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Exit code.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Manage word frequency caches")
|
||||||
|
parser.add_argument(
|
||||||
|
"--stats", action="store_true", help="Show cache statistics"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clear", action="store_true", help="Clear all caches"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clear-translations", action="store_true", help="Clear translation cache"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clear-excerpts", action="store_true", help="Clear excerpt cache"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clear-anki", action="store_true", help="Clear Anki deck cache"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.clear:
|
||||||
|
clear_all_caches()
|
||||||
|
print("All caches cleared.") # noqa: T201
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.clear_translations:
|
||||||
|
get_translation_cache().clear()
|
||||||
|
print("Translation cache cleared.") # noqa: T201
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.clear_excerpts:
|
||||||
|
get_vocab_curve_cache().clear()
|
||||||
|
print("Excerpt cache cleared.") # noqa: T201
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.clear_anki:
|
||||||
|
get_anki_deck_cache().clear()
|
||||||
|
print("Anki deck cache cleared.") # noqa: T201
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Default: show stats
|
||||||
|
stats = get_all_cache_stats()
|
||||||
|
print("Cache Statistics") # noqa: T201
|
||||||
|
print("=" * 50) # noqa: T201
|
||||||
|
for cache_name, cache_stats in stats.items():
|
||||||
|
print(f"\n{cache_name.upper()}:") # noqa: T201
|
||||||
|
for key, value in cache_stats.items():
|
||||||
|
if key == "cache_size_bytes":
|
||||||
|
# Format as human-readable
|
||||||
|
if value < 1024:
|
||||||
|
size_str = f"{value} B"
|
||||||
|
elif value < 1024 * 1024:
|
||||||
|
size_str = f"{value / 1024:.1f} KB"
|
||||||
|
else:
|
||||||
|
size_str = f"{value / (1024 * 1024):.1f} MB"
|
||||||
|
print(f" {key}: {size_str}") # noqa: T201
|
||||||
|
else:
|
||||||
|
print(f" {key}: {value}") # noqa: T201
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
sys.exit(main())
|
||||||
153
python_pkg/word_frequency/run_anki_generator.sh
Executable file
153
python_pkg/word_frequency/run_anki_generator.sh
Executable file
@ -0,0 +1,153 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Wrapper script for anki_generator that ensures argostranslate is available
|
||||||
|
#
|
||||||
|
# Usage: ./run_anki_generator.sh [anki_generator args...]
|
||||||
|
# Example: ./run_anki_generator.sh --file text.txt --length 20 --from pl --to en
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
# Use /tmp for venv to avoid home directory quota issues
|
||||||
|
VENV_DIR="/tmp/.venv_argos_$(id -u)"
|
||||||
|
|
||||||
|
# Colors for output
|
||||||
|
RED='\033[0;31m'
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
log_info() {
|
||||||
|
echo -e "${GREEN}[INFO]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_warn() {
|
||||||
|
echo -e "${YELLOW}[WARN]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Convert relative file paths to absolute before changing directories
|
||||||
|
resolve_file_paths() {
|
||||||
|
local args=()
|
||||||
|
local i=0
|
||||||
|
while [[ $i -lt ${#ORIGINAL_ARGS[@]} ]]; do
|
||||||
|
local arg="${ORIGINAL_ARGS[$i]}"
|
||||||
|
if [[ "$arg" == "--file" || "$arg" == "-f" ]]; then
|
||||||
|
args+=("$arg")
|
||||||
|
((i++))
|
||||||
|
if [[ $i -lt ${#ORIGINAL_ARGS[@]} ]]; then
|
||||||
|
local file="${ORIGINAL_ARGS[$i]}"
|
||||||
|
# Convert relative path to absolute
|
||||||
|
if [[ -f "$file" ]]; then
|
||||||
|
file="$(cd "$(dirname "$file")" && pwd)/$(basename "$file")"
|
||||||
|
fi
|
||||||
|
args+=("$file")
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
args+=("$arg")
|
||||||
|
fi
|
||||||
|
((i++))
|
||||||
|
done
|
||||||
|
echo "${args[@]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Store original args before any directory changes
|
||||||
|
ORIGINAL_ARGS=("$@")
|
||||||
|
|
||||||
|
# Check if argostranslate is available
|
||||||
|
check_argos() {
|
||||||
|
python -c "import argostranslate" 2>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
# Try to install argostranslate using pipx (system-wide)
|
||||||
|
try_pipx_install() {
|
||||||
|
if command -v pipx &>/dev/null; then
|
||||||
|
log_info "Trying pipx install argostranslate..."
|
||||||
|
if pipx install argostranslate 2>/dev/null; then
|
||||||
|
log_info "argostranslate installed via pipx"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Create/use a virtualenv for argostranslate
|
||||||
|
setup_venv() {
|
||||||
|
# Use /tmp for pip cache to avoid home directory quota issues
|
||||||
|
export PIP_CACHE_DIR="/tmp/.pip_cache_$(id -u)"
|
||||||
|
mkdir -p "$PIP_CACHE_DIR"
|
||||||
|
|
||||||
|
if [[ ! -d "$VENV_DIR" ]]; then
|
||||||
|
log_info "Creating virtual environment at $VENV_DIR..."
|
||||||
|
python -m venv "$VENV_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Activate venv
|
||||||
|
source "$VENV_DIR/bin/activate"
|
||||||
|
|
||||||
|
# Install argostranslate if not present
|
||||||
|
if ! python -c "import argostranslate" 2>/dev/null; then
|
||||||
|
log_info "Installing argostranslate in virtualenv (this may take a few minutes)..."
|
||||||
|
# Use CPU-only PyTorch to reduce download size significantly (~200MB vs ~900MB)
|
||||||
|
# Use --no-cache-dir to avoid any cache writes to home directory
|
||||||
|
pip install --progress-bar on --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
|
||||||
|
pip install --progress-bar on --no-cache-dir argostranslate
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Install langdetect for auto language detection
|
||||||
|
if ! python -c "import langdetect" 2>/dev/null; then
|
||||||
|
log_info "Installing langdetect for auto language detection..."
|
||||||
|
pip install --progress-bar on --no-cache-dir langdetect
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Also ensure other dependencies are available
|
||||||
|
if [[ -f "${SCRIPT_DIR}/../../requirements.txt" ]]; then
|
||||||
|
pip install --progress-bar on --no-cache-dir -r "${SCRIPT_DIR}/../../requirements.txt" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_info "Using virtualenv: $VENV_DIR"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Main logic
|
||||||
|
main() {
|
||||||
|
# Resolve file paths to absolute before changing directories
|
||||||
|
local resolved_args
|
||||||
|
resolved_args=$(resolve_file_paths)
|
||||||
|
|
||||||
|
# If --no-translate is passed, we don't need argostranslate
|
||||||
|
if [[ " $* " =~ " --no-translate " ]] || [[ " $* " =~ " -n " ]]; then
|
||||||
|
log_info "Running without translation (--no-translate)"
|
||||||
|
cd "$(dirname "$SCRIPT_DIR")" && cd ..
|
||||||
|
python -m python_pkg.word_frequency.anki_generator $resolved_args
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if argostranslate is already available
|
||||||
|
if check_argos; then
|
||||||
|
log_info "argostranslate is available"
|
||||||
|
cd "$(dirname "$SCRIPT_DIR")" && cd ..
|
||||||
|
python -m python_pkg.word_frequency.anki_generator $resolved_args
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
log_warn "argostranslate not found in system Python"
|
||||||
|
|
||||||
|
# Try pipx first (cleaner system-wide installation)
|
||||||
|
if try_pipx_install && check_argos; then
|
||||||
|
cd "$(dirname "$SCRIPT_DIR")" && cd ..
|
||||||
|
python -m python_pkg.word_frequency.anki_generator $resolved_args
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Fall back to virtualenv
|
||||||
|
log_info "Setting up virtualenv with argostranslate..."
|
||||||
|
setup_venv
|
||||||
|
|
||||||
|
# Run in venv context
|
||||||
|
cd "$(dirname "$SCRIPT_DIR")" && cd ..
|
||||||
|
python -m python_pkg.word_frequency.anki_generator $resolved_args
|
||||||
|
}
|
||||||
|
|
||||||
|
main "$@"
|
||||||
2040
python_pkg/word_frequency/test_texts/polish_pan_tadeusz_anki_20.txt
Normal file
2040
python_pkg/word_frequency/test_texts/polish_pan_tadeusz_anki_20.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -13,7 +13,6 @@ try:
|
|||||||
find_word_contexts,
|
find_word_contexts,
|
||||||
generate_anki_deck,
|
generate_anki_deck,
|
||||||
generate_flashcards,
|
generate_flashcards,
|
||||||
get_top_n_words,
|
|
||||||
main,
|
main,
|
||||||
parse_vocabulary_curve_output,
|
parse_vocabulary_curve_output,
|
||||||
)
|
)
|
||||||
@ -24,7 +23,6 @@ except ImportError:
|
|||||||
find_word_contexts,
|
find_word_contexts,
|
||||||
generate_anki_deck,
|
generate_anki_deck,
|
||||||
generate_flashcards,
|
generate_flashcards,
|
||||||
get_top_n_words,
|
|
||||||
main,
|
main,
|
||||||
parse_vocabulary_curve_output,
|
parse_vocabulary_curve_output,
|
||||||
)
|
)
|
||||||
@ -80,30 +78,44 @@ class TestParseVocabularyCurveOutput:
|
|||||||
|
|
||||||
def test_parse_length_1(self, sample_vocabulary_output: str) -> None:
|
def test_parse_length_1(self, sample_vocabulary_output: str) -> None:
|
||||||
"""Test parsing output for length 1."""
|
"""Test parsing output for length 1."""
|
||||||
excerpt, words = parse_vocabulary_curve_output(sample_vocabulary_output, 1)
|
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 1)
|
||||||
assert excerpt == "the"
|
assert excerpt == "the"
|
||||||
assert words == [("the", 1)]
|
assert excerpt_words == [("the", 1)]
|
||||||
|
|
||||||
def test_parse_length_2(self, sample_vocabulary_output: str) -> None:
|
def test_parse_length_2(self, sample_vocabulary_output: str) -> None:
|
||||||
"""Test parsing output for length 2."""
|
"""Test parsing output for length 2."""
|
||||||
excerpt, words = parse_vocabulary_curve_output(sample_vocabulary_output, 2)
|
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 2)
|
||||||
assert excerpt == "the dog"
|
assert excerpt == "the dog"
|
||||||
assert words == [("the", 1), ("dog", 2)]
|
assert excerpt_words == [("the", 1), ("dog", 2)]
|
||||||
|
|
||||||
def test_parse_length_3(self, sample_vocabulary_output: str) -> None:
|
def test_parse_length_3(self, sample_vocabulary_output: str) -> None:
|
||||||
"""Test parsing output for length 3."""
|
"""Test parsing output for length 3."""
|
||||||
excerpt, words = parse_vocabulary_curve_output(sample_vocabulary_output, 3)
|
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 3)
|
||||||
assert excerpt == "the quick fox"
|
assert excerpt == "the quick fox"
|
||||||
assert len(words) == 3
|
assert len(excerpt_words) == 3
|
||||||
assert ("the", 1) in words
|
assert ("the", 1) in excerpt_words
|
||||||
assert ("quick", 3) in words
|
assert ("quick", 3) in excerpt_words
|
||||||
assert ("fox", 5) in words
|
assert ("fox", 5) in excerpt_words
|
||||||
|
|
||||||
def test_parse_nonexistent_length(self, sample_vocabulary_output: str) -> None:
|
def test_parse_nonexistent_length(self, sample_vocabulary_output: str) -> None:
|
||||||
"""Test parsing output for non-existent length."""
|
"""Test parsing output for non-existent length."""
|
||||||
excerpt, words = parse_vocabulary_curve_output(sample_vocabulary_output, 100)
|
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 100)
|
||||||
assert excerpt == ""
|
assert excerpt == ""
|
||||||
assert words == []
|
assert excerpt_words == []
|
||||||
|
|
||||||
|
def test_parse_vocab_dump(self) -> None:
|
||||||
|
"""Test parsing VOCAB_DUMP section."""
|
||||||
|
output = """[Length 2] Vocab needed: 2
|
||||||
|
Excerpt: "hello world"
|
||||||
|
Words: hello(#1), world(#2)
|
||||||
|
|
||||||
|
VOCAB_DUMP_START
|
||||||
|
hello;1
|
||||||
|
world;2
|
||||||
|
VOCAB_DUMP_END
|
||||||
|
"""
|
||||||
|
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(output, 2)
|
||||||
|
assert all_vocab == [("hello", 1), ("world", 2)]
|
||||||
|
|
||||||
|
|
||||||
# Tests for find_word_contexts
|
# Tests for find_word_contexts
|
||||||
@ -250,31 +262,6 @@ class TestGenerateAnkiDeck:
|
|||||||
assert "world" in result
|
assert "world" in result
|
||||||
|
|
||||||
|
|
||||||
# Tests for get_top_n_words
|
|
||||||
|
|
||||||
|
|
||||||
class TestGetTopNWords:
|
|
||||||
"""Tests for getting top N words."""
|
|
||||||
|
|
||||||
def test_get_top_5_words(self) -> None:
|
|
||||||
"""Test getting top 5 words from text."""
|
|
||||||
text = "the cat sat on the mat the cat meowed"
|
|
||||||
words = get_top_n_words(text, 5)
|
|
||||||
assert len(words) == 5
|
|
||||||
# 'the' appears 3x, 'cat' appears 2x
|
|
||||||
assert words[0][0] == "the"
|
|
||||||
assert words[0][1] == 1
|
|
||||||
assert words[1][0] == "cat"
|
|
||||||
assert words[1][1] == 2
|
|
||||||
|
|
||||||
def test_ranks_are_sequential(self) -> None:
|
|
||||||
"""Test that ranks are 1-based and sequential."""
|
|
||||||
text = "one two three four five six seven eight"
|
|
||||||
words = get_top_n_words(text, 8)
|
|
||||||
ranks = [r for _, r in words]
|
|
||||||
assert ranks == [1, 2, 3, 4, 5, 6, 7, 8]
|
|
||||||
|
|
||||||
|
|
||||||
# Tests for main function
|
# Tests for main function
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,8 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -13,6 +15,40 @@ from python_pkg.word_frequency.learning_pipe import (
|
|||||||
load_stopwords,
|
load_stopwords,
|
||||||
main,
|
main,
|
||||||
)
|
)
|
||||||
|
import python_pkg.word_frequency.learning_pipe as learning_pipe_module
|
||||||
|
from python_pkg.word_frequency.translator import TranslationResult
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from collections.abc import Generator
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_translation() -> Generator[MagicMock, None, None]:
|
||||||
|
"""Mock translation to avoid requiring argostranslate."""
|
||||||
|
def fake_batch_translate(
|
||||||
|
words: list[str],
|
||||||
|
from_lang: str,
|
||||||
|
to_lang: str,
|
||||||
|
*,
|
||||||
|
use_cache: bool = True, # noqa: ARG001
|
||||||
|
) -> list[TranslationResult]:
|
||||||
|
"""Fake batch translation that returns word with prefix."""
|
||||||
|
return [
|
||||||
|
TranslationResult(
|
||||||
|
source_word=word,
|
||||||
|
translated_word=f"translated_{word}",
|
||||||
|
source_lang=from_lang,
|
||||||
|
target_lang=to_lang,
|
||||||
|
success=True,
|
||||||
|
)
|
||||||
|
for word in words
|
||||||
|
]
|
||||||
|
|
||||||
|
# Need to patch in learning_pipe module since it imports the function directly
|
||||||
|
with patch.object(
|
||||||
|
learning_pipe_module, "translate_words_batch", side_effect=fake_batch_translate
|
||||||
|
):
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
class TestLoadStopwords:
|
class TestLoadStopwords:
|
||||||
@ -162,7 +198,9 @@ class TestGenerateLearningLesson:
|
|||||||
class TestMain:
|
class TestMain:
|
||||||
"""Tests for main CLI function."""
|
"""Tests for main CLI function."""
|
||||||
|
|
||||||
def test_basic_text_input(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_basic_text_input(
|
||||||
|
self, capsys: pytest.CaptureFixture[str], mock_translation: None
|
||||||
|
) -> None:
|
||||||
"""Test with text input."""
|
"""Test with text input."""
|
||||||
exit_code = main(
|
exit_code = main(
|
||||||
[
|
[
|
||||||
@ -179,7 +217,7 @@ class TestMain:
|
|||||||
assert "LANGUAGE LEARNING LESSON" in captured.out
|
assert "LANGUAGE LEARNING LESSON" in captured.out
|
||||||
|
|
||||||
def test_file_input(
|
def test_file_input(
|
||||||
self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
|
self, tmp_path: Path, capsys: pytest.CaptureFixture[str], mock_translation: None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test with file input."""
|
"""Test with file input."""
|
||||||
test_file = tmp_path / "test.txt"
|
test_file = tmp_path / "test.txt"
|
||||||
@ -199,7 +237,7 @@ class TestMain:
|
|||||||
assert exit_code == 0
|
assert exit_code == 0
|
||||||
assert "hello" in captured.out.lower()
|
assert "hello" in captured.out.lower()
|
||||||
|
|
||||||
def test_output_to_file(self, tmp_path: Path) -> None:
|
def test_output_to_file(self, tmp_path: Path, mock_translation: None) -> None:
|
||||||
"""Test outputting to file."""
|
"""Test outputting to file."""
|
||||||
output_file = tmp_path / "lesson.txt"
|
output_file = tmp_path / "lesson.txt"
|
||||||
|
|
||||||
@ -219,7 +257,7 @@ class TestMain:
|
|||||||
assert "LANGUAGE LEARNING LESSON" in content
|
assert "LANGUAGE LEARNING LESSON" in content
|
||||||
|
|
||||||
def test_custom_stopwords(
|
def test_custom_stopwords(
|
||||||
self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
|
self, tmp_path: Path, capsys: pytest.CaptureFixture[str], mock_translation: None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test with custom stopwords file."""
|
"""Test with custom stopwords file."""
|
||||||
stopwords_file = tmp_path / "stop.txt"
|
stopwords_file = tmp_path / "stop.txt"
|
||||||
@ -242,7 +280,7 @@ class TestMain:
|
|||||||
# "hello" should be filtered by custom stopwords
|
# "hello" should be filtered by custom stopwords
|
||||||
|
|
||||||
def test_multiple_batches_option(
|
def test_multiple_batches_option(
|
||||||
self, capsys: pytest.CaptureFixture[str]
|
self, capsys: pytest.CaptureFixture[str], mock_translation: None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test --batches option."""
|
"""Test --batches option."""
|
||||||
text = " ".join(f"word{i}" * (50 - i) for i in range(30))
|
text = " ".join(f"word{i}" * (50 - i) for i in range(30))
|
||||||
@ -329,10 +367,10 @@ class TestTranslationIntegration:
|
|||||||
# Should not have translation arrows
|
# Should not have translation arrows
|
||||||
assert " -> " not in result or "Translation" not in result
|
assert " -> " not in result or "Translation" not in result
|
||||||
|
|
||||||
def test_lesson_with_translation_params(self) -> None:
|
def test_lesson_with_translation_params(self, mock_translation: None) -> None:
|
||||||
"""Test that translation params are accepted."""
|
"""Test that translation params are accepted."""
|
||||||
text = "hello world hello world hello"
|
text = "hello world hello world hello"
|
||||||
# This should not crash even without argostranslate installed
|
# This should work with mocked translation
|
||||||
result = generate_learning_lesson(
|
result = generate_learning_lesson(
|
||||||
text,
|
text,
|
||||||
batch_size=5,
|
batch_size=5,
|
||||||
@ -346,12 +384,14 @@ class TestTranslationIntegration:
|
|||||||
assert "VOCABULARY TO LEARN:" in result
|
assert "VOCABULARY TO LEARN:" in result
|
||||||
assert "hello" in result
|
assert "hello" in result
|
||||||
|
|
||||||
def test_main_with_translate_flags(self, tmp_path: Path) -> None:
|
def test_main_with_translate_flags(
|
||||||
|
self, tmp_path: Path, mock_translation: None
|
||||||
|
) -> None:
|
||||||
"""Test that main accepts translation flags."""
|
"""Test that main accepts translation flags."""
|
||||||
text_file = tmp_path / "test.txt"
|
text_file = tmp_path / "test.txt"
|
||||||
text_file.write_text("hello world hello world hello", encoding="utf-8")
|
text_file.write_text("hello world hello world hello", encoding="utf-8")
|
||||||
|
|
||||||
# Should not crash even if translation fails
|
# Should work with mocked translation
|
||||||
result = main([
|
result = main([
|
||||||
"--file", str(text_file),
|
"--file", str(text_file),
|
||||||
"--translate-from", "en",
|
"--translate-from", "en",
|
||||||
@ -361,7 +401,9 @@ class TestTranslationIntegration:
|
|||||||
|
|
||||||
assert result == 0
|
assert result == 0
|
||||||
|
|
||||||
def test_translate_to_defaults_to_english(self, capsys: pytest.CaptureFixture[str]) -> None:
|
def test_translate_to_defaults_to_english(
|
||||||
|
self, capsys: pytest.CaptureFixture[str], mock_translation: None
|
||||||
|
) -> None:
|
||||||
"""Test that translate_to defaults to 'en' when using auto-detection."""
|
"""Test that translate_to defaults to 'en' when using auto-detection."""
|
||||||
text = "hello world"
|
text = "hello world"
|
||||||
# When using --translate flag (translate_from="auto"), translate_to defaults to "en"
|
# When using --translate flag (translate_from="auto"), translate_to defaults to "en"
|
||||||
|
|||||||
@ -47,15 +47,22 @@ except ImportError:
|
|||||||
|
|
||||||
# Helper context manager for mocking argostranslate
|
# Helper context manager for mocking argostranslate
|
||||||
class ArgosAvailableMock:
|
class ArgosAvailableMock:
|
||||||
"""Context manager to mock argostranslate being available."""
|
"""Context manager to mock argostranslate being available and control its output.
|
||||||
|
|
||||||
|
Works whether argos is installed or not by patching sys.modules.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, translate_returns: str | list[str] | Exception | None = None) -> None:
|
def __init__(self, translate_returns: str | list[str] | Exception | None = None) -> None:
|
||||||
"""Initialize with return values for translate()."""
|
"""Initialize with return values for translate()."""
|
||||||
self.translate_returns = translate_returns
|
self.translate_returns = translate_returns
|
||||||
|
self.mock_translate_fn = MagicMock()
|
||||||
self.mock_translate_module = MagicMock()
|
self.mock_translate_module = MagicMock()
|
||||||
self.mock_package_module = MagicMock()
|
self.mock_package_module = MagicMock()
|
||||||
self.mock_parent = MagicMock()
|
self.mock_parent = MagicMock()
|
||||||
self.original_available = translator._argos_available
|
self.original_available = translator._argos_available
|
||||||
|
self._sys_modules_patcher: MagicMock | None = None
|
||||||
|
self._ensure_patcher: MagicMock | None = None
|
||||||
|
self._lang_patcher: MagicMock | None = None
|
||||||
|
|
||||||
def __enter__(self) -> MagicMock:
|
def __enter__(self) -> MagicMock:
|
||||||
"""Set up the mocks."""
|
"""Set up the mocks."""
|
||||||
@ -63,36 +70,52 @@ class ArgosAvailableMock:
|
|||||||
|
|
||||||
# Set up translate return value
|
# Set up translate return value
|
||||||
if isinstance(self.translate_returns, Exception):
|
if isinstance(self.translate_returns, Exception):
|
||||||
self.mock_translate_module.translate.side_effect = self.translate_returns
|
self.mock_translate_fn.side_effect = self.translate_returns
|
||||||
elif isinstance(self.translate_returns, list):
|
elif isinstance(self.translate_returns, list):
|
||||||
self.mock_translate_module.translate.side_effect = self.translate_returns
|
self.mock_translate_fn.side_effect = self.translate_returns
|
||||||
elif self.translate_returns is not None:
|
elif self.translate_returns is not None:
|
||||||
self.mock_translate_module.translate.return_value = self.translate_returns
|
self.mock_translate_fn.return_value = self.translate_returns
|
||||||
|
|
||||||
# Link parent module to submodules (critical for Python imports)
|
# Wire up the mock modules
|
||||||
|
self.mock_translate_module.translate = self.mock_translate_fn
|
||||||
|
self.mock_translate_module.get_installed_languages = MagicMock(return_value=[])
|
||||||
|
self.mock_package_module.update_package_index = MagicMock()
|
||||||
|
self.mock_package_module.get_available_packages = MagicMock(return_value=[])
|
||||||
self.mock_parent.translate = self.mock_translate_module
|
self.mock_parent.translate = self.mock_translate_module
|
||||||
self.mock_parent.package = self.mock_package_module
|
self.mock_parent.package = self.mock_package_module
|
||||||
|
|
||||||
# Patch sys.modules
|
# Patch sys.modules to inject our mock (works even if argos not installed)
|
||||||
self.patchers = [
|
self._sys_modules_patcher = patch.dict(
|
||||||
patch.dict(
|
"sys.modules",
|
||||||
"sys.modules",
|
{
|
||||||
{
|
"argostranslate": self.mock_parent,
|
||||||
"argostranslate": self.mock_parent,
|
"argostranslate.translate": self.mock_translate_module,
|
||||||
"argostranslate.translate": self.mock_translate_module,
|
"argostranslate.package": self.mock_package_module,
|
||||||
"argostranslate.package": self.mock_package_module,
|
},
|
||||||
},
|
)
|
||||||
),
|
|
||||||
]
|
|
||||||
for p in self.patchers:
|
|
||||||
p.start()
|
|
||||||
|
|
||||||
return self.mock_translate_module
|
# Patch _ensure_argos_installed and _ensure_language_pair to no-op
|
||||||
|
self._ensure_patcher = patch.object(
|
||||||
|
translator, "_ensure_argos_installed", lambda: None
|
||||||
|
)
|
||||||
|
self._lang_patcher = patch.object(
|
||||||
|
translator, "_ensure_language_pair", lambda f, t: None
|
||||||
|
)
|
||||||
|
|
||||||
|
self._sys_modules_patcher.start()
|
||||||
|
self._ensure_patcher.start()
|
||||||
|
self._lang_patcher.start()
|
||||||
|
|
||||||
|
return self.mock_translate_fn
|
||||||
|
|
||||||
def __exit__(self, *args: object) -> None:
|
def __exit__(self, *args: object) -> None:
|
||||||
"""Restore original state."""
|
"""Restore original state."""
|
||||||
for p in self.patchers:
|
if self._lang_patcher:
|
||||||
p.stop()
|
self._lang_patcher.stop()
|
||||||
|
if self._ensure_patcher:
|
||||||
|
self._ensure_patcher.stop()
|
||||||
|
if self._sys_modules_patcher:
|
||||||
|
self._sys_modules_patcher.stop()
|
||||||
translator._argos_available = self.original_available
|
translator._argos_available = self.original_available
|
||||||
|
|
||||||
|
|
||||||
@ -101,25 +124,13 @@ class ArgosAvailableMock:
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_argos_unavailable() -> Generator[None, None, None]:
|
def mock_argos_unavailable() -> Generator[None, None, None]:
|
||||||
"""Mock argostranslate being unavailable."""
|
"""Mock argostranslate being unavailable (for legacy tests)."""
|
||||||
original_value = translator._argos_available
|
original_value = translator._argos_available
|
||||||
translator._argos_available = False
|
translator._argos_available = False
|
||||||
yield
|
yield
|
||||||
translator._argos_available = original_value
|
translator._argos_available = original_value
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_all_translators_unavailable() -> Generator[None, None, None]:
|
|
||||||
"""Mock both argostranslate and deep-translator being unavailable."""
|
|
||||||
original_argos = translator._argos_available
|
|
||||||
original_deep = translator._deep_translator_available
|
|
||||||
translator._argos_available = False
|
|
||||||
translator._deep_translator_available = False
|
|
||||||
yield
|
|
||||||
translator._argos_available = original_argos
|
|
||||||
translator._deep_translator_available = original_deep
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def temp_words_file(tmp_path: Path) -> Path:
|
def temp_words_file(tmp_path: Path) -> Path:
|
||||||
"""Create a temporary file with words."""
|
"""Create a temporary file with words."""
|
||||||
@ -174,43 +185,36 @@ class TestTranslationResult:
|
|||||||
|
|
||||||
|
|
||||||
class TestTranslateWord:
|
class TestTranslateWord:
|
||||||
"""Tests for translate_word function."""
|
"""Tests for translate_word function - offline-first behavior."""
|
||||||
|
|
||||||
def test_translate_word_all_backends_unavailable(
|
def test_translate_word_argos_unavailable_raises(self) -> None:
|
||||||
self, mock_all_translators_unavailable: None
|
"""Test that translation raises ImportError when argos is unavailable."""
|
||||||
) -> None:
|
# Mock _ensure_argos_installed to raise ImportError
|
||||||
"""Test translation when no backends are available."""
|
with patch.object(
|
||||||
result = translate_word("hello", "en", "es")
|
translator,
|
||||||
assert result.success is False
|
"_ensure_argos_installed",
|
||||||
assert "No translation backend" in str(result.error)
|
side_effect=ImportError("argostranslate not available"),
|
||||||
|
):
|
||||||
def test_translate_word_argos_unavailable_uses_deep_translator(
|
with pytest.raises(ImportError, match="argostranslate not available"):
|
||||||
self, mock_argos_unavailable: None
|
translate_word("hello", "en", "es", use_cache=False)
|
||||||
) -> None:
|
|
||||||
"""Test that deep-translator is used when argos is unavailable."""
|
|
||||||
# deep-translator should work as fallback (it's installed)
|
|
||||||
result = translate_word("hello", "en", "es")
|
|
||||||
# This may succeed if deep-translator is installed
|
|
||||||
# Just verify we get a result without crashing
|
|
||||||
assert isinstance(result, TranslationResult)
|
|
||||||
|
|
||||||
def test_translate_word_success(self) -> None:
|
def test_translate_word_success(self) -> None:
|
||||||
"""Test successful word translation."""
|
"""Test successful word translation."""
|
||||||
with ArgosAvailableMock("hola"):
|
with ArgosAvailableMock("hola"):
|
||||||
result = translate_word("hello", "en", "es")
|
result = translate_word("hello", "en", "es", use_cache=False)
|
||||||
|
|
||||||
assert result.source_word == "hello"
|
assert result.source_word == "hello"
|
||||||
assert result.translated_word == "hola"
|
assert result.translated_word == "hola"
|
||||||
assert result.success is True
|
assert result.success is True
|
||||||
|
|
||||||
def test_translate_word_argos_exception_falls_back(
|
def test_translate_word_argos_exception_returns_error(self) -> None:
|
||||||
self, mock_argos_unavailable: None
|
"""Test that argos exception returns failed result with error."""
|
||||||
) -> None:
|
# Mock argos being available but translate raising an exception
|
||||||
"""Test that argos exception falls back to deep-translator."""
|
with ArgosAvailableMock(RuntimeError("Translation failed")):
|
||||||
# With argos unavailable, deep-translator should be used
|
result = translate_word("hello", "en", "es", use_cache=False)
|
||||||
result = translate_word("hello", "en", "es")
|
|
||||||
# Just verify it doesn't crash - may succeed or fail depending on network
|
assert result.success is False
|
||||||
assert isinstance(result, TranslationResult)
|
assert "Translation failed" in str(result.error)
|
||||||
|
|
||||||
|
|
||||||
# translate_words tests
|
# translate_words tests
|
||||||
@ -221,99 +225,123 @@ class TestTranslateWords:
|
|||||||
|
|
||||||
def test_translate_empty_list(self) -> None:
|
def test_translate_empty_list(self) -> None:
|
||||||
"""Test translating empty list."""
|
"""Test translating empty list."""
|
||||||
|
# Empty list returns empty result without calling translation
|
||||||
results = translate_words([], "en", "es")
|
results = translate_words([], "en", "es")
|
||||||
assert results == []
|
assert results == []
|
||||||
|
|
||||||
def test_translate_multiple_words(self) -> None:
|
def test_translate_multiple_words(self) -> None:
|
||||||
"""Test translating multiple words."""
|
"""Test translating multiple words."""
|
||||||
with ArgosAvailableMock(["hola", "mundo"]):
|
with ArgosAvailableMock(["hola", "mundo"]) as mock:
|
||||||
results = translate_words(["hello", "world"], "en", "es")
|
mock.side_effect = ["hola", "mundo"]
|
||||||
|
results = translate_words(["hello", "world"], "en", "es", use_cache=False)
|
||||||
|
|
||||||
assert len(results) == 2
|
assert len(results) == 2
|
||||||
assert results[0].translated_word == "hola"
|
assert results[0].translated_word == "hola"
|
||||||
assert results[1].translated_word == "mundo"
|
assert results[1].translated_word == "mundo"
|
||||||
|
|
||||||
|
def test_translate_words_argos_unavailable_raises(self) -> None:
|
||||||
|
"""Test that translating words raises ImportError when argos unavailable."""
|
||||||
|
with patch.object(
|
||||||
|
translator,
|
||||||
|
"_ensure_argos_installed",
|
||||||
|
side_effect=ImportError("argostranslate not available"),
|
||||||
|
):
|
||||||
|
with pytest.raises(ImportError, match="argostranslate not available"):
|
||||||
|
translate_words(["hello", "world"], "en", "es", use_cache=False)
|
||||||
|
|
||||||
|
|
||||||
# translate_words_batch tests
|
# translate_words_batch tests
|
||||||
|
|
||||||
|
|
||||||
class TestTranslateWordsBatch:
|
class TestTranslateWordsBatch:
|
||||||
"""Tests for translate_words_batch function."""
|
"""Tests for translate_words_batch function - offline-first."""
|
||||||
|
|
||||||
def test_batch_empty_list(self) -> None:
|
def test_batch_empty_list(self) -> None:
|
||||||
"""Test batch translation of empty list."""
|
"""Test batch translation of empty list."""
|
||||||
results = translate_words_batch([], "en", "es")
|
# Empty list doesn't require argos
|
||||||
|
with patch.object(translator, "_ensure_argos_installed", lambda: None):
|
||||||
|
results = translate_words_batch([], "en", "es")
|
||||||
assert results == []
|
assert results == []
|
||||||
|
|
||||||
def test_batch_small_list(self) -> None:
|
def test_batch_small_list(self) -> None:
|
||||||
"""Test batch translation of small list (3 or fewer)."""
|
"""Test batch translation of small list (uses batch mode anyway)."""
|
||||||
with ArgosAvailableMock(["uno", "dos", "tres"]) as mock:
|
with ArgosAvailableMock("uno\ndos\ntres") as mock:
|
||||||
results = translate_words_batch(["one", "two", "three"], "en", "es")
|
results = translate_words_batch(
|
||||||
|
["one", "two", "three"], "en", "es", use_cache=False
|
||||||
|
)
|
||||||
|
|
||||||
assert len(results) == 3
|
assert len(results) == 3
|
||||||
# Small lists use individual translation
|
# Batch translation
|
||||||
assert mock.translate.call_count == 3
|
assert mock.call_count == 1
|
||||||
|
|
||||||
def test_batch_large_list_success(self) -> None:
|
def test_batch_large_list_success(self) -> None:
|
||||||
"""Test batch translation of large list."""
|
"""Test batch translation of large list."""
|
||||||
words = ["one", "two", "three", "four", "five"]
|
words = ["one", "two", "three", "four", "five"]
|
||||||
|
|
||||||
with ArgosAvailableMock("uno\ndos\ntres\ncuatro\ncinco") as mock:
|
with ArgosAvailableMock("uno\ndos\ntres\ncuatro\ncinco") as mock:
|
||||||
results = translate_words_batch(words, "en", "es")
|
results = translate_words_batch(words, "en", "es", use_cache=False)
|
||||||
|
|
||||||
assert len(results) == 5
|
assert len(results) == 5
|
||||||
# Batch translation called once
|
# Batch translation called once
|
||||||
mock.translate.assert_called_once()
|
mock.assert_called_once()
|
||||||
assert results[0].translated_word == "uno"
|
assert results[0].translated_word == "uno"
|
||||||
assert results[4].translated_word == "cinco"
|
assert results[4].translated_word == "cinco"
|
||||||
|
|
||||||
def test_batch_fallback_on_mismatch(self) -> None:
|
def test_batch_fallback_on_mismatch(self) -> None:
|
||||||
"""Test batch translation falls back when result count mismatches."""
|
"""Test batch translation falls back to individual when result count mismatches."""
|
||||||
words = ["one", "two", "three", "four"]
|
words = ["one", "two", "three", "four"]
|
||||||
# First call (batch) returns wrong count, subsequent calls are individual
|
# First call (batch) returns wrong count, subsequent calls are individual
|
||||||
with ArgosAvailableMock(
|
with ArgosAvailableMock(
|
||||||
["wrong\ncount", "uno", "dos", "tres", "cuatro"]
|
["wrong", "uno", "dos", "tres", "cuatro"]
|
||||||
) as mock:
|
) as mock:
|
||||||
results = translate_words_batch(words, "en", "es")
|
results = translate_words_batch(words, "en", "es", use_cache=False)
|
||||||
|
|
||||||
assert len(results) == 4
|
assert len(results) == 4
|
||||||
# Fallback to individual
|
# Fallback to individual argos translation
|
||||||
assert mock.translate.call_count == 5
|
assert mock.call_count == 5
|
||||||
|
|
||||||
def test_batch_fallback_on_exception(self) -> None:
|
def test_batch_fallback_on_exception(self) -> None:
|
||||||
"""Test batch translation falls back on exception."""
|
"""Test batch translation raises on exception (no fallback to online)."""
|
||||||
words = ["one", "two", "three", "four"]
|
words = ["one", "two", "three", "four"]
|
||||||
|
|
||||||
# Create mock that raises first then succeeds
|
# Create mock that raises
|
||||||
original = translator._argos_available
|
mock_translate = MagicMock(side_effect=RuntimeError("Batch failed"))
|
||||||
translator._argos_available = True
|
|
||||||
|
|
||||||
mock_translate_module = MagicMock()
|
mock_translate_module = MagicMock()
|
||||||
mock_translate_module.translate.side_effect = [
|
mock_translate_module.translate = mock_translate
|
||||||
RuntimeError("Batch failed"),
|
|
||||||
"uno",
|
|
||||||
"dos",
|
|
||||||
"tres",
|
|
||||||
"cuatro",
|
|
||||||
]
|
|
||||||
mock_package_module = MagicMock()
|
mock_package_module = MagicMock()
|
||||||
mock_parent = MagicMock()
|
mock_parent = MagicMock()
|
||||||
mock_parent.translate = mock_translate_module
|
mock_parent.translate = mock_translate_module
|
||||||
mock_parent.package = mock_package_module
|
mock_parent.package = mock_package_module
|
||||||
|
|
||||||
with patch.dict(
|
original = translator._argos_available
|
||||||
"sys.modules",
|
translator._argos_available = True
|
||||||
{
|
|
||||||
"argostranslate": mock_parent,
|
with (
|
||||||
"argostranslate.translate": mock_translate_module,
|
patch.dict(
|
||||||
"argostranslate.package": mock_package_module,
|
"sys.modules",
|
||||||
},
|
{
|
||||||
|
"argostranslate": mock_parent,
|
||||||
|
"argostranslate.translate": mock_translate_module,
|
||||||
|
"argostranslate.package": mock_package_module,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
patch.object(translator, "_ensure_argos_installed", lambda: None),
|
||||||
|
patch.object(translator, "_ensure_language_pair", lambda f, t: None),
|
||||||
|
pytest.raises(RuntimeError, match="Translation failed"),
|
||||||
):
|
):
|
||||||
results = translate_words_batch(words, "en", "es")
|
translate_words_batch(words, "en", "es", use_cache=False)
|
||||||
|
|
||||||
translator._argos_available = original
|
translator._argos_available = original
|
||||||
|
|
||||||
assert len(results) == 4
|
def test_batch_argos_unavailable_raises(self) -> None:
|
||||||
|
"""Test that batch translation raises ImportError when argos unavailable."""
|
||||||
|
with patch.object(
|
||||||
|
translator,
|
||||||
|
"_ensure_argos_installed",
|
||||||
|
side_effect=ImportError("argostranslate not available"),
|
||||||
|
):
|
||||||
|
with pytest.raises(ImportError, match="argostranslate not available"):
|
||||||
|
translate_words_batch(["hello", "world"], "en", "es", use_cache=False)
|
||||||
|
|
||||||
|
|
||||||
# format_translations tests
|
# format_translations tests
|
||||||
@ -394,10 +422,31 @@ class TestGetInstalledLanguages:
|
|||||||
mock_lang2.code = "es"
|
mock_lang2.code = "es"
|
||||||
mock_lang2.name = "Spanish"
|
mock_lang2.name = "Spanish"
|
||||||
|
|
||||||
with ArgosAvailableMock() as mock:
|
# We need to mock the translate module's get_installed_languages
|
||||||
mock.get_installed_languages.return_value = [mock_lang1, mock_lang2]
|
mock_translate_module = MagicMock()
|
||||||
|
mock_translate_module.get_installed_languages.return_value = [
|
||||||
|
mock_lang1, mock_lang2
|
||||||
|
]
|
||||||
|
mock_package_module = MagicMock()
|
||||||
|
mock_parent = MagicMock()
|
||||||
|
mock_parent.translate = mock_translate_module
|
||||||
|
mock_parent.package = mock_package_module
|
||||||
|
|
||||||
|
original = translator._argos_available
|
||||||
|
translator._argos_available = True
|
||||||
|
|
||||||
|
with patch.dict(
|
||||||
|
"sys.modules",
|
||||||
|
{
|
||||||
|
"argostranslate": mock_parent,
|
||||||
|
"argostranslate.translate": mock_translate_module,
|
||||||
|
"argostranslate.package": mock_package_module,
|
||||||
|
},
|
||||||
|
):
|
||||||
result = get_installed_languages()
|
result = get_installed_languages()
|
||||||
|
|
||||||
|
translator._argos_available = original
|
||||||
|
|
||||||
assert ("en", "English") in result
|
assert ("en", "English") in result
|
||||||
assert ("es", "Spanish") in result
|
assert ("es", "Spanish") in result
|
||||||
|
|
||||||
@ -462,10 +511,28 @@ class TestMain:
|
|||||||
self, capsys: pytest.CaptureFixture[str]
|
self, capsys: pytest.CaptureFixture[str]
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Test listing languages when none installed."""
|
"""Test listing languages when none installed."""
|
||||||
with ArgosAvailableMock() as mock:
|
mock_translate_module = MagicMock()
|
||||||
mock.get_installed_languages.return_value = []
|
mock_translate_module.get_installed_languages.return_value = []
|
||||||
|
mock_package_module = MagicMock()
|
||||||
|
mock_parent = MagicMock()
|
||||||
|
mock_parent.translate = mock_translate_module
|
||||||
|
mock_parent.package = mock_package_module
|
||||||
|
|
||||||
|
original = translator._argos_available
|
||||||
|
translator._argos_available = True
|
||||||
|
|
||||||
|
with patch.dict(
|
||||||
|
"sys.modules",
|
||||||
|
{
|
||||||
|
"argostranslate": mock_parent,
|
||||||
|
"argostranslate.translate": mock_translate_module,
|
||||||
|
"argostranslate.package": mock_package_module,
|
||||||
|
},
|
||||||
|
):
|
||||||
result = main(["--list-languages"])
|
result = main(["--list-languages"])
|
||||||
|
|
||||||
|
translator._argos_available = original
|
||||||
|
|
||||||
assert result == 0
|
assert result == 0
|
||||||
captured = capsys.readouterr()
|
captured = capsys.readouterr()
|
||||||
assert "No languages installed" in captured.out
|
assert "No languages installed" in captured.out
|
||||||
@ -478,10 +545,28 @@ class TestMain:
|
|||||||
mock_lang.code = "en"
|
mock_lang.code = "en"
|
||||||
mock_lang.name = "English"
|
mock_lang.name = "English"
|
||||||
|
|
||||||
with ArgosAvailableMock() as mock:
|
mock_translate_module = MagicMock()
|
||||||
mock.get_installed_languages.return_value = [mock_lang]
|
mock_translate_module.get_installed_languages.return_value = [mock_lang]
|
||||||
|
mock_package_module = MagicMock()
|
||||||
|
mock_parent = MagicMock()
|
||||||
|
mock_parent.translate = mock_translate_module
|
||||||
|
mock_parent.package = mock_package_module
|
||||||
|
|
||||||
|
original = translator._argos_available
|
||||||
|
translator._argos_available = True
|
||||||
|
|
||||||
|
with patch.dict(
|
||||||
|
"sys.modules",
|
||||||
|
{
|
||||||
|
"argostranslate": mock_parent,
|
||||||
|
"argostranslate.translate": mock_translate_module,
|
||||||
|
"argostranslate.package": mock_package_module,
|
||||||
|
},
|
||||||
|
):
|
||||||
result = main(["--list-languages"])
|
result = main(["--list-languages"])
|
||||||
|
|
||||||
|
translator._argos_available = original
|
||||||
|
|
||||||
assert result == 0
|
assert result == 0
|
||||||
captured = capsys.readouterr()
|
captured = capsys.readouterr()
|
||||||
assert "en" in captured.out
|
assert "en" in captured.out
|
||||||
@ -578,11 +663,14 @@ class TestMain:
|
|||||||
|
|
||||||
assert result == 1
|
assert result == 1
|
||||||
|
|
||||||
def test_translation_failure_returns_error(
|
def test_translation_failure_returns_error(self) -> None:
|
||||||
self, mock_all_translators_unavailable: None
|
"""Test that translation failure returns error code when argos unavailable."""
|
||||||
) -> None:
|
with patch.object(
|
||||||
"""Test that translation failure returns error code when no backends."""
|
translator,
|
||||||
result = main(["--text", "hello", "--from", "en", "--to", "es"])
|
"_ensure_argos_installed",
|
||||||
|
side_effect=ImportError("argostranslate not available"),
|
||||||
|
):
|
||||||
|
result = main(["--text", "hello", "--from", "en", "--to", "es"])
|
||||||
assert result == 1
|
assert result == 1
|
||||||
|
|
||||||
|
|
||||||
@ -594,9 +682,10 @@ class TestIntegration:
|
|||||||
|
|
||||||
def test_full_translation_flow(self) -> None:
|
def test_full_translation_flow(self) -> None:
|
||||||
"""Test complete translation flow."""
|
"""Test complete translation flow."""
|
||||||
with ArgosAvailableMock(["uno", "dos", "tres"]):
|
with ArgosAvailableMock(["uno", "dos", "tres"]) as mock:
|
||||||
|
mock.side_effect = ["uno", "dos", "tres"]
|
||||||
words = ["one", "two", "three"]
|
words = ["one", "two", "three"]
|
||||||
results = translate_words(words, "en", "es")
|
results = translate_words(words, "en", "es", use_cache=False)
|
||||||
|
|
||||||
assert all(r.success for r in results)
|
assert all(r.success for r in results)
|
||||||
assert [r.translated_word for r in results] == ["uno", "dos", "tres"]
|
assert [r.translated_word for r in results] == ["uno", "dos", "tres"]
|
||||||
@ -606,14 +695,19 @@ class TestIntegration:
|
|||||||
assert "one" in output
|
assert "one" in output
|
||||||
assert "uno" in output
|
assert "uno" in output
|
||||||
|
|
||||||
def test_mixed_success_failure(
|
def test_mixed_success_failure(self) -> None:
|
||||||
self, mock_all_translators_unavailable: None
|
"""Test handling when argos raises exception for some translations."""
|
||||||
) -> None:
|
# Simulate argos translating first word, then failing, then succeeding
|
||||||
"""Test handling when no translation backends are available."""
|
with ArgosAvailableMock() as mock:
|
||||||
results = translate_words(["hello", "xyz", "world"], "en", "es")
|
mock.side_effect = ["hola", RuntimeError("Unknown"), "mundo"]
|
||||||
|
results = translate_words(
|
||||||
|
["hello", "xyz", "world"], "en", "es", use_cache=False
|
||||||
|
)
|
||||||
|
|
||||||
# All should fail when no backends available
|
# First and third succeed, second fails
|
||||||
assert all(not r.success for r in results)
|
assert results[0].success is True
|
||||||
|
assert results[1].success is False
|
||||||
|
assert results[2].success is True
|
||||||
|
|
||||||
output = format_translations(results)
|
output = format_translations(results)
|
||||||
assert "Error" in output
|
assert "Error" in output
|
||||||
|
|||||||
@ -40,6 +40,65 @@ if TYPE_CHECKING:
|
|||||||
_argos_available: bool | None = None
|
_argos_available: bool | None = None
|
||||||
_deep_translator_available: bool | None = None
|
_deep_translator_available: bool | None = None
|
||||||
_langdetect_available: bool | None = None
|
_langdetect_available: bool | None = None
|
||||||
|
_gpu_initialized: bool = False
|
||||||
|
_gpu_available: bool | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _check_cuda_available() -> bool:
|
||||||
|
"""Check if CUDA is available for GPU acceleration."""
|
||||||
|
global _gpu_available
|
||||||
|
if _gpu_available is None:
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
_gpu_available = torch.cuda.is_available()
|
||||||
|
except ImportError:
|
||||||
|
_gpu_available = False
|
||||||
|
return _gpu_available
|
||||||
|
|
||||||
|
|
||||||
|
def _init_gpu_if_available() -> None:
|
||||||
|
"""Initialize GPU for argostranslate if CUDA is available.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If CUDA is available but GPU initialization fails.
|
||||||
|
"""
|
||||||
|
global _gpu_initialized
|
||||||
|
if _gpu_initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
if not _check_cuda_available():
|
||||||
|
_gpu_initialized = True
|
||||||
|
return
|
||||||
|
|
||||||
|
import sys
|
||||||
|
print("CUDA detected, initializing GPU acceleration...", file=sys.stderr)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
import ctranslate2
|
||||||
|
|
||||||
|
# Force CTranslate2 to use CUDA
|
||||||
|
device_count = torch.cuda.device_count()
|
||||||
|
if device_count == 0:
|
||||||
|
raise RuntimeError("CUDA reports available but no GPU devices found")
|
||||||
|
|
||||||
|
device_name = torch.cuda.get_device_name(0)
|
||||||
|
print(f" Using GPU: {device_name}", file=sys.stderr)
|
||||||
|
|
||||||
|
# Set environment variable to force GPU usage in argos
|
||||||
|
import os
|
||||||
|
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
|
||||||
|
os.environ["CT2_USE_EXPERIMENTAL_PACKED_GEMM"] = "1"
|
||||||
|
|
||||||
|
_gpu_initialized = True
|
||||||
|
print(" GPU acceleration enabled.", file=sys.stderr)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"CUDA is available but GPU initialization failed: {e}\n"
|
||||||
|
f"This may be due to incompatible CUDA version or driver issues.\n"
|
||||||
|
f"To disable GPU and use CPU only, set environment variable: CT2_FORCE_CPU=1"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
def _check_argos() -> bool:
|
def _check_argos() -> bool:
|
||||||
@ -205,85 +264,184 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_argos_installed() -> None:
|
||||||
|
"""Ensure argostranslate is installed, attempt installation if not.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If argos cannot be installed.
|
||||||
|
"""
|
||||||
|
if _check_argos():
|
||||||
|
return
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
print("argostranslate not found. Attempting to install...") # noqa: T201
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
[sys.executable, "-m", "pip", "install", "argostranslate"],
|
||||||
|
check=True,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
# Reset the check flag and verify
|
||||||
|
global _argos_available # noqa: PLW0603
|
||||||
|
_argos_available = None
|
||||||
|
if not _check_argos():
|
||||||
|
raise ImportError("argostranslate installation succeeded but import failed")
|
||||||
|
print("argostranslate installed successfully.") # noqa: T201
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
error_msg = e.stderr.decode() if e.stderr else str(e)
|
||||||
|
raise ImportError(
|
||||||
|
f"argostranslate is required for offline translation.\n\n"
|
||||||
|
f"Install manually with one of:\n"
|
||||||
|
f" pip install argostranslate # In a virtualenv\n"
|
||||||
|
f" pipx install argostranslate # System-wide via pipx\n"
|
||||||
|
f" pacman -S python-argostranslate # Arch Linux (if available)\n\n"
|
||||||
|
f"Original error: {error_msg}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
|
||||||
|
"""Ensure the language pair is available, download if needed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
from_lang: Source language code.
|
||||||
|
to_lang: Target language code.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If language pair cannot be obtained.
|
||||||
|
"""
|
||||||
|
import argostranslate.package
|
||||||
|
import argostranslate.translate
|
||||||
|
|
||||||
|
# Check if already installed
|
||||||
|
installed_languages = argostranslate.translate.get_installed_languages()
|
||||||
|
from_lang_obj = None
|
||||||
|
to_lang_obj = None
|
||||||
|
|
||||||
|
for lang in installed_languages:
|
||||||
|
if lang.code == from_lang:
|
||||||
|
from_lang_obj = lang
|
||||||
|
if lang.code == to_lang:
|
||||||
|
to_lang_obj = lang
|
||||||
|
|
||||||
|
if from_lang_obj and to_lang_obj:
|
||||||
|
# Check if translation is available
|
||||||
|
translation = from_lang_obj.get_translation(to_lang_obj)
|
||||||
|
if translation:
|
||||||
|
return # Already available
|
||||||
|
|
||||||
|
# Need to download
|
||||||
|
import sys
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Downloading language pack: {from_lang} -> {to_lang}...",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
print(" Fetching package index...", file=sys.stderr)
|
||||||
|
argostranslate.package.update_package_index()
|
||||||
|
available = argostranslate.package.get_available_packages()
|
||||||
|
|
||||||
|
pkg = next(
|
||||||
|
(p for p in available if p.from_code == from_lang and p.to_code == to_lang),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
|
||||||
|
if pkg is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"No language pack available for {from_lang} -> {to_lang}. "
|
||||||
|
f"Available pairs can be listed with --list-languages."
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" Downloading package (~50-100MB, this may take a minute)...",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
download_path = pkg.download()
|
||||||
|
print(" Installing language pack...", file=sys.stderr)
|
||||||
|
argostranslate.package.install_from_path(download_path)
|
||||||
|
print(
|
||||||
|
f"Language pack {from_lang} -> {to_lang} installed.",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def translate_word(
|
def translate_word(
|
||||||
word: str,
|
word: str,
|
||||||
from_lang: str,
|
from_lang: str,
|
||||||
to_lang: str,
|
to_lang: str,
|
||||||
|
*,
|
||||||
|
use_cache: bool = True,
|
||||||
) -> TranslationResult:
|
) -> TranslationResult:
|
||||||
"""Translate a single word.
|
"""Translate a single word using argostranslate (offline).
|
||||||
|
|
||||||
Uses argostranslate if available (offline), otherwise falls back to
|
|
||||||
deep-translator (Google Translate, online).
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
word: The word to translate.
|
word: The word to translate.
|
||||||
from_lang: Source language code (e.g., 'en', 'pl', 'la').
|
from_lang: Source language code (e.g., 'en', 'pl', 'la').
|
||||||
to_lang: Target language code.
|
to_lang: Target language code.
|
||||||
|
use_cache: Whether to use/update translation cache.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
TranslationResult with the translation.
|
TranslationResult with the translation.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If argostranslate is not available and cannot be installed.
|
||||||
"""
|
"""
|
||||||
# Try argostranslate first (offline)
|
# Check cache first
|
||||||
if _check_argos():
|
if use_cache:
|
||||||
import argostranslate.translate
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
translated = argostranslate.translate.translate(word, from_lang, to_lang)
|
from python_pkg.word_frequency.cache import get_translation_cache
|
||||||
return TranslationResult(
|
cache = get_translation_cache()
|
||||||
source_word=word,
|
cached = cache.get(word, from_lang, to_lang)
|
||||||
translated_word=translated,
|
if cached is not None:
|
||||||
source_lang=from_lang,
|
return TranslationResult(
|
||||||
target_lang=to_lang,
|
source_word=word,
|
||||||
success=True,
|
translated_word=cached,
|
||||||
)
|
source_lang=from_lang,
|
||||||
except Exception as e: # noqa: BLE001
|
target_lang=to_lang,
|
||||||
# Fall through to try deep-translator
|
success=True,
|
||||||
argos_error = str(e)
|
)
|
||||||
else:
|
except ImportError:
|
||||||
argos_error = None
|
pass # Cache not available
|
||||||
|
|
||||||
# Try deep-translator (online via Google Translate)
|
# Ensure argos is installed (will raise if it can't be)
|
||||||
if _check_deep_translator():
|
_ensure_argos_installed()
|
||||||
from deep_translator import GoogleTranslator
|
|
||||||
|
|
||||||
try:
|
import argostranslate.translate
|
||||||
translator = GoogleTranslator(source=from_lang, target=to_lang)
|
|
||||||
translated = translator.translate(word)
|
|
||||||
return TranslationResult(
|
|
||||||
source_word=word,
|
|
||||||
translated_word=translated or "",
|
|
||||||
source_lang=from_lang,
|
|
||||||
target_lang=to_lang,
|
|
||||||
success=True,
|
|
||||||
)
|
|
||||||
except Exception as e: # noqa: BLE001
|
|
||||||
return TranslationResult(
|
|
||||||
source_word=word,
|
|
||||||
translated_word="",
|
|
||||||
source_lang=from_lang,
|
|
||||||
target_lang=to_lang,
|
|
||||||
success=False,
|
|
||||||
error=str(e),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Neither backend available
|
try:
|
||||||
error_msg = "No translation backend available. Install: pip install deep-translator"
|
translated = argostranslate.translate.translate(word, from_lang, to_lang)
|
||||||
if argos_error:
|
# Cache the result
|
||||||
error_msg = f"argostranslate error: {argos_error}"
|
if use_cache:
|
||||||
return TranslationResult(
|
try:
|
||||||
source_word=word,
|
from python_pkg.word_frequency.cache import get_translation_cache
|
||||||
translated_word="",
|
get_translation_cache().set(word, from_lang, to_lang, translated)
|
||||||
source_lang=from_lang,
|
except ImportError:
|
||||||
target_lang=to_lang,
|
pass
|
||||||
success=False,
|
return TranslationResult(
|
||||||
error=error_msg,
|
source_word=word,
|
||||||
)
|
translated_word=translated,
|
||||||
|
source_lang=from_lang,
|
||||||
|
target_lang=to_lang,
|
||||||
|
success=True,
|
||||||
|
)
|
||||||
|
except Exception as e: # noqa: BLE001
|
||||||
|
return TranslationResult(
|
||||||
|
source_word=word,
|
||||||
|
translated_word="",
|
||||||
|
source_lang=from_lang,
|
||||||
|
target_lang=to_lang,
|
||||||
|
success=False,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def translate_words(
|
def translate_words(
|
||||||
words: Sequence[str],
|
words: Sequence[str],
|
||||||
from_lang: str,
|
from_lang: str,
|
||||||
to_lang: str,
|
to_lang: str,
|
||||||
|
*,
|
||||||
|
use_cache: bool = True,
|
||||||
) -> list[TranslationResult]:
|
) -> list[TranslationResult]:
|
||||||
"""Translate multiple words.
|
"""Translate multiple words.
|
||||||
|
|
||||||
@ -291,69 +449,187 @@ def translate_words(
|
|||||||
words: List of words to translate.
|
words: List of words to translate.
|
||||||
from_lang: Source language code.
|
from_lang: Source language code.
|
||||||
to_lang: Target language code.
|
to_lang: Target language code.
|
||||||
|
use_cache: Whether to use translation cache.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of TranslationResult for each word.
|
List of TranslationResult for each word.
|
||||||
"""
|
"""
|
||||||
return [translate_word(word, from_lang, to_lang) for word in words]
|
return [translate_word(word, from_lang, to_lang, use_cache=use_cache) for word in words]
|
||||||
|
|
||||||
|
|
||||||
|
def _translate_batch_worker(
|
||||||
|
batch_words: list[str],
|
||||||
|
from_lang: str,
|
||||||
|
to_lang: str,
|
||||||
|
batch_idx: int,
|
||||||
|
) -> tuple[int, dict[str, str]]:
|
||||||
|
"""Worker function to translate a batch of words.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch_words: Words to translate in this batch.
|
||||||
|
from_lang: Source language code.
|
||||||
|
to_lang: Target language code.
|
||||||
|
batch_idx: Index of this batch (for ordering results).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (batch_idx, translations dict).
|
||||||
|
"""
|
||||||
|
import argostranslate.translate
|
||||||
|
|
||||||
|
translations: dict[str, str] = {}
|
||||||
|
|
||||||
|
# Batch translate by joining with newlines
|
||||||
|
batch_text = "\n".join(batch_words)
|
||||||
|
translated_batch = argostranslate.translate.translate(
|
||||||
|
batch_text, from_lang, to_lang
|
||||||
|
)
|
||||||
|
translated_words = translated_batch.split("\n")
|
||||||
|
|
||||||
|
# If we got the same number of translations, use them
|
||||||
|
if len(translated_words) == len(batch_words):
|
||||||
|
for word, trans in zip(batch_words, translated_words, strict=True):
|
||||||
|
translations[word.lower()] = trans.strip()
|
||||||
|
else:
|
||||||
|
# Fall back to individual translation for this batch
|
||||||
|
for word in batch_words:
|
||||||
|
translated = argostranslate.translate.translate(
|
||||||
|
word, from_lang, to_lang
|
||||||
|
)
|
||||||
|
translations[word.lower()] = translated
|
||||||
|
|
||||||
|
return batch_idx, translations
|
||||||
|
|
||||||
|
|
||||||
def translate_words_batch(
|
def translate_words_batch(
|
||||||
words: Sequence[str],
|
words: Sequence[str],
|
||||||
from_lang: str,
|
from_lang: str,
|
||||||
to_lang: str,
|
to_lang: str,
|
||||||
|
*,
|
||||||
|
use_cache: bool = True,
|
||||||
) -> list[TranslationResult]:
|
) -> list[TranslationResult]:
|
||||||
"""Translate multiple words, attempting batch translation for efficiency.
|
"""Translate multiple words using argostranslate (offline).
|
||||||
|
|
||||||
For better results with context, this joins words and translates together,
|
Uses small batch translation for efficiency with frequent progress updates.
|
||||||
then splits. Falls back to word-by-word if batch fails.
|
Requires argostranslate. Will use GPU if CUDA is available.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
words: List of words to translate.
|
words: List of words to translate.
|
||||||
from_lang: Source language code.
|
from_lang: Source language code.
|
||||||
to_lang: Target language code.
|
to_lang: Target language code.
|
||||||
|
use_cache: Whether to use translation cache.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of TranslationResult for each word.
|
List of TranslationResult for each word.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ImportError: If argostranslate is not available and cannot be installed.
|
||||||
|
RuntimeError: If CUDA is available but GPU initialization fails.
|
||||||
"""
|
"""
|
||||||
if not words:
|
if not words:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# For single words or small batches, just translate individually
|
# Ensure argos is installed (will raise if it can't be)
|
||||||
if len(words) <= 3:
|
_ensure_argos_installed()
|
||||||
return translate_words(words, from_lang, to_lang)
|
|
||||||
|
# Initialize GPU if available (will raise if CUDA available but fails)
|
||||||
|
_init_gpu_if_available()
|
||||||
|
|
||||||
# Try batch translation by joining with newlines
|
# Ensure language pair is available
|
||||||
if not _check_argos():
|
_ensure_language_pair(from_lang, to_lang)
|
||||||
return translate_words(words, from_lang, to_lang)
|
|
||||||
|
|
||||||
import argostranslate.translate
|
# Check cache for already-translated words
|
||||||
|
cached_results: dict[str, str] = {}
|
||||||
|
words_to_translate: list[str] = []
|
||||||
|
|
||||||
try:
|
if use_cache:
|
||||||
# Join words with newlines for batch translation
|
try:
|
||||||
batch_text = "\n".join(words)
|
from python_pkg.word_frequency.cache import get_translation_cache
|
||||||
translated_batch = argostranslate.translate.translate(
|
cache = get_translation_cache()
|
||||||
batch_text, from_lang, to_lang
|
cached_results = cache.get_many(list(words), from_lang, to_lang)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Find words that still need translation
|
||||||
|
for word in words:
|
||||||
|
if word.lower() not in cached_results:
|
||||||
|
words_to_translate.append(word)
|
||||||
|
|
||||||
|
# Translate uncached words using argos batch
|
||||||
|
new_translations: dict[str, str] = {}
|
||||||
|
if words_to_translate:
|
||||||
|
import sys
|
||||||
|
|
||||||
|
num_to_translate = len(words_to_translate)
|
||||||
|
|
||||||
|
# Check if GPU is being used
|
||||||
|
gpu_status = " (GPU)" if _gpu_available else " (CPU)"
|
||||||
|
print(
|
||||||
|
f"Translating {num_to_translate} words from {from_lang} to {to_lang}{gpu_status}...",
|
||||||
|
file=sys.stderr,
|
||||||
|
flush=True,
|
||||||
)
|
)
|
||||||
translated_words = translated_batch.split("\n")
|
|
||||||
|
|
||||||
# If we got the same number of translations, use them
|
try:
|
||||||
if len(translated_words) == len(words):
|
# Split into batches - larger batches are faster but show progress less often
|
||||||
return [
|
BATCH_SIZE = 100
|
||||||
TranslationResult(
|
batches: list[list[str]] = []
|
||||||
source_word=word,
|
for i in range(0, num_to_translate, BATCH_SIZE):
|
||||||
translated_word=trans.strip(),
|
batches.append(words_to_translate[i:i + BATCH_SIZE])
|
||||||
source_lang=from_lang,
|
|
||||||
target_lang=to_lang,
|
total_batches = len(batches)
|
||||||
success=True,
|
|
||||||
|
# Sequential translation with progress
|
||||||
|
# (argostranslate is not thread-safe - uses global model)
|
||||||
|
for batch_idx, batch_words in enumerate(batches):
|
||||||
|
words_done = (batch_idx + 1) * BATCH_SIZE
|
||||||
|
words_done = min(words_done, num_to_translate)
|
||||||
|
pct = int(words_done / num_to_translate * 100)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" [{pct:3d}%] Translating batch {batch_idx + 1}/{total_batches} "
|
||||||
|
f"({words_done}/{num_to_translate} words)...",
|
||||||
|
file=sys.stderr,
|
||||||
|
flush=True,
|
||||||
)
|
)
|
||||||
for word, trans in zip(words, translated_words, strict=True)
|
|
||||||
]
|
_, batch_translations = _translate_batch_worker(
|
||||||
except Exception: # noqa: BLE001, S110
|
batch_words, from_lang, to_lang, batch_idx
|
||||||
pass
|
)
|
||||||
|
new_translations.update(batch_translations)
|
||||||
|
|
||||||
|
print(f" Translation complete.", file=sys.stderr, flush=True)
|
||||||
|
except Exception as e: # noqa: BLE001
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Translation failed for {from_lang} -> {to_lang}: {e}"
|
||||||
|
) from e
|
||||||
|
|
||||||
# Fall back to individual translation
|
# Cache new translations
|
||||||
return translate_words(words, from_lang, to_lang)
|
if use_cache and new_translations:
|
||||||
|
try:
|
||||||
|
from python_pkg.word_frequency.cache import get_translation_cache
|
||||||
|
get_translation_cache().set_many(new_translations, from_lang, to_lang)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Merge cached and new translations
|
||||||
|
all_translations = {**cached_results, **new_translations}
|
||||||
|
|
||||||
|
# Build results in original order
|
||||||
|
results: list[TranslationResult] = []
|
||||||
|
for word in words:
|
||||||
|
translation = all_translations.get(word.lower(), "")
|
||||||
|
results.append(
|
||||||
|
TranslationResult(
|
||||||
|
source_word=word,
|
||||||
|
translated_word=translation,
|
||||||
|
source_lang=from_lang,
|
||||||
|
target_lang=to_lang,
|
||||||
|
success=bool(translation),
|
||||||
|
error=None if translation else "Translation failed",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
def format_translations(
|
def format_translations(
|
||||||
@ -551,7 +827,12 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Translate
|
# Translate
|
||||||
results = translate_words_batch(words, args.from_lang, args.to_lang)
|
try:
|
||||||
|
results = translate_words_batch(words, args.from_lang, args.to_lang)
|
||||||
|
except ImportError as e:
|
||||||
|
print(f"Error: {e}", file=sys.stderr) # noqa: T201
|
||||||
|
return 1
|
||||||
|
|
||||||
output = format_translations(results)
|
output = format_translations(results)
|
||||||
|
|
||||||
# Output
|
# Output
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user