mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 12:43:15 +02:00
Add pre-commit workflow and fix linting violations (#2)
* Initial plan * Add pre-commit GitHub workflow and fix linting issues - Created .github/workflows/pre-commit.yml to run pre-commit hooks in CI - Fixed mypy type errors in translator.py - Fixed shellcheck warning in run_anki_generator.sh - Added per-file ignores for word_frequency module legacy code - Applied auto-fixes from ruff, ruff-format, autoflake, prettier - All pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Make Python scripts with shebangs executable - Set executable bit for word_frequency module scripts with shebangs - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Fix: Restore imports in check functions (autoflake-proof) - Restored imports in _check_argos(), _check_deep_translator(), _check_langdetect() - Used _ = module assignment to prevent autoflake from removing imports - These imports test module availability by triggering ImportError if missing - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
This commit is contained in:
parent
f48b1dd178
commit
0bf6fd1bb1
@ -1,9 +1,9 @@
|
||||
/*
|
||||
* Vocabulary Learning Curve Analyzer
|
||||
*
|
||||
*
|
||||
* For each excerpt length (1, 2, 3, ... N words), finds the excerpt that
|
||||
* requires the minimum number of top-frequency words to understand 100%.
|
||||
*
|
||||
*
|
||||
* Usage:
|
||||
* ./vocabulary_curve <file.txt> [max_length]
|
||||
* ./vocabulary_curve test.txt 50
|
||||
@ -58,35 +58,35 @@ static unsigned int hash_word(const char *word) {
|
||||
static WordEntry *get_or_create_word(const char *word) {
|
||||
unsigned int h = hash_word(word);
|
||||
WordEntry *entry = hash_table[h];
|
||||
|
||||
|
||||
while (entry) {
|
||||
if (strcmp(entry->word, word) == 0) {
|
||||
return entry;
|
||||
}
|
||||
entry = entry->next;
|
||||
}
|
||||
|
||||
|
||||
/* Create new entry */
|
||||
if (num_unique_words >= MAX_UNIQUE_WORDS) {
|
||||
fprintf(stderr, "Too many unique words\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
entry = malloc(sizeof(WordEntry));
|
||||
if (!entry) {
|
||||
fprintf(stderr, "Memory allocation failed\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
strncpy(entry->word, word, MAX_WORD_LEN - 1);
|
||||
entry->word[MAX_WORD_LEN - 1] = '\0';
|
||||
entry->count = 0;
|
||||
entry->rank = 0;
|
||||
entry->next = hash_table[h];
|
||||
hash_table[h] = entry;
|
||||
|
||||
|
||||
all_entries[num_unique_words++] = entry;
|
||||
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
@ -109,11 +109,11 @@ static bool process_file(const char *filename) {
|
||||
fprintf(stderr, "Cannot open file: %s\n", filename);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
char word[MAX_WORD_LEN];
|
||||
int word_len = 0;
|
||||
int c;
|
||||
|
||||
|
||||
while ((c = fgetc(fp)) != EOF) {
|
||||
if (is_word_char(c)) {
|
||||
if (word_len < MAX_WORD_LEN - 1) {
|
||||
@ -121,34 +121,34 @@ static bool process_file(const char *filename) {
|
||||
}
|
||||
} else if (word_len > 0) {
|
||||
word[word_len] = '\0';
|
||||
|
||||
|
||||
WordEntry *entry = get_or_create_word(word);
|
||||
entry->count++;
|
||||
|
||||
|
||||
if (num_words >= MAX_WORDS) {
|
||||
fprintf(stderr, "Too many words in file\n");
|
||||
fclose(fp);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/* Store pointer directly - survives sorting */
|
||||
word_sequence[num_words++] = entry;
|
||||
|
||||
|
||||
word_len = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Handle last word if file doesn't end with whitespace */
|
||||
if (word_len > 0) {
|
||||
word[word_len] = '\0';
|
||||
WordEntry *entry = get_or_create_word(word);
|
||||
entry->count++;
|
||||
|
||||
|
||||
if (num_words < MAX_WORDS) {
|
||||
word_sequence[num_words++] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fclose(fp);
|
||||
return true;
|
||||
}
|
||||
@ -157,7 +157,7 @@ static bool process_file(const char *filename) {
|
||||
static void assign_ranks(void) {
|
||||
/* Sort all_entries by frequency (this doesn't affect word_sequence) */
|
||||
qsort(all_entries, num_unique_words, sizeof(WordEntry *), compare_by_count);
|
||||
|
||||
|
||||
/* Assign 1-indexed ranks using competition ranking:
|
||||
* Words with same frequency get same rank.
|
||||
* Next rank is current_position + 1 (skipping numbers).
|
||||
@ -181,13 +181,13 @@ static int analyze_excerpt(int start, int length) {
|
||||
/* We use the rank field is already assigned, so we can check uniqueness */
|
||||
static bool seen_rank[MAX_UNIQUE_WORDS + 1];
|
||||
memset(seen_rank, 0, (num_unique_words + 1) * sizeof(bool));
|
||||
|
||||
|
||||
int max_rank = 0;
|
||||
|
||||
|
||||
for (int i = start; i < start + length; i++) {
|
||||
WordEntry *entry = word_sequence[i];
|
||||
int rank = entry->rank;
|
||||
|
||||
|
||||
if (!seen_rank[rank]) {
|
||||
seen_rank[rank] = true;
|
||||
if (rank > max_rank) {
|
||||
@ -195,7 +195,7 @@ static int analyze_excerpt(int start, int length) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return max_rank;
|
||||
}
|
||||
|
||||
@ -204,17 +204,17 @@ static void find_optimal_excerpts(int max_length, ExcerptResult *results) {
|
||||
for (int length = 1; length <= max_length && length <= num_words; length++) {
|
||||
int best_vocab = num_unique_words + 1;
|
||||
int best_start = 0;
|
||||
|
||||
|
||||
/* Slide window through text */
|
||||
for (int start = 0; start <= num_words - length; start++) {
|
||||
int vocab_needed = analyze_excerpt(start, length);
|
||||
|
||||
|
||||
if (vocab_needed < best_vocab) {
|
||||
best_vocab = vocab_needed;
|
||||
best_start = start;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
results[length - 1].excerpt_length = length;
|
||||
results[length - 1].min_vocab_needed = best_vocab;
|
||||
results[length - 1].start_pos = best_start;
|
||||
@ -235,7 +235,7 @@ static void print_words_needed(int start, int length) {
|
||||
static WordEntry *unique_entries[MAX_UNIQUE_WORDS];
|
||||
static bool seen_rank[MAX_UNIQUE_WORDS + 1];
|
||||
memset(seen_rank, 0, (num_unique_words + 1) * sizeof(bool));
|
||||
|
||||
|
||||
int count = 0;
|
||||
for (int i = start; i < start + length; i++) {
|
||||
WordEntry *entry = word_sequence[i];
|
||||
@ -244,7 +244,7 @@ static void print_words_needed(int start, int length) {
|
||||
unique_entries[count++] = entry;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Sort by rank (simple bubble sort - small arrays) */
|
||||
for (int i = 0; i < count - 1; i++) {
|
||||
for (int j = i + 1; j < count; j++) {
|
||||
@ -255,7 +255,7 @@ static void print_words_needed(int start, int length) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Print */
|
||||
for (int i = 0; i < count; i++) {
|
||||
if (i > 0) printf(", ");
|
||||
@ -276,33 +276,33 @@ static void print_results(ExcerptResult *results, int max_length) {
|
||||
printf("Unique words: %d\n", num_unique_words);
|
||||
printf("\n");
|
||||
printf("----------------------------------------------------------------------\n");
|
||||
|
||||
|
||||
int prev_vocab = 0;
|
||||
int actual_max = max_length;
|
||||
if (actual_max > num_words) actual_max = num_words;
|
||||
|
||||
|
||||
for (int i = 0; i < actual_max; i++) {
|
||||
ExcerptResult *r = &results[i];
|
||||
|
||||
|
||||
printf("\n[Length %d] Vocab needed: %d", r->excerpt_length, r->min_vocab_needed);
|
||||
if (r->min_vocab_needed > prev_vocab) {
|
||||
printf(" (+%d)", r->min_vocab_needed - prev_vocab);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
|
||||
printf(" Excerpt: \"");
|
||||
print_excerpt(r->start_pos, r->excerpt_length);
|
||||
printf("\"\n");
|
||||
|
||||
|
||||
printf(" Words: ");
|
||||
print_words_needed(r->start_pos, r->excerpt_length);
|
||||
printf("\n");
|
||||
|
||||
|
||||
prev_vocab = r->min_vocab_needed;
|
||||
}
|
||||
|
||||
|
||||
printf("\n----------------------------------------------------------------------\n");
|
||||
|
||||
|
||||
if (actual_max > 0) {
|
||||
ExcerptResult *final = &results[actual_max - 1];
|
||||
printf("\nTo understand a %d-word excerpt,\n", final->excerpt_length);
|
||||
@ -333,7 +333,7 @@ static void find_longest_excerpt(int max_vocab) {
|
||||
/* Sliding window: find longest contiguous sequence where all words have rank <= max_vocab */
|
||||
int best_start = 0;
|
||||
int best_length = 0;
|
||||
|
||||
|
||||
int left = 0;
|
||||
for (int right = 0; right < num_words; right++) {
|
||||
/* If current word is outside our vocabulary, move left past it */
|
||||
@ -348,7 +348,7 @@ static void find_longest_excerpt(int max_vocab) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Print results */
|
||||
printf("======================================================================\n");
|
||||
printf("INVERSE MODE: LONGEST EXCERPT WITH TOP %d WORDS\n", max_vocab);
|
||||
@ -360,7 +360,7 @@ static void find_longest_excerpt(int max_vocab) {
|
||||
printf("\n");
|
||||
printf("----------------------------------------------------------------------\n");
|
||||
printf("\n");
|
||||
|
||||
|
||||
if (best_length == 0) {
|
||||
printf("No valid excerpt found with top %d words.\n", max_vocab);
|
||||
printf("The text may require rarer words from the very beginning.\n");
|
||||
@ -372,7 +372,7 @@ static void find_longest_excerpt(int max_vocab) {
|
||||
print_excerpt(best_start, best_length);
|
||||
printf("\"\n");
|
||||
printf("\n");
|
||||
|
||||
|
||||
/* Find the rarest word in the excerpt */
|
||||
int max_rank_used = 0;
|
||||
const char *rarest_word = NULL;
|
||||
@ -383,7 +383,7 @@ static void find_longest_excerpt(int max_vocab) {
|
||||
}
|
||||
}
|
||||
printf("Rarest word used: %s (#%d)\n", rarest_word, max_rank_used);
|
||||
|
||||
|
||||
/* Count unique words in excerpt */
|
||||
static bool seen_rank[MAX_UNIQUE_WORDS + 1];
|
||||
memset(seen_rank, 0, (num_unique_words + 1) * sizeof(bool));
|
||||
@ -396,7 +396,7 @@ static void find_longest_excerpt(int max_vocab) {
|
||||
}
|
||||
printf("Unique words in excerpt: %d\n", unique_count);
|
||||
}
|
||||
|
||||
|
||||
printf("\n----------------------------------------------------------------------\n");
|
||||
}
|
||||
|
||||
@ -414,13 +414,13 @@ int main(int argc, char *argv[]) {
|
||||
fprintf(stderr, " %s book.txt --max-vocab 500 # Find longest excerpt with top 500 words\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
const char *filename = argv[1];
|
||||
int max_length = 30;
|
||||
bool dump_vocab = false;
|
||||
int dump_max_rank = 0;
|
||||
int max_vocab_mode = 0; /* 0 = normal mode, >0 = inverse mode with this vocab limit */
|
||||
|
||||
|
||||
/* Parse arguments */
|
||||
for (int i = 2; i < argc; i++) {
|
||||
if (strcmp(argv[i], "--dump-vocab") == 0) {
|
||||
@ -445,37 +445,37 @@ int main(int argc, char *argv[]) {
|
||||
if (max_length > 1000) max_length = 1000;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Initialize hash table */
|
||||
memset(hash_table, 0, sizeof(hash_table));
|
||||
|
||||
|
||||
/* Process file */
|
||||
if (!process_file(filename)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
if (num_words == 0) {
|
||||
fprintf(stderr, "No words found in file\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/* Assign ranks by frequency */
|
||||
assign_ranks();
|
||||
|
||||
|
||||
/* Inverse mode: find longest excerpt with limited vocabulary */
|
||||
if (max_vocab_mode > 0) {
|
||||
find_longest_excerpt(max_vocab_mode);
|
||||
|
||||
|
||||
/* Dump vocabulary if requested */
|
||||
if (dump_vocab) {
|
||||
if (dump_max_rank == 0) dump_max_rank = max_vocab_mode;
|
||||
dump_vocabulary(dump_max_rank);
|
||||
}
|
||||
|
||||
|
||||
cleanup();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Normal mode: find optimal excerpts */
|
||||
ExcerptResult *results = malloc(max_length * sizeof(ExcerptResult));
|
||||
if (!results) {
|
||||
@ -483,12 +483,12 @@ int main(int argc, char *argv[]) {
|
||||
cleanup();
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
find_optimal_excerpts(max_length, results);
|
||||
|
||||
|
||||
/* Print results */
|
||||
print_results(results, max_length);
|
||||
|
||||
|
||||
/* Dump vocabulary if requested */
|
||||
if (dump_vocab) {
|
||||
/* If no max_rank specified, use the max from the excerpt */
|
||||
@ -499,10 +499,10 @@ int main(int argc, char *argv[]) {
|
||||
dump_vocabulary(dump_max_rank);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Cleanup */
|
||||
free(results);
|
||||
cleanup();
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
16
python_pkg/word_frequency/analyzer.py
Normal file → Executable file
16
python_pkg/word_frequency/analyzer.py
Normal file → Executable file
@ -21,10 +21,10 @@ Usage:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -143,7 +143,9 @@ def format_results(
|
||||
# Data rows
|
||||
for word, count in items:
|
||||
percentage = (count / total_words) * 100
|
||||
lines.append(f"{word:<{max_word_len}} {count:>{count_width}} {percentage:>9.2f}%")
|
||||
lines.append(
|
||||
f"{word:<{max_word_len}} {count:>{count_width}} {percentage:>9.2f}%"
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
@ -242,15 +244,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
|
||||
if args.output:
|
||||
Path(args.output).write_text(result, encoding="utf-8")
|
||||
print(f"Output written to {args.output}") # noqa: T201
|
||||
print(f"Output written to {args.output}")
|
||||
else:
|
||||
print(result) # noqa: T201
|
||||
print(result)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
||||
return 1
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
197
python_pkg/word_frequency/anki_generator.py
Normal file → Executable file
197
python_pkg/word_frequency/anki_generator.py
Normal file → Executable file
@ -25,29 +25,30 @@ Output:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
try:
|
||||
from python_pkg.word_frequency.analyzer import read_file
|
||||
from python_pkg.word_frequency.translator import (
|
||||
detect_language,
|
||||
translate_words_batch,
|
||||
)
|
||||
from python_pkg.word_frequency.analyzer import read_file
|
||||
except ImportError:
|
||||
from translator import detect_language, translate_words_batch
|
||||
from analyzer import read_file
|
||||
from translator import detect_language, translate_words_batch
|
||||
|
||||
|
||||
# Path to C vocabulary_curve executable
|
||||
C_EXECUTABLE = Path(__file__).parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
|
||||
C_EXECUTABLE = (
|
||||
Path(__file__).parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
|
||||
)
|
||||
|
||||
|
||||
class VocabWord(NamedTuple):
|
||||
@ -59,7 +60,9 @@ class VocabWord(NamedTuple):
|
||||
context: str
|
||||
|
||||
|
||||
def run_vocabulary_curve(filepath: Path, max_length: int, *, dump_vocab: bool = False) -> str:
|
||||
def run_vocabulary_curve(
|
||||
filepath: Path, max_length: int, *, dump_vocab: bool = False
|
||||
) -> str:
|
||||
"""Run the C vocabulary_curve executable.
|
||||
|
||||
Args:
|
||||
@ -94,7 +97,9 @@ def run_vocabulary_curve(filepath: Path, max_length: int, *, dump_vocab: bool =
|
||||
return result.stdout
|
||||
|
||||
|
||||
def run_vocabulary_curve_inverse(filepath: Path, max_vocab: int, *, dump_vocab: bool = False) -> str:
|
||||
def run_vocabulary_curve_inverse(
|
||||
filepath: Path, max_vocab: int, *, dump_vocab: bool = False
|
||||
) -> str:
|
||||
"""Run the C vocabulary_curve executable in inverse mode.
|
||||
|
||||
Args:
|
||||
@ -129,7 +134,9 @@ def run_vocabulary_curve_inverse(filepath: Path, max_vocab: int, *, dump_vocab:
|
||||
return result.stdout
|
||||
|
||||
|
||||
def parse_inverse_mode_output(output: str) -> tuple[str, int, int, list[tuple[str, int]]]:
|
||||
def parse_inverse_mode_output(
|
||||
output: str,
|
||||
) -> tuple[str, int, int, list[tuple[str, int]]]:
|
||||
"""Parse output from vocabulary_curve inverse mode.
|
||||
|
||||
Args:
|
||||
@ -146,12 +153,12 @@ def parse_inverse_mode_output(output: str) -> tuple[str, int, int, list[tuple[st
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
|
||||
|
||||
if line.startswith("LONGEST EXCERPT:"):
|
||||
parts = line.split()
|
||||
if len(parts) >= 3:
|
||||
excerpt_length = int(parts[2])
|
||||
|
||||
|
||||
elif line.startswith("Excerpt:"):
|
||||
# Next line(s) contain the excerpt
|
||||
i += 1
|
||||
@ -167,7 +174,7 @@ def parse_inverse_mode_output(output: str) -> tuple[str, int, int, list[tuple[st
|
||||
excerpt_parts.append(next_line)
|
||||
i += 1
|
||||
excerpt = " ".join(excerpt_parts)
|
||||
|
||||
|
||||
elif line.startswith("Rarest word used:"):
|
||||
# Parse "word (#rank)"
|
||||
match = re.search(r"\(#(\d+)\)", line)
|
||||
@ -194,7 +201,9 @@ def parse_inverse_mode_output(output: str) -> tuple[str, int, int, list[tuple[st
|
||||
return excerpt, excerpt_length, max_rank_used, all_vocab
|
||||
|
||||
|
||||
def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
|
||||
def parse_vocabulary_curve_output(
|
||||
output: str, target_length: int
|
||||
) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
|
||||
"""Parse output from vocabulary_curve to get words needed.
|
||||
|
||||
Args:
|
||||
@ -328,8 +337,8 @@ def generate_anki_deck(
|
||||
lines: list[str] = []
|
||||
|
||||
# Add Anki headers
|
||||
lines.append(f"#separator:semicolon")
|
||||
lines.append(f"#html:true")
|
||||
lines.append("#separator:semicolon")
|
||||
lines.append("#html:true")
|
||||
lines.append(f"#deck:{deck_name}")
|
||||
lines.append(f"#tags:vocabulary {source_lang}")
|
||||
if include_context:
|
||||
@ -351,11 +360,15 @@ def generate_anki_deck(
|
||||
if most_frequent != rarest:
|
||||
pattern_rare = re.compile(rf"\b({re.escape(rarest)})\b", re.IGNORECASE)
|
||||
excerpt_escaped = pattern_rare.sub(r"<b>\1</b>", excerpt_escaped)
|
||||
pattern_freq = re.compile(rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE)
|
||||
pattern_freq = re.compile(
|
||||
rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE
|
||||
)
|
||||
excerpt_escaped = pattern_freq.sub(r"<i>\1</i>", excerpt_escaped)
|
||||
else:
|
||||
# Same word is both most and least frequent - use bold+italic
|
||||
pattern = re.compile(rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE)
|
||||
pattern = re.compile(
|
||||
rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE
|
||||
)
|
||||
excerpt_escaped = pattern.sub(r"<b><i>\1</i></b>", excerpt_escaped)
|
||||
lines.append(f"📖 TARGET EXCERPT;{excerpt_escaped};#0")
|
||||
|
||||
@ -391,7 +404,9 @@ def generate_anki_deck(
|
||||
context_escaped = pattern.sub(f"<b>{word}</b>", context_escaped)
|
||||
else:
|
||||
context_escaped = ""
|
||||
lines.append(f"{word_escaped};{translation_escaped};#{rank};{context_escaped}")
|
||||
lines.append(
|
||||
f"{word_escaped};{translation_escaped};#{rank};{context_escaped}"
|
||||
)
|
||||
else:
|
||||
lines.append(f"{word_escaped};{translation_escaped};#{rank}")
|
||||
|
||||
@ -415,6 +430,7 @@ def get_cached_excerpt(
|
||||
return None
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_vocab_curve_cache
|
||||
|
||||
return get_vocab_curve_cache().get(filepath, length)
|
||||
except ImportError:
|
||||
return None
|
||||
@ -433,6 +449,7 @@ def cache_excerpt(
|
||||
"""
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_vocab_curve_cache
|
||||
|
||||
get_vocab_curve_cache().set(filepath, length, excerpt, words)
|
||||
except ImportError:
|
||||
pass
|
||||
@ -464,6 +481,7 @@ def get_cached_deck(
|
||||
return None
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_anki_deck_cache
|
||||
|
||||
return get_anki_deck_cache().get(
|
||||
filepath, length, target_lang, include_context, all_vocab
|
||||
)
|
||||
@ -497,6 +515,7 @@ def cache_deck(
|
||||
"""
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_anki_deck_cache
|
||||
|
||||
get_anki_deck_cache().set(
|
||||
filepath,
|
||||
length,
|
||||
@ -568,7 +587,9 @@ def generate_flashcards(
|
||||
# Run vocabulary curve analysis with vocab dump for all words
|
||||
output = run_vocabulary_curve(filepath, excerpt_length, dump_vocab=all_vocab)
|
||||
# Parse the output (now includes all vocabulary from C)
|
||||
excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(output, excerpt_length)
|
||||
excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(
|
||||
output, excerpt_length
|
||||
)
|
||||
|
||||
if not excerpt_words:
|
||||
raise ValueError(f"No words found for excerpt length {excerpt_length}")
|
||||
@ -671,9 +692,11 @@ def generate_flashcards_inverse(
|
||||
|
||||
# Run vocabulary curve in inverse mode
|
||||
output = run_vocabulary_curve_inverse(filepath, max_vocab, dump_vocab=True)
|
||||
|
||||
|
||||
# Parse the output
|
||||
excerpt, excerpt_length, max_rank_used, all_vocab_words = parse_inverse_mode_output(output)
|
||||
excerpt, excerpt_length, max_rank_used, all_vocab_words = parse_inverse_mode_output(
|
||||
output
|
||||
)
|
||||
|
||||
if excerpt_length == 0:
|
||||
raise ValueError(
|
||||
@ -686,10 +709,12 @@ def generate_flashcards_inverse(
|
||||
|
||||
# Use all vocabulary up to max_vocab
|
||||
words_with_ranks = all_vocab_words
|
||||
|
||||
|
||||
# Find words that appear in the excerpt (for highlighting)
|
||||
excerpt_word_set = set(excerpt.lower().split())
|
||||
excerpt_words = [(w, r) for w, r in all_vocab_words if w.lower() in excerpt_word_set]
|
||||
excerpt_words = [
|
||||
(w, r) for w, r in all_vocab_words if w.lower() in excerpt_word_set
|
||||
]
|
||||
|
||||
# Get contexts if requested
|
||||
contexts = None
|
||||
@ -835,13 +860,13 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
try:
|
||||
from cache import get_all_cache_stats
|
||||
except ImportError:
|
||||
print("Cache module not available", file=sys.stderr) # noqa: T201
|
||||
print("Cache module not available", file=sys.stderr)
|
||||
return 1
|
||||
stats = get_all_cache_stats()
|
||||
print("Cache Statistics") # noqa: T201
|
||||
print("=" * 50) # noqa: T201
|
||||
print("Cache Statistics")
|
||||
print("=" * 50)
|
||||
for cache_name, cache_stats in stats.items():
|
||||
print(f"\n{cache_name.upper()}:") # noqa: T201
|
||||
print(f"\n{cache_name.upper()}:")
|
||||
for key, value in cache_stats.items():
|
||||
if key == "cache_size_bytes":
|
||||
if value < 1024:
|
||||
@ -850,9 +875,9 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
size_str = f"{value / 1024:.1f} KB"
|
||||
else:
|
||||
size_str = f"{value / (1024 * 1024):.1f} MB"
|
||||
print(f" {key}: {size_str}") # noqa: T201
|
||||
print(f" {key}: {size_str}")
|
||||
else:
|
||||
print(f" {key}: {value}") # noqa: T201
|
||||
print(f" {key}: {value}")
|
||||
return 0
|
||||
|
||||
if args.clear_cache:
|
||||
@ -862,10 +887,10 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
try:
|
||||
from cache import clear_all_caches
|
||||
except ImportError:
|
||||
print("Cache module not available", file=sys.stderr) # noqa: T201
|
||||
print("Cache module not available", file=sys.stderr)
|
||||
return 1
|
||||
clear_all_caches()
|
||||
print("All caches cleared.") # noqa: T201
|
||||
print("All caches cleared.")
|
||||
return 0
|
||||
|
||||
# Validate required arguments for main functionality
|
||||
@ -879,63 +904,67 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
try:
|
||||
filepath = Path(args.file)
|
||||
if not filepath.exists():
|
||||
print(f"Error: File not found: {args.file}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# INVERSE MODE: --max-vocab
|
||||
if args.max_vocab is not None:
|
||||
if not args.quiet:
|
||||
print(f"Analyzing {filepath.name}...") # noqa: T201
|
||||
print(f"Finding longest excerpt using top {args.max_vocab} words...") # noqa: T201
|
||||
print(f"Analyzing {filepath.name}...")
|
||||
print(f"Finding longest excerpt using top {args.max_vocab} words...")
|
||||
|
||||
# Generate flashcards in inverse mode
|
||||
anki_content, excerpt, excerpt_length, num_words, max_rank_used = generate_flashcards_inverse(
|
||||
filepath,
|
||||
args.max_vocab,
|
||||
source_lang=args.source_lang,
|
||||
target_lang=args.target_lang,
|
||||
include_context=args.include_context,
|
||||
deck_name=args.deck_name,
|
||||
no_translate=args.no_translate,
|
||||
force=args.force,
|
||||
anki_content, excerpt, excerpt_length, num_words, max_rank_used = (
|
||||
generate_flashcards_inverse(
|
||||
filepath,
|
||||
args.max_vocab,
|
||||
source_lang=args.source_lang,
|
||||
target_lang=args.target_lang,
|
||||
include_context=args.include_context,
|
||||
deck_name=args.deck_name,
|
||||
no_translate=args.no_translate,
|
||||
force=args.force,
|
||||
)
|
||||
)
|
||||
|
||||
# Determine output path
|
||||
if args.output:
|
||||
output_path = Path(args.output)
|
||||
else:
|
||||
output_path = filepath.parent / f"{filepath.stem}_anki_top{args.max_vocab}.txt"
|
||||
output_path = (
|
||||
filepath.parent / f"{filepath.stem}_anki_top{args.max_vocab}.txt"
|
||||
)
|
||||
|
||||
# Write output
|
||||
output_path.write_text(anki_content, encoding="utf-8")
|
||||
|
||||
if not args.quiet:
|
||||
print("") # noqa: T201
|
||||
print("=" * 60) # noqa: T201
|
||||
print("FLASHCARD GENERATION COMPLETE (INVERSE MODE)") # noqa: T201
|
||||
print("=" * 60) # noqa: T201
|
||||
print(f"Learning: top {args.max_vocab} words") # noqa: T201
|
||||
print(f"Longest excerpt you can understand: {excerpt_length} words") # noqa: T201
|
||||
print(f' "{excerpt}"') # noqa: T201
|
||||
print("") # noqa: T201
|
||||
print(f"Rarest word in excerpt: #{max_rank_used}") # noqa: T201
|
||||
print(f"Flashcards: {num_words}") # noqa: T201
|
||||
print(f"Output file: {output_path}") # noqa: T201
|
||||
print("") # noqa: T201
|
||||
print("To import into Anki:") # noqa: T201
|
||||
print(" 1. Open Anki") # noqa: T201
|
||||
print(" 2. File -> Import") # noqa: T201
|
||||
print(f" 3. Select: {output_path}") # noqa: T201
|
||||
print(" 4. Click Import") # noqa: T201
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("FLASHCARD GENERATION COMPLETE (INVERSE MODE)")
|
||||
print("=" * 60)
|
||||
print(f"Learning: top {args.max_vocab} words")
|
||||
print(f"Longest excerpt you can understand: {excerpt_length} words")
|
||||
print(f' "{excerpt}"')
|
||||
print()
|
||||
print(f"Rarest word in excerpt: #{max_rank_used}")
|
||||
print(f"Flashcards: {num_words}")
|
||||
print(f"Output file: {output_path}")
|
||||
print()
|
||||
print("To import into Anki:")
|
||||
print(" 1. Open Anki")
|
||||
print(" 2. File -> Import")
|
||||
print(f" 3. Select: {output_path}")
|
||||
print(" 4. Click Import")
|
||||
else:
|
||||
print(output_path) # noqa: T201
|
||||
print(output_path)
|
||||
|
||||
return 0
|
||||
|
||||
# NORMAL MODE: --length
|
||||
if not args.quiet:
|
||||
print(f"Analyzing {filepath.name}...") # noqa: T201
|
||||
print(f"Finding vocabulary for {args.length}-word excerpt...") # noqa: T201
|
||||
print(f"Analyzing {filepath.name}...")
|
||||
print(f"Finding vocabulary for {args.length}-word excerpt...")
|
||||
|
||||
# Generate flashcards
|
||||
anki_content, excerpt, num_words, max_rank = generate_flashcards(
|
||||
@ -960,38 +989,38 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
output_path.write_text(anki_content, encoding="utf-8")
|
||||
|
||||
if not args.quiet:
|
||||
print("") # noqa: T201
|
||||
print("=" * 60) # noqa: T201
|
||||
print("FLASHCARD GENERATION COMPLETE") # noqa: T201
|
||||
print("=" * 60) # noqa: T201
|
||||
print(f"Excerpt to understand ({args.length} words):") # noqa: T201
|
||||
print(f' "{excerpt}"') # noqa: T201
|
||||
print("") # noqa: T201
|
||||
print(f"Max word rank needed: #{max_rank}") # noqa: T201
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("FLASHCARD GENERATION COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"Excerpt to understand ({args.length} words):")
|
||||
print(f' "{excerpt}"')
|
||||
print()
|
||||
print(f"Max word rank needed: #{max_rank}")
|
||||
if args.excerpt_words_only:
|
||||
print(f"Flashcards: {num_words} (excerpt words only)") # noqa: T201
|
||||
print(f"Flashcards: {num_words} (excerpt words only)")
|
||||
else:
|
||||
print(f"Flashcards: {num_words} (ALL words rank #1 to #{max_rank})") # noqa: T201
|
||||
print(f"Output file: {output_path}") # noqa: T201
|
||||
print("") # noqa: T201
|
||||
print("To import into Anki:") # noqa: T201
|
||||
print(" 1. Open Anki") # noqa: T201
|
||||
print(" 2. File -> Import") # noqa: T201
|
||||
print(f" 3. Select: {output_path}") # noqa: T201
|
||||
print(" 4. Click Import") # noqa: T201
|
||||
print(f"Flashcards: {num_words} (ALL words rank #1 to #{max_rank})")
|
||||
print(f"Output file: {output_path}")
|
||||
print()
|
||||
print("To import into Anki:")
|
||||
print(" 1. Open Anki")
|
||||
print(" 2. File -> Import")
|
||||
print(f" 3. Select: {output_path}")
|
||||
print(" 4. Click Import")
|
||||
else:
|
||||
print(output_path) # noqa: T201
|
||||
print(output_path)
|
||||
|
||||
return 0
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error running vocabulary_curve: {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error running vocabulary_curve: {e}", file=sys.stderr)
|
||||
return 1
|
||||
except ValueError as e:
|
||||
print(f"Error: {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
|
||||
55
python_pkg/word_frequency/cache.py
Normal file → Executable file
55
python_pkg/word_frequency/cache.py
Normal file → Executable file
@ -15,10 +15,7 @@ import hashlib
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
from typing import Any
|
||||
|
||||
# Default cache directory
|
||||
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency"
|
||||
@ -88,7 +85,9 @@ class TranslationCache:
|
||||
if self._cache is None:
|
||||
if self.cache_file.exists():
|
||||
try:
|
||||
self._cache = json.loads(self.cache_file.read_text(encoding="utf-8"))
|
||||
self._cache = json.loads(
|
||||
self.cache_file.read_text(encoding="utf-8")
|
||||
)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
self._cache = {}
|
||||
else:
|
||||
@ -122,9 +121,7 @@ class TranslationCache:
|
||||
"""
|
||||
return f"{source_lang}:{target_lang}:{word.lower()}"
|
||||
|
||||
def get(
|
||||
self, word: str, source_lang: str, target_lang: str
|
||||
) -> str | None:
|
||||
def get(self, word: str, source_lang: str, target_lang: str) -> str | None:
|
||||
"""Get cached translation.
|
||||
|
||||
Args:
|
||||
@ -140,8 +137,13 @@ class TranslationCache:
|
||||
return cache.get(key)
|
||||
|
||||
def set(
|
||||
self, word: str, source_lang: str, target_lang: str, translation: str,
|
||||
*, auto_save: bool = False,
|
||||
self,
|
||||
word: str,
|
||||
source_lang: str,
|
||||
target_lang: str,
|
||||
translation: str,
|
||||
*,
|
||||
auto_save: bool = False,
|
||||
) -> None:
|
||||
"""Store translation in cache.
|
||||
|
||||
@ -525,7 +527,7 @@ _anki_deck_cache: AnkiDeckCache | None = None
|
||||
|
||||
def get_translation_cache() -> TranslationCache:
|
||||
"""Get the global translation cache instance."""
|
||||
global _translation_cache # noqa: PLW0603
|
||||
global _translation_cache
|
||||
if _translation_cache is None:
|
||||
_translation_cache = TranslationCache()
|
||||
return _translation_cache
|
||||
@ -533,7 +535,7 @@ def get_translation_cache() -> TranslationCache:
|
||||
|
||||
def get_vocab_curve_cache() -> VocabCurveCache:
|
||||
"""Get the global vocabulary curve cache instance."""
|
||||
global _vocab_curve_cache # noqa: PLW0603
|
||||
global _vocab_curve_cache
|
||||
if _vocab_curve_cache is None:
|
||||
_vocab_curve_cache = VocabCurveCache()
|
||||
return _vocab_curve_cache
|
||||
@ -541,7 +543,7 @@ def get_vocab_curve_cache() -> VocabCurveCache:
|
||||
|
||||
def get_anki_deck_cache() -> AnkiDeckCache:
|
||||
"""Get the global Anki deck cache instance."""
|
||||
global _anki_deck_cache # noqa: PLW0603
|
||||
global _anki_deck_cache
|
||||
if _anki_deck_cache is None:
|
||||
_anki_deck_cache = AnkiDeckCache()
|
||||
return _anki_deck_cache
|
||||
@ -576,12 +578,8 @@ def main() -> int:
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Manage word frequency caches")
|
||||
parser.add_argument(
|
||||
"--stats", action="store_true", help="Show cache statistics"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--clear", action="store_true", help="Clear all caches"
|
||||
)
|
||||
parser.add_argument("--stats", action="store_true", help="Show cache statistics")
|
||||
parser.add_argument("--clear", action="store_true", help="Clear all caches")
|
||||
parser.add_argument(
|
||||
"--clear-translations", action="store_true", help="Clear translation cache"
|
||||
)
|
||||
@ -596,30 +594,30 @@ def main() -> int:
|
||||
|
||||
if args.clear:
|
||||
clear_all_caches()
|
||||
print("All caches cleared.") # noqa: T201
|
||||
print("All caches cleared.")
|
||||
return 0
|
||||
|
||||
if args.clear_translations:
|
||||
get_translation_cache().clear()
|
||||
print("Translation cache cleared.") # noqa: T201
|
||||
print("Translation cache cleared.")
|
||||
return 0
|
||||
|
||||
if args.clear_excerpts:
|
||||
get_vocab_curve_cache().clear()
|
||||
print("Excerpt cache cleared.") # noqa: T201
|
||||
print("Excerpt cache cleared.")
|
||||
return 0
|
||||
|
||||
if args.clear_anki:
|
||||
get_anki_deck_cache().clear()
|
||||
print("Anki deck cache cleared.") # noqa: T201
|
||||
print("Anki deck cache cleared.")
|
||||
return 0
|
||||
|
||||
# Default: show stats
|
||||
stats = get_all_cache_stats()
|
||||
print("Cache Statistics") # noqa: T201
|
||||
print("=" * 50) # noqa: T201
|
||||
print("Cache Statistics")
|
||||
print("=" * 50)
|
||||
for cache_name, cache_stats in stats.items():
|
||||
print(f"\n{cache_name.upper()}:") # noqa: T201
|
||||
print(f"\n{cache_name.upper()}:")
|
||||
for key, value in cache_stats.items():
|
||||
if key == "cache_size_bytes":
|
||||
# Format as human-readable
|
||||
@ -629,13 +627,14 @@ def main() -> int:
|
||||
size_str = f"{value / 1024:.1f} KB"
|
||||
else:
|
||||
size_str = f"{value / (1024 * 1024):.1f} MB"
|
||||
print(f" {key}: {size_str}") # noqa: T201
|
||||
print(f" {key}: {size_str}")
|
||||
else:
|
||||
print(f" {key}: {value}") # noqa: T201
|
||||
print(f" {key}: {value}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
sys.exit(main())
|
||||
|
||||
22
python_pkg/word_frequency/excerpt_finder.py
Normal file → Executable file
22
python_pkg/word_frequency/excerpt_finder.py
Normal file → Executable file
@ -21,8 +21,8 @@ Usage:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
try:
|
||||
@ -81,7 +81,9 @@ def find_best_excerpt(
|
||||
target_set = {w.lower() for w in target_words}
|
||||
|
||||
# Use sliding window to find the best excerpt
|
||||
results: list[tuple[int, int, float, int]] = [] # (match_count, -start, percentage, start)
|
||||
results: list[
|
||||
tuple[int, int, float, int]
|
||||
] = [] # (match_count, -start, percentage, start)
|
||||
|
||||
# Count matches in first window
|
||||
current_matches = sum(1 for w in words[:excerpt_length] if w in target_set)
|
||||
@ -219,9 +221,11 @@ def format_excerpt_results(
|
||||
for i, result in enumerate(results, 1):
|
||||
if len(results) > 1:
|
||||
lines.append(f"=== Result #{i} ===")
|
||||
lines.append(f"Excerpt: \"{result.excerpt}\"")
|
||||
lines.append(f'Excerpt: "{result.excerpt}"')
|
||||
lines.append(f"Word position: {result.start_index} - {result.end_index - 1}")
|
||||
lines.append(f"Matches: {result.match_count}/{len(result.words)} ({result.match_percentage:.2f}%)")
|
||||
lines.append(
|
||||
f"Matches: {result.match_count}/{len(result.words)} ({result.match_percentage:.2f}%)"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
@ -325,7 +329,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
target_words = [w.strip() for w in words_content.splitlines() if w.strip()]
|
||||
|
||||
if not target_words:
|
||||
print("Error: No target words provided", file=sys.stderr) # noqa: T201
|
||||
print("Error: No target words provided", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Find excerpts
|
||||
@ -343,15 +347,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
|
||||
if args.output:
|
||||
Path(args.output).write_text(output, encoding="utf-8")
|
||||
print(f"Output written to {args.output}") # noqa: T201
|
||||
print(f"Output written to {args.output}")
|
||||
else:
|
||||
print(output) # noqa: T201
|
||||
print(output)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
||||
return 1
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
157
python_pkg/word_frequency/learning_pipe.py
Normal file → Executable file
157
python_pkg/word_frequency/learning_pipe.py
Normal file → Executable file
@ -31,15 +31,14 @@ Usage:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
try:
|
||||
from python_pkg.word_frequency.analyzer import analyze_text, read_file
|
||||
from python_pkg.word_frequency.excerpt_finder import find_best_excerpt
|
||||
from python_pkg.word_frequency.translator import (
|
||||
TranslationResult,
|
||||
detect_language,
|
||||
translate_words_batch,
|
||||
)
|
||||
@ -47,7 +46,6 @@ except ModuleNotFoundError:
|
||||
from analyzer import analyze_text, read_file # type: ignore[import-not-found]
|
||||
from excerpt_finder import find_best_excerpt # type: ignore[import-not-found]
|
||||
from translator import ( # type: ignore[import-not-found]
|
||||
TranslationResult,
|
||||
detect_language,
|
||||
translate_words_batch,
|
||||
)
|
||||
@ -57,19 +55,108 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
# Common stopwords for various languages (can be overridden with --stopwords)
|
||||
DEFAULT_STOPWORDS_EN = frozenset({
|
||||
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
|
||||
"of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
|
||||
"being", "have", "has", "had", "do", "does", "did", "will", "would",
|
||||
"could", "should", "may", "might", "must", "shall", "can", "this",
|
||||
"that", "these", "those", "i", "you", "he", "she", "it", "we", "they",
|
||||
"me", "him", "her", "us", "them", "my", "your", "his", "its", "our",
|
||||
"their", "what", "which", "who", "whom", "whose", "where", "when",
|
||||
"why", "how", "all", "each", "every", "both", "few", "more", "most",
|
||||
"other", "some", "such", "no", "nor", "not", "only", "own", "same",
|
||||
"so", "than", "too", "very", "just", "as", "if", "then", "because",
|
||||
"while", "although", "though", "after", "before", "when", "where",
|
||||
})
|
||||
DEFAULT_STOPWORDS_EN = frozenset(
|
||||
{
|
||||
"the",
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"or",
|
||||
"but",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"to",
|
||||
"for",
|
||||
"of",
|
||||
"with",
|
||||
"by",
|
||||
"from",
|
||||
"is",
|
||||
"are",
|
||||
"was",
|
||||
"were",
|
||||
"be",
|
||||
"been",
|
||||
"being",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"could",
|
||||
"should",
|
||||
"may",
|
||||
"might",
|
||||
"must",
|
||||
"shall",
|
||||
"can",
|
||||
"this",
|
||||
"that",
|
||||
"these",
|
||||
"those",
|
||||
"i",
|
||||
"you",
|
||||
"he",
|
||||
"she",
|
||||
"it",
|
||||
"we",
|
||||
"they",
|
||||
"me",
|
||||
"him",
|
||||
"her",
|
||||
"us",
|
||||
"them",
|
||||
"my",
|
||||
"your",
|
||||
"his",
|
||||
"its",
|
||||
"our",
|
||||
"their",
|
||||
"what",
|
||||
"which",
|
||||
"who",
|
||||
"whom",
|
||||
"whose",
|
||||
"where",
|
||||
"when",
|
||||
"why",
|
||||
"how",
|
||||
"all",
|
||||
"each",
|
||||
"every",
|
||||
"both",
|
||||
"few",
|
||||
"more",
|
||||
"most",
|
||||
"other",
|
||||
"some",
|
||||
"such",
|
||||
"no",
|
||||
"nor",
|
||||
"not",
|
||||
"only",
|
||||
"own",
|
||||
"same",
|
||||
"so",
|
||||
"than",
|
||||
"too",
|
||||
"very",
|
||||
"just",
|
||||
"as",
|
||||
"if",
|
||||
"then",
|
||||
"because",
|
||||
"while",
|
||||
"although",
|
||||
"though",
|
||||
"after",
|
||||
"before",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
|
||||
@ -89,7 +176,9 @@ def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
|
||||
return frozenset()
|
||||
|
||||
content = path.read_text(encoding="utf-8")
|
||||
return frozenset(word.strip().lower() for word in content.splitlines() if word.strip())
|
||||
return frozenset(
|
||||
word.strip().lower() for word in content.splitlines() if word.strip()
|
||||
)
|
||||
|
||||
|
||||
def generate_learning_lesson(
|
||||
@ -151,9 +240,13 @@ def generate_learning_lesson(
|
||||
lines.append("=" * 70)
|
||||
lines.append("LANGUAGE LEARNING LESSON")
|
||||
lines.append("=" * 70)
|
||||
lines.append(f"Source text: {total_words:,} total words, {len(word_counts):,} unique words")
|
||||
lines.append(
|
||||
f"Source text: {total_words:,} total words, {len(word_counts):,} unique words"
|
||||
)
|
||||
if all_stopwords:
|
||||
lines.append(f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words")
|
||||
lines.append(
|
||||
f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words"
|
||||
)
|
||||
else:
|
||||
lines.append(f"Vocabulary words: {len(filtered_words):,}")
|
||||
|
||||
@ -196,7 +289,9 @@ def generate_learning_lesson(
|
||||
cumulative_words.extend(word for word, _ in batch_words)
|
||||
|
||||
lines.append("-" * 70)
|
||||
lines.append(f"BATCH {batch_num + 1}: Words {start_idx + 1} - {min(end_idx, len(filtered_words))}")
|
||||
lines.append(
|
||||
f"BATCH {batch_num + 1}: Words {start_idx + 1} - {min(end_idx, len(filtered_words))}"
|
||||
)
|
||||
lines.append("-" * 70)
|
||||
lines.append("")
|
||||
|
||||
@ -230,7 +325,9 @@ def generate_learning_lesson(
|
||||
else:
|
||||
for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
|
||||
percentage = (count / total_words) * 100
|
||||
lines.append(f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)")
|
||||
lines.append(
|
||||
f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)"
|
||||
)
|
||||
|
||||
lines.append("")
|
||||
|
||||
@ -239,7 +336,9 @@ def generate_learning_lesson(
|
||||
word_counts[word] for word in cumulative_words if word in word_counts
|
||||
)
|
||||
coverage = (cumulative_count / total_words) * 100
|
||||
lines.append(f"After learning these words, you'll recognize ~{coverage:.1f}% of the text")
|
||||
lines.append(
|
||||
f"After learning these words, you'll recognize ~{coverage:.1f}% of the text"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Find excerpts using cumulative words
|
||||
@ -256,8 +355,10 @@ def generate_learning_lesson(
|
||||
)
|
||||
|
||||
for j, excerpt in enumerate(excerpts, 1):
|
||||
lines.append(f" Excerpt {j} ({excerpt.match_percentage:.1f}% known words):")
|
||||
lines.append(f" \"{excerpt.excerpt}\"")
|
||||
lines.append(
|
||||
f" Excerpt {j} ({excerpt.match_percentage:.1f}% known words):"
|
||||
)
|
||||
lines.append(f' "{excerpt.excerpt}"')
|
||||
lines.append("")
|
||||
|
||||
# Summary
|
||||
@ -431,15 +532,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
# Output
|
||||
if args.output:
|
||||
Path(args.output).write_text(lesson, encoding="utf-8")
|
||||
print(f"Lesson written to {args.output}") # noqa: T201
|
||||
print(f"Lesson written to {args.output}")
|
||||
else:
|
||||
print(lesson) # noqa: T201
|
||||
print(lesson)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
||||
return 1
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
@ -76,17 +76,18 @@ try_pipx_install() {
|
||||
# Create/use a virtualenv for argostranslate
|
||||
setup_venv() {
|
||||
# Use /tmp for pip cache to avoid home directory quota issues
|
||||
export PIP_CACHE_DIR="/tmp/.pip_cache_$(id -u)"
|
||||
PIP_CACHE_DIR="/tmp/.pip_cache_$(id -u)"
|
||||
export PIP_CACHE_DIR
|
||||
mkdir -p "$PIP_CACHE_DIR"
|
||||
|
||||
|
||||
if [[ ! -d "$VENV_DIR" ]]; then
|
||||
log_info "Creating virtual environment at $VENV_DIR..."
|
||||
python -m venv "$VENV_DIR"
|
||||
fi
|
||||
|
||||
|
||||
# Activate venv
|
||||
source "$VENV_DIR/bin/activate"
|
||||
|
||||
|
||||
# Install argostranslate if not present
|
||||
if ! python -c "import argostranslate" 2>/dev/null; then
|
||||
log_info "Installing argostranslate in virtualenv (this may take a few minutes)..."
|
||||
@ -95,18 +96,18 @@ setup_venv() {
|
||||
pip install --progress-bar on --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
|
||||
pip install --progress-bar on --no-cache-dir argostranslate
|
||||
fi
|
||||
|
||||
|
||||
# Install langdetect for auto language detection
|
||||
if ! python -c "import langdetect" 2>/dev/null; then
|
||||
log_info "Installing langdetect for auto language detection..."
|
||||
pip install --progress-bar on --no-cache-dir langdetect
|
||||
fi
|
||||
|
||||
|
||||
# Also ensure other dependencies are available
|
||||
if [[ -f "${SCRIPT_DIR}/../../requirements.txt" ]]; then
|
||||
pip install --progress-bar on --no-cache-dir -r "${SCRIPT_DIR}/../../requirements.txt" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
|
||||
log_info "Using virtualenv: $VENV_DIR"
|
||||
}
|
||||
|
||||
@ -115,7 +116,7 @@ main() {
|
||||
# Resolve file paths to absolute before changing directories
|
||||
local resolved_args
|
||||
resolved_args=$(resolve_file_paths)
|
||||
|
||||
|
||||
# If --no-translate is passed, we don't need argostranslate
|
||||
if [[ " $* " =~ " --no-translate " ]] || [[ " $* " =~ " -n " ]]; then
|
||||
log_info "Running without translation (--no-translate)"
|
||||
@ -123,7 +124,7 @@ main() {
|
||||
python -m python_pkg.word_frequency.anki_generator $resolved_args
|
||||
exit $?
|
||||
fi
|
||||
|
||||
|
||||
# Check if argostranslate is already available
|
||||
if check_argos; then
|
||||
log_info "argostranslate is available"
|
||||
@ -131,20 +132,20 @@ main() {
|
||||
python -m python_pkg.word_frequency.anki_generator $resolved_args
|
||||
exit $?
|
||||
fi
|
||||
|
||||
|
||||
log_warn "argostranslate not found in system Python"
|
||||
|
||||
|
||||
# Try pipx first (cleaner system-wide installation)
|
||||
if try_pipx_install && check_argos; then
|
||||
cd "$(dirname "$SCRIPT_DIR")" && cd ..
|
||||
python -m python_pkg.word_frequency.anki_generator $resolved_args
|
||||
exit $?
|
||||
fi
|
||||
|
||||
|
||||
# Fall back to virtualenv
|
||||
log_info "Setting up virtualenv with argostranslate..."
|
||||
setup_venv
|
||||
|
||||
|
||||
# Run in venv context
|
||||
cd "$(dirname "$SCRIPT_DIR")" && cd ..
|
||||
python -m python_pkg.word_frequency.anki_generator $resolved_args
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,12 +1,12 @@
|
||||
|
||||
|
||||
|
||||
|
||||
Caesar: Bellum Gallicum I
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
C. IVLI CAESARIS COMMENTARIORVM DE BELLO GALLICO LIBER PRIMVS
|
||||
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
|
||||
[1] 1 Gallia est omnis divisa in partes tres, quarum unam incolunt Belgae, aliam Aquitani, tertiam qui ipsorum lingua Celtae, nostra Galli appellantur. 2 Hi omnes lingua, institutis, legibus inter se differunt. Gallos ab Aquitanis Garumna flumen, a Belgis Matrona et Sequana dividit. 3 Horum omnium fortissimi sunt Belgae, propterea quod a cultu atque humanitate provinciae longissime absunt, minimeque ad eos mercatores saepe commeant atque ea quae ad effeminandos animos pertinent important, 4 proximique sunt Germanis, qui trans Rhenum incolunt, quibuscum continenter bellum gerunt. Qua de causa Helvetii quoque reliquos Gallos virtute praecedunt, quod fere cotidianis proeliis cum Germanis contendunt, cum aut suis finibus eos prohibent aut ipsi in eorum finibus bellum gerunt. 5 Eorum una pars, quam Gallos obtinere dictum est, initium capit a flumine Rhodano, continetur Garumna flumine, Oceano, finibus Belgarum, attingit etiam ab Sequanis et Helvetiis flumen Rhenum, vergit ad septentriones. 6 Belgae ab extremis Galliae finibus oriuntur, pertinent ad inferiorem partem fluminis Rheni, spectant in septentrionem et orientem solem. 7 Aquitania a Garumna flumine ad Pyrenaeos montes et eam partem Oceani quae est ad Hispaniam pertinet; spectat inter occasum solis et septentriones.
|
||||
@ -63,8 +63,7 @@ C. IVLI CAESARIS COMMENTARIORVM DE BELLO GALLICO LIBER PRIMVS
|
||||
[52] 1 Caesar singulis legionibus singulos legatos et quaestorem praefecit, uti eos testes suae quisque virtutis haberet; 2 ipse a dextro cornu, quod eam partem minime firmam hostium esse animadverterat, proelium commisit. 3 Ita nostri acriter in hostes signo dato impetum fecerunt itaque hostes repente celeriterque procurrerunt, ut spatium pila in hostes coiciendi non daretur. 4 Relictis pilis comminus gladiis pugnatum est. At Germani celeriter ex consuetudine sua phalange facta impetus gladiorum exceperunt. 5 Reperti sunt complures nostri qui in phalanga insilirent et scuta manibus revellerent et desuper vulnerarent. 6 Cum hostium acies a sinistro cornu pulsa atque in fugam coniecta esset, a dextro cornu vehementer multitudine suorum nostram aciem premebant. 7 Id cum animadvertisset P. Crassus adulescens, qui equitatui praeerat, quod expeditior erat quam ii qui inter aciem versabantur, tertiam aciem laborantibus nostris subsidio misit.
|
||||
[53] 1 Ita proelium restitutum est, atque omnes hostes terga verterunt nec prius fugere destiterunt quam ad flumen Rhenum milia passuum ex eo loco circiter L pervenerunt. 2 Ibi perpauci aut viribus confisi tranare contenderunt aut lintribus inventis sibi salutem reppererunt. 3 In his fuit Ariovistus, qui naviculam deligatam ad ripam nactus ea profugit; reliquos omnes consecuti equites nostri interfecerunt. 4 Duae fuerunt Ariovisti uxores, una Sueba natione, quam domo secum eduxerat, altera Norica, regis Voccionis soror, quam in Gallia duxerat a fratre missam: utraque in ea fuga periit; duae filiae: harum altera occisa, altera capta est. 5 C. Valerius Procillus, cum a custodibus in fuga trinis catenis vinctus traheretur, in ipsum Caesarem hostes equitatu insequentem incidit. 6 Quae quidem res Caesari non minorem quam ipsa victoria voluptatem attulit, quod hominem honestissimum provinciae Galliae, suum familiarem et hospitem, ereptum ex manibus hostium sibi restitutum videbat neque eius calamitate de tanta voluptate et gratulatione quicquam fortuna deminuerat. 7 Is se praesente de se ter sortibus consultum dicebat, utrum igni statim necaretur an in aliud tempus reservaretur: sortium beneficio se esse incolumem. 8 Item M. Metius repertus et ad eum reductus est.
|
||||
[54] 1 Hoc proelio trans Rhenum nuntiato, Suebi, qui ad ripas Rheni venerant, domum reverti coeperunt; quos ubi qui proximi Rhenum incolunt perterritos senserunt, insecuti magnum ex iis numerum occiderunt. 2 Caesar una aestate duobus maximis bellis confectis maturius paulo quam tempus anni postulabat in hiberna in Sequanos exercitum deduxit; hibernis Labienum praeposuit; 3 ipse in citeriorem Galliam ad conventus agendos profectus est.
|
||||
|
||||
|
||||
Caesar
|
||||
The Latin Library
|
||||
The Classics Page
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -195,4 +195,4 @@ cię;you;#188;...ty jesteś jak zdrowie Ile <b>cię</b> trzeba cenić ten tylko
|
||||
koniec;end;#189;...Maleski z Mickiewiczem a na <b>koniec</b> Hrabia Z Soplicą i czytając...
|
||||
których;which;#190;...zabawia przez rozmowy grzeczne Z <b>których</b> by wychowanie poznano stołeczne To...
|
||||
okiem;eye;#191;...końca doczekał nareszcie Wbiega i <b>okiem</b> chciwie ściany starodawne Ogląda czule...
|
||||
rejent;notary;#192;...kusego charta Którego posiadaniem pan <b>rejent</b> się szczycił I utrzymywał że...
|
||||
rejent;notary;#192;...kusego charta Którego posiadaniem pan <b>rejent</b> się szczycił I utrzymywał że...
|
||||
|
||||
@ -2037,4 +2037,4 @@ stany;states;#1612
|
||||
wieśniaczki;villagers;#1612
|
||||
jenerale;jenerale;#1612
|
||||
pl;en;#1612
|
||||
lektury;reading;#1612
|
||||
lektury;reading;#1612
|
||||
|
||||
@ -30,4 +30,4 @@ damom;ladies;#1355;...zaszczyt należy Idąc kłaniał się <b>damom</b> starcom
|
||||
kołem;wheel;#1671;...weszli w porządku i stanęli <b>kołem</b> Podkomorzy najwyższe brał miejsce za...
|
||||
najwyższe;highest;#1672;...porządku i stanęli kołem Podkomorzy <b>najwyższe</b> brał miejsce za stołem Z...
|
||||
zaszczyt;honor;#2110;...mu i z urzędu ten <b>zaszczyt</b> należy Idąc kłaniał się damom...
|
||||
starcom;old men;#2111;...należy Idąc kłaniał się damom <b>starcom</b> i młodzieży Przy nim stał...
|
||||
starcom;old men;#2111;...należy Idąc kłaniał się damom <b>starcom</b> i młodzieży Przy nim stał...
|
||||
|
||||
@ -533,4 +533,4 @@ nimi;with;#495
|
||||
konewka;water;#495
|
||||
czoło;forehead;#495
|
||||
głupi;Stupid.;#495
|
||||
maćka;macaw;#495
|
||||
maćka;macaw;#495
|
||||
|
||||
@ -2,11 +2,9 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
@ -20,9 +18,6 @@ from python_pkg.word_frequency.analyzer import (
|
||||
read_files,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
|
||||
class TestExtractWords:
|
||||
"""Tests for extract_words function."""
|
||||
|
||||
19
python_pkg/word_frequency/tests/test_anki_generator.py
Normal file → Executable file
19
python_pkg/word_frequency/tests/test_anki_generator.py
Normal file → Executable file
@ -12,17 +12,16 @@ try:
|
||||
from python_pkg.word_frequency.anki_generator import (
|
||||
find_word_contexts,
|
||||
generate_anki_deck,
|
||||
generate_flashcards,
|
||||
main,
|
||||
parse_vocabulary_curve_output,
|
||||
)
|
||||
except ImportError:
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
||||
from python_pkg.word_frequency.anki_generator import (
|
||||
find_word_contexts,
|
||||
generate_anki_deck,
|
||||
generate_flashcards,
|
||||
main,
|
||||
parse_vocabulary_curve_output,
|
||||
)
|
||||
@ -78,19 +77,25 @@ class TestParseVocabularyCurveOutput:
|
||||
|
||||
def test_parse_length_1(self, sample_vocabulary_output: str) -> None:
|
||||
"""Test parsing output for length 1."""
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 1)
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
||||
sample_vocabulary_output, 1
|
||||
)
|
||||
assert excerpt == "the"
|
||||
assert excerpt_words == [("the", 1)]
|
||||
|
||||
def test_parse_length_2(self, sample_vocabulary_output: str) -> None:
|
||||
"""Test parsing output for length 2."""
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 2)
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
||||
sample_vocabulary_output, 2
|
||||
)
|
||||
assert excerpt == "the dog"
|
||||
assert excerpt_words == [("the", 1), ("dog", 2)]
|
||||
|
||||
def test_parse_length_3(self, sample_vocabulary_output: str) -> None:
|
||||
"""Test parsing output for length 3."""
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 3)
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
||||
sample_vocabulary_output, 3
|
||||
)
|
||||
assert excerpt == "the quick fox"
|
||||
assert len(excerpt_words) == 3
|
||||
assert ("the", 1) in excerpt_words
|
||||
@ -99,7 +104,9 @@ class TestParseVocabularyCurveOutput:
|
||||
|
||||
def test_parse_nonexistent_length(self, sample_vocabulary_output: str) -> None:
|
||||
"""Test parsing output for non-existent length."""
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 100)
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
||||
sample_vocabulary_output, 100
|
||||
)
|
||||
assert excerpt == ""
|
||||
assert excerpt_words == []
|
||||
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
@ -324,7 +324,7 @@ class TestMain:
|
||||
"2",
|
||||
]
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
capsys.readouterr()
|
||||
|
||||
assert exit_code == 0
|
||||
# Excerpt should include context words
|
||||
@ -342,7 +342,7 @@ class TestMain:
|
||||
"--case-sensitive",
|
||||
]
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
capsys.readouterr()
|
||||
|
||||
assert exit_code == 0
|
||||
# Only lowercase "hello" should match
|
||||
|
||||
@ -2,20 +2,20 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
import time
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
import python_pkg.word_frequency.learning_pipe as learning_pipe_module
|
||||
from python_pkg.word_frequency.learning_pipe import (
|
||||
DEFAULT_STOPWORDS_EN,
|
||||
generate_learning_lesson,
|
||||
load_stopwords,
|
||||
main,
|
||||
)
|
||||
import python_pkg.word_frequency.learning_pipe as learning_pipe_module
|
||||
from python_pkg.word_frequency.translator import TranslationResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -25,12 +25,13 @@ if TYPE_CHECKING:
|
||||
@pytest.fixture
|
||||
def mock_translation() -> Generator[MagicMock, None, None]:
|
||||
"""Mock translation to avoid requiring argostranslate."""
|
||||
|
||||
def fake_batch_translate(
|
||||
words: list[str],
|
||||
from_lang: str,
|
||||
to_lang: str,
|
||||
*,
|
||||
use_cache: bool = True, # noqa: ARG001
|
||||
use_cache: bool = True,
|
||||
) -> list[TranslationResult]:
|
||||
"""Fake batch translation that returns word with prefix."""
|
||||
return [
|
||||
@ -274,7 +275,7 @@ class TestMain:
|
||||
"5",
|
||||
]
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
capsys.readouterr()
|
||||
|
||||
assert exit_code == 0
|
||||
# "hello" should be filtered by custom stopwords
|
||||
@ -392,12 +393,17 @@ class TestTranslationIntegration:
|
||||
text_file.write_text("hello world hello world hello", encoding="utf-8")
|
||||
|
||||
# Should work with mocked translation
|
||||
result = main([
|
||||
"--file", str(text_file),
|
||||
"--translate-from", "en",
|
||||
"--translate-to", "es",
|
||||
"--no-default-stopwords",
|
||||
])
|
||||
result = main(
|
||||
[
|
||||
"--file",
|
||||
str(text_file),
|
||||
"--translate-from",
|
||||
"en",
|
||||
"--translate-to",
|
||||
"es",
|
||||
"--no-default-stopwords",
|
||||
]
|
||||
)
|
||||
|
||||
assert result == 0
|
||||
|
||||
@ -437,4 +443,3 @@ class TestTranslationIntegration:
|
||||
# Should not have translation output
|
||||
assert "Translation:" not in result
|
||||
assert "Detected language:" not in result
|
||||
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
@ -52,7 +52,9 @@ class ArgosAvailableMock:
|
||||
Works whether argos is installed or not by patching sys.modules.
|
||||
"""
|
||||
|
||||
def __init__(self, translate_returns: str | list[str] | Exception | None = None) -> None:
|
||||
def __init__(
|
||||
self, translate_returns: str | list[str] | Exception | None = None
|
||||
) -> None:
|
||||
"""Initialize with return values for translate()."""
|
||||
self.translate_returns = translate_returns
|
||||
self.mock_translate_fn = MagicMock()
|
||||
@ -69,9 +71,9 @@ class ArgosAvailableMock:
|
||||
translator._argos_available = True
|
||||
|
||||
# Set up translate return value
|
||||
if isinstance(self.translate_returns, Exception):
|
||||
self.mock_translate_fn.side_effect = self.translate_returns
|
||||
elif isinstance(self.translate_returns, list):
|
||||
if isinstance(self.translate_returns, Exception) or isinstance(
|
||||
self.translate_returns, list
|
||||
):
|
||||
self.mock_translate_fn.side_effect = self.translate_returns
|
||||
elif self.translate_returns is not None:
|
||||
self.mock_translate_fn.return_value = self.translate_returns
|
||||
@ -102,9 +104,9 @@ class ArgosAvailableMock:
|
||||
translator, "_ensure_language_pair", lambda f, t: None
|
||||
)
|
||||
|
||||
self._sys_modules_patcher.start()
|
||||
self._ensure_patcher.start()
|
||||
self._lang_patcher.start()
|
||||
self._sys_modules_patcher.start() # type: ignore[union-attr]
|
||||
self._ensure_patcher.start() # type: ignore[union-attr]
|
||||
self._lang_patcher.start() # type: ignore[union-attr]
|
||||
|
||||
return self.mock_translate_fn
|
||||
|
||||
@ -291,9 +293,7 @@ class TestTranslateWordsBatch:
|
||||
"""Test batch translation falls back to individual when result count mismatches."""
|
||||
words = ["one", "two", "three", "four"]
|
||||
# First call (batch) returns wrong count, subsequent calls are individual
|
||||
with ArgosAvailableMock(
|
||||
["wrong", "uno", "dos", "tres", "cuatro"]
|
||||
) as mock:
|
||||
with ArgosAvailableMock(["wrong", "uno", "dos", "tres", "cuatro"]) as mock:
|
||||
results = translate_words_batch(words, "en", "es", use_cache=False)
|
||||
|
||||
assert len(results) == 4
|
||||
@ -425,7 +425,8 @@ class TestGetInstalledLanguages:
|
||||
# We need to mock the translate module's get_installed_languages
|
||||
mock_translate_module = MagicMock()
|
||||
mock_translate_module.get_installed_languages.return_value = [
|
||||
mock_lang1, mock_lang2
|
||||
mock_lang1,
|
||||
mock_lang2,
|
||||
]
|
||||
mock_package_module = MagicMock()
|
||||
mock_parent = MagicMock()
|
||||
@ -507,9 +508,7 @@ class TestMain:
|
||||
result = main(["--text", "hello", "--from", "en", "--to", "es"])
|
||||
assert result == 1
|
||||
|
||||
def test_list_languages_empty(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_list_languages_empty(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test listing languages when none installed."""
|
||||
mock_translate_module = MagicMock()
|
||||
mock_translate_module.get_installed_languages.return_value = []
|
||||
@ -572,9 +571,7 @@ class TestMain:
|
||||
assert "en" in captured.out
|
||||
assert "English" in captured.out
|
||||
|
||||
def test_translate_single_text(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_translate_single_text(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test translating single text."""
|
||||
with ArgosAvailableMock("hola"):
|
||||
result = main(["--text", "hello", "--from", "en", "--to", "es"])
|
||||
@ -584,9 +581,7 @@ class TestMain:
|
||||
assert "hello" in captured.out
|
||||
assert "hola" in captured.out
|
||||
|
||||
def test_translate_multiple_words(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_translate_multiple_words(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test translating multiple words."""
|
||||
with ArgosAvailableMock(["hola", "mundo"]):
|
||||
result = main(["--words", "hello", "world", "--from", "en", "--to", "es"])
|
||||
@ -613,9 +608,7 @@ class TestMain:
|
||||
assert "world" in captured.out
|
||||
assert "goodbye" in captured.out
|
||||
|
||||
def test_translate_file_not_found(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_translate_file_not_found(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test error when words file not found."""
|
||||
with ArgosAvailableMock():
|
||||
result = main(
|
||||
@ -654,9 +647,7 @@ class TestMain:
|
||||
assert "hello" in content
|
||||
assert "hola" in content
|
||||
|
||||
def test_no_input_shows_help(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_no_input_shows_help(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test that no input shows help."""
|
||||
with ArgosAvailableMock():
|
||||
result = main([])
|
||||
|
||||
88
python_pkg/word_frequency/tests/test_vocabulary_curve.py
Normal file → Executable file
88
python_pkg/word_frequency/tests/test_vocabulary_curve.py
Normal file → Executable file
@ -3,14 +3,18 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
# Path to the C executable
|
||||
C_EXECUTABLE = Path(__file__).parent.parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
|
||||
C_EXECUTABLE = (
|
||||
Path(__file__).parent.parent.parent.parent
|
||||
/ "C"
|
||||
/ "vocabulary_curve"
|
||||
/ "vocabulary_curve"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -40,12 +44,13 @@ def run_vocabulary_curve(filepath: Path, max_length: int = 10) -> str:
|
||||
"""Run the vocabulary_curve executable and return output."""
|
||||
if not C_EXECUTABLE.exists():
|
||||
pytest.skip(f"C executable not found at {C_EXECUTABLE}")
|
||||
|
||||
|
||||
result = subprocess.run(
|
||||
[str(C_EXECUTABLE), str(filepath), str(max_length)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
check=False,
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
@ -54,19 +59,19 @@ def extract_excerpts_from_output(output: str) -> list[tuple[int, str]]:
|
||||
"""Extract (length, excerpt) pairs from output."""
|
||||
excerpts = []
|
||||
lines = output.split("\n")
|
||||
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
if line.strip().startswith("[Length "):
|
||||
# Parse length
|
||||
length = int(line.split("]")[0].split()[-1])
|
||||
|
||||
|
||||
# Find excerpt line
|
||||
i += 1
|
||||
while i < len(lines) and not lines[i].strip().startswith("Excerpt:"):
|
||||
i += 1
|
||||
|
||||
|
||||
if i < len(lines):
|
||||
excerpt_line = lines[i].strip()
|
||||
# Extract text between quotes
|
||||
@ -76,7 +81,7 @@ def extract_excerpts_from_output(output: str) -> list[tuple[int, str]]:
|
||||
excerpt = excerpt_line[start:end]
|
||||
excerpts.append((length, excerpt))
|
||||
i += 1
|
||||
|
||||
|
||||
return excerpts
|
||||
|
||||
|
||||
@ -86,19 +91,20 @@ class TestExcerptValidity:
|
||||
def test_excerpt_exists_in_source_text(self, sample_text_file: Path) -> None:
|
||||
"""Test that each excerpt can be found in the source text as contiguous words."""
|
||||
import re
|
||||
|
||||
source_text = sample_text_file.read_text(encoding="utf-8").lower()
|
||||
source_words = re.findall(r'\b[\w]+\b', source_text)
|
||||
source_words = re.findall(r"\b[\w]+\b", source_text)
|
||||
output = run_vocabulary_curve(sample_text_file, max_length=10)
|
||||
excerpts = extract_excerpts_from_output(output)
|
||||
|
||||
|
||||
assert len(excerpts) > 0, "No excerpts found in output"
|
||||
|
||||
|
||||
for length, excerpt in excerpts:
|
||||
excerpt_words = excerpt.lower().split()
|
||||
# Find this sequence in source_words
|
||||
found = False
|
||||
for i in range(len(source_words) - len(excerpt_words) + 1):
|
||||
if source_words[i:i+len(excerpt_words)] == excerpt_words:
|
||||
if source_words[i : i + len(excerpt_words)] == excerpt_words:
|
||||
found = True
|
||||
break
|
||||
assert found, (
|
||||
@ -111,29 +117,30 @@ class TestExcerptValidity:
|
||||
"""Test that excerpt has the expected number of words."""
|
||||
output = run_vocabulary_curve(sample_text_file, max_length=10)
|
||||
excerpts = extract_excerpts_from_output(output)
|
||||
|
||||
|
||||
for length, excerpt in excerpts:
|
||||
word_count = len(excerpt.split())
|
||||
assert word_count == length, (
|
||||
f"Expected {length} words, got {word_count}: '{excerpt}'"
|
||||
)
|
||||
assert (
|
||||
word_count == length
|
||||
), f"Expected {length} words, got {word_count}: '{excerpt}'"
|
||||
|
||||
def test_polish_excerpt_exists_in_source(self, polish_text_file: Path) -> None:
|
||||
"""Test Polish text excerpts are found in source as contiguous words."""
|
||||
import re
|
||||
|
||||
source_text = polish_text_file.read_text(encoding="utf-8").lower()
|
||||
source_words = re.findall(r'\b[\w]+\b', source_text)
|
||||
source_words = re.findall(r"\b[\w]+\b", source_text)
|
||||
output = run_vocabulary_curve(polish_text_file, max_length=8)
|
||||
excerpts = extract_excerpts_from_output(output)
|
||||
|
||||
|
||||
assert len(excerpts) > 0, "No excerpts found in output"
|
||||
|
||||
|
||||
for length, excerpt in excerpts:
|
||||
excerpt_words = excerpt.lower().split()
|
||||
# Find this sequence in source_words
|
||||
found = False
|
||||
for i in range(len(source_words) - len(excerpt_words) + 1):
|
||||
if source_words[i:i+len(excerpt_words)] == excerpt_words:
|
||||
if source_words[i : i + len(excerpt_words)] == excerpt_words:
|
||||
found = True
|
||||
break
|
||||
assert found, (
|
||||
@ -145,24 +152,24 @@ class TestExcerptValidity:
|
||||
def test_excerpt_is_contiguous(self, sample_text_file: Path) -> None:
|
||||
"""Test that excerpt words appear contiguously in source."""
|
||||
import re
|
||||
|
||||
|
||||
source_text = sample_text_file.read_text(encoding="utf-8").lower()
|
||||
# Extract words from source
|
||||
source_words = re.findall(r'\b[\w]+\b', source_text)
|
||||
|
||||
source_words = re.findall(r"\b[\w]+\b", source_text)
|
||||
|
||||
output = run_vocabulary_curve(sample_text_file, max_length=5)
|
||||
excerpts = extract_excerpts_from_output(output)
|
||||
|
||||
|
||||
for length, excerpt in excerpts:
|
||||
excerpt_words = excerpt.lower().split()
|
||||
|
||||
|
||||
# Find this sequence in source_words
|
||||
found = False
|
||||
for i in range(len(source_words) - length + 1):
|
||||
if source_words[i:i+length] == excerpt_words:
|
||||
if source_words[i : i + length] == excerpt_words:
|
||||
found = True
|
||||
break
|
||||
|
||||
|
||||
assert found, (
|
||||
f"Excerpt words not found as contiguous sequence:\n"
|
||||
f" Excerpt: {excerpt_words}\n"
|
||||
@ -176,14 +183,14 @@ class TestVocabNeeded:
|
||||
def test_length_1_needs_vocab_1(self, sample_text_file: Path) -> None:
|
||||
"""Test that a 1-word excerpt needs exactly 1 vocabulary word."""
|
||||
output = run_vocabulary_curve(sample_text_file, max_length=1)
|
||||
|
||||
|
||||
assert "[Length 1] Vocab needed: 1" in output
|
||||
|
||||
def test_vocab_needed_increases_monotonically(self, sample_text_file: Path) -> None:
|
||||
"""Test that vocab needed never decreases as length increases."""
|
||||
output = run_vocabulary_curve(sample_text_file, max_length=10)
|
||||
excerpts = extract_excerpts_from_output(output)
|
||||
|
||||
extract_excerpts_from_output(output)
|
||||
|
||||
# Extract vocab needed from output
|
||||
prev_vocab = 0
|
||||
for line in output.split("\n"):
|
||||
@ -192,9 +199,9 @@ class TestVocabNeeded:
|
||||
parts = line.split("Vocab needed:")
|
||||
if len(parts) > 1:
|
||||
vocab = int(parts[1].split()[0])
|
||||
assert vocab >= prev_vocab, (
|
||||
f"Vocab decreased from {prev_vocab} to {vocab}"
|
||||
)
|
||||
assert (
|
||||
vocab >= prev_vocab
|
||||
), f"Vocab decreased from {prev_vocab} to {vocab}"
|
||||
prev_vocab = vocab
|
||||
|
||||
|
||||
@ -205,25 +212,26 @@ class TestEdgeCases:
|
||||
"""Test handling of empty file."""
|
||||
filepath = tmp_path / "empty.txt"
|
||||
filepath.write_text("", encoding="utf-8")
|
||||
|
||||
|
||||
if not C_EXECUTABLE.exists():
|
||||
pytest.skip("C executable not found")
|
||||
|
||||
|
||||
result = subprocess.run(
|
||||
[str(C_EXECUTABLE), str(filepath), "5"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
|
||||
assert result.returncode != 0 or "No words" in result.stderr
|
||||
|
||||
def test_single_word_file(self, tmp_path: Path) -> None:
|
||||
"""Test file with single word."""
|
||||
filepath = tmp_path / "single.txt"
|
||||
filepath.write_text("hello", encoding="utf-8")
|
||||
|
||||
|
||||
output = run_vocabulary_curve(filepath, max_length=5)
|
||||
|
||||
|
||||
assert "[Length 1] Vocab needed: 1" in output
|
||||
# Should only have 1 length since there's only 1 word
|
||||
assert "[Length 2]" not in output
|
||||
@ -232,9 +240,9 @@ class TestEdgeCases:
|
||||
"""Test file with same word repeated."""
|
||||
filepath = tmp_path / "repeated.txt"
|
||||
filepath.write_text("hello hello hello hello hello", encoding="utf-8")
|
||||
|
||||
|
||||
output = run_vocabulary_curve(filepath, max_length=5)
|
||||
|
||||
|
||||
# All excerpts should need only 1 vocabulary word
|
||||
for i in range(1, 6):
|
||||
assert f"[Length {i}] Vocab needed: 1" in output
|
||||
|
||||
127
python_pkg/word_frequency/translator.py
Normal file → Executable file
127
python_pkg/word_frequency/translator.py
Normal file → Executable file
@ -29,8 +29,8 @@ Dependencies (install one):
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -50,6 +50,7 @@ def _check_cuda_available() -> bool:
|
||||
if _gpu_available is None:
|
||||
try:
|
||||
import torch
|
||||
|
||||
_gpu_available = torch.cuda.is_available()
|
||||
except ImportError:
|
||||
_gpu_available = False
|
||||
@ -58,41 +59,42 @@ def _check_cuda_available() -> bool:
|
||||
|
||||
def _init_gpu_if_available() -> None:
|
||||
"""Initialize GPU for argostranslate if CUDA is available.
|
||||
|
||||
|
||||
Raises:
|
||||
RuntimeError: If CUDA is available but GPU initialization fails.
|
||||
"""
|
||||
global _gpu_initialized
|
||||
if _gpu_initialized:
|
||||
return
|
||||
|
||||
|
||||
if not _check_cuda_available():
|
||||
_gpu_initialized = True
|
||||
return
|
||||
|
||||
|
||||
import sys
|
||||
|
||||
print("CUDA detected, initializing GPU acceleration...", file=sys.stderr)
|
||||
|
||||
|
||||
try:
|
||||
import torch
|
||||
import ctranslate2
|
||||
|
||||
|
||||
# Force CTranslate2 to use CUDA
|
||||
device_count = torch.cuda.device_count()
|
||||
if device_count == 0:
|
||||
raise RuntimeError("CUDA reports available but no GPU devices found")
|
||||
|
||||
|
||||
device_name = torch.cuda.get_device_name(0)
|
||||
print(f" Using GPU: {device_name}", file=sys.stderr)
|
||||
|
||||
|
||||
# Set environment variable to force GPU usage in argos
|
||||
import os
|
||||
|
||||
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
|
||||
os.environ["CT2_USE_EXPERIMENTAL_PACKED_GEMM"] = "1"
|
||||
|
||||
|
||||
_gpu_initialized = True
|
||||
print(" GPU acceleration enabled.", file=sys.stderr)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"CUDA is available but GPU initialization failed: {e}\n"
|
||||
@ -106,9 +108,10 @@ def _check_argos() -> bool:
|
||||
global _argos_available
|
||||
if _argos_available is None:
|
||||
try:
|
||||
import argostranslate.package # noqa: F401
|
||||
import argostranslate.translate # noqa: F401
|
||||
import argostranslate.package
|
||||
import argostranslate.translate
|
||||
|
||||
_ = (argostranslate.package, argostranslate.translate)
|
||||
_argos_available = True
|
||||
except ImportError:
|
||||
_argos_available = False
|
||||
@ -120,8 +123,9 @@ def _check_deep_translator() -> bool:
|
||||
global _deep_translator_available
|
||||
if _deep_translator_available is None:
|
||||
try:
|
||||
from deep_translator import GoogleTranslator # noqa: F401
|
||||
from deep_translator import GoogleTranslator
|
||||
|
||||
_ = GoogleTranslator
|
||||
_deep_translator_available = True
|
||||
except ImportError:
|
||||
_deep_translator_available = False
|
||||
@ -133,8 +137,9 @@ def _check_langdetect() -> bool:
|
||||
global _langdetect_available
|
||||
if _langdetect_available is None:
|
||||
try:
|
||||
import langdetect # noqa: F401
|
||||
import langdetect
|
||||
|
||||
_ = langdetect
|
||||
_langdetect_available = True
|
||||
except ImportError:
|
||||
_langdetect_available = False
|
||||
@ -227,7 +232,7 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
|
||||
results: dict[str, bool] = {}
|
||||
|
||||
# Update package index
|
||||
print("Updating package index...") # noqa: T201
|
||||
print("Updating package index...")
|
||||
argostranslate.package.update_package_index()
|
||||
available = argostranslate.package.get_available_packages()
|
||||
|
||||
@ -250,13 +255,13 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
|
||||
if pkg_key in available_lookup:
|
||||
pkg = available_lookup[pkg_key]
|
||||
try:
|
||||
print(f"Downloading {from_code} -> {to_code}...") # noqa: T201
|
||||
print(f"Downloading {from_code} -> {to_code}...")
|
||||
argostranslate.package.install_from_path(pkg.download())
|
||||
results[key] = True
|
||||
print(f" ✓ Installed {from_code} -> {to_code}") # noqa: T201
|
||||
print(f" ✓ Installed {from_code} -> {to_code}")
|
||||
except Exception as e: # noqa: BLE001
|
||||
results[key] = False
|
||||
print(f" ✗ Failed {from_code} -> {to_code}: {e}") # noqa: T201
|
||||
print(f" ✗ Failed {from_code} -> {to_code}: {e}")
|
||||
else:
|
||||
# Package not available
|
||||
results[key] = False
|
||||
@ -276,7 +281,7 @@ def _ensure_argos_installed() -> None:
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
print("argostranslate not found. Attempting to install...") # noqa: T201
|
||||
print("argostranslate not found. Attempting to install...")
|
||||
try:
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "pip", "install", "argostranslate"],
|
||||
@ -284,11 +289,11 @@ def _ensure_argos_installed() -> None:
|
||||
capture_output=True,
|
||||
)
|
||||
# Reset the check flag and verify
|
||||
global _argos_available # noqa: PLW0603
|
||||
global _argos_available
|
||||
_argos_available = None
|
||||
if not _check_argos():
|
||||
raise ImportError("argostranslate installation succeeded but import failed")
|
||||
print("argostranslate installed successfully.") # noqa: T201
|
||||
print("argostranslate installed successfully.")
|
||||
except subprocess.CalledProcessError as e:
|
||||
error_msg = e.stderr.decode() if e.stderr else str(e)
|
||||
raise ImportError(
|
||||
@ -354,7 +359,7 @@ def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
|
||||
)
|
||||
|
||||
print(
|
||||
f" Downloading package (~50-100MB, this may take a minute)...",
|
||||
" Downloading package (~50-100MB, this may take a minute)...",
|
||||
file=sys.stderr,
|
||||
)
|
||||
download_path = pkg.download()
|
||||
@ -391,6 +396,7 @@ def translate_word(
|
||||
if use_cache:
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_translation_cache
|
||||
|
||||
cache = get_translation_cache()
|
||||
cached = cache.get(word, from_lang, to_lang)
|
||||
if cached is not None:
|
||||
@ -415,6 +421,7 @@ def translate_word(
|
||||
if use_cache:
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_translation_cache
|
||||
|
||||
get_translation_cache().set(word, from_lang, to_lang, translated)
|
||||
except ImportError:
|
||||
pass
|
||||
@ -454,7 +461,9 @@ def translate_words(
|
||||
Returns:
|
||||
List of TranslationResult for each word.
|
||||
"""
|
||||
return [translate_word(word, from_lang, to_lang, use_cache=use_cache) for word in words]
|
||||
return [
|
||||
translate_word(word, from_lang, to_lang, use_cache=use_cache) for word in words
|
||||
]
|
||||
|
||||
|
||||
def _translate_batch_worker(
|
||||
@ -464,20 +473,20 @@ def _translate_batch_worker(
|
||||
batch_idx: int,
|
||||
) -> tuple[int, dict[str, str]]:
|
||||
"""Worker function to translate a batch of words.
|
||||
|
||||
|
||||
Args:
|
||||
batch_words: Words to translate in this batch.
|
||||
from_lang: Source language code.
|
||||
to_lang: Target language code.
|
||||
batch_idx: Index of this batch (for ordering results).
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple of (batch_idx, translations dict).
|
||||
"""
|
||||
import argostranslate.translate
|
||||
|
||||
|
||||
translations: dict[str, str] = {}
|
||||
|
||||
|
||||
# Batch translate by joining with newlines
|
||||
batch_text = "\n".join(batch_words)
|
||||
translated_batch = argostranslate.translate.translate(
|
||||
@ -492,11 +501,9 @@ def _translate_batch_worker(
|
||||
else:
|
||||
# Fall back to individual translation for this batch
|
||||
for word in batch_words:
|
||||
translated = argostranslate.translate.translate(
|
||||
word, from_lang, to_lang
|
||||
)
|
||||
translated = argostranslate.translate.translate(word, from_lang, to_lang)
|
||||
translations[word.lower()] = translated
|
||||
|
||||
|
||||
return batch_idx, translations
|
||||
|
||||
|
||||
@ -530,7 +537,7 @@ def translate_words_batch(
|
||||
|
||||
# Ensure argos is installed (will raise if it can't be)
|
||||
_ensure_argos_installed()
|
||||
|
||||
|
||||
# Initialize GPU if available (will raise if CUDA available but fails)
|
||||
_init_gpu_if_available()
|
||||
|
||||
@ -544,6 +551,7 @@ def translate_words_batch(
|
||||
if use_cache:
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_translation_cache
|
||||
|
||||
cache = get_translation_cache()
|
||||
cached_results = cache.get_many(list(words), from_lang, to_lang)
|
||||
except ImportError:
|
||||
@ -560,7 +568,7 @@ def translate_words_batch(
|
||||
import sys
|
||||
|
||||
num_to_translate = len(words_to_translate)
|
||||
|
||||
|
||||
# Check if GPU is being used
|
||||
gpu_status = " (GPU)" if _gpu_available else " (CPU)"
|
||||
print(
|
||||
@ -574,31 +582,31 @@ def translate_words_batch(
|
||||
BATCH_SIZE = 100
|
||||
batches: list[list[str]] = []
|
||||
for i in range(0, num_to_translate, BATCH_SIZE):
|
||||
batches.append(words_to_translate[i:i + BATCH_SIZE])
|
||||
|
||||
batches.append(words_to_translate[i : i + BATCH_SIZE])
|
||||
|
||||
total_batches = len(batches)
|
||||
|
||||
|
||||
# Sequential translation with progress
|
||||
# (argostranslate is not thread-safe - uses global model)
|
||||
for batch_idx, batch_words in enumerate(batches):
|
||||
words_done = (batch_idx + 1) * BATCH_SIZE
|
||||
words_done = min(words_done, num_to_translate)
|
||||
pct = int(words_done / num_to_translate * 100)
|
||||
|
||||
|
||||
print(
|
||||
f" [{pct:3d}%] Translating batch {batch_idx + 1}/{total_batches} "
|
||||
f"({words_done}/{num_to_translate} words)...",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
|
||||
|
||||
_, batch_translations = _translate_batch_worker(
|
||||
batch_words, from_lang, to_lang, batch_idx
|
||||
)
|
||||
new_translations.update(batch_translations)
|
||||
|
||||
print(f" Translation complete.", file=sys.stderr, flush=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
|
||||
print(" Translation complete.", file=sys.stderr, flush=True)
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Translation failed for {from_lang} -> {to_lang}: {e}"
|
||||
) from e
|
||||
@ -607,6 +615,7 @@ def translate_words_batch(
|
||||
if use_cache and new_translations:
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_translation_cache
|
||||
|
||||
get_translation_cache().set_many(new_translations, from_lang, to_lang)
|
||||
except ImportError:
|
||||
pass
|
||||
@ -670,7 +679,9 @@ def format_translations(
|
||||
# Data
|
||||
for r in results:
|
||||
if r.success:
|
||||
lines.append(f"{r.source_word:<{max_source}} {r.translated_word:<{max_trans}}")
|
||||
lines.append(
|
||||
f"{r.source_word:<{max_source}} {r.translated_word:<{max_trans}}"
|
||||
)
|
||||
elif show_errors:
|
||||
error_msg = f"[Error: {r.error}]" if r.error else "[Failed]"
|
||||
lines.append(f"{r.source_word:<{max_source}} {error_msg}")
|
||||
@ -771,7 +782,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
|
||||
# Check if argostranslate is available
|
||||
if not _check_argos():
|
||||
print( # noqa: T201
|
||||
print(
|
||||
"Error: argostranslate is not installed.\n"
|
||||
"Install it with: pip install argostranslate",
|
||||
file=sys.stderr,
|
||||
@ -782,30 +793,30 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
if args.list_languages:
|
||||
langs = get_installed_languages()
|
||||
if not langs:
|
||||
print("No languages installed.") # noqa: T201
|
||||
print("Download some with: --download en es pl de fr") # noqa: T201
|
||||
print("No languages installed.")
|
||||
print("Download some with: --download en es pl de fr")
|
||||
else:
|
||||
print("Installed languages:") # noqa: T201
|
||||
print("Installed languages:")
|
||||
for code, name in sorted(langs):
|
||||
print(f" {code}: {name}") # noqa: T201
|
||||
print(f" {code}: {name}")
|
||||
return 0
|
||||
|
||||
# Handle list-available
|
||||
if args.list_available:
|
||||
packages = get_available_packages()
|
||||
if not packages:
|
||||
print("No packages available (check internet connection).") # noqa: T201
|
||||
print("No packages available (check internet connection).")
|
||||
else:
|
||||
print("Available language packages:") # noqa: T201
|
||||
print("Available language packages:")
|
||||
for from_code, from_name, to_code, to_name in sorted(packages):
|
||||
print(f" {from_code} ({from_name}) -> {to_code} ({to_name})") # noqa: T201
|
||||
print(f" {from_code} ({from_name}) -> {to_code} ({to_name})")
|
||||
return 0
|
||||
|
||||
# Handle download
|
||||
if args.download:
|
||||
results = download_languages(args.download)
|
||||
success_count = sum(1 for v in results.values() if v)
|
||||
print(f"\nDownloaded {success_count}/{len(results)} language pairs.") # noqa: T201
|
||||
download_results = download_languages(args.download)
|
||||
success_count = sum(1 for v in download_results.values() if v)
|
||||
print(f"\nDownloaded {success_count}/{len(download_results)} language pairs.")
|
||||
return 0 if success_count > 0 else 1
|
||||
|
||||
# Handle translation
|
||||
@ -819,7 +830,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
content = read_file(args.words_file)
|
||||
words = [w.strip() for w in content.splitlines() if w.strip()]
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {args.words_file}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found: {args.words_file}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if not words:
|
||||
@ -830,7 +841,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
try:
|
||||
results = translate_words_batch(words, args.from_lang, args.to_lang)
|
||||
except ImportError as e:
|
||||
print(f"Error: {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
output = format_translations(results)
|
||||
@ -838,9 +849,9 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
# Output
|
||||
if args.output:
|
||||
Path(args.output).write_text(output, encoding="utf-8")
|
||||
print(f"Translations written to {args.output}") # noqa: T201
|
||||
print(f"Translations written to {args.output}")
|
||||
else:
|
||||
print(output) # noqa: T201
|
||||
print(output)
|
||||
|
||||
# Return error if any translation failed
|
||||
if any(not r.success for r in results):
|
||||
|
||||
15
python_pkg/word_frequency/vocabulary_curve.py
Normal file → Executable file
15
python_pkg/word_frequency/vocabulary_curve.py
Normal file → Executable file
@ -14,8 +14,8 @@ Usage:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -112,6 +112,7 @@ def find_optimal_excerpts(
|
||||
|
||||
# Extract all words from text (preserving order)
|
||||
import re
|
||||
|
||||
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
|
||||
if not case_sensitive:
|
||||
all_words = [w.lower() for w in all_words]
|
||||
@ -213,7 +214,9 @@ def format_results(
|
||||
if results:
|
||||
final = results[-1]
|
||||
lines.append(f"To understand a {final.excerpt_length}-word excerpt,")
|
||||
lines.append(f"you need to learn at minimum {final.min_vocab_needed} top words.")
|
||||
lines.append(
|
||||
f"you need to learn at minimum {final.min_vocab_needed} top words."
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
@ -301,15 +304,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
|
||||
if args.output:
|
||||
Path(args.output).write_text(output, encoding="utf-8")
|
||||
print(f"Output written to {args.output}") # noqa: T201
|
||||
print(f"Output written to {args.output}")
|
||||
else:
|
||||
print(output) # noqa: T201
|
||||
print(output)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
||||
return 1
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Error: Could not decode file - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: Could not decode file - {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
Loading…
Reference in New Issue
Block a user