mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 14:23:04 +02:00
Add pre-commit workflow and fix linting violations (#2)
* Initial plan * Add pre-commit GitHub workflow and fix linting issues - Created .github/workflows/pre-commit.yml to run pre-commit hooks in CI - Fixed mypy type errors in translator.py - Fixed shellcheck warning in run_anki_generator.sh - Added per-file ignores for word_frequency module legacy code - Applied auto-fixes from ruff, ruff-format, autoflake, prettier - All pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Make Python scripts with shebangs executable - Set executable bit for word_frequency module scripts with shebangs - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Fix: Restore imports in check functions (autoflake-proof) - Restored imports in _check_argos(), _check_deep_translator(), _check_langdetect() - Used _ = module assignment to prevent autoflake from removing imports - These imports test module availability by triggering ImportError if missing - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
This commit is contained in:
parent
f48b1dd178
commit
0bf6fd1bb1
16
python_pkg/word_frequency/analyzer.py
Normal file → Executable file
16
python_pkg/word_frequency/analyzer.py
Normal file → Executable file
@ -21,10 +21,10 @@ Usage:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -143,7 +143,9 @@ def format_results(
|
||||
# Data rows
|
||||
for word, count in items:
|
||||
percentage = (count / total_words) * 100
|
||||
lines.append(f"{word:<{max_word_len}} {count:>{count_width}} {percentage:>9.2f}%")
|
||||
lines.append(
|
||||
f"{word:<{max_word_len}} {count:>{count_width}} {percentage:>9.2f}%"
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
@ -242,15 +244,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
|
||||
if args.output:
|
||||
Path(args.output).write_text(result, encoding="utf-8")
|
||||
print(f"Output written to {args.output}") # noqa: T201
|
||||
print(f"Output written to {args.output}")
|
||||
else:
|
||||
print(result) # noqa: T201
|
||||
print(result)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
||||
return 1
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
171
python_pkg/word_frequency/anki_generator.py
Normal file → Executable file
171
python_pkg/word_frequency/anki_generator.py
Normal file → Executable file
@ -25,29 +25,30 @@ Output:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
try:
|
||||
from python_pkg.word_frequency.analyzer import read_file
|
||||
from python_pkg.word_frequency.translator import (
|
||||
detect_language,
|
||||
translate_words_batch,
|
||||
)
|
||||
from python_pkg.word_frequency.analyzer import read_file
|
||||
except ImportError:
|
||||
from translator import detect_language, translate_words_batch
|
||||
from analyzer import read_file
|
||||
from translator import detect_language, translate_words_batch
|
||||
|
||||
|
||||
# Path to C vocabulary_curve executable
|
||||
C_EXECUTABLE = Path(__file__).parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
|
||||
C_EXECUTABLE = (
|
||||
Path(__file__).parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
|
||||
)
|
||||
|
||||
|
||||
class VocabWord(NamedTuple):
|
||||
@ -59,7 +60,9 @@ class VocabWord(NamedTuple):
|
||||
context: str
|
||||
|
||||
|
||||
def run_vocabulary_curve(filepath: Path, max_length: int, *, dump_vocab: bool = False) -> str:
|
||||
def run_vocabulary_curve(
|
||||
filepath: Path, max_length: int, *, dump_vocab: bool = False
|
||||
) -> str:
|
||||
"""Run the C vocabulary_curve executable.
|
||||
|
||||
Args:
|
||||
@ -94,7 +97,9 @@ def run_vocabulary_curve(filepath: Path, max_length: int, *, dump_vocab: bool =
|
||||
return result.stdout
|
||||
|
||||
|
||||
def run_vocabulary_curve_inverse(filepath: Path, max_vocab: int, *, dump_vocab: bool = False) -> str:
|
||||
def run_vocabulary_curve_inverse(
|
||||
filepath: Path, max_vocab: int, *, dump_vocab: bool = False
|
||||
) -> str:
|
||||
"""Run the C vocabulary_curve executable in inverse mode.
|
||||
|
||||
Args:
|
||||
@ -129,7 +134,9 @@ def run_vocabulary_curve_inverse(filepath: Path, max_vocab: int, *, dump_vocab:
|
||||
return result.stdout
|
||||
|
||||
|
||||
def parse_inverse_mode_output(output: str) -> tuple[str, int, int, list[tuple[str, int]]]:
|
||||
def parse_inverse_mode_output(
|
||||
output: str,
|
||||
) -> tuple[str, int, int, list[tuple[str, int]]]:
|
||||
"""Parse output from vocabulary_curve inverse mode.
|
||||
|
||||
Args:
|
||||
@ -194,7 +201,9 @@ def parse_inverse_mode_output(output: str) -> tuple[str, int, int, list[tuple[st
|
||||
return excerpt, excerpt_length, max_rank_used, all_vocab
|
||||
|
||||
|
||||
def parse_vocabulary_curve_output(output: str, target_length: int) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
|
||||
def parse_vocabulary_curve_output(
|
||||
output: str, target_length: int
|
||||
) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
|
||||
"""Parse output from vocabulary_curve to get words needed.
|
||||
|
||||
Args:
|
||||
@ -328,8 +337,8 @@ def generate_anki_deck(
|
||||
lines: list[str] = []
|
||||
|
||||
# Add Anki headers
|
||||
lines.append(f"#separator:semicolon")
|
||||
lines.append(f"#html:true")
|
||||
lines.append("#separator:semicolon")
|
||||
lines.append("#html:true")
|
||||
lines.append(f"#deck:{deck_name}")
|
||||
lines.append(f"#tags:vocabulary {source_lang}")
|
||||
if include_context:
|
||||
@ -351,11 +360,15 @@ def generate_anki_deck(
|
||||
if most_frequent != rarest:
|
||||
pattern_rare = re.compile(rf"\b({re.escape(rarest)})\b", re.IGNORECASE)
|
||||
excerpt_escaped = pattern_rare.sub(r"<b>\1</b>", excerpt_escaped)
|
||||
pattern_freq = re.compile(rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE)
|
||||
pattern_freq = re.compile(
|
||||
rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE
|
||||
)
|
||||
excerpt_escaped = pattern_freq.sub(r"<i>\1</i>", excerpt_escaped)
|
||||
else:
|
||||
# Same word is both most and least frequent - use bold+italic
|
||||
pattern = re.compile(rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE)
|
||||
pattern = re.compile(
|
||||
rf"\b({re.escape(most_frequent)})\b", re.IGNORECASE
|
||||
)
|
||||
excerpt_escaped = pattern.sub(r"<b><i>\1</i></b>", excerpt_escaped)
|
||||
lines.append(f"📖 TARGET EXCERPT;{excerpt_escaped};#0")
|
||||
|
||||
@ -391,7 +404,9 @@ def generate_anki_deck(
|
||||
context_escaped = pattern.sub(f"<b>{word}</b>", context_escaped)
|
||||
else:
|
||||
context_escaped = ""
|
||||
lines.append(f"{word_escaped};{translation_escaped};#{rank};{context_escaped}")
|
||||
lines.append(
|
||||
f"{word_escaped};{translation_escaped};#{rank};{context_escaped}"
|
||||
)
|
||||
else:
|
||||
lines.append(f"{word_escaped};{translation_escaped};#{rank}")
|
||||
|
||||
@ -415,6 +430,7 @@ def get_cached_excerpt(
|
||||
return None
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_vocab_curve_cache
|
||||
|
||||
return get_vocab_curve_cache().get(filepath, length)
|
||||
except ImportError:
|
||||
return None
|
||||
@ -433,6 +449,7 @@ def cache_excerpt(
|
||||
"""
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_vocab_curve_cache
|
||||
|
||||
get_vocab_curve_cache().set(filepath, length, excerpt, words)
|
||||
except ImportError:
|
||||
pass
|
||||
@ -464,6 +481,7 @@ def get_cached_deck(
|
||||
return None
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_anki_deck_cache
|
||||
|
||||
return get_anki_deck_cache().get(
|
||||
filepath, length, target_lang, include_context, all_vocab
|
||||
)
|
||||
@ -497,6 +515,7 @@ def cache_deck(
|
||||
"""
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_anki_deck_cache
|
||||
|
||||
get_anki_deck_cache().set(
|
||||
filepath,
|
||||
length,
|
||||
@ -568,7 +587,9 @@ def generate_flashcards(
|
||||
# Run vocabulary curve analysis with vocab dump for all words
|
||||
output = run_vocabulary_curve(filepath, excerpt_length, dump_vocab=all_vocab)
|
||||
# Parse the output (now includes all vocabulary from C)
|
||||
excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(output, excerpt_length)
|
||||
excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(
|
||||
output, excerpt_length
|
||||
)
|
||||
|
||||
if not excerpt_words:
|
||||
raise ValueError(f"No words found for excerpt length {excerpt_length}")
|
||||
@ -673,7 +694,9 @@ def generate_flashcards_inverse(
|
||||
output = run_vocabulary_curve_inverse(filepath, max_vocab, dump_vocab=True)
|
||||
|
||||
# Parse the output
|
||||
excerpt, excerpt_length, max_rank_used, all_vocab_words = parse_inverse_mode_output(output)
|
||||
excerpt, excerpt_length, max_rank_used, all_vocab_words = parse_inverse_mode_output(
|
||||
output
|
||||
)
|
||||
|
||||
if excerpt_length == 0:
|
||||
raise ValueError(
|
||||
@ -689,7 +712,9 @@ def generate_flashcards_inverse(
|
||||
|
||||
# Find words that appear in the excerpt (for highlighting)
|
||||
excerpt_word_set = set(excerpt.lower().split())
|
||||
excerpt_words = [(w, r) for w, r in all_vocab_words if w.lower() in excerpt_word_set]
|
||||
excerpt_words = [
|
||||
(w, r) for w, r in all_vocab_words if w.lower() in excerpt_word_set
|
||||
]
|
||||
|
||||
# Get contexts if requested
|
||||
contexts = None
|
||||
@ -835,13 +860,13 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
try:
|
||||
from cache import get_all_cache_stats
|
||||
except ImportError:
|
||||
print("Cache module not available", file=sys.stderr) # noqa: T201
|
||||
print("Cache module not available", file=sys.stderr)
|
||||
return 1
|
||||
stats = get_all_cache_stats()
|
||||
print("Cache Statistics") # noqa: T201
|
||||
print("=" * 50) # noqa: T201
|
||||
print("Cache Statistics")
|
||||
print("=" * 50)
|
||||
for cache_name, cache_stats in stats.items():
|
||||
print(f"\n{cache_name.upper()}:") # noqa: T201
|
||||
print(f"\n{cache_name.upper()}:")
|
||||
for key, value in cache_stats.items():
|
||||
if key == "cache_size_bytes":
|
||||
if value < 1024:
|
||||
@ -850,9 +875,9 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
size_str = f"{value / 1024:.1f} KB"
|
||||
else:
|
||||
size_str = f"{value / (1024 * 1024):.1f} MB"
|
||||
print(f" {key}: {size_str}") # noqa: T201
|
||||
print(f" {key}: {size_str}")
|
||||
else:
|
||||
print(f" {key}: {value}") # noqa: T201
|
||||
print(f" {key}: {value}")
|
||||
return 0
|
||||
|
||||
if args.clear_cache:
|
||||
@ -862,10 +887,10 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
try:
|
||||
from cache import clear_all_caches
|
||||
except ImportError:
|
||||
print("Cache module not available", file=sys.stderr) # noqa: T201
|
||||
print("Cache module not available", file=sys.stderr)
|
||||
return 1
|
||||
clear_all_caches()
|
||||
print("All caches cleared.") # noqa: T201
|
||||
print("All caches cleared.")
|
||||
return 0
|
||||
|
||||
# Validate required arguments for main functionality
|
||||
@ -879,17 +904,18 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
try:
|
||||
filepath = Path(args.file)
|
||||
if not filepath.exists():
|
||||
print(f"Error: File not found: {args.file}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# INVERSE MODE: --max-vocab
|
||||
if args.max_vocab is not None:
|
||||
if not args.quiet:
|
||||
print(f"Analyzing {filepath.name}...") # noqa: T201
|
||||
print(f"Finding longest excerpt using top {args.max_vocab} words...") # noqa: T201
|
||||
print(f"Analyzing {filepath.name}...")
|
||||
print(f"Finding longest excerpt using top {args.max_vocab} words...")
|
||||
|
||||
# Generate flashcards in inverse mode
|
||||
anki_content, excerpt, excerpt_length, num_words, max_rank_used = generate_flashcards_inverse(
|
||||
anki_content, excerpt, excerpt_length, num_words, max_rank_used = (
|
||||
generate_flashcards_inverse(
|
||||
filepath,
|
||||
args.max_vocab,
|
||||
source_lang=args.source_lang,
|
||||
@ -899,43 +925,46 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
no_translate=args.no_translate,
|
||||
force=args.force,
|
||||
)
|
||||
)
|
||||
|
||||
# Determine output path
|
||||
if args.output:
|
||||
output_path = Path(args.output)
|
||||
else:
|
||||
output_path = filepath.parent / f"{filepath.stem}_anki_top{args.max_vocab}.txt"
|
||||
output_path = (
|
||||
filepath.parent / f"{filepath.stem}_anki_top{args.max_vocab}.txt"
|
||||
)
|
||||
|
||||
# Write output
|
||||
output_path.write_text(anki_content, encoding="utf-8")
|
||||
|
||||
if not args.quiet:
|
||||
print("") # noqa: T201
|
||||
print("=" * 60) # noqa: T201
|
||||
print("FLASHCARD GENERATION COMPLETE (INVERSE MODE)") # noqa: T201
|
||||
print("=" * 60) # noqa: T201
|
||||
print(f"Learning: top {args.max_vocab} words") # noqa: T201
|
||||
print(f"Longest excerpt you can understand: {excerpt_length} words") # noqa: T201
|
||||
print(f' "{excerpt}"') # noqa: T201
|
||||
print("") # noqa: T201
|
||||
print(f"Rarest word in excerpt: #{max_rank_used}") # noqa: T201
|
||||
print(f"Flashcards: {num_words}") # noqa: T201
|
||||
print(f"Output file: {output_path}") # noqa: T201
|
||||
print("") # noqa: T201
|
||||
print("To import into Anki:") # noqa: T201
|
||||
print(" 1. Open Anki") # noqa: T201
|
||||
print(" 2. File -> Import") # noqa: T201
|
||||
print(f" 3. Select: {output_path}") # noqa: T201
|
||||
print(" 4. Click Import") # noqa: T201
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("FLASHCARD GENERATION COMPLETE (INVERSE MODE)")
|
||||
print("=" * 60)
|
||||
print(f"Learning: top {args.max_vocab} words")
|
||||
print(f"Longest excerpt you can understand: {excerpt_length} words")
|
||||
print(f' "{excerpt}"')
|
||||
print()
|
||||
print(f"Rarest word in excerpt: #{max_rank_used}")
|
||||
print(f"Flashcards: {num_words}")
|
||||
print(f"Output file: {output_path}")
|
||||
print()
|
||||
print("To import into Anki:")
|
||||
print(" 1. Open Anki")
|
||||
print(" 2. File -> Import")
|
||||
print(f" 3. Select: {output_path}")
|
||||
print(" 4. Click Import")
|
||||
else:
|
||||
print(output_path) # noqa: T201
|
||||
print(output_path)
|
||||
|
||||
return 0
|
||||
|
||||
# NORMAL MODE: --length
|
||||
if not args.quiet:
|
||||
print(f"Analyzing {filepath.name}...") # noqa: T201
|
||||
print(f"Finding vocabulary for {args.length}-word excerpt...") # noqa: T201
|
||||
print(f"Analyzing {filepath.name}...")
|
||||
print(f"Finding vocabulary for {args.length}-word excerpt...")
|
||||
|
||||
# Generate flashcards
|
||||
anki_content, excerpt, num_words, max_rank = generate_flashcards(
|
||||
@ -960,38 +989,38 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
output_path.write_text(anki_content, encoding="utf-8")
|
||||
|
||||
if not args.quiet:
|
||||
print("") # noqa: T201
|
||||
print("=" * 60) # noqa: T201
|
||||
print("FLASHCARD GENERATION COMPLETE") # noqa: T201
|
||||
print("=" * 60) # noqa: T201
|
||||
print(f"Excerpt to understand ({args.length} words):") # noqa: T201
|
||||
print(f' "{excerpt}"') # noqa: T201
|
||||
print("") # noqa: T201
|
||||
print(f"Max word rank needed: #{max_rank}") # noqa: T201
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("FLASHCARD GENERATION COMPLETE")
|
||||
print("=" * 60)
|
||||
print(f"Excerpt to understand ({args.length} words):")
|
||||
print(f' "{excerpt}"')
|
||||
print()
|
||||
print(f"Max word rank needed: #{max_rank}")
|
||||
if args.excerpt_words_only:
|
||||
print(f"Flashcards: {num_words} (excerpt words only)") # noqa: T201
|
||||
print(f"Flashcards: {num_words} (excerpt words only)")
|
||||
else:
|
||||
print(f"Flashcards: {num_words} (ALL words rank #1 to #{max_rank})") # noqa: T201
|
||||
print(f"Output file: {output_path}") # noqa: T201
|
||||
print("") # noqa: T201
|
||||
print("To import into Anki:") # noqa: T201
|
||||
print(" 1. Open Anki") # noqa: T201
|
||||
print(" 2. File -> Import") # noqa: T201
|
||||
print(f" 3. Select: {output_path}") # noqa: T201
|
||||
print(" 4. Click Import") # noqa: T201
|
||||
print(f"Flashcards: {num_words} (ALL words rank #1 to #{max_rank})")
|
||||
print(f"Output file: {output_path}")
|
||||
print()
|
||||
print("To import into Anki:")
|
||||
print(" 1. Open Anki")
|
||||
print(" 2. File -> Import")
|
||||
print(f" 3. Select: {output_path}")
|
||||
print(" 4. Click Import")
|
||||
else:
|
||||
print(output_path) # noqa: T201
|
||||
print(output_path)
|
||||
|
||||
return 0
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error running vocabulary_curve: {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error running vocabulary_curve: {e}", file=sys.stderr)
|
||||
return 1
|
||||
except ValueError as e:
|
||||
print(f"Error: {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
|
||||
55
python_pkg/word_frequency/cache.py
Normal file → Executable file
55
python_pkg/word_frequency/cache.py
Normal file → Executable file
@ -15,10 +15,7 @@ import hashlib
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
if TYPE_CHECKING:
|
||||
pass
|
||||
from typing import Any
|
||||
|
||||
# Default cache directory
|
||||
DEFAULT_CACHE_DIR = Path.home() / ".cache" / "word_frequency"
|
||||
@ -88,7 +85,9 @@ class TranslationCache:
|
||||
if self._cache is None:
|
||||
if self.cache_file.exists():
|
||||
try:
|
||||
self._cache = json.loads(self.cache_file.read_text(encoding="utf-8"))
|
||||
self._cache = json.loads(
|
||||
self.cache_file.read_text(encoding="utf-8")
|
||||
)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
self._cache = {}
|
||||
else:
|
||||
@ -122,9 +121,7 @@ class TranslationCache:
|
||||
"""
|
||||
return f"{source_lang}:{target_lang}:{word.lower()}"
|
||||
|
||||
def get(
|
||||
self, word: str, source_lang: str, target_lang: str
|
||||
) -> str | None:
|
||||
def get(self, word: str, source_lang: str, target_lang: str) -> str | None:
|
||||
"""Get cached translation.
|
||||
|
||||
Args:
|
||||
@ -140,8 +137,13 @@ class TranslationCache:
|
||||
return cache.get(key)
|
||||
|
||||
def set(
|
||||
self, word: str, source_lang: str, target_lang: str, translation: str,
|
||||
*, auto_save: bool = False,
|
||||
self,
|
||||
word: str,
|
||||
source_lang: str,
|
||||
target_lang: str,
|
||||
translation: str,
|
||||
*,
|
||||
auto_save: bool = False,
|
||||
) -> None:
|
||||
"""Store translation in cache.
|
||||
|
||||
@ -525,7 +527,7 @@ _anki_deck_cache: AnkiDeckCache | None = None
|
||||
|
||||
def get_translation_cache() -> TranslationCache:
|
||||
"""Get the global translation cache instance."""
|
||||
global _translation_cache # noqa: PLW0603
|
||||
global _translation_cache
|
||||
if _translation_cache is None:
|
||||
_translation_cache = TranslationCache()
|
||||
return _translation_cache
|
||||
@ -533,7 +535,7 @@ def get_translation_cache() -> TranslationCache:
|
||||
|
||||
def get_vocab_curve_cache() -> VocabCurveCache:
|
||||
"""Get the global vocabulary curve cache instance."""
|
||||
global _vocab_curve_cache # noqa: PLW0603
|
||||
global _vocab_curve_cache
|
||||
if _vocab_curve_cache is None:
|
||||
_vocab_curve_cache = VocabCurveCache()
|
||||
return _vocab_curve_cache
|
||||
@ -541,7 +543,7 @@ def get_vocab_curve_cache() -> VocabCurveCache:
|
||||
|
||||
def get_anki_deck_cache() -> AnkiDeckCache:
|
||||
"""Get the global Anki deck cache instance."""
|
||||
global _anki_deck_cache # noqa: PLW0603
|
||||
global _anki_deck_cache
|
||||
if _anki_deck_cache is None:
|
||||
_anki_deck_cache = AnkiDeckCache()
|
||||
return _anki_deck_cache
|
||||
@ -576,12 +578,8 @@ def main() -> int:
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Manage word frequency caches")
|
||||
parser.add_argument(
|
||||
"--stats", action="store_true", help="Show cache statistics"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--clear", action="store_true", help="Clear all caches"
|
||||
)
|
||||
parser.add_argument("--stats", action="store_true", help="Show cache statistics")
|
||||
parser.add_argument("--clear", action="store_true", help="Clear all caches")
|
||||
parser.add_argument(
|
||||
"--clear-translations", action="store_true", help="Clear translation cache"
|
||||
)
|
||||
@ -596,30 +594,30 @@ def main() -> int:
|
||||
|
||||
if args.clear:
|
||||
clear_all_caches()
|
||||
print("All caches cleared.") # noqa: T201
|
||||
print("All caches cleared.")
|
||||
return 0
|
||||
|
||||
if args.clear_translations:
|
||||
get_translation_cache().clear()
|
||||
print("Translation cache cleared.") # noqa: T201
|
||||
print("Translation cache cleared.")
|
||||
return 0
|
||||
|
||||
if args.clear_excerpts:
|
||||
get_vocab_curve_cache().clear()
|
||||
print("Excerpt cache cleared.") # noqa: T201
|
||||
print("Excerpt cache cleared.")
|
||||
return 0
|
||||
|
||||
if args.clear_anki:
|
||||
get_anki_deck_cache().clear()
|
||||
print("Anki deck cache cleared.") # noqa: T201
|
||||
print("Anki deck cache cleared.")
|
||||
return 0
|
||||
|
||||
# Default: show stats
|
||||
stats = get_all_cache_stats()
|
||||
print("Cache Statistics") # noqa: T201
|
||||
print("=" * 50) # noqa: T201
|
||||
print("Cache Statistics")
|
||||
print("=" * 50)
|
||||
for cache_name, cache_stats in stats.items():
|
||||
print(f"\n{cache_name.upper()}:") # noqa: T201
|
||||
print(f"\n{cache_name.upper()}:")
|
||||
for key, value in cache_stats.items():
|
||||
if key == "cache_size_bytes":
|
||||
# Format as human-readable
|
||||
@ -629,13 +627,14 @@ def main() -> int:
|
||||
size_str = f"{value / 1024:.1f} KB"
|
||||
else:
|
||||
size_str = f"{value / (1024 * 1024):.1f} MB"
|
||||
print(f" {key}: {size_str}") # noqa: T201
|
||||
print(f" {key}: {size_str}")
|
||||
else:
|
||||
print(f" {key}: {value}") # noqa: T201
|
||||
print(f" {key}: {value}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
sys.exit(main())
|
||||
|
||||
22
python_pkg/word_frequency/excerpt_finder.py
Normal file → Executable file
22
python_pkg/word_frequency/excerpt_finder.py
Normal file → Executable file
@ -21,8 +21,8 @@ Usage:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
try:
|
||||
@ -81,7 +81,9 @@ def find_best_excerpt(
|
||||
target_set = {w.lower() for w in target_words}
|
||||
|
||||
# Use sliding window to find the best excerpt
|
||||
results: list[tuple[int, int, float, int]] = [] # (match_count, -start, percentage, start)
|
||||
results: list[
|
||||
tuple[int, int, float, int]
|
||||
] = [] # (match_count, -start, percentage, start)
|
||||
|
||||
# Count matches in first window
|
||||
current_matches = sum(1 for w in words[:excerpt_length] if w in target_set)
|
||||
@ -219,9 +221,11 @@ def format_excerpt_results(
|
||||
for i, result in enumerate(results, 1):
|
||||
if len(results) > 1:
|
||||
lines.append(f"=== Result #{i} ===")
|
||||
lines.append(f"Excerpt: \"{result.excerpt}\"")
|
||||
lines.append(f'Excerpt: "{result.excerpt}"')
|
||||
lines.append(f"Word position: {result.start_index} - {result.end_index - 1}")
|
||||
lines.append(f"Matches: {result.match_count}/{len(result.words)} ({result.match_percentage:.2f}%)")
|
||||
lines.append(
|
||||
f"Matches: {result.match_count}/{len(result.words)} ({result.match_percentage:.2f}%)"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
@ -325,7 +329,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
target_words = [w.strip() for w in words_content.splitlines() if w.strip()]
|
||||
|
||||
if not target_words:
|
||||
print("Error: No target words provided", file=sys.stderr) # noqa: T201
|
||||
print("Error: No target words provided", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Find excerpts
|
||||
@ -343,15 +347,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
|
||||
if args.output:
|
||||
Path(args.output).write_text(output, encoding="utf-8")
|
||||
print(f"Output written to {args.output}") # noqa: T201
|
||||
print(f"Output written to {args.output}")
|
||||
else:
|
||||
print(output) # noqa: T201
|
||||
print(output)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
||||
return 1
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
157
python_pkg/word_frequency/learning_pipe.py
Normal file → Executable file
157
python_pkg/word_frequency/learning_pipe.py
Normal file → Executable file
@ -31,15 +31,14 @@ Usage:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
try:
|
||||
from python_pkg.word_frequency.analyzer import analyze_text, read_file
|
||||
from python_pkg.word_frequency.excerpt_finder import find_best_excerpt
|
||||
from python_pkg.word_frequency.translator import (
|
||||
TranslationResult,
|
||||
detect_language,
|
||||
translate_words_batch,
|
||||
)
|
||||
@ -47,7 +46,6 @@ except ModuleNotFoundError:
|
||||
from analyzer import analyze_text, read_file # type: ignore[import-not-found]
|
||||
from excerpt_finder import find_best_excerpt # type: ignore[import-not-found]
|
||||
from translator import ( # type: ignore[import-not-found]
|
||||
TranslationResult,
|
||||
detect_language,
|
||||
translate_words_batch,
|
||||
)
|
||||
@ -57,19 +55,108 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
# Common stopwords for various languages (can be overridden with --stopwords)
|
||||
DEFAULT_STOPWORDS_EN = frozenset({
|
||||
"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
|
||||
"of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
|
||||
"being", "have", "has", "had", "do", "does", "did", "will", "would",
|
||||
"could", "should", "may", "might", "must", "shall", "can", "this",
|
||||
"that", "these", "those", "i", "you", "he", "she", "it", "we", "they",
|
||||
"me", "him", "her", "us", "them", "my", "your", "his", "its", "our",
|
||||
"their", "what", "which", "who", "whom", "whose", "where", "when",
|
||||
"why", "how", "all", "each", "every", "both", "few", "more", "most",
|
||||
"other", "some", "such", "no", "nor", "not", "only", "own", "same",
|
||||
"so", "than", "too", "very", "just", "as", "if", "then", "because",
|
||||
"while", "although", "though", "after", "before", "when", "where",
|
||||
})
|
||||
DEFAULT_STOPWORDS_EN = frozenset(
|
||||
{
|
||||
"the",
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"or",
|
||||
"but",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"to",
|
||||
"for",
|
||||
"of",
|
||||
"with",
|
||||
"by",
|
||||
"from",
|
||||
"is",
|
||||
"are",
|
||||
"was",
|
||||
"were",
|
||||
"be",
|
||||
"been",
|
||||
"being",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"could",
|
||||
"should",
|
||||
"may",
|
||||
"might",
|
||||
"must",
|
||||
"shall",
|
||||
"can",
|
||||
"this",
|
||||
"that",
|
||||
"these",
|
||||
"those",
|
||||
"i",
|
||||
"you",
|
||||
"he",
|
||||
"she",
|
||||
"it",
|
||||
"we",
|
||||
"they",
|
||||
"me",
|
||||
"him",
|
||||
"her",
|
||||
"us",
|
||||
"them",
|
||||
"my",
|
||||
"your",
|
||||
"his",
|
||||
"its",
|
||||
"our",
|
||||
"their",
|
||||
"what",
|
||||
"which",
|
||||
"who",
|
||||
"whom",
|
||||
"whose",
|
||||
"where",
|
||||
"when",
|
||||
"why",
|
||||
"how",
|
||||
"all",
|
||||
"each",
|
||||
"every",
|
||||
"both",
|
||||
"few",
|
||||
"more",
|
||||
"most",
|
||||
"other",
|
||||
"some",
|
||||
"such",
|
||||
"no",
|
||||
"nor",
|
||||
"not",
|
||||
"only",
|
||||
"own",
|
||||
"same",
|
||||
"so",
|
||||
"than",
|
||||
"too",
|
||||
"very",
|
||||
"just",
|
||||
"as",
|
||||
"if",
|
||||
"then",
|
||||
"because",
|
||||
"while",
|
||||
"although",
|
||||
"though",
|
||||
"after",
|
||||
"before",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
|
||||
@ -89,7 +176,9 @@ def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
|
||||
return frozenset()
|
||||
|
||||
content = path.read_text(encoding="utf-8")
|
||||
return frozenset(word.strip().lower() for word in content.splitlines() if word.strip())
|
||||
return frozenset(
|
||||
word.strip().lower() for word in content.splitlines() if word.strip()
|
||||
)
|
||||
|
||||
|
||||
def generate_learning_lesson(
|
||||
@ -151,9 +240,13 @@ def generate_learning_lesson(
|
||||
lines.append("=" * 70)
|
||||
lines.append("LANGUAGE LEARNING LESSON")
|
||||
lines.append("=" * 70)
|
||||
lines.append(f"Source text: {total_words:,} total words, {len(word_counts):,} unique words")
|
||||
lines.append(
|
||||
f"Source text: {total_words:,} total words, {len(word_counts):,} unique words"
|
||||
)
|
||||
if all_stopwords:
|
||||
lines.append(f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words")
|
||||
lines.append(
|
||||
f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words"
|
||||
)
|
||||
else:
|
||||
lines.append(f"Vocabulary words: {len(filtered_words):,}")
|
||||
|
||||
@ -196,7 +289,9 @@ def generate_learning_lesson(
|
||||
cumulative_words.extend(word for word, _ in batch_words)
|
||||
|
||||
lines.append("-" * 70)
|
||||
lines.append(f"BATCH {batch_num + 1}: Words {start_idx + 1} - {min(end_idx, len(filtered_words))}")
|
||||
lines.append(
|
||||
f"BATCH {batch_num + 1}: Words {start_idx + 1} - {min(end_idx, len(filtered_words))}"
|
||||
)
|
||||
lines.append("-" * 70)
|
||||
lines.append("")
|
||||
|
||||
@ -230,7 +325,9 @@ def generate_learning_lesson(
|
||||
else:
|
||||
for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
|
||||
percentage = (count / total_words) * 100
|
||||
lines.append(f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)")
|
||||
lines.append(
|
||||
f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)"
|
||||
)
|
||||
|
||||
lines.append("")
|
||||
|
||||
@ -239,7 +336,9 @@ def generate_learning_lesson(
|
||||
word_counts[word] for word in cumulative_words if word in word_counts
|
||||
)
|
||||
coverage = (cumulative_count / total_words) * 100
|
||||
lines.append(f"After learning these words, you'll recognize ~{coverage:.1f}% of the text")
|
||||
lines.append(
|
||||
f"After learning these words, you'll recognize ~{coverage:.1f}% of the text"
|
||||
)
|
||||
lines.append("")
|
||||
|
||||
# Find excerpts using cumulative words
|
||||
@ -256,8 +355,10 @@ def generate_learning_lesson(
|
||||
)
|
||||
|
||||
for j, excerpt in enumerate(excerpts, 1):
|
||||
lines.append(f" Excerpt {j} ({excerpt.match_percentage:.1f}% known words):")
|
||||
lines.append(f" \"{excerpt.excerpt}\"")
|
||||
lines.append(
|
||||
f" Excerpt {j} ({excerpt.match_percentage:.1f}% known words):"
|
||||
)
|
||||
lines.append(f' "{excerpt.excerpt}"')
|
||||
lines.append("")
|
||||
|
||||
# Summary
|
||||
@ -431,15 +532,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
# Output
|
||||
if args.output:
|
||||
Path(args.output).write_text(lesson, encoding="utf-8")
|
||||
print(f"Lesson written to {args.output}") # noqa: T201
|
||||
print(f"Lesson written to {args.output}")
|
||||
else:
|
||||
print(lesson) # noqa: T201
|
||||
print(lesson)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
||||
return 1
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
@ -76,7 +76,8 @@ try_pipx_install() {
|
||||
# Create/use a virtualenv for argostranslate
|
||||
setup_venv() {
|
||||
# Use /tmp for pip cache to avoid home directory quota issues
|
||||
export PIP_CACHE_DIR="/tmp/.pip_cache_$(id -u)"
|
||||
PIP_CACHE_DIR="/tmp/.pip_cache_$(id -u)"
|
||||
export PIP_CACHE_DIR
|
||||
mkdir -p "$PIP_CACHE_DIR"
|
||||
|
||||
if [[ ! -d "$VENV_DIR" ]]; then
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
The Project Gutenberg eBook of The King James Version of the Bible
|
||||
The Project Gutenberg eBook of The King James Version of the Bible
|
||||
|
||||
This ebook is for the use of anyone anywhere in the United States and
|
||||
most other parts of the world at no cost and with almost no restrictions
|
||||
@ -99964,5 +99964,3 @@ This website includes information about Project Gutenberg™,
|
||||
including how to make donations to the Project Gutenberg Literary
|
||||
Archive Foundation, how to help produce our new eBooks, and how to
|
||||
subscribe to our email newsletter to hear about new eBooks.
|
||||
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
The Project Gutenberg eBook of The Orbis Pictus
|
||||
The Project Gutenberg eBook of The Orbis Pictus
|
||||
|
||||
This ebook is for the use of anyone anywhere in the United States and
|
||||
most other parts of the world at no cost and with almost no restrictions
|
||||
@ -12899,5 +12899,3 @@ This website includes information about Project Gutenberg™,
|
||||
including how to make donations to the Project Gutenberg Literary
|
||||
Archive Foundation, how to help produce our new eBooks, and how to
|
||||
subscribe to our email newsletter to hear about new eBooks.
|
||||
|
||||
|
||||
|
||||
@ -67,4 +67,3 @@ C. IVLI CAESARIS COMMENTARIORVM DE BELLO GALLICO LIBER PRIMVS
|
||||
Caesar
|
||||
The Latin Library
|
||||
The Classics Page
|
||||
|
||||
|
||||
@ -2,11 +2,9 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
import time
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
@ -20,9 +18,6 @@ from python_pkg.word_frequency.analyzer import (
|
||||
read_files,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Sequence
|
||||
|
||||
|
||||
class TestExtractWords:
|
||||
"""Tests for extract_words function."""
|
||||
|
||||
19
python_pkg/word_frequency/tests/test_anki_generator.py
Normal file → Executable file
19
python_pkg/word_frequency/tests/test_anki_generator.py
Normal file → Executable file
@ -12,17 +12,16 @@ try:
|
||||
from python_pkg.word_frequency.anki_generator import (
|
||||
find_word_contexts,
|
||||
generate_anki_deck,
|
||||
generate_flashcards,
|
||||
main,
|
||||
parse_vocabulary_curve_output,
|
||||
)
|
||||
except ImportError:
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
|
||||
from python_pkg.word_frequency.anki_generator import (
|
||||
find_word_contexts,
|
||||
generate_anki_deck,
|
||||
generate_flashcards,
|
||||
main,
|
||||
parse_vocabulary_curve_output,
|
||||
)
|
||||
@ -78,19 +77,25 @@ class TestParseVocabularyCurveOutput:
|
||||
|
||||
def test_parse_length_1(self, sample_vocabulary_output: str) -> None:
|
||||
"""Test parsing output for length 1."""
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 1)
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
||||
sample_vocabulary_output, 1
|
||||
)
|
||||
assert excerpt == "the"
|
||||
assert excerpt_words == [("the", 1)]
|
||||
|
||||
def test_parse_length_2(self, sample_vocabulary_output: str) -> None:
|
||||
"""Test parsing output for length 2."""
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 2)
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
||||
sample_vocabulary_output, 2
|
||||
)
|
||||
assert excerpt == "the dog"
|
||||
assert excerpt_words == [("the", 1), ("dog", 2)]
|
||||
|
||||
def test_parse_length_3(self, sample_vocabulary_output: str) -> None:
|
||||
"""Test parsing output for length 3."""
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 3)
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
||||
sample_vocabulary_output, 3
|
||||
)
|
||||
assert excerpt == "the quick fox"
|
||||
assert len(excerpt_words) == 3
|
||||
assert ("the", 1) in excerpt_words
|
||||
@ -99,7 +104,9 @@ class TestParseVocabularyCurveOutput:
|
||||
|
||||
def test_parse_nonexistent_length(self, sample_vocabulary_output: str) -> None:
|
||||
"""Test parsing output for non-existent length."""
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(sample_vocabulary_output, 100)
|
||||
excerpt, excerpt_words, all_vocab = parse_vocabulary_curve_output(
|
||||
sample_vocabulary_output, 100
|
||||
)
|
||||
assert excerpt == ""
|
||||
assert excerpt_words == []
|
||||
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
@ -324,7 +324,7 @@ class TestMain:
|
||||
"2",
|
||||
]
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
capsys.readouterr()
|
||||
|
||||
assert exit_code == 0
|
||||
# Excerpt should include context words
|
||||
@ -342,7 +342,7 @@ class TestMain:
|
||||
"--case-sensitive",
|
||||
]
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
capsys.readouterr()
|
||||
|
||||
assert exit_code == 0
|
||||
# Only lowercase "hello" should match
|
||||
|
||||
@ -2,20 +2,20 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
import time
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
import python_pkg.word_frequency.learning_pipe as learning_pipe_module
|
||||
from python_pkg.word_frequency.learning_pipe import (
|
||||
DEFAULT_STOPWORDS_EN,
|
||||
generate_learning_lesson,
|
||||
load_stopwords,
|
||||
main,
|
||||
)
|
||||
import python_pkg.word_frequency.learning_pipe as learning_pipe_module
|
||||
from python_pkg.word_frequency.translator import TranslationResult
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -25,12 +25,13 @@ if TYPE_CHECKING:
|
||||
@pytest.fixture
|
||||
def mock_translation() -> Generator[MagicMock, None, None]:
|
||||
"""Mock translation to avoid requiring argostranslate."""
|
||||
|
||||
def fake_batch_translate(
|
||||
words: list[str],
|
||||
from_lang: str,
|
||||
to_lang: str,
|
||||
*,
|
||||
use_cache: bool = True, # noqa: ARG001
|
||||
use_cache: bool = True,
|
||||
) -> list[TranslationResult]:
|
||||
"""Fake batch translation that returns word with prefix."""
|
||||
return [
|
||||
@ -274,7 +275,7 @@ class TestMain:
|
||||
"5",
|
||||
]
|
||||
)
|
||||
captured = capsys.readouterr()
|
||||
capsys.readouterr()
|
||||
|
||||
assert exit_code == 0
|
||||
# "hello" should be filtered by custom stopwords
|
||||
@ -392,12 +393,17 @@ class TestTranslationIntegration:
|
||||
text_file.write_text("hello world hello world hello", encoding="utf-8")
|
||||
|
||||
# Should work with mocked translation
|
||||
result = main([
|
||||
"--file", str(text_file),
|
||||
"--translate-from", "en",
|
||||
"--translate-to", "es",
|
||||
result = main(
|
||||
[
|
||||
"--file",
|
||||
str(text_file),
|
||||
"--translate-from",
|
||||
"en",
|
||||
"--translate-to",
|
||||
"es",
|
||||
"--no-default-stopwords",
|
||||
])
|
||||
]
|
||||
)
|
||||
|
||||
assert result == 0
|
||||
|
||||
@ -437,4 +443,3 @@ class TestTranslationIntegration:
|
||||
# Should not have translation output
|
||||
assert "Translation:" not in result
|
||||
assert "Detected language:" not in result
|
||||
|
||||
|
||||
@ -2,8 +2,8 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
@ -52,7 +52,9 @@ class ArgosAvailableMock:
|
||||
Works whether argos is installed or not by patching sys.modules.
|
||||
"""
|
||||
|
||||
def __init__(self, translate_returns: str | list[str] | Exception | None = None) -> None:
|
||||
def __init__(
|
||||
self, translate_returns: str | list[str] | Exception | None = None
|
||||
) -> None:
|
||||
"""Initialize with return values for translate()."""
|
||||
self.translate_returns = translate_returns
|
||||
self.mock_translate_fn = MagicMock()
|
||||
@ -69,9 +71,9 @@ class ArgosAvailableMock:
|
||||
translator._argos_available = True
|
||||
|
||||
# Set up translate return value
|
||||
if isinstance(self.translate_returns, Exception):
|
||||
self.mock_translate_fn.side_effect = self.translate_returns
|
||||
elif isinstance(self.translate_returns, list):
|
||||
if isinstance(self.translate_returns, Exception) or isinstance(
|
||||
self.translate_returns, list
|
||||
):
|
||||
self.mock_translate_fn.side_effect = self.translate_returns
|
||||
elif self.translate_returns is not None:
|
||||
self.mock_translate_fn.return_value = self.translate_returns
|
||||
@ -102,9 +104,9 @@ class ArgosAvailableMock:
|
||||
translator, "_ensure_language_pair", lambda f, t: None
|
||||
)
|
||||
|
||||
self._sys_modules_patcher.start()
|
||||
self._ensure_patcher.start()
|
||||
self._lang_patcher.start()
|
||||
self._sys_modules_patcher.start() # type: ignore[union-attr]
|
||||
self._ensure_patcher.start() # type: ignore[union-attr]
|
||||
self._lang_patcher.start() # type: ignore[union-attr]
|
||||
|
||||
return self.mock_translate_fn
|
||||
|
||||
@ -291,9 +293,7 @@ class TestTranslateWordsBatch:
|
||||
"""Test batch translation falls back to individual when result count mismatches."""
|
||||
words = ["one", "two", "three", "four"]
|
||||
# First call (batch) returns wrong count, subsequent calls are individual
|
||||
with ArgosAvailableMock(
|
||||
["wrong", "uno", "dos", "tres", "cuatro"]
|
||||
) as mock:
|
||||
with ArgosAvailableMock(["wrong", "uno", "dos", "tres", "cuatro"]) as mock:
|
||||
results = translate_words_batch(words, "en", "es", use_cache=False)
|
||||
|
||||
assert len(results) == 4
|
||||
@ -425,7 +425,8 @@ class TestGetInstalledLanguages:
|
||||
# We need to mock the translate module's get_installed_languages
|
||||
mock_translate_module = MagicMock()
|
||||
mock_translate_module.get_installed_languages.return_value = [
|
||||
mock_lang1, mock_lang2
|
||||
mock_lang1,
|
||||
mock_lang2,
|
||||
]
|
||||
mock_package_module = MagicMock()
|
||||
mock_parent = MagicMock()
|
||||
@ -507,9 +508,7 @@ class TestMain:
|
||||
result = main(["--text", "hello", "--from", "en", "--to", "es"])
|
||||
assert result == 1
|
||||
|
||||
def test_list_languages_empty(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_list_languages_empty(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test listing languages when none installed."""
|
||||
mock_translate_module = MagicMock()
|
||||
mock_translate_module.get_installed_languages.return_value = []
|
||||
@ -572,9 +571,7 @@ class TestMain:
|
||||
assert "en" in captured.out
|
||||
assert "English" in captured.out
|
||||
|
||||
def test_translate_single_text(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_translate_single_text(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test translating single text."""
|
||||
with ArgosAvailableMock("hola"):
|
||||
result = main(["--text", "hello", "--from", "en", "--to", "es"])
|
||||
@ -584,9 +581,7 @@ class TestMain:
|
||||
assert "hello" in captured.out
|
||||
assert "hola" in captured.out
|
||||
|
||||
def test_translate_multiple_words(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_translate_multiple_words(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test translating multiple words."""
|
||||
with ArgosAvailableMock(["hola", "mundo"]):
|
||||
result = main(["--words", "hello", "world", "--from", "en", "--to", "es"])
|
||||
@ -613,9 +608,7 @@ class TestMain:
|
||||
assert "world" in captured.out
|
||||
assert "goodbye" in captured.out
|
||||
|
||||
def test_translate_file_not_found(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_translate_file_not_found(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test error when words file not found."""
|
||||
with ArgosAvailableMock():
|
||||
result = main(
|
||||
@ -654,9 +647,7 @@ class TestMain:
|
||||
assert "hello" in content
|
||||
assert "hola" in content
|
||||
|
||||
def test_no_input_shows_help(
|
||||
self, capsys: pytest.CaptureFixture[str]
|
||||
) -> None:
|
||||
def test_no_input_shows_help(self, capsys: pytest.CaptureFixture[str]) -> None:
|
||||
"""Test that no input shows help."""
|
||||
with ArgosAvailableMock():
|
||||
result = main([])
|
||||
|
||||
40
python_pkg/word_frequency/tests/test_vocabulary_curve.py
Normal file → Executable file
40
python_pkg/word_frequency/tests/test_vocabulary_curve.py
Normal file → Executable file
@ -3,14 +3,18 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
# Path to the C executable
|
||||
C_EXECUTABLE = Path(__file__).parent.parent.parent.parent / "C" / "vocabulary_curve" / "vocabulary_curve"
|
||||
C_EXECUTABLE = (
|
||||
Path(__file__).parent.parent.parent.parent
|
||||
/ "C"
|
||||
/ "vocabulary_curve"
|
||||
/ "vocabulary_curve"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -46,6 +50,7 @@ def run_vocabulary_curve(filepath: Path, max_length: int = 10) -> str:
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
check=False,
|
||||
)
|
||||
return result.stdout
|
||||
|
||||
@ -86,8 +91,9 @@ class TestExcerptValidity:
|
||||
def test_excerpt_exists_in_source_text(self, sample_text_file: Path) -> None:
|
||||
"""Test that each excerpt can be found in the source text as contiguous words."""
|
||||
import re
|
||||
|
||||
source_text = sample_text_file.read_text(encoding="utf-8").lower()
|
||||
source_words = re.findall(r'\b[\w]+\b', source_text)
|
||||
source_words = re.findall(r"\b[\w]+\b", source_text)
|
||||
output = run_vocabulary_curve(sample_text_file, max_length=10)
|
||||
excerpts = extract_excerpts_from_output(output)
|
||||
|
||||
@ -98,7 +104,7 @@ class TestExcerptValidity:
|
||||
# Find this sequence in source_words
|
||||
found = False
|
||||
for i in range(len(source_words) - len(excerpt_words) + 1):
|
||||
if source_words[i:i+len(excerpt_words)] == excerpt_words:
|
||||
if source_words[i : i + len(excerpt_words)] == excerpt_words:
|
||||
found = True
|
||||
break
|
||||
assert found, (
|
||||
@ -114,15 +120,16 @@ class TestExcerptValidity:
|
||||
|
||||
for length, excerpt in excerpts:
|
||||
word_count = len(excerpt.split())
|
||||
assert word_count == length, (
|
||||
f"Expected {length} words, got {word_count}: '{excerpt}'"
|
||||
)
|
||||
assert (
|
||||
word_count == length
|
||||
), f"Expected {length} words, got {word_count}: '{excerpt}'"
|
||||
|
||||
def test_polish_excerpt_exists_in_source(self, polish_text_file: Path) -> None:
|
||||
"""Test Polish text excerpts are found in source as contiguous words."""
|
||||
import re
|
||||
|
||||
source_text = polish_text_file.read_text(encoding="utf-8").lower()
|
||||
source_words = re.findall(r'\b[\w]+\b', source_text)
|
||||
source_words = re.findall(r"\b[\w]+\b", source_text)
|
||||
output = run_vocabulary_curve(polish_text_file, max_length=8)
|
||||
excerpts = extract_excerpts_from_output(output)
|
||||
|
||||
@ -133,7 +140,7 @@ class TestExcerptValidity:
|
||||
# Find this sequence in source_words
|
||||
found = False
|
||||
for i in range(len(source_words) - len(excerpt_words) + 1):
|
||||
if source_words[i:i+len(excerpt_words)] == excerpt_words:
|
||||
if source_words[i : i + len(excerpt_words)] == excerpt_words:
|
||||
found = True
|
||||
break
|
||||
assert found, (
|
||||
@ -148,7 +155,7 @@ class TestExcerptValidity:
|
||||
|
||||
source_text = sample_text_file.read_text(encoding="utf-8").lower()
|
||||
# Extract words from source
|
||||
source_words = re.findall(r'\b[\w]+\b', source_text)
|
||||
source_words = re.findall(r"\b[\w]+\b", source_text)
|
||||
|
||||
output = run_vocabulary_curve(sample_text_file, max_length=5)
|
||||
excerpts = extract_excerpts_from_output(output)
|
||||
@ -159,7 +166,7 @@ class TestExcerptValidity:
|
||||
# Find this sequence in source_words
|
||||
found = False
|
||||
for i in range(len(source_words) - length + 1):
|
||||
if source_words[i:i+length] == excerpt_words:
|
||||
if source_words[i : i + length] == excerpt_words:
|
||||
found = True
|
||||
break
|
||||
|
||||
@ -182,7 +189,7 @@ class TestVocabNeeded:
|
||||
def test_vocab_needed_increases_monotonically(self, sample_text_file: Path) -> None:
|
||||
"""Test that vocab needed never decreases as length increases."""
|
||||
output = run_vocabulary_curve(sample_text_file, max_length=10)
|
||||
excerpts = extract_excerpts_from_output(output)
|
||||
extract_excerpts_from_output(output)
|
||||
|
||||
# Extract vocab needed from output
|
||||
prev_vocab = 0
|
||||
@ -192,9 +199,9 @@ class TestVocabNeeded:
|
||||
parts = line.split("Vocab needed:")
|
||||
if len(parts) > 1:
|
||||
vocab = int(parts[1].split()[0])
|
||||
assert vocab >= prev_vocab, (
|
||||
f"Vocab decreased from {prev_vocab} to {vocab}"
|
||||
)
|
||||
assert (
|
||||
vocab >= prev_vocab
|
||||
), f"Vocab decreased from {prev_vocab} to {vocab}"
|
||||
prev_vocab = vocab
|
||||
|
||||
|
||||
@ -213,6 +220,7 @@ class TestEdgeCases:
|
||||
[str(C_EXECUTABLE), str(filepath), "5"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
|
||||
assert result.returncode != 0 or "No words" in result.stderr
|
||||
|
||||
85
python_pkg/word_frequency/translator.py
Normal file → Executable file
85
python_pkg/word_frequency/translator.py
Normal file → Executable file
@ -29,8 +29,8 @@ Dependencies (install one):
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -50,6 +50,7 @@ def _check_cuda_available() -> bool:
|
||||
if _gpu_available is None:
|
||||
try:
|
||||
import torch
|
||||
|
||||
_gpu_available = torch.cuda.is_available()
|
||||
except ImportError:
|
||||
_gpu_available = False
|
||||
@ -71,11 +72,11 @@ def _init_gpu_if_available() -> None:
|
||||
return
|
||||
|
||||
import sys
|
||||
|
||||
print("CUDA detected, initializing GPU acceleration...", file=sys.stderr)
|
||||
|
||||
try:
|
||||
import torch
|
||||
import ctranslate2
|
||||
|
||||
# Force CTranslate2 to use CUDA
|
||||
device_count = torch.cuda.device_count()
|
||||
@ -87,6 +88,7 @@ def _init_gpu_if_available() -> None:
|
||||
|
||||
# Set environment variable to force GPU usage in argos
|
||||
import os
|
||||
|
||||
os.environ["CT2_CUDA_ALLOW_FP16"] = "1"
|
||||
os.environ["CT2_USE_EXPERIMENTAL_PACKED_GEMM"] = "1"
|
||||
|
||||
@ -106,9 +108,10 @@ def _check_argos() -> bool:
|
||||
global _argos_available
|
||||
if _argos_available is None:
|
||||
try:
|
||||
import argostranslate.package # noqa: F401
|
||||
import argostranslate.translate # noqa: F401
|
||||
import argostranslate.package
|
||||
import argostranslate.translate
|
||||
|
||||
_ = (argostranslate.package, argostranslate.translate)
|
||||
_argos_available = True
|
||||
except ImportError:
|
||||
_argos_available = False
|
||||
@ -120,8 +123,9 @@ def _check_deep_translator() -> bool:
|
||||
global _deep_translator_available
|
||||
if _deep_translator_available is None:
|
||||
try:
|
||||
from deep_translator import GoogleTranslator # noqa: F401
|
||||
from deep_translator import GoogleTranslator
|
||||
|
||||
_ = GoogleTranslator
|
||||
_deep_translator_available = True
|
||||
except ImportError:
|
||||
_deep_translator_available = False
|
||||
@ -133,8 +137,9 @@ def _check_langdetect() -> bool:
|
||||
global _langdetect_available
|
||||
if _langdetect_available is None:
|
||||
try:
|
||||
import langdetect # noqa: F401
|
||||
import langdetect
|
||||
|
||||
_ = langdetect
|
||||
_langdetect_available = True
|
||||
except ImportError:
|
||||
_langdetect_available = False
|
||||
@ -227,7 +232,7 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
|
||||
results: dict[str, bool] = {}
|
||||
|
||||
# Update package index
|
||||
print("Updating package index...") # noqa: T201
|
||||
print("Updating package index...")
|
||||
argostranslate.package.update_package_index()
|
||||
available = argostranslate.package.get_available_packages()
|
||||
|
||||
@ -250,13 +255,13 @@ def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]:
|
||||
if pkg_key in available_lookup:
|
||||
pkg = available_lookup[pkg_key]
|
||||
try:
|
||||
print(f"Downloading {from_code} -> {to_code}...") # noqa: T201
|
||||
print(f"Downloading {from_code} -> {to_code}...")
|
||||
argostranslate.package.install_from_path(pkg.download())
|
||||
results[key] = True
|
||||
print(f" ✓ Installed {from_code} -> {to_code}") # noqa: T201
|
||||
print(f" ✓ Installed {from_code} -> {to_code}")
|
||||
except Exception as e: # noqa: BLE001
|
||||
results[key] = False
|
||||
print(f" ✗ Failed {from_code} -> {to_code}: {e}") # noqa: T201
|
||||
print(f" ✗ Failed {from_code} -> {to_code}: {e}")
|
||||
else:
|
||||
# Package not available
|
||||
results[key] = False
|
||||
@ -276,7 +281,7 @@ def _ensure_argos_installed() -> None:
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
print("argostranslate not found. Attempting to install...") # noqa: T201
|
||||
print("argostranslate not found. Attempting to install...")
|
||||
try:
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "pip", "install", "argostranslate"],
|
||||
@ -284,11 +289,11 @@ def _ensure_argos_installed() -> None:
|
||||
capture_output=True,
|
||||
)
|
||||
# Reset the check flag and verify
|
||||
global _argos_available # noqa: PLW0603
|
||||
global _argos_available
|
||||
_argos_available = None
|
||||
if not _check_argos():
|
||||
raise ImportError("argostranslate installation succeeded but import failed")
|
||||
print("argostranslate installed successfully.") # noqa: T201
|
||||
print("argostranslate installed successfully.")
|
||||
except subprocess.CalledProcessError as e:
|
||||
error_msg = e.stderr.decode() if e.stderr else str(e)
|
||||
raise ImportError(
|
||||
@ -354,7 +359,7 @@ def _ensure_language_pair(from_lang: str, to_lang: str) -> None:
|
||||
)
|
||||
|
||||
print(
|
||||
f" Downloading package (~50-100MB, this may take a minute)...",
|
||||
" Downloading package (~50-100MB, this may take a minute)...",
|
||||
file=sys.stderr,
|
||||
)
|
||||
download_path = pkg.download()
|
||||
@ -391,6 +396,7 @@ def translate_word(
|
||||
if use_cache:
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_translation_cache
|
||||
|
||||
cache = get_translation_cache()
|
||||
cached = cache.get(word, from_lang, to_lang)
|
||||
if cached is not None:
|
||||
@ -415,6 +421,7 @@ def translate_word(
|
||||
if use_cache:
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_translation_cache
|
||||
|
||||
get_translation_cache().set(word, from_lang, to_lang, translated)
|
||||
except ImportError:
|
||||
pass
|
||||
@ -454,7 +461,9 @@ def translate_words(
|
||||
Returns:
|
||||
List of TranslationResult for each word.
|
||||
"""
|
||||
return [translate_word(word, from_lang, to_lang, use_cache=use_cache) for word in words]
|
||||
return [
|
||||
translate_word(word, from_lang, to_lang, use_cache=use_cache) for word in words
|
||||
]
|
||||
|
||||
|
||||
def _translate_batch_worker(
|
||||
@ -492,9 +501,7 @@ def _translate_batch_worker(
|
||||
else:
|
||||
# Fall back to individual translation for this batch
|
||||
for word in batch_words:
|
||||
translated = argostranslate.translate.translate(
|
||||
word, from_lang, to_lang
|
||||
)
|
||||
translated = argostranslate.translate.translate(word, from_lang, to_lang)
|
||||
translations[word.lower()] = translated
|
||||
|
||||
return batch_idx, translations
|
||||
@ -544,6 +551,7 @@ def translate_words_batch(
|
||||
if use_cache:
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_translation_cache
|
||||
|
||||
cache = get_translation_cache()
|
||||
cached_results = cache.get_many(list(words), from_lang, to_lang)
|
||||
except ImportError:
|
||||
@ -574,7 +582,7 @@ def translate_words_batch(
|
||||
BATCH_SIZE = 100
|
||||
batches: list[list[str]] = []
|
||||
for i in range(0, num_to_translate, BATCH_SIZE):
|
||||
batches.append(words_to_translate[i:i + BATCH_SIZE])
|
||||
batches.append(words_to_translate[i : i + BATCH_SIZE])
|
||||
|
||||
total_batches = len(batches)
|
||||
|
||||
@ -597,8 +605,8 @@ def translate_words_batch(
|
||||
)
|
||||
new_translations.update(batch_translations)
|
||||
|
||||
print(f" Translation complete.", file=sys.stderr, flush=True)
|
||||
except Exception as e: # noqa: BLE001
|
||||
print(" Translation complete.", file=sys.stderr, flush=True)
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Translation failed for {from_lang} -> {to_lang}: {e}"
|
||||
) from e
|
||||
@ -607,6 +615,7 @@ def translate_words_batch(
|
||||
if use_cache and new_translations:
|
||||
try:
|
||||
from python_pkg.word_frequency.cache import get_translation_cache
|
||||
|
||||
get_translation_cache().set_many(new_translations, from_lang, to_lang)
|
||||
except ImportError:
|
||||
pass
|
||||
@ -670,7 +679,9 @@ def format_translations(
|
||||
# Data
|
||||
for r in results:
|
||||
if r.success:
|
||||
lines.append(f"{r.source_word:<{max_source}} {r.translated_word:<{max_trans}}")
|
||||
lines.append(
|
||||
f"{r.source_word:<{max_source}} {r.translated_word:<{max_trans}}"
|
||||
)
|
||||
elif show_errors:
|
||||
error_msg = f"[Error: {r.error}]" if r.error else "[Failed]"
|
||||
lines.append(f"{r.source_word:<{max_source}} {error_msg}")
|
||||
@ -771,7 +782,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
|
||||
# Check if argostranslate is available
|
||||
if not _check_argos():
|
||||
print( # noqa: T201
|
||||
print(
|
||||
"Error: argostranslate is not installed.\n"
|
||||
"Install it with: pip install argostranslate",
|
||||
file=sys.stderr,
|
||||
@ -782,30 +793,30 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
if args.list_languages:
|
||||
langs = get_installed_languages()
|
||||
if not langs:
|
||||
print("No languages installed.") # noqa: T201
|
||||
print("Download some with: --download en es pl de fr") # noqa: T201
|
||||
print("No languages installed.")
|
||||
print("Download some with: --download en es pl de fr")
|
||||
else:
|
||||
print("Installed languages:") # noqa: T201
|
||||
print("Installed languages:")
|
||||
for code, name in sorted(langs):
|
||||
print(f" {code}: {name}") # noqa: T201
|
||||
print(f" {code}: {name}")
|
||||
return 0
|
||||
|
||||
# Handle list-available
|
||||
if args.list_available:
|
||||
packages = get_available_packages()
|
||||
if not packages:
|
||||
print("No packages available (check internet connection).") # noqa: T201
|
||||
print("No packages available (check internet connection).")
|
||||
else:
|
||||
print("Available language packages:") # noqa: T201
|
||||
print("Available language packages:")
|
||||
for from_code, from_name, to_code, to_name in sorted(packages):
|
||||
print(f" {from_code} ({from_name}) -> {to_code} ({to_name})") # noqa: T201
|
||||
print(f" {from_code} ({from_name}) -> {to_code} ({to_name})")
|
||||
return 0
|
||||
|
||||
# Handle download
|
||||
if args.download:
|
||||
results = download_languages(args.download)
|
||||
success_count = sum(1 for v in results.values() if v)
|
||||
print(f"\nDownloaded {success_count}/{len(results)} language pairs.") # noqa: T201
|
||||
download_results = download_languages(args.download)
|
||||
success_count = sum(1 for v in download_results.values() if v)
|
||||
print(f"\nDownloaded {success_count}/{len(download_results)} language pairs.")
|
||||
return 0 if success_count > 0 else 1
|
||||
|
||||
# Handle translation
|
||||
@ -819,7 +830,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
content = read_file(args.words_file)
|
||||
words = [w.strip() for w in content.splitlines() if w.strip()]
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File not found: {args.words_file}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found: {args.words_file}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if not words:
|
||||
@ -830,7 +841,7 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
try:
|
||||
results = translate_words_batch(words, args.from_lang, args.to_lang)
|
||||
except ImportError as e:
|
||||
print(f"Error: {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
output = format_translations(results)
|
||||
@ -838,9 +849,9 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
# Output
|
||||
if args.output:
|
||||
Path(args.output).write_text(output, encoding="utf-8")
|
||||
print(f"Translations written to {args.output}") # noqa: T201
|
||||
print(f"Translations written to {args.output}")
|
||||
else:
|
||||
print(output) # noqa: T201
|
||||
print(output)
|
||||
|
||||
# Return error if any translation failed
|
||||
if any(not r.success for r in results):
|
||||
|
||||
15
python_pkg/word_frequency/vocabulary_curve.py
Normal file → Executable file
15
python_pkg/word_frequency/vocabulary_curve.py
Normal file → Executable file
@ -14,8 +14,8 @@ Usage:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import TYPE_CHECKING, NamedTuple
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -112,6 +112,7 @@ def find_optimal_excerpts(
|
||||
|
||||
# Extract all words from text (preserving order)
|
||||
import re
|
||||
|
||||
all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
|
||||
if not case_sensitive:
|
||||
all_words = [w.lower() for w in all_words]
|
||||
@ -213,7 +214,9 @@ def format_results(
|
||||
if results:
|
||||
final = results[-1]
|
||||
lines.append(f"To understand a {final.excerpt_length}-word excerpt,")
|
||||
lines.append(f"you need to learn at minimum {final.min_vocab_needed} top words.")
|
||||
lines.append(
|
||||
f"you need to learn at minimum {final.min_vocab_needed} top words."
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
@ -301,15 +304,15 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
|
||||
if args.output:
|
||||
Path(args.output).write_text(output, encoding="utf-8")
|
||||
print(f"Output written to {args.output}") # noqa: T201
|
||||
print(f"Output written to {args.output}")
|
||||
else:
|
||||
print(output) # noqa: T201
|
||||
print(output)
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: File not found - {e}", file=sys.stderr)
|
||||
return 1
|
||||
except UnicodeDecodeError as e:
|
||||
print(f"Error: Could not decode file - {e}", file=sys.stderr) # noqa: T201
|
||||
print(f"Error: Could not decode file - {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
Loading…
Reference in New Issue
Block a user