From 8546cddfda3c20723695ee79d3aa1b5ca8d0162a Mon Sep 17 00:00:00 2001 From: Krzysztof Rudnicki Date: Sun, 28 Dec 2025 15:55:43 +0100 Subject: [PATCH] feat: added translations --- python_pkg/word_frequency/__init__.py | 7 + python_pkg/word_frequency/learning_pipe.py | 100 ++- .../tests/test_learning_pipe.py | 87 +++ .../word_frequency/tests/test_translator.py | 619 ++++++++++++++++++ python_pkg/word_frequency/translator.py | 572 ++++++++++++++++ 5 files changed, 1382 insertions(+), 3 deletions(-) create mode 100644 python_pkg/word_frequency/tests/test_translator.py create mode 100644 python_pkg/word_frequency/translator.py diff --git a/python_pkg/word_frequency/__init__.py b/python_pkg/word_frequency/__init__.py index dc67439..cacc578 100644 --- a/python_pkg/word_frequency/__init__.py +++ b/python_pkg/word_frequency/__init__.py @@ -3,10 +3,13 @@ This package provides tools for: 1. Analyzing word frequency in text (analyzer module) 2. Finding text excerpts where target words are most prevalent (excerpt_finder module) +3. Combining analysis with excerpts for language learning (learning_pipe module) +4. Offline translation between languages (translator module) Example usage: from python_pkg.word_frequency.analyzer import analyze_text, analyze_and_format from python_pkg.word_frequency.excerpt_finder import find_best_excerpt + from python_pkg.word_frequency.translator import translate_words # Analyze word frequency counts = analyze_text("hello world hello") @@ -18,6 +21,10 @@ Example usage: target_words=["and", "the"], excerpt_length=3, ) + + # Translate words (requires argostranslate installed) + translations = translate_words(["hello", "world"], "en", "es") +``` print(results[0].excerpt) # "and she and" or similar """ diff --git a/python_pkg/word_frequency/learning_pipe.py b/python_pkg/word_frequency/learning_pipe.py index 041662c..18626b0 100644 --- a/python_pkg/word_frequency/learning_pipe.py +++ b/python_pkg/word_frequency/learning_pipe.py @@ -38,9 +38,19 @@ from typing import TYPE_CHECKING try: from python_pkg.word_frequency.analyzer import analyze_text, read_file from python_pkg.word_frequency.excerpt_finder import find_best_excerpt + from python_pkg.word_frequency.translator import ( + TranslationResult, + detect_language, + translate_words_batch, + ) except ModuleNotFoundError: from analyzer import analyze_text, read_file # type: ignore[import-not-found] from excerpt_finder import find_best_excerpt # type: ignore[import-not-found] + from translator import ( # type: ignore[import-not-found] + TranslationResult, + detect_language, + translate_words_batch, + ) if TYPE_CHECKING: from collections.abc import Sequence @@ -94,6 +104,8 @@ def generate_learning_lesson( skip_numbers: bool = True, case_sensitive: bool = False, context_words: int = 5, + translate_from: str | None = None, + translate_to: str | None = None, ) -> str: """Generate a learning lesson from text. @@ -108,6 +120,8 @@ def generate_learning_lesson( skip_numbers: If True, filter out numeric words (default: True). case_sensitive: If True, treat words case-sensitively. context_words: Words of context to include around excerpts. + translate_from: Source language code for translation (e.g., 'la', 'pl'). + translate_to: Target language code for translation (e.g., 'en'). Returns: Formatted learning lesson as a string. @@ -142,6 +156,30 @@ def generate_learning_lesson( lines.append(f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words") else: lines.append(f"Vocabulary words: {len(filtered_words):,}") + + # Handle translation setup + actual_translate_from = translate_from + actual_translate_to = translate_to or "en" # Default to English + + # Auto-detect language if translation is enabled but source not specified + if translate_from == "auto" or (translate_to and not translate_from): + detected = detect_language(text) + if detected: + actual_translate_from = detected + lines.append(f"Detected language: {detected}") + # Note: langdetect doesn't support Latin (often detected as Italian) + # If detection seems wrong, use --translate-from to override + else: + lines.append( + "Warning: Could not detect language " + "(install langdetect: pip install langdetect)" + ) + actual_translate_from = None + + do_translate = actual_translate_from is not None and actual_translate_to is not None + if do_translate: + lines.append(f"Translation: {actual_translate_from} -> {actual_translate_to}") + lines.append("") # Generate batches @@ -162,13 +200,37 @@ def generate_learning_lesson( lines.append("-" * 70) lines.append("") + # Get translations if requested + translations: dict[str, str] = {} + if do_translate: + words_to_translate = [word for word, _ in batch_words] + translation_results = translate_words_batch( + words_to_translate, + actual_translate_from, # type: ignore[arg-type] + actual_translate_to, # type: ignore[arg-type] + ) + translations = { + r.source_word: r.translated_word + for r in translation_results + if r.success + } + # Word list with frequencies lines.append("VOCABULARY TO LEARN:") lines.append("") - for i, (word, count) in enumerate(batch_words, start=start_idx + 1): - percentage = (count / total_words) * 100 - lines.append(f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)") + if do_translate and translations: + # Include translations in output + for i, (word, count) in enumerate(batch_words, start=start_idx + 1): + percentage = (count / total_words) * 100 + trans = translations.get(word, "?") + lines.append( + f" {i:3}. {word:<20} -> {trans:<20} ({count:,} occurrences, {percentage:.2f}%)" + ) + else: + for i, (word, count) in enumerate(batch_words, start=start_idx + 1): + percentage = (count / total_words) * 100 + lines.append(f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)") lines.append("") @@ -301,6 +363,27 @@ def main(argv: Sequence[str] | None = None) -> int: help="Include numeric words in vocabulary (filtered by default)", ) + # Translation options (enabled by default) + parser.add_argument( + "--no-translate", + "-T", + action="store_true", + help="Disable translation", + ) + parser.add_argument( + "--translate-from", + type=str, + metavar="LANG", + help="Source language code (e.g., 'la', 'pl', 'de'). If omitted, auto-detected.", + ) + parser.add_argument( + "--translate-to", + type=str, + metavar="LANG", + default="en", + help="Target language code (default: 'en')", + ) + # Output options parser.add_argument( "--output", @@ -321,6 +404,15 @@ def main(argv: Sequence[str] | None = None) -> int: # Load custom stopwords if provided custom_stopwords = load_stopwords(args.stopwords) + # Determine translation settings + # Translation enabled by default, --no-translate disables it + translate_from: str | None = None + translate_to: str | None = None + + if not args.no_translate: + translate_from = args.translate_from or "auto" # "auto" triggers detection + translate_to = args.translate_to + # Generate lesson lesson = generate_learning_lesson( text, @@ -332,6 +424,8 @@ def main(argv: Sequence[str] | None = None) -> int: skip_default_stopwords=args.no_default_stopwords, skip_numbers=not args.include_numbers, case_sensitive=args.case_sensitive, + translate_from=translate_from, + translate_to=translate_to, ) # Output diff --git a/python_pkg/word_frequency/tests/test_learning_pipe.py b/python_pkg/word_frequency/tests/test_learning_pipe.py index 53a0ee3..28e7245 100644 --- a/python_pkg/word_frequency/tests/test_learning_pipe.py +++ b/python_pkg/word_frequency/tests/test_learning_pipe.py @@ -309,3 +309,90 @@ class TestDefaultStopwords: """Test that all stopwords are lowercase.""" for word in DEFAULT_STOPWORDS_EN: assert word == word.lower() + + +class TestTranslationIntegration: + """Tests for translation integration in learning_pipe.""" + + def test_lesson_without_translation(self) -> None: + """Test that lesson works without translation.""" + text = "hello world hello world hello" + result = generate_learning_lesson( + text, + batch_size=5, + num_batches=1, + skip_default_stopwords=True, + ) + + assert "hello" in result + assert "world" in result + # Should not have translation arrows + assert " -> " not in result or "Translation" not in result + + def test_lesson_with_translation_params(self) -> None: + """Test that translation params are accepted.""" + text = "hello world hello world hello" + # This should not crash even without argostranslate installed + result = generate_learning_lesson( + text, + batch_size=5, + num_batches=1, + skip_default_stopwords=True, + translate_from="en", + translate_to="es", + ) + + # The lesson should still be generated + assert "VOCABULARY TO LEARN:" in result + assert "hello" in result + + def test_main_with_translate_flags(self, tmp_path: Path) -> None: + """Test that main accepts translation flags.""" + text_file = tmp_path / "test.txt" + text_file.write_text("hello world hello world hello", encoding="utf-8") + + # Should not crash even if translation fails + result = main([ + "--file", str(text_file), + "--translate-from", "en", + "--translate-to", "es", + "--no-default-stopwords", + ]) + + assert result == 0 + + def test_translate_to_defaults_to_english(self, capsys: pytest.CaptureFixture[str]) -> None: + """Test that translate_to defaults to 'en' when using auto-detection.""" + text = "hello world" + # When using --translate flag (translate_from="auto"), translate_to defaults to "en" + result = generate_learning_lesson( + text, + batch_size=5, + num_batches=1, + skip_default_stopwords=True, + translate_from="auto", # Auto-detect source language + translate_to=None, # Should default to English + ) + + # Should have translation output with auto-detected source -> en + assert "Detected language:" in result + assert " -> en" in result + + def test_no_translation_when_both_none( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Test no translation happens when both translate_from and translate_to are None.""" + text = "hello world" + result = generate_learning_lesson( + text, + batch_size=5, + num_batches=1, + skip_default_stopwords=True, + translate_from=None, + translate_to=None, + ) + + # Should not have translation output + assert "Translation:" not in result + assert "Detected language:" not in result + diff --git a/python_pkg/word_frequency/tests/test_translator.py b/python_pkg/word_frequency/tests/test_translator.py new file mode 100644 index 0000000..2e80320 --- /dev/null +++ b/python_pkg/word_frequency/tests/test_translator.py @@ -0,0 +1,619 @@ +"""Tests for the offline translator module.""" + +from __future__ import annotations + +import sys +from pathlib import Path +from typing import TYPE_CHECKING +from unittest.mock import MagicMock, patch + +import pytest + +if TYPE_CHECKING: + from collections.abc import Generator + +# Import the module +try: + from python_pkg.word_frequency import translator + from python_pkg.word_frequency.translator import ( + TranslationResult, + download_languages, + format_translations, + get_available_packages, + get_installed_languages, + main, + read_file, + translate_word, + translate_words, + translate_words_batch, + ) +except ImportError: + # Direct execution support + sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) + from python_pkg.word_frequency import translator + from python_pkg.word_frequency.translator import ( + TranslationResult, + download_languages, + format_translations, + get_available_packages, + get_installed_languages, + main, + read_file, + translate_word, + translate_words, + translate_words_batch, + ) + + +# Helper context manager for mocking argostranslate +class ArgosAvailableMock: + """Context manager to mock argostranslate being available.""" + + def __init__(self, translate_returns: str | list[str] | Exception | None = None) -> None: + """Initialize with return values for translate().""" + self.translate_returns = translate_returns + self.mock_translate_module = MagicMock() + self.mock_package_module = MagicMock() + self.mock_parent = MagicMock() + self.original_available = translator._argos_available + + def __enter__(self) -> MagicMock: + """Set up the mocks.""" + translator._argos_available = True + + # Set up translate return value + if isinstance(self.translate_returns, Exception): + self.mock_translate_module.translate.side_effect = self.translate_returns + elif isinstance(self.translate_returns, list): + self.mock_translate_module.translate.side_effect = self.translate_returns + elif self.translate_returns is not None: + self.mock_translate_module.translate.return_value = self.translate_returns + + # Link parent module to submodules (critical for Python imports) + self.mock_parent.translate = self.mock_translate_module + self.mock_parent.package = self.mock_package_module + + # Patch sys.modules + self.patchers = [ + patch.dict( + "sys.modules", + { + "argostranslate": self.mock_parent, + "argostranslate.translate": self.mock_translate_module, + "argostranslate.package": self.mock_package_module, + }, + ), + ] + for p in self.patchers: + p.start() + + return self.mock_translate_module + + def __exit__(self, *args: object) -> None: + """Restore original state.""" + for p in self.patchers: + p.stop() + translator._argos_available = self.original_available + + +# Fixtures + + +@pytest.fixture +def mock_argos_unavailable() -> Generator[None, None, None]: + """Mock argostranslate being unavailable.""" + original_value = translator._argos_available + translator._argos_available = False + yield + translator._argos_available = original_value + + +@pytest.fixture +def mock_all_translators_unavailable() -> Generator[None, None, None]: + """Mock both argostranslate and deep-translator being unavailable.""" + original_argos = translator._argos_available + original_deep = translator._deep_translator_available + translator._argos_available = False + translator._deep_translator_available = False + yield + translator._argos_available = original_argos + translator._deep_translator_available = original_deep + + +@pytest.fixture +def temp_words_file(tmp_path: Path) -> Path: + """Create a temporary file with words.""" + words_file = tmp_path / "words.txt" + words_file.write_text("hello\nworld\ngoodbye\n", encoding="utf-8") + return words_file + + +# TranslationResult tests + + +class TestTranslationResult: + """Tests for TranslationResult namedtuple.""" + + def test_successful_result(self) -> None: + """Test creating a successful translation result.""" + result = TranslationResult( + source_word="hello", + translated_word="hola", + source_lang="en", + target_lang="es", + success=True, + ) + assert result.source_word == "hello" + assert result.translated_word == "hola" + assert result.source_lang == "en" + assert result.target_lang == "es" + assert result.success is True + assert result.error is None + + def test_failed_result(self) -> None: + """Test creating a failed translation result.""" + result = TranslationResult( + source_word="xyz", + translated_word="", + source_lang="en", + target_lang="xx", + success=False, + error="Language not supported", + ) + assert result.success is False + assert result.error == "Language not supported" + + def test_result_is_tuple(self) -> None: + """Test that TranslationResult is a namedtuple.""" + result = TranslationResult("a", "b", "en", "es", True) + assert isinstance(result, tuple) + assert len(result) == 6 + + +# translate_word tests + + +class TestTranslateWord: + """Tests for translate_word function.""" + + def test_translate_word_all_backends_unavailable( + self, mock_all_translators_unavailable: None + ) -> None: + """Test translation when no backends are available.""" + result = translate_word("hello", "en", "es") + assert result.success is False + assert "No translation backend" in str(result.error) + + def test_translate_word_argos_unavailable_uses_deep_translator( + self, mock_argos_unavailable: None + ) -> None: + """Test that deep-translator is used when argos is unavailable.""" + # deep-translator should work as fallback (it's installed) + result = translate_word("hello", "en", "es") + # This may succeed if deep-translator is installed + # Just verify we get a result without crashing + assert isinstance(result, TranslationResult) + + def test_translate_word_success(self) -> None: + """Test successful word translation.""" + with ArgosAvailableMock("hola"): + result = translate_word("hello", "en", "es") + + assert result.source_word == "hello" + assert result.translated_word == "hola" + assert result.success is True + + def test_translate_word_argos_exception_falls_back( + self, mock_argos_unavailable: None + ) -> None: + """Test that argos exception falls back to deep-translator.""" + # With argos unavailable, deep-translator should be used + result = translate_word("hello", "en", "es") + # Just verify it doesn't crash - may succeed or fail depending on network + assert isinstance(result, TranslationResult) + + +# translate_words tests + + +class TestTranslateWords: + """Tests for translate_words function.""" + + def test_translate_empty_list(self) -> None: + """Test translating empty list.""" + results = translate_words([], "en", "es") + assert results == [] + + def test_translate_multiple_words(self) -> None: + """Test translating multiple words.""" + with ArgosAvailableMock(["hola", "mundo"]): + results = translate_words(["hello", "world"], "en", "es") + + assert len(results) == 2 + assert results[0].translated_word == "hola" + assert results[1].translated_word == "mundo" + + +# translate_words_batch tests + + +class TestTranslateWordsBatch: + """Tests for translate_words_batch function.""" + + def test_batch_empty_list(self) -> None: + """Test batch translation of empty list.""" + results = translate_words_batch([], "en", "es") + assert results == [] + + def test_batch_small_list(self) -> None: + """Test batch translation of small list (3 or fewer).""" + with ArgosAvailableMock(["uno", "dos", "tres"]) as mock: + results = translate_words_batch(["one", "two", "three"], "en", "es") + + assert len(results) == 3 + # Small lists use individual translation + assert mock.translate.call_count == 3 + + def test_batch_large_list_success(self) -> None: + """Test batch translation of large list.""" + words = ["one", "two", "three", "four", "five"] + + with ArgosAvailableMock("uno\ndos\ntres\ncuatro\ncinco") as mock: + results = translate_words_batch(words, "en", "es") + + assert len(results) == 5 + # Batch translation called once + mock.translate.assert_called_once() + assert results[0].translated_word == "uno" + assert results[4].translated_word == "cinco" + + def test_batch_fallback_on_mismatch(self) -> None: + """Test batch translation falls back when result count mismatches.""" + words = ["one", "two", "three", "four"] + # First call (batch) returns wrong count, subsequent calls are individual + with ArgosAvailableMock( + ["wrong\ncount", "uno", "dos", "tres", "cuatro"] + ) as mock: + results = translate_words_batch(words, "en", "es") + + assert len(results) == 4 + # Fallback to individual + assert mock.translate.call_count == 5 + + def test_batch_fallback_on_exception(self) -> None: + """Test batch translation falls back on exception.""" + words = ["one", "two", "three", "four"] + + # Create mock that raises first then succeeds + original = translator._argos_available + translator._argos_available = True + + mock_translate_module = MagicMock() + mock_translate_module.translate.side_effect = [ + RuntimeError("Batch failed"), + "uno", + "dos", + "tres", + "cuatro", + ] + mock_package_module = MagicMock() + mock_parent = MagicMock() + mock_parent.translate = mock_translate_module + mock_parent.package = mock_package_module + + with patch.dict( + "sys.modules", + { + "argostranslate": mock_parent, + "argostranslate.translate": mock_translate_module, + "argostranslate.package": mock_package_module, + }, + ): + results = translate_words_batch(words, "en", "es") + + translator._argos_available = original + + assert len(results) == 4 + + +# format_translations tests + + +class TestFormatTranslations: + """Tests for format_translations function.""" + + def test_format_empty(self) -> None: + """Test formatting empty results.""" + output = format_translations([]) + assert output == "No translations." + + def test_format_single_translation(self) -> None: + """Test formatting single translation.""" + results = [ + TranslationResult("hello", "hola", "en", "es", True), + ] + output = format_translations(results) + + assert "en -> es" in output + assert "hello" in output + assert "hola" in output + + def test_format_multiple_translations(self) -> None: + """Test formatting multiple translations.""" + results = [ + TranslationResult("hello", "hola", "en", "es", True), + TranslationResult("world", "mundo", "en", "es", True), + ] + output = format_translations(results) + + assert "hello" in output + assert "hola" in output + assert "world" in output + assert "mundo" in output + + def test_format_with_errors(self) -> None: + """Test formatting with failed translations.""" + results = [ + TranslationResult("hello", "hola", "en", "es", True), + TranslationResult("xyz", "", "en", "es", False, "Unknown word"), + ] + output = format_translations(results, show_errors=True) + + assert "hello" in output + assert "Error: Unknown word" in output + + def test_format_hide_errors(self) -> None: + """Test formatting with errors hidden.""" + results = [ + TranslationResult("hello", "hola", "en", "es", True), + TranslationResult("xyz", "", "en", "es", False, "Unknown word"), + ] + output = format_translations(results, show_errors=False) + + assert "hello" in output + assert "Unknown word" not in output + + +# get_installed_languages tests + + +class TestGetInstalledLanguages: + """Tests for get_installed_languages function.""" + + def test_argos_unavailable(self, mock_argos_unavailable: None) -> None: + """Test when argos is unavailable.""" + result = get_installed_languages() + assert result == [] + + def test_returns_languages(self) -> None: + """Test returning installed languages.""" + mock_lang1 = MagicMock() + mock_lang1.code = "en" + mock_lang1.name = "English" + mock_lang2 = MagicMock() + mock_lang2.code = "es" + mock_lang2.name = "Spanish" + + with ArgosAvailableMock() as mock: + mock.get_installed_languages.return_value = [mock_lang1, mock_lang2] + result = get_installed_languages() + + assert ("en", "English") in result + assert ("es", "Spanish") in result + + +# get_available_packages tests + + +class TestGetAvailablePackages: + """Tests for get_available_packages function.""" + + def test_argos_unavailable(self, mock_argos_unavailable: None) -> None: + """Test when argos is unavailable.""" + result = get_available_packages() + assert result == [] + + +# download_languages tests + + +class TestDownloadLanguages: + """Tests for download_languages function.""" + + def test_argos_unavailable(self, mock_argos_unavailable: None) -> None: + """Test when argos is unavailable.""" + result = download_languages(["en", "es"]) + assert result == {} + + +# read_file tests + + +class TestReadFile: + """Tests for read_file function.""" + + def test_read_file(self, tmp_path: Path) -> None: + """Test reading a file.""" + test_file = tmp_path / "test.txt" + test_file.write_text("hello\nworld", encoding="utf-8") + + content = read_file(test_file) + + assert content == "hello\nworld" + + def test_read_file_not_found(self, tmp_path: Path) -> None: + """Test reading non-existent file.""" + with pytest.raises(FileNotFoundError): + read_file(tmp_path / "nonexistent.txt") + + +# main function tests + + +class TestMain: + """Tests for main CLI function.""" + + def test_argos_unavailable_error(self, mock_argos_unavailable: None) -> None: + """Test error when argos not installed.""" + result = main(["--text", "hello", "--from", "en", "--to", "es"]) + assert result == 1 + + def test_list_languages_empty( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Test listing languages when none installed.""" + with ArgosAvailableMock() as mock: + mock.get_installed_languages.return_value = [] + result = main(["--list-languages"]) + + assert result == 0 + captured = capsys.readouterr() + assert "No languages installed" in captured.out + + def test_list_languages_with_results( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Test listing installed languages.""" + mock_lang = MagicMock() + mock_lang.code = "en" + mock_lang.name = "English" + + with ArgosAvailableMock() as mock: + mock.get_installed_languages.return_value = [mock_lang] + result = main(["--list-languages"]) + + assert result == 0 + captured = capsys.readouterr() + assert "en" in captured.out + assert "English" in captured.out + + def test_translate_single_text( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Test translating single text.""" + with ArgosAvailableMock("hola"): + result = main(["--text", "hello", "--from", "en", "--to", "es"]) + + assert result == 0 + captured = capsys.readouterr() + assert "hello" in captured.out + assert "hola" in captured.out + + def test_translate_multiple_words( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Test translating multiple words.""" + with ArgosAvailableMock(["hola", "mundo"]): + result = main(["--words", "hello", "world", "--from", "en", "--to", "es"]) + + assert result == 0 + captured = capsys.readouterr() + assert "hello" in captured.out + assert "world" in captured.out + + def test_translate_from_file( + self, + temp_words_file: Path, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Test translating words from file.""" + with ArgosAvailableMock(["hola", "mundo", "adios"]): + result = main( + ["--words-file", str(temp_words_file), "--from", "en", "--to", "es"] + ) + + assert result == 0 + captured = capsys.readouterr() + assert "hello" in captured.out + assert "world" in captured.out + assert "goodbye" in captured.out + + def test_translate_file_not_found( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Test error when words file not found.""" + with ArgosAvailableMock(): + result = main( + ["--words-file", "/nonexistent/file.txt", "--from", "en", "--to", "es"] + ) + + assert result == 1 + captured = capsys.readouterr() + assert "File not found" in captured.err + + def test_translate_output_to_file( + self, + tmp_path: Path, + capsys: pytest.CaptureFixture[str], + ) -> None: + """Test outputting translations to file.""" + output_file = tmp_path / "output.txt" + + with ArgosAvailableMock("hola"): + result = main( + [ + "--text", + "hello", + "--from", + "en", + "--to", + "es", + "--output", + str(output_file), + ] + ) + + assert result == 0 + assert output_file.exists() + content = output_file.read_text(encoding="utf-8") + assert "hello" in content + assert "hola" in content + + def test_no_input_shows_help( + self, capsys: pytest.CaptureFixture[str] + ) -> None: + """Test that no input shows help.""" + with ArgosAvailableMock(): + result = main([]) + + assert result == 1 + + def test_translation_failure_returns_error( + self, mock_all_translators_unavailable: None + ) -> None: + """Test that translation failure returns error code when no backends.""" + result = main(["--text", "hello", "--from", "en", "--to", "es"]) + assert result == 1 + + +# Integration-style tests (still mocked but testing more flow) + + +class TestIntegration: + """Integration-style tests for translator.""" + + def test_full_translation_flow(self) -> None: + """Test complete translation flow.""" + with ArgosAvailableMock(["uno", "dos", "tres"]): + words = ["one", "two", "three"] + results = translate_words(words, "en", "es") + + assert all(r.success for r in results) + assert [r.translated_word for r in results] == ["uno", "dos", "tres"] + + output = format_translations(results) + assert "en -> es" in output + assert "one" in output + assert "uno" in output + + def test_mixed_success_failure( + self, mock_all_translators_unavailable: None + ) -> None: + """Test handling when no translation backends are available.""" + results = translate_words(["hello", "xyz", "world"], "en", "es") + + # All should fail when no backends available + assert all(not r.success for r in results) + + output = format_translations(results) + assert "Error" in output diff --git a/python_pkg/word_frequency/translator.py b/python_pkg/word_frequency/translator.py new file mode 100644 index 0000000..89c3ed6 --- /dev/null +++ b/python_pkg/word_frequency/translator.py @@ -0,0 +1,572 @@ +#!/usr/bin/env python3 +"""Translator - translates words/text between languages. + +This module provides translation capabilities using either: +1. Argos Translate (offline, requires large downloads) - preferred if installed +2. deep-translator (online, uses Google Translate) - lightweight fallback + +Usage: + # Translate a single word + python -m python_pkg.word_frequency.translator --text "hello" --from en --to es + + # Translate multiple words + python -m python_pkg.word_frequency.translator --words hello world goodbye --from en --to pl + + # Translate words from a file (one word per line) + python -m python_pkg.word_frequency.translator --words-file words.txt --from la --to en + + # List available languages + python -m python_pkg.word_frequency.translator --list-languages + + # Output to file + python -m python_pkg.word_frequency.translator --words-file vocab.txt --from pl --to en --output translations.txt + +Dependencies (install one): + pip install deep-translator # Lightweight, uses Google Translate (online) + pip install argostranslate # Offline translation (requires ~3GB downloads) +""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path +from typing import TYPE_CHECKING, NamedTuple + +if TYPE_CHECKING: + from collections.abc import Sequence + +# Lazy imports for translation backends (may not be installed) +_argos_available: bool | None = None +_deep_translator_available: bool | None = None +_langdetect_available: bool | None = None + + +def _check_argos() -> bool: + """Check if argostranslate is available.""" + global _argos_available + if _argos_available is None: + try: + import argostranslate.package # noqa: F401 + import argostranslate.translate # noqa: F401 + + _argos_available = True + except ImportError: + _argos_available = False + return _argos_available + + +def _check_deep_translator() -> bool: + """Check if deep-translator is available.""" + global _deep_translator_available + if _deep_translator_available is None: + try: + from deep_translator import GoogleTranslator # noqa: F401 + + _deep_translator_available = True + except ImportError: + _deep_translator_available = False + return _deep_translator_available + + +def _check_langdetect() -> bool: + """Check if langdetect is available.""" + global _langdetect_available + if _langdetect_available is None: + try: + import langdetect # noqa: F401 + + _langdetect_available = True + except ImportError: + _langdetect_available = False + return _langdetect_available + + +def detect_language(text: str) -> str | None: + """Detect the language of a text. + + Args: + text: The text to analyze. + + Returns: + ISO 639-1 language code (e.g., 'en', 'la', 'pl') or None if detection fails. + """ + if not _check_langdetect(): + return None + + import langdetect + + try: + # Use a sample of the text for detection (faster and more reliable) + sample = text[:5000] if len(text) > 5000 else text + return langdetect.detect(sample) # type: ignore[no-any-return] + except langdetect.LangDetectException: # type: ignore[attr-defined] + return None + + +class TranslationResult(NamedTuple): + """Result of a translation.""" + + source_word: str + translated_word: str + source_lang: str + target_lang: str + success: bool + error: str | None = None + + +def get_installed_languages() -> list[tuple[str, str]]: + """Get list of installed languages. + + Returns: + List of (code, name) tuples for installed languages. + """ + if not _check_argos(): + return [] + + import argostranslate.translate + + languages = argostranslate.translate.get_installed_languages() + return [(lang.code, lang.name) for lang in languages] + + +def get_available_packages() -> list[tuple[str, str, str, str]]: + """Get list of available language packages for download. + + Returns: + List of (from_code, from_name, to_code, to_name) tuples. + """ + if not _check_argos(): + return [] + + import argostranslate.package + + argostranslate.package.update_package_index() + available = argostranslate.package.get_available_packages() + return [ + (pkg.from_code, pkg.from_name, pkg.to_code, pkg.to_name) for pkg in available + ] + + +def download_languages(lang_codes: Sequence[str]) -> dict[str, bool]: + """Download language packages for the specified languages. + + Downloads packages for translation between English and the specified languages, + and between each pair of specified languages if available. + + Args: + lang_codes: List of language codes to download (e.g., ['en', 'es', 'pl']). + + Returns: + Dict mapping "from->to" to success boolean. + """ + if not _check_argos(): + return {} + + import argostranslate.package + + results: dict[str, bool] = {} + + # Update package index + print("Updating package index...") # noqa: T201 + argostranslate.package.update_package_index() + available = argostranslate.package.get_available_packages() + + # Create a lookup for available packages + available_lookup: dict[tuple[str, str], object] = {} + for pkg in available: + available_lookup[(pkg.from_code, pkg.to_code)] = pkg + + # Download packages for all requested language pairs + lang_codes_set = set(lang_codes) + + for from_code in lang_codes_set: + for to_code in lang_codes_set: + if from_code == to_code: + continue + + key = f"{from_code}->{to_code}" + pkg_key = (from_code, to_code) + + if pkg_key in available_lookup: + pkg = available_lookup[pkg_key] + try: + print(f"Downloading {from_code} -> {to_code}...") # noqa: T201 + argostranslate.package.install_from_path(pkg.download()) + results[key] = True + print(f" ✓ Installed {from_code} -> {to_code}") # noqa: T201 + except Exception as e: # noqa: BLE001 + results[key] = False + print(f" ✗ Failed {from_code} -> {to_code}: {e}") # noqa: T201 + else: + # Package not available + results[key] = False + + return results + + +def translate_word( + word: str, + from_lang: str, + to_lang: str, +) -> TranslationResult: + """Translate a single word. + + Uses argostranslate if available (offline), otherwise falls back to + deep-translator (Google Translate, online). + + Args: + word: The word to translate. + from_lang: Source language code (e.g., 'en', 'pl', 'la'). + to_lang: Target language code. + + Returns: + TranslationResult with the translation. + """ + # Try argostranslate first (offline) + if _check_argos(): + import argostranslate.translate + + try: + translated = argostranslate.translate.translate(word, from_lang, to_lang) + return TranslationResult( + source_word=word, + translated_word=translated, + source_lang=from_lang, + target_lang=to_lang, + success=True, + ) + except Exception as e: # noqa: BLE001 + # Fall through to try deep-translator + argos_error = str(e) + else: + argos_error = None + + # Try deep-translator (online via Google Translate) + if _check_deep_translator(): + from deep_translator import GoogleTranslator + + try: + translator = GoogleTranslator(source=from_lang, target=to_lang) + translated = translator.translate(word) + return TranslationResult( + source_word=word, + translated_word=translated or "", + source_lang=from_lang, + target_lang=to_lang, + success=True, + ) + except Exception as e: # noqa: BLE001 + return TranslationResult( + source_word=word, + translated_word="", + source_lang=from_lang, + target_lang=to_lang, + success=False, + error=str(e), + ) + + # Neither backend available + error_msg = "No translation backend available. Install: pip install deep-translator" + if argos_error: + error_msg = f"argostranslate error: {argos_error}" + return TranslationResult( + source_word=word, + translated_word="", + source_lang=from_lang, + target_lang=to_lang, + success=False, + error=error_msg, + ) + + +def translate_words( + words: Sequence[str], + from_lang: str, + to_lang: str, +) -> list[TranslationResult]: + """Translate multiple words. + + Args: + words: List of words to translate. + from_lang: Source language code. + to_lang: Target language code. + + Returns: + List of TranslationResult for each word. + """ + return [translate_word(word, from_lang, to_lang) for word in words] + + +def translate_words_batch( + words: Sequence[str], + from_lang: str, + to_lang: str, +) -> list[TranslationResult]: + """Translate multiple words, attempting batch translation for efficiency. + + For better results with context, this joins words and translates together, + then splits. Falls back to word-by-word if batch fails. + + Args: + words: List of words to translate. + from_lang: Source language code. + to_lang: Target language code. + + Returns: + List of TranslationResult for each word. + """ + if not words: + return [] + + # For single words or small batches, just translate individually + if len(words) <= 3: + return translate_words(words, from_lang, to_lang) + + # Try batch translation by joining with newlines + if not _check_argos(): + return translate_words(words, from_lang, to_lang) + + import argostranslate.translate + + try: + # Join words with newlines for batch translation + batch_text = "\n".join(words) + translated_batch = argostranslate.translate.translate( + batch_text, from_lang, to_lang + ) + translated_words = translated_batch.split("\n") + + # If we got the same number of translations, use them + if len(translated_words) == len(words): + return [ + TranslationResult( + source_word=word, + translated_word=trans.strip(), + source_lang=from_lang, + target_lang=to_lang, + success=True, + ) + for word, trans in zip(words, translated_words, strict=True) + ] + except Exception: # noqa: BLE001, S110 + pass + + # Fall back to individual translation + return translate_words(words, from_lang, to_lang) + + +def format_translations( + results: list[TranslationResult], + *, + show_errors: bool = True, +) -> str: + """Format translation results as a table. + + Args: + results: List of TranslationResult to format. + show_errors: If True, show error messages for failed translations. + + Returns: + Formatted string with translations. + """ + if not results: + return "No translations." + + lines: list[str] = [] + + # Find max widths + max_source = max(len(r.source_word) for r in results) + max_source = max(max_source, 6) # "Source" header + + successful_lengths = [len(r.translated_word) for r in results if r.success] + max_trans = max(successful_lengths) if successful_lengths else 0 + max_trans = max(max_trans, 11) # "Translation" header minimum + + # Header + from_lang = results[0].source_lang + to_lang = results[0].target_lang + lines.append(f"Translation: {from_lang} -> {to_lang}") + lines.append("") + lines.append(f"{'Source':<{max_source}} {'Translation':<{max_trans}}") + lines.append("-" * (max_source + max_trans + 2)) + + # Data + for r in results: + if r.success: + lines.append(f"{r.source_word:<{max_source}} {r.translated_word:<{max_trans}}") + elif show_errors: + error_msg = f"[Error: {r.error}]" if r.error else "[Failed]" + lines.append(f"{r.source_word:<{max_source}} {error_msg}") + + return "\n".join(lines) + + +def read_file(filepath: str | Path) -> str: + """Read text content from a file.""" + return Path(filepath).read_text(encoding="utf-8") + + +def main(argv: Sequence[str] | None = None) -> int: + """Main entry point for the translator. + + Args: + argv: Command line arguments. + + Returns: + Exit code. + """ + parser = argparse.ArgumentParser( + description="Offline translator using Argos Translate.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + # Actions + action_group = parser.add_mutually_exclusive_group() + action_group.add_argument( + "--list-languages", + "-l", + action="store_true", + help="List installed languages", + ) + action_group.add_argument( + "--list-available", + "-L", + action="store_true", + help="List available language packages for download", + ) + action_group.add_argument( + "--download", + "-d", + nargs="+", + metavar="LANG", + help="Download language packs (e.g., --download en es pl)", + ) + + # Input + input_group = parser.add_mutually_exclusive_group() + input_group.add_argument( + "--text", + "-t", + type=str, + help="Single text/word to translate", + ) + input_group.add_argument( + "--words", + "-w", + nargs="+", + help="Words to translate", + ) + input_group.add_argument( + "--words-file", + "-W", + type=str, + help="File with words to translate (one per line)", + ) + + # Language options + parser.add_argument( + "--from", + "-f", + dest="from_lang", + type=str, + default="en", + help="Source language code (default: en)", + ) + parser.add_argument( + "--to", + "-T", + dest="to_lang", + type=str, + default="en", + help="Target language code (default: en)", + ) + + # Output + parser.add_argument( + "--output", + "-o", + type=str, + help="Output file path", + ) + + args = parser.parse_args(argv) + + # Check if argostranslate is available + if not _check_argos(): + print( # noqa: T201 + "Error: argostranslate is not installed.\n" + "Install it with: pip install argostranslate", + file=sys.stderr, + ) + return 1 + + # Handle list-languages + if args.list_languages: + langs = get_installed_languages() + if not langs: + print("No languages installed.") # noqa: T201 + print("Download some with: --download en es pl de fr") # noqa: T201 + else: + print("Installed languages:") # noqa: T201 + for code, name in sorted(langs): + print(f" {code}: {name}") # noqa: T201 + return 0 + + # Handle list-available + if args.list_available: + packages = get_available_packages() + if not packages: + print("No packages available (check internet connection).") # noqa: T201 + else: + print("Available language packages:") # noqa: T201 + for from_code, from_name, to_code, to_name in sorted(packages): + print(f" {from_code} ({from_name}) -> {to_code} ({to_name})") # noqa: T201 + return 0 + + # Handle download + if args.download: + results = download_languages(args.download) + success_count = sum(1 for v in results.values() if v) + print(f"\nDownloaded {success_count}/{len(results)} language pairs.") # noqa: T201 + return 0 if success_count > 0 else 1 + + # Handle translation + words: list[str] = [] + if args.text: + words = [args.text] + elif args.words: + words = args.words + elif args.words_file: + try: + content = read_file(args.words_file) + words = [w.strip() for w in content.splitlines() if w.strip()] + except FileNotFoundError: + print(f"Error: File not found: {args.words_file}", file=sys.stderr) # noqa: T201 + return 1 + + if not words: + parser.print_help() + return 1 + + # Translate + results = translate_words_batch(words, args.from_lang, args.to_lang) + output = format_translations(results) + + # Output + if args.output: + Path(args.output).write_text(output, encoding="utf-8") + print(f"Translations written to {args.output}") # noqa: T201 + else: + print(output) # noqa: T201 + + # Return error if any translation failed + if any(not r.success for r in results): + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main())