testsAndMisc/python_pkg/word_frequency/tests/test_learning_pipe.py
Krzysztof kuhy Rudnicki ee27d10fef Reduce per-file-ignores by fixing lint violations across codebase
Fix ruff violations in ~15 source files and ~60+ test files to minimize
per-file-ignores in pyproject.toml. Remaining ignores are justified with
comments explaining why each suppression is necessary.

Source fixes: FBT003 (keyword args), S310 (URL validation), SLF001
(private access), T201 (print→logging), C901 (complexity), E501 (line
length), E402 (import order).

Test fixes: SIM117 (combined with), FBT (boolean args), PERF203 (try in
loop), S310/S607 (URLs/executables), E402/E501 (imports/lines), S108
(tmp paths), PLR0913 (too many args), ARG (unused args), ANN (type
annotations), RUF059 (unused unpacked vars), PT019 (fixture naming).

Remaining per-file-ignores (with justifications):
- Tests: ARG, D, PLC0415, PLR2004, S101, SLF001
- music_gen sources: PLC0415 (heavy ML lazy imports)
- moviepy_showcase: PLC0415 (circular dependency)
- generate_images: PLR0913 (matplotlib helpers need many params)
- praca_magisterska_video: E501, E402 (long paths, mpl.use)
2026-03-25 18:58:05 +01:00

456 lines
15 KiB
Python

"""Tests for word_frequency.learning_pipe module."""
from __future__ import annotations
import logging
import time
from typing import TYPE_CHECKING
from unittest.mock import MagicMock, patch
import pytest
if TYPE_CHECKING:
from pathlib import Path
from python_pkg.word_frequency._learning_constants import (
DEFAULT_STOPWORDS_EN,
LessonConfig,
load_stopwords,
)
from python_pkg.word_frequency._translator_helpers import TranslationResult
from python_pkg.word_frequency.learning_pipe import (
generate_learning_lesson,
main,
)
import python_pkg.word_frequency.translator as _translator_module
if TYPE_CHECKING:
from collections.abc import Generator
@pytest.fixture
def mock_translation() -> Generator[MagicMock, None, None]:
"""Mock translation to avoid requiring argostranslate."""
def fake_batch_translate(
words: list[str],
from_lang: str,
to_lang: str,
*,
_use_cache: bool = True,
) -> list[TranslationResult]:
"""Fake batch translation that returns word with prefix."""
return [
TranslationResult(
source_word=word,
translated_word=f"translated_{word}",
source_lang=from_lang,
target_lang=to_lang,
success=True,
)
for word in words
]
# Need to patch in translator module since _learning_batch looks it up there
with patch.object(
_translator_module, "translate_words_batch", side_effect=fake_batch_translate
):
yield
class TestLoadStopwords:
"""Tests for load_stopwords function."""
def test_load_from_file(self, tmp_path: Path) -> None:
"""Test loading stopwords from file."""
stopwords_file = tmp_path / "stopwords.txt"
stopwords_file.write_text("word1\nword2\nword3\n", encoding="utf-8")
result = load_stopwords(stopwords_file)
assert "word1" in result
assert "word2" in result
assert "word3" in result
def test_load_none_returns_empty(self) -> None:
"""Test that None returns empty frozenset."""
result = load_stopwords(None)
assert result == frozenset()
def test_load_nonexistent_returns_empty(self) -> None:
"""Test that nonexistent file returns empty frozenset."""
result = load_stopwords("/nonexistent/file.txt")
assert result == frozenset()
def test_lowercase_conversion(self, tmp_path: Path) -> None:
"""Test that stopwords are converted to lowercase."""
stopwords_file = tmp_path / "stopwords.txt"
stopwords_file.write_text("UPPER\nMixed\nlower\n", encoding="utf-8")
result = load_stopwords(stopwords_file)
assert "upper" in result
assert "mixed" in result
assert "lower" in result
class TestGenerateLearningLesson:
"""Tests for generate_learning_lesson function."""
def test_basic_generation(self) -> None:
"""Test basic lesson generation."""
text = "hello world hello hello world test test test test"
result = generate_learning_lesson(
text, LessonConfig(batch_size=3, num_batches=1, skip_default_stopwords=True)
)
assert "LANGUAGE LEARNING LESSON" in result
assert "VOCABULARY TO LEARN" in result
assert "test" in result # Most common word
def test_multiple_batches(self) -> None:
"""Test generation with multiple batches."""
text = " ".join(f"word{i}" * (100 - i) for i in range(20))
result = generate_learning_lesson(
text, LessonConfig(batch_size=5, num_batches=3, skip_default_stopwords=True)
)
assert "BATCH 1" in result
assert "BATCH 2" in result
assert "BATCH 3" in result
def test_stopwords_filtering(self) -> None:
"""Test that default stopwords are filtered."""
text = "the the the hello world"
result = generate_learning_lesson(
text, LessonConfig(batch_size=5, num_batches=1)
)
# "the" should be filtered, "hello" and "world" should appear
lines = result.split("\n")
vocab_section = False
found_words = []
for line in lines:
if "VOCABULARY TO LEARN" in line:
vocab_section = True
elif vocab_section and ". " in line and "(" in line:
# Extract word from line like " 1. hello (1 occurrences..."
word = line.split(".")[1].split("(")[0].strip()
found_words.append(word)
elif vocab_section and "PRACTICE" in line:
break
assert "the" not in found_words
assert "hello" in found_words or "world" in found_words
def test_skip_default_stopwords(self) -> None:
"""Test disabling default stopword filtering."""
text = "the the the hello"
result = generate_learning_lesson(
text, LessonConfig(batch_size=5, num_batches=1, skip_default_stopwords=True)
)
assert "the" in result.lower()
def test_numbers_filtered_by_default(self) -> None:
"""Test that numbers are filtered by default."""
text = "123 123 123 hello world"
result = generate_learning_lesson(
text, LessonConfig(batch_size=5, num_batches=1, skip_default_stopwords=True)
)
# Check vocabulary section doesn't include "123"
lines = result.split("\n")
for line in lines:
if ". 123" in line and "occurrences" in line:
pytest.fail("Number '123' should be filtered from vocabulary")
def test_numbers_included_when_requested(self) -> None:
"""Test including numbers in vocabulary."""
text = "123 123 123 hello"
result = generate_learning_lesson(
text,
LessonConfig(
batch_size=5,
num_batches=1,
skip_default_stopwords=True,
skip_numbers=False,
),
)
assert "123" in result
def test_coverage_calculation(self) -> None:
"""Test that coverage percentage is calculated."""
text = "hello hello hello world world test"
result = generate_learning_lesson(
text, LessonConfig(batch_size=3, num_batches=1, skip_default_stopwords=True)
)
assert "recognize" in result.lower()
assert "%" in result
def test_excerpts_included(self) -> None:
"""Test that practice excerpts are included."""
text = "hello world hello world hello world test test test"
result = generate_learning_lesson(
text,
LessonConfig(
batch_size=2,
num_batches=1,
excerpt_length=3,
excerpts_per_batch=2,
skip_default_stopwords=True,
),
)
assert "PRACTICE EXCERPTS" in result
assert "Excerpt 1" in result
def test_more_batches_than_words(self) -> None:
"""Test with num_batches larger than available words (early break)."""
# "ab" is the only word with len > 1
text = "ab ab ab"
result = generate_learning_lesson(
text,
LessonConfig(
batch_size=1,
num_batches=100,
skip_default_stopwords=True,
),
)
assert "SUMMARY" in result
def test_all_words_filtered_empty_cumulative(self) -> None:
"""Test when all words are filtered, cumulative_words is empty."""
text = "a b c" # All 1-char words -> filtered by len(word) > 1
result = generate_learning_lesson(
text,
LessonConfig(
batch_size=5,
num_batches=1,
skip_default_stopwords=True,
),
)
assert "SUMMARY" in result
# No batches generated, no vocabulary coverage stats
assert "Text coverage" not in result
def test_no_translation(self) -> None:
"""Test lesson without translation enabled (do_translate=False)."""
text = "hello hello hello world world"
result = generate_learning_lesson(
text,
LessonConfig(
batch_size=5,
num_batches=1,
skip_default_stopwords=True,
translate_from=None,
translate_to=None,
),
)
assert "LANGUAGE LEARNING LESSON" in result
def test_default_config(self) -> None:
"""Test calling generate_learning_lesson without config (line 79)."""
text = "hello hello hello world world"
result = generate_learning_lesson(text)
assert "LANGUAGE LEARNING LESSON" in result
class TestMain:
"""Tests for main CLI function."""
def test_basic_text_input(
self, caplog: pytest.LogCaptureFixture, mock_translation: None
) -> None:
"""Test with text input."""
with caplog.at_level(logging.INFO):
exit_code = main(
[
"--text",
"hello world hello world test test test",
"--batch-size",
"3",
"--no-default-stopwords",
]
)
assert exit_code == 0
assert "LANGUAGE LEARNING LESSON" in caplog.text
def test_file_input(
self, tmp_path: Path, caplog: pytest.LogCaptureFixture, mock_translation: None
) -> None:
"""Test with file input."""
test_file = tmp_path / "test.txt"
test_file.write_text("hello world hello world test", encoding="utf-8")
with caplog.at_level(logging.INFO):
exit_code = main(
[
"--file",
str(test_file),
"--batch-size",
"3",
"--no-default-stopwords",
]
)
assert exit_code == 0
assert "hello" in caplog.text.lower()
def test_output_to_file(self, tmp_path: Path, mock_translation: None) -> None:
"""Test outputting to file."""
output_file = tmp_path / "lesson.txt"
exit_code = main(
[
"--text",
"hello world hello world",
"--output",
str(output_file),
"--no-default-stopwords",
]
)
assert exit_code == 0
assert output_file.exists()
content = output_file.read_text(encoding="utf-8")
assert "LANGUAGE LEARNING LESSON" in content
def test_custom_stopwords(self, tmp_path: Path, mock_translation: None) -> None:
"""Test with custom stopwords file."""
stopwords_file = tmp_path / "stop.txt"
stopwords_file.write_text("hello\n", encoding="utf-8")
exit_code = main(
[
"--text",
"hello hello hello world world",
"--stopwords",
str(stopwords_file),
"--no-default-stopwords",
"--batch-size",
"5",
]
)
assert exit_code == 0
# "hello" should be filtered by custom stopwords
def test_multiple_batches_option(
self, caplog: pytest.LogCaptureFixture, mock_translation: None
) -> None:
"""Test --batches option."""
text = " ".join(f"word{i}" * (50 - i) for i in range(30))
with caplog.at_level(logging.INFO):
exit_code = main(
[
"--text",
text,
"--batch-size",
"5",
"--batches",
"3",
"--no-default-stopwords",
]
)
assert exit_code == 0
assert "BATCH 1" in caplog.text
assert "BATCH 2" in caplog.text
assert "BATCH 3" in caplog.text
def test_file_not_found(self, caplog: pytest.LogCaptureFixture) -> None:
"""Test error handling for missing file."""
with caplog.at_level(logging.ERROR):
exit_code = main(["--file", "/nonexistent/file.txt"])
assert exit_code == 1
assert "Error" in caplog.text
def test_unicode_decode_error(
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
) -> None:
"""Test UnicodeDecodeError handling."""
with (
caplog.at_level(logging.ERROR),
patch(
"python_pkg.word_frequency.learning_pipe.read_file",
side_effect=UnicodeDecodeError("utf-8", b"", 0, 1, "bad"),
),
):
exit_code = main(["--file", str(tmp_path / "f.txt")])
assert exit_code == 1
def test_output_to_file_branch(
self, tmp_path: Path, mock_translation: None
) -> None:
"""Test --output to verify the file writing path."""
out = tmp_path / "out.txt"
exit_code = main(
[
"--text",
"hello world hello",
"--output",
str(out),
"--no-default-stopwords",
]
)
assert exit_code == 0
assert out.exists()
def test_no_translate_flag(self, caplog: pytest.LogCaptureFixture) -> None:
"""Test --no-translate flag to cover branch 303->307."""
with caplog.at_level(logging.INFO):
exit_code = main(
[
"--text",
"hello world hello",
"--no-translate",
"--no-default-stopwords",
]
)
assert exit_code == 0
class TestPerformance:
"""Performance tests for learning pipe."""
def test_large_text_performance(self) -> None:
"""Test performance with large text."""
# Generate large text with enough unique words for 5 batches
words = ["word" + str(i) for i in range(500)]
large_text = " ".join(words * 200)
start_time = time.perf_counter()
result = generate_learning_lesson(
large_text,
LessonConfig(
batch_size=50,
num_batches=5,
excerpt_length=30,
skip_default_stopwords=True,
),
)
elapsed = time.perf_counter() - start_time
assert elapsed < 10.0, f"Generation took {elapsed:.2f}s, expected < 10s"
assert "BATCH 5" in result
class TestDefaultStopwords:
"""Tests for default stopwords."""
def test_common_words_in_stopwords(self) -> None:
"""Test that common words are in default stopwords."""
common = ["the", "a", "an", "and", "or", "but", "in", "on", "is", "are"]
for word in common:
assert word in DEFAULT_STOPWORDS_EN
def test_stopwords_are_lowercase(self) -> None:
"""Test that all stopwords are lowercase."""
for word in DEFAULT_STOPWORDS_EN:
assert word == word.lower()