testsAndMisc-archive/python_pkg/word_frequency/tests/test_learning_pipe.py

"""Tests for word_frequency.learning_pipe module."""

from __future__ import annotations

import logging
import time
from typing import TYPE_CHECKING
from unittest.mock import MagicMock, patch

import pytest

if TYPE_CHECKING:
    from pathlib import Path

from python_pkg.word_frequency._learning_constants import (
    DEFAULT_STOPWORDS_EN,
    LessonConfig,
    load_stopwords,
)
from python_pkg.word_frequency._translator_helpers import TranslationResult
from python_pkg.word_frequency.learning_pipe import (
    generate_learning_lesson,
    main,
)
import python_pkg.word_frequency.translator as _translator_module

if TYPE_CHECKING:
    from collections.abc import Generator


@pytest.fixture
def mock_translation() -> Generator[MagicMock, None, None]:
    """Mock translation to avoid requiring argostranslate."""

    def fake_batch_translate(
        words: list[str],
        from_lang: str,
        to_lang: str,
        *,
        _use_cache: bool = True,
    ) -> list[TranslationResult]:
        """Fake batch translation that returns word with prefix."""
        return [
            TranslationResult(
                source_word=word,
                translated_word=f"translated_{word}",
                source_lang=from_lang,
                target_lang=to_lang,
                success=True,
            )
            for word in words
        ]

    # Need to patch in translator module since _learning_batch looks it up there
    with patch.object(
        _translator_module, "translate_words_batch", side_effect=fake_batch_translate
    ):
        yield


class TestLoadStopwords:
    """Tests for load_stopwords function."""

    def test_load_from_file(self, tmp_path: Path) -> None:
        """Test loading stopwords from file."""
        stopwords_file = tmp_path / "stopwords.txt"
        stopwords_file.write_text("word1\nword2\nword3\n", encoding="utf-8")

        result = load_stopwords(stopwords_file)

        assert "word1" in result
        assert "word2" in result
        assert "word3" in result

    def test_load_none_returns_empty(self) -> None:
        """Test that None returns empty frozenset."""
        result = load_stopwords(None)
        assert result == frozenset()

    def test_load_nonexistent_returns_empty(self) -> None:
        """Test that nonexistent file returns empty frozenset."""
        result = load_stopwords("/nonexistent/file.txt")
        assert result == frozenset()

    def test_lowercase_conversion(self, tmp_path: Path) -> None:
        """Test that stopwords are converted to lowercase."""
        stopwords_file = tmp_path / "stopwords.txt"
        stopwords_file.write_text("UPPER\nMixed\nlower\n", encoding="utf-8")

        result = load_stopwords(stopwords_file)

        assert "upper" in result
        assert "mixed" in result
        assert "lower" in result


class TestGenerateLearningLesson:
    """Tests for generate_learning_lesson function."""

    def test_basic_generation(self) -> None:
        """Test basic lesson generation."""
        text = "hello world hello hello world test test test test"
        result = generate_learning_lesson(
            text, LessonConfig(batch_size=3, num_batches=1, skip_default_stopwords=True)
        )

        assert "LANGUAGE LEARNING LESSON" in result
        assert "VOCABULARY TO LEARN" in result
        assert "test" in result  # Most common word

    def test_multiple_batches(self) -> None:
        """Test generation with multiple batches."""
        text = " ".join(f"word{i}" * (100 - i) for i in range(20))
        result = generate_learning_lesson(
            text, LessonConfig(batch_size=5, num_batches=3, skip_default_stopwords=True)
        )

        assert "BATCH 1" in result
        assert "BATCH 2" in result
        assert "BATCH 3" in result

    def test_stopwords_filtering(self) -> None:
        """Test that default stopwords are filtered."""
        text = "the the the hello world"
        result = generate_learning_lesson(
            text, LessonConfig(batch_size=5, num_batches=1)
        )

        # "the" should be filtered, "hello" and "world" should appear
        lines = result.split("\n")
        vocab_section = False
        found_words = []
        for line in lines:
            if "VOCABULARY TO LEARN" in line:
                vocab_section = True
            elif vocab_section and ". " in line and "(" in line:
                # Extract word from line like "  1. hello    (1 occurrences..."
                word = line.split(".")[1].split("(")[0].strip()
                found_words.append(word)
            elif vocab_section and "PRACTICE" in line:
                break

        assert "the" not in found_words
        assert "hello" in found_words or "world" in found_words

    def test_skip_default_stopwords(self) -> None:
        """Test disabling default stopword filtering."""
        text = "the the the hello"
        result = generate_learning_lesson(
            text, LessonConfig(batch_size=5, num_batches=1, skip_default_stopwords=True)
        )

        assert "the" in result.lower()

    def test_numbers_filtered_by_default(self) -> None:
        """Test that numbers are filtered by default."""
        text = "123 123 123 hello world"
        result = generate_learning_lesson(
            text, LessonConfig(batch_size=5, num_batches=1, skip_default_stopwords=True)
        )

        # Check vocabulary section doesn't include "123"
        lines = result.split("\n")
        for line in lines:
            if ". 123" in line and "occurrences" in line:
                pytest.fail("Number '123' should be filtered from vocabulary")

    def test_numbers_included_when_requested(self) -> None:
        """Test including numbers in vocabulary."""
        text = "123 123 123 hello"
        result = generate_learning_lesson(
            text,
            LessonConfig(
                batch_size=5,
                num_batches=1,
                skip_default_stopwords=True,
                skip_numbers=False,
            ),
        )

        assert "123" in result

    def test_coverage_calculation(self) -> None:
        """Test that coverage percentage is calculated."""
        text = "hello hello hello world world test"
        result = generate_learning_lesson(
            text, LessonConfig(batch_size=3, num_batches=1, skip_default_stopwords=True)
        )

        assert "recognize" in result.lower()
        assert "%" in result

    def test_excerpts_included(self) -> None:
        """Test that practice excerpts are included."""
        text = "hello world hello world hello world test test test"
        result = generate_learning_lesson(
            text,
            LessonConfig(
                batch_size=2,
                num_batches=1,
                excerpt_length=3,
                excerpts_per_batch=2,
                skip_default_stopwords=True,
            ),
        )

        assert "PRACTICE EXCERPTS" in result
        assert "Excerpt 1" in result

    def test_more_batches_than_words(self) -> None:
        """Test with num_batches larger than available words (early break)."""
        # "ab" is the only word with len > 1
        text = "ab ab ab"
        result = generate_learning_lesson(
            text,
            LessonConfig(
                batch_size=1,
                num_batches=100,
                skip_default_stopwords=True,
            ),
        )
        assert "SUMMARY" in result

    def test_all_words_filtered_empty_cumulative(self) -> None:
        """Test when all words are filtered, cumulative_words is empty."""
        text = "a b c"  # All 1-char words -> filtered by len(word) > 1
        result = generate_learning_lesson(
            text,
            LessonConfig(
                batch_size=5,
                num_batches=1,
                skip_default_stopwords=True,
            ),
        )
        assert "SUMMARY" in result
        # No batches generated, no vocabulary coverage stats
        assert "Text coverage" not in result

    def test_no_translation(self) -> None:
        """Test lesson without translation enabled (do_translate=False)."""
        text = "hello hello hello world world"
        result = generate_learning_lesson(
            text,
            LessonConfig(
                batch_size=5,
                num_batches=1,
                skip_default_stopwords=True,
                translate_from=None,
                translate_to=None,
            ),
        )
        assert "LANGUAGE LEARNING LESSON" in result

    def test_default_config(self) -> None:
        """Test calling generate_learning_lesson without config (line 79)."""
        text = "hello hello hello world world"
        result = generate_learning_lesson(text)
        assert "LANGUAGE LEARNING LESSON" in result


class TestMain:
    """Tests for main CLI function."""

    def test_basic_text_input(
        self, caplog: pytest.LogCaptureFixture, mock_translation: None
    ) -> None:
        """Test with text input."""
        with caplog.at_level(logging.INFO):
            exit_code = main(
                [
                    "--text",
                    "hello world hello world test test test",
                    "--batch-size",
                    "3",
                    "--no-default-stopwords",
                ]
            )

        assert exit_code == 0
        assert "LANGUAGE LEARNING LESSON" in caplog.text

    def test_file_input(
        self, tmp_path: Path, caplog: pytest.LogCaptureFixture, mock_translation: None
    ) -> None:
        """Test with file input."""
        test_file = tmp_path / "test.txt"
        test_file.write_text("hello world hello world test", encoding="utf-8")

        with caplog.at_level(logging.INFO):
            exit_code = main(
                [
                    "--file",
                    str(test_file),
                    "--batch-size",
                    "3",
                    "--no-default-stopwords",
                ]
            )

        assert exit_code == 0
        assert "hello" in caplog.text.lower()

    def test_output_to_file(self, tmp_path: Path, mock_translation: None) -> None:
        """Test outputting to file."""
        output_file = tmp_path / "lesson.txt"

        exit_code = main(
            [
                "--text",
                "hello world hello world",
                "--output",
                str(output_file),
                "--no-default-stopwords",
            ]
        )

        assert exit_code == 0
        assert output_file.exists()
        content = output_file.read_text(encoding="utf-8")
        assert "LANGUAGE LEARNING LESSON" in content

    def test_custom_stopwords(self, tmp_path: Path, mock_translation: None) -> None:
        """Test with custom stopwords file."""
        stopwords_file = tmp_path / "stop.txt"
        stopwords_file.write_text("hello\n", encoding="utf-8")

        exit_code = main(
            [
                "--text",
                "hello hello hello world world",
                "--stopwords",
                str(stopwords_file),
                "--no-default-stopwords",
                "--batch-size",
                "5",
            ]
        )

        assert exit_code == 0
        # "hello" should be filtered by custom stopwords

    def test_multiple_batches_option(
        self, caplog: pytest.LogCaptureFixture, mock_translation: None
    ) -> None:
        """Test --batches option."""
        text = " ".join(f"word{i}" * (50 - i) for i in range(30))
        with caplog.at_level(logging.INFO):
            exit_code = main(
                [
                    "--text",
                    text,
                    "--batch-size",
                    "5",
                    "--batches",
                    "3",
                    "--no-default-stopwords",
                ]
            )

        assert exit_code == 0
        assert "BATCH 1" in caplog.text
        assert "BATCH 2" in caplog.text
        assert "BATCH 3" in caplog.text

    def test_file_not_found(self, caplog: pytest.LogCaptureFixture) -> None:
        """Test error handling for missing file."""
        with caplog.at_level(logging.ERROR):
            exit_code = main(["--file", "/nonexistent/file.txt"])

        assert exit_code == 1
        assert "Error" in caplog.text

    def test_unicode_decode_error(
        self, tmp_path: Path, caplog: pytest.LogCaptureFixture
    ) -> None:
        """Test UnicodeDecodeError handling."""
        with (
            caplog.at_level(logging.ERROR),
            patch(
                "python_pkg.word_frequency.learning_pipe.read_file",
                side_effect=UnicodeDecodeError("utf-8", b"", 0, 1, "bad"),
            ),
        ):
            exit_code = main(["--file", str(tmp_path / "f.txt")])
        assert exit_code == 1

    def test_output_to_file_branch(
        self, tmp_path: Path, mock_translation: None
    ) -> None:
        """Test --output to verify the file writing path."""
        out = tmp_path / "out.txt"
        exit_code = main(
            [
                "--text",
                "hello world hello",
                "--output",
                str(out),
                "--no-default-stopwords",
            ]
        )
        assert exit_code == 0
        assert out.exists()

    def test_no_translate_flag(self, caplog: pytest.LogCaptureFixture) -> None:
        """Test --no-translate flag to cover branch 303->307."""
        with caplog.at_level(logging.INFO):
            exit_code = main(
                [
                    "--text",
                    "hello world hello",
                    "--no-translate",
                    "--no-default-stopwords",
                ]
            )
        assert exit_code == 0


class TestPerformance:
    """Performance tests for learning pipe."""

    def test_large_text_performance(self) -> None:
        """Test performance with large text."""
        # Generate large text with enough unique words for 5 batches
        words = ["word" + str(i) for i in range(500)]
        large_text = " ".join(words * 200)

        start_time = time.perf_counter()
        result = generate_learning_lesson(
            large_text,
            LessonConfig(
                batch_size=50,
                num_batches=5,
                excerpt_length=30,
                skip_default_stopwords=True,
            ),
        )
        elapsed = time.perf_counter() - start_time

        assert elapsed < 10.0, f"Generation took {elapsed:.2f}s, expected < 10s"
        assert "BATCH 5" in result


class TestDefaultStopwords:
    """Tests for default stopwords."""

    def test_common_words_in_stopwords(self) -> None:
        """Test that common words are in default stopwords."""
        common = ["the", "a", "an", "and", "or", "but", "in", "on", "is", "are"]
        for word in common:
            assert word in DEFAULT_STOPWORDS_EN

    def test_stopwords_are_lowercase(self) -> None:
        """Test that all stopwords are lowercase."""
        for word in DEFAULT_STOPWORDS_EN:
            assert word == word.lower()