testsAndMisc-archive/python_pkg/word_frequency/tests/test_analyzer.py

"""Tests for word_frequency.analyzer module."""

from __future__ import annotations

from collections import Counter
import time
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from pathlib import Path

import pytest

from python_pkg.word_frequency.analyzer import (
    analyze_and_format,
    analyze_text,
    extract_words,
    format_results,
    main,
    read_file,
    read_files,
)


class TestExtractWords:
    """Tests for extract_words function."""

    def test_basic_extraction(self) -> None:
        """Test basic word extraction."""
        text = "Hello world"
        result = extract_words(text)
        assert result == ["hello", "world"]

    def test_case_insensitive_default(self) -> None:
        """Test that extraction is case-insensitive by default."""
        text = "Hello WORLD HeLLo"
        result = extract_words(text)
        assert result == ["hello", "world", "hello"]

    def test_case_sensitive(self) -> None:
        """Test case-sensitive extraction."""
        text = "Hello WORLD HeLLo"
        result = extract_words(text, case_sensitive=True)
        assert result == ["Hello", "WORLD", "HeLLo"]

    def test_unicode_words(self) -> None:
        """Test extraction of unicode words (Polish, Latin accents)."""
        text = "zażółć gęślą jaźń"
        result = extract_words(text)
        assert result == ["zażółć", "gęślą", "jaźń"]

    def test_punctuation_removal(self) -> None:
        """Test that punctuation is not included in words."""
        text = "Hello, world! How are you?"
        result = extract_words(text)
        assert result == ["hello", "world", "how", "are", "you"]

    def test_numbers_included(self) -> None:
        """Test that numbers are included as words."""
        text = "There are 123 apples and 456 oranges"
        result = extract_words(text)
        assert "123" in result
        assert "456" in result

    def test_empty_string(self) -> None:
        """Test extraction from empty string."""
        result = extract_words("")
        assert result == []

    def test_only_punctuation(self) -> None:
        """Test extraction from string with only punctuation."""
        result = extract_words("!@#$%^&*()")
        assert result == []

    def test_hyphenated_words(self) -> None:
        """Test handling of hyphenated words (split into parts)."""
        text = "well-known self-aware"
        result = extract_words(text)
        # Hyphens act as word boundaries with \b
        assert "well" in result
        assert "known" in result


class TestAnalyzeText:
    """Tests for analyze_text function."""

    def test_basic_counting(self) -> None:
        """Test basic word counting."""
        text = "hello world hello"
        result = analyze_text(text)
        assert result["hello"] == 2
        assert result["world"] == 1

    def test_case_insensitive_counting(self) -> None:
        """Test case-insensitive counting."""
        text = "Hello HELLO hello"
        result = analyze_text(text)
        assert result["hello"] == 3

    def test_case_sensitive_counting(self) -> None:
        """Test case-sensitive counting."""
        text = "Hello HELLO hello"
        result = analyze_text(text, case_sensitive=True)
        assert result["Hello"] == 1
        assert result["HELLO"] == 1
        assert result["hello"] == 1

    def test_returns_counter(self) -> None:
        """Test that result is a Counter object."""
        result = analyze_text("test")
        assert isinstance(result, Counter)

    def test_empty_text(self) -> None:
        """Test analysis of empty text."""
        result = analyze_text("")
        assert len(result) == 0


class TestReadFile:
    """Tests for read_file function."""

    def test_read_existing_file(self, tmp_path: Path) -> None:
        """Test reading an existing file."""
        test_file = tmp_path / "test.txt"
        test_file.write_text("Hello world", encoding="utf-8")
        result = read_file(test_file)
        assert result == "Hello world"

    def test_read_utf8_content(self, tmp_path: Path) -> None:
        """Test reading UTF-8 content with special characters."""
        test_file = tmp_path / "test.txt"
        test_file.write_text("zażółć gęślą jaźń", encoding="utf-8")
        result = read_file(test_file)
        assert result == "zażółć gęślą jaźń"

    def test_file_not_found(self) -> None:
        """Test that FileNotFoundError is raised for missing file."""
        with pytest.raises(FileNotFoundError):
            read_file("/nonexistent/path/file.txt")


class TestReadFiles:
    """Tests for read_files function."""

    def test_read_multiple_files(self, tmp_path: Path) -> None:
        """Test reading multiple files."""
        file1 = tmp_path / "file1.txt"
        file2 = tmp_path / "file2.txt"
        file1.write_text("Hello", encoding="utf-8")
        file2.write_text("World", encoding="utf-8")
        result = read_files([file1, file2])
        assert "Hello" in result
        assert "World" in result

    def test_empty_list(self) -> None:
        """Test reading empty list of files."""
        result = read_files([])
        assert result == ""


class TestFormatResults:
    """Tests for format_results function."""

    def test_basic_formatting(self) -> None:
        """Test basic result formatting."""
        counter: Counter[str] = Counter({"hello": 3, "world": 2})
        result = format_results(counter)
        assert "Total words: 5" in result
        assert "Unique words: 2" in result
        assert "hello" in result
        assert "world" in result
        assert "60.00%" in result  # hello percentage
        assert "40.00%" in result  # world percentage

    def test_top_n_limit(self) -> None:
        """Test limiting results to top N."""
        counter: Counter[str] = Counter({"a": 10, "b": 5, "c": 3, "d": 1})
        result = format_results(counter, top_n=2)
        assert "a" in result
        assert "b" in result
        # c and d should not appear in the data rows
        lines = result.split("\n")
        data_lines = [line for line in lines if line.strip() and "%" in line]
        assert len(data_lines) == 2

    def test_empty_counter(self) -> None:
        """Test formatting empty counter."""
        counter: Counter[str] = Counter()
        result = format_results(counter)
        assert "No words found" in result


class TestAnalyzeAndFormat:
    """Tests for analyze_and_format function."""

    def test_full_pipeline(self) -> None:
        """Test the full analyze and format pipeline."""
        text = "hello world hello"
        result = analyze_and_format(text)
        assert "Total words: 3" in result
        assert "hello" in result
        assert "66.67%" in result  # hello appears 2/3 times


class TestMain:
    """Tests for main CLI function."""

    def test_text_input(self, capsys: pytest.CaptureFixture[str]) -> None:
        """Test --text input option."""
        exit_code = main(["--text", "hello world hello"])
        captured = capsys.readouterr()
        assert exit_code == 0
        assert "hello" in captured.out
        assert "world" in captured.out

    def test_file_input(
        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
    ) -> None:
        """Test --file input option."""
        test_file = tmp_path / "test.txt"
        test_file.write_text("hello world hello", encoding="utf-8")
        exit_code = main(["--file", str(test_file)])
        captured = capsys.readouterr()
        assert exit_code == 0
        assert "hello" in captured.out

    def test_files_input(
        self, tmp_path: Path, capsys: pytest.CaptureFixture[str]
    ) -> None:
        """Test --files input option."""
        file1 = tmp_path / "file1.txt"
        file2 = tmp_path / "file2.txt"
        file1.write_text("hello hello", encoding="utf-8")
        file2.write_text("world world world", encoding="utf-8")
        exit_code = main(["--files", str(file1), str(file2)])
        captured = capsys.readouterr()
        assert exit_code == 0
        assert "hello" in captured.out
        assert "world" in captured.out

    def test_top_n_option(self, capsys: pytest.CaptureFixture[str]) -> None:
        """Test --top option to limit results."""
        exit_code = main(["--text", "a a a b b c d e f g", "--top", "2"])
        captured = capsys.readouterr()
        assert exit_code == 0
        # Count data lines with percentages
        lines = [line for line in captured.out.split("\n") if "%" in line]
        assert len(lines) == 2

    def test_case_sensitive_option(self, capsys: pytest.CaptureFixture[str]) -> None:
        """Test --case-sensitive option."""
        exit_code = main(["--text", "Hello HELLO hello", "--case-sensitive"])
        captured = capsys.readouterr()
        assert exit_code == 0
        assert "Unique words: 3" in captured.out

    def test_file_not_found_error(
        self, caplog: pytest.LogCaptureFixture
    ) -> None:
        """Test error handling for missing file."""
        exit_code = main(["--file", "/nonexistent/file.txt"])
        assert exit_code == 1
        assert "File not found" in caplog.text


class TestPerformance:
    """Performance tests for word frequency analyzer."""

    def test_large_text_performance(self) -> None:
        """Test that analyzing large text with 10k top words completes in < 10s."""
        # Generate a large text with many unique words
        # We'll create ~100k words to ensure a good stress test
        words = [f"word{i}" for i in range(10000)]
        # Repeat each word a varying number of times
        text_parts = []
        for i, word in enumerate(words):
            # More common words appear more often
            count = 10000 - i
            text_parts.extend([word] * max(1, count // 100))

        large_text = " ".join(text_parts)

        start_time = time.perf_counter()
        result = analyze_and_format(large_text, top_n=10000)
        elapsed = time.perf_counter() - start_time

        assert elapsed < 10.0, f"Analysis took {elapsed:.2f}s, expected < 10s"
        assert "word0" in result  # Most common word should be present

    def test_bible_sized_text_performance(self) -> None:
        """Test with Bible-sized text (~800k words)."""
        # Generate text similar in size to the Bible
        base_words = ["the", "and", "of", "to", "in", "a", "that", "is", "was", "for"]
        text_parts = []
        for _ in range(80000):  # ~800k words total
            text_parts.extend(base_words)

        large_text = " ".join(text_parts)

        start_time = time.perf_counter()
        result = analyze_and_format(large_text, top_n=10000)
        elapsed = time.perf_counter() - start_time

        assert elapsed < 10.0, f"Analysis took {elapsed:.2f}s, expected < 10s"
        assert "the" in result
feat: text learning pipe 2025-12-27 17:22:17 +01:00			`"""Tests for word_frequency.analyzer module."""`

			`from __future__ import annotations`

			`from collections import Counter`
Add pre-commit workflow and fix linting violations (#2) * Initial plan * Add pre-commit GitHub workflow and fix linting issues - Created .github/workflows/pre-commit.yml to run pre-commit hooks in CI - Fixed mypy type errors in translator.py - Fixed shellcheck warning in run_anki_generator.sh - Added per-file ignores for word_frequency module legacy code - Applied auto-fixes from ruff, ruff-format, autoflake, prettier - All pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Make Python scripts with shebangs executable - Set executable bit for word_frequency module scripts with shebangs - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Fix: Restore imports in check functions (autoflake-proof) - Restored imports in _check_argos(), _check_deep_translator(), _check_langdetect() - Used _ = module assignment to prevent autoflake from removing imports - These imports test module availability by triggering ImportError if missing - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> 2026-01-07 22:57:42 +01:00			`import time`
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`from typing import TYPE_CHECKING`

			`if TYPE_CHECKING:`
			`from pathlib import Path`
feat: text learning pipe 2025-12-27 17:22:17 +01:00
			`import pytest`

			`from python_pkg.word_frequency.analyzer import (`
			`analyze_and_format,`
			`analyze_text,`
			`extract_words,`
			`format_results,`
			`main,`
			`read_file,`
			`read_files,`
			`)`


			`class TestExtractWords:`
			`"""Tests for extract_words function."""`

			`def test_basic_extraction(self) -> None:`
			`"""Test basic word extraction."""`
			`text = "Hello world"`
			`result = extract_words(text)`
			`assert result == ["hello", "world"]`

			`def test_case_insensitive_default(self) -> None:`
			`"""Test that extraction is case-insensitive by default."""`
			`text = "Hello WORLD HeLLo"`
			`result = extract_words(text)`
			`assert result == ["hello", "world", "hello"]`

			`def test_case_sensitive(self) -> None:`
			`"""Test case-sensitive extraction."""`
			`text = "Hello WORLD HeLLo"`
			`result = extract_words(text, case_sensitive=True)`
			`assert result == ["Hello", "WORLD", "HeLLo"]`

			`def test_unicode_words(self) -> None:`
			`"""Test extraction of unicode words (Polish, Latin accents)."""`
			`text = "zażółć gęślą jaźń"`
			`result = extract_words(text)`
			`assert result == ["zażółć", "gęślą", "jaźń"]`

			`def test_punctuation_removal(self) -> None:`
			`"""Test that punctuation is not included in words."""`
			`text = "Hello, world! How are you?"`
			`result = extract_words(text)`
			`assert result == ["hello", "world", "how", "are", "you"]`

			`def test_numbers_included(self) -> None:`
			`"""Test that numbers are included as words."""`
			`text = "There are 123 apples and 456 oranges"`
			`result = extract_words(text)`
			`assert "123" in result`
			`assert "456" in result`

			`def test_empty_string(self) -> None:`
			`"""Test extraction from empty string."""`
			`result = extract_words("")`
			`assert result == []`

			`def test_only_punctuation(self) -> None:`
			`"""Test extraction from string with only punctuation."""`
			`result = extract_words("!@#$%^&*()")`
			`assert result == []`

			`def test_hyphenated_words(self) -> None:`
			`"""Test handling of hyphenated words (split into parts)."""`
			`text = "well-known self-aware"`
			`result = extract_words(text)`
			`# Hyphens act as word boundaries with \b`
			`assert "well" in result`
			`assert "known" in result`


			`class TestAnalyzeText:`
			`"""Tests for analyze_text function."""`

			`def test_basic_counting(self) -> None:`
			`"""Test basic word counting."""`
			`text = "hello world hello"`
			`result = analyze_text(text)`
			`assert result["hello"] == 2`
			`assert result["world"] == 1`

			`def test_case_insensitive_counting(self) -> None:`
			`"""Test case-insensitive counting."""`
			`text = "Hello HELLO hello"`
			`result = analyze_text(text)`
			`assert result["hello"] == 3`

			`def test_case_sensitive_counting(self) -> None:`
			`"""Test case-sensitive counting."""`
			`text = "Hello HELLO hello"`
			`result = analyze_text(text, case_sensitive=True)`
			`assert result["Hello"] == 1`
			`assert result["HELLO"] == 1`
			`assert result["hello"] == 1`

			`def test_returns_counter(self) -> None:`
			`"""Test that result is a Counter object."""`
			`result = analyze_text("test")`
			`assert isinstance(result, Counter)`

			`def test_empty_text(self) -> None:`
			`"""Test analysis of empty text."""`
			`result = analyze_text("")`
			`assert len(result) == 0`


			`class TestReadFile:`
			`"""Tests for read_file function."""`

			`def test_read_existing_file(self, tmp_path: Path) -> None:`
			`"""Test reading an existing file."""`
			`test_file = tmp_path / "test.txt"`
			`test_file.write_text("Hello world", encoding="utf-8")`
			`result = read_file(test_file)`
			`assert result == "Hello world"`

			`def test_read_utf8_content(self, tmp_path: Path) -> None:`
			`"""Test reading UTF-8 content with special characters."""`
			`test_file = tmp_path / "test.txt"`
			`test_file.write_text("zażółć gęślą jaźń", encoding="utf-8")`
			`result = read_file(test_file)`
			`assert result == "zażółć gęślą jaźń"`

			`def test_file_not_found(self) -> None:`
			`"""Test that FileNotFoundError is raised for missing file."""`
			`with pytest.raises(FileNotFoundError):`
			`read_file("/nonexistent/path/file.txt")`


			`class TestReadFiles:`
			`"""Tests for read_files function."""`

			`def test_read_multiple_files(self, tmp_path: Path) -> None:`
			`"""Test reading multiple files."""`
			`file1 = tmp_path / "file1.txt"`
			`file2 = tmp_path / "file2.txt"`
			`file1.write_text("Hello", encoding="utf-8")`
			`file2.write_text("World", encoding="utf-8")`
			`result = read_files([file1, file2])`
			`assert "Hello" in result`
			`assert "World" in result`

			`def test_empty_list(self) -> None:`
			`"""Test reading empty list of files."""`
			`result = read_files([])`
			`assert result == ""`


			`class TestFormatResults:`
			`"""Tests for format_results function."""`

			`def test_basic_formatting(self) -> None:`
			`"""Test basic result formatting."""`
			`counter: Counter[str] = Counter({"hello": 3, "world": 2})`
			`result = format_results(counter)`
			`assert "Total words: 5" in result`
			`assert "Unique words: 2" in result`
			`assert "hello" in result`
			`assert "world" in result`
			`assert "60.00%" in result # hello percentage`
			`assert "40.00%" in result # world percentage`

			`def test_top_n_limit(self) -> None:`
			`"""Test limiting results to top N."""`
			`counter: Counter[str] = Counter({"a": 10, "b": 5, "c": 3, "d": 1})`
			`result = format_results(counter, top_n=2)`
			`assert "a" in result`
			`assert "b" in result`
			`# c and d should not appear in the data rows`
			`lines = result.split("\n")`
			`data_lines = [line for line in lines if line.strip() and "%" in line]`
			`assert len(data_lines) == 2`

			`def test_empty_counter(self) -> None:`
			`"""Test formatting empty counter."""`
			`counter: Counter[str] = Counter()`
			`result = format_results(counter)`
			`assert "No words found" in result`


			`class TestAnalyzeAndFormat:`
			`"""Tests for analyze_and_format function."""`

			`def test_full_pipeline(self) -> None:`
			`"""Test the full analyze and format pipeline."""`
			`text = "hello world hello"`
			`result = analyze_and_format(text)`
			`assert "Total words: 3" in result`
			`assert "hello" in result`
			`assert "66.67%" in result # hello appears 2/3 times`


			`class TestMain:`
			`"""Tests for main CLI function."""`

			`def test_text_input(self, capsys: pytest.CaptureFixture[str]) -> None:`
			`"""Test --text input option."""`
			`exit_code = main(["--text", "hello world hello"])`
			`captured = capsys.readouterr()`
			`assert exit_code == 0`
			`assert "hello" in captured.out`
			`assert "world" in captured.out`

			`def test_file_input(`
			`self, tmp_path: Path, capsys: pytest.CaptureFixture[str]`
			`) -> None:`
			`"""Test --file input option."""`
			`test_file = tmp_path / "test.txt"`
			`test_file.write_text("hello world hello", encoding="utf-8")`
			`exit_code = main(["--file", str(test_file)])`
			`captured = capsys.readouterr()`
			`assert exit_code == 0`
			`assert "hello" in captured.out`

			`def test_files_input(`
			`self, tmp_path: Path, capsys: pytest.CaptureFixture[str]`
			`) -> None:`
			`"""Test --files input option."""`
			`file1 = tmp_path / "file1.txt"`
			`file2 = tmp_path / "file2.txt"`
			`file1.write_text("hello hello", encoding="utf-8")`
			`file2.write_text("world world world", encoding="utf-8")`
			`exit_code = main(["--files", str(file1), str(file2)])`
			`captured = capsys.readouterr()`
			`assert exit_code == 0`
			`assert "hello" in captured.out`
			`assert "world" in captured.out`

			`def test_top_n_option(self, capsys: pytest.CaptureFixture[str]) -> None:`
			`"""Test --top option to limit results."""`
			`exit_code = main(["--text", "a a a b b c d e f g", "--top", "2"])`
			`captured = capsys.readouterr()`
			`assert exit_code == 0`
			`# Count data lines with percentages`
			`lines = [line for line in captured.out.split("\n") if "%" in line]`
			`assert len(lines) == 2`

			`def test_case_sensitive_option(self, capsys: pytest.CaptureFixture[str]) -> None:`
			`"""Test --case-sensitive option."""`
			`exit_code = main(["--text", "Hello HELLO hello", "--case-sensitive"])`
			`captured = capsys.readouterr()`
			`assert exit_code == 0`
			`assert "Unique words: 3" in captured.out`

refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`def test_file_not_found_error(`
			`self, caplog: pytest.LogCaptureFixture`
			`) -> None:`
feat: text learning pipe 2025-12-27 17:22:17 +01:00			`"""Test error handling for missing file."""`
			`exit_code = main(["--file", "/nonexistent/file.txt"])`
			`assert exit_code == 1`
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`assert "File not found" in caplog.text`
feat: text learning pipe 2025-12-27 17:22:17 +01:00

			`class TestPerformance:`
			`"""Performance tests for word frequency analyzer."""`

			`def test_large_text_performance(self) -> None:`
			`"""Test that analyzing large text with 10k top words completes in < 10s."""`
			`# Generate a large text with many unique words`
			`# We'll create ~100k words to ensure a good stress test`
			`words = [f"word{i}" for i in range(10000)]`
			`# Repeat each word a varying number of times`
			`text_parts = []`
			`for i, word in enumerate(words):`
			`# More common words appear more often`
			`count = 10000 - i`
			`text_parts.extend([word] * max(1, count // 100))`

			`large_text = " ".join(text_parts)`

			`start_time = time.perf_counter()`
			`result = analyze_and_format(large_text, top_n=10000)`
			`elapsed = time.perf_counter() - start_time`

			`assert elapsed < 10.0, f"Analysis took {elapsed:.2f}s, expected < 10s"`
			`assert "word0" in result # Most common word should be present`

refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`def test_bible_sized_text_performance(self) -> None:`
feat: text learning pipe 2025-12-27 17:22:17 +01:00			`"""Test with Bible-sized text (~800k words)."""`
			`# Generate text similar in size to the Bible`
			`base_words = ["the", "and", "of", "to", "in", "a", "that", "is", "was", "for"]`
			`text_parts = []`
			`for _ in range(80000): # ~800k words total`
			`text_parts.extend(base_words)`

			`large_text = " ".join(text_parts)`

			`start_time = time.perf_counter()`
			`result = analyze_and_format(large_text, top_n=10000)`
			`elapsed = time.perf_counter() - start_time`

			`assert elapsed < 10.0, f"Analysis took {elapsed:.2f}s, expected < 10s"`
			`assert "the" in result`