mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 14:23:04 +02:00
- Add comprehensive tests for all packages (3572 tests, 100% branch coverage) - Split oversized test files to stay under 500-line limit - Add per-file ruff ignores for test-appropriate suppressions - Fix _cache_decks.py to properly convert JSON lists to tuples - Add session-scoped conftest fixture for logging handler cleanup (Python 3.14) - Update ruff pre-commit hook to v0.15.2 - Add codespell ignore words for test data - Add generated output files to .gitignore
493 lines
16 KiB
Python
493 lines
16 KiB
Python
"""Tests for word_frequency.excerpt_finder module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from typing import TYPE_CHECKING
|
|
|
|
import pytest
|
|
|
|
if TYPE_CHECKING:
|
|
from pathlib import Path
|
|
|
|
from python_pkg.word_frequency.excerpt_finder import (
|
|
ExcerptResult,
|
|
ExcerptSearchOptions,
|
|
find_best_excerpt,
|
|
find_best_excerpt_with_context,
|
|
format_excerpt_results,
|
|
main,
|
|
)
|
|
|
|
|
|
class TestFindBestExcerpt:
|
|
"""Tests for find_best_excerpt function."""
|
|
|
|
def test_basic_example(self) -> None:
|
|
"""Test the example from the user request."""
|
|
text = "they went somewhere he and she and the guy"
|
|
result = find_best_excerpt(text, ["and", "the"], excerpt_length=3)
|
|
|
|
assert len(result) == 1
|
|
# Should find an excerpt with 66.67% match (2/3)
|
|
assert result[0].match_count == 2
|
|
assert result[0].match_percentage == pytest.approx(66.67, rel=0.01)
|
|
|
|
def test_all_matching_words(self) -> None:
|
|
"""Test when all words in excerpt match target words."""
|
|
text = "the and the and the"
|
|
result = find_best_excerpt(text, ["the", "and"], excerpt_length=3)
|
|
|
|
assert len(result) == 1
|
|
assert result[0].match_count == 3
|
|
assert result[0].match_percentage == 100.0
|
|
|
|
def test_no_matching_words(self) -> None:
|
|
"""Test when no words match target words."""
|
|
text = "hello world foo bar"
|
|
result = find_best_excerpt(text, ["xyz", "abc"], excerpt_length=2)
|
|
|
|
assert len(result) == 1
|
|
assert result[0].match_count == 0
|
|
assert result[0].match_percentage == 0.0
|
|
|
|
def test_top_n_results(self) -> None:
|
|
"""Test getting multiple top results."""
|
|
text = "they went somewhere he and she and the guy"
|
|
result = find_best_excerpt(text, ["and", "the"], excerpt_length=3, top_n=5)
|
|
|
|
# Should have multiple results
|
|
assert len(result) >= 3
|
|
# First results should have higher or equal match counts than later ones
|
|
for i in range(len(result) - 1):
|
|
assert result[i].match_count >= result[i + 1].match_count
|
|
|
|
def test_case_insensitive_default(self) -> None:
|
|
"""Test case-insensitive matching by default."""
|
|
text = "THE And THE and THE"
|
|
result = find_best_excerpt(text, ["the", "AND"], excerpt_length=3)
|
|
|
|
assert result[0].match_count == 3
|
|
|
|
def test_case_sensitive(self) -> None:
|
|
"""Test case-sensitive matching."""
|
|
text = "THE And THE and THE"
|
|
result = find_best_excerpt(
|
|
text, ["the", "and"], excerpt_length=3, case_sensitive=True
|
|
)
|
|
|
|
# "THE" won't match "the", "And" won't match "and"
|
|
# Only "and" matches in position 3
|
|
assert result[0].match_count < 3
|
|
|
|
def test_empty_text(self) -> None:
|
|
"""Test with empty text."""
|
|
result = find_best_excerpt("", ["the"], excerpt_length=3)
|
|
assert result == []
|
|
|
|
def test_text_shorter_than_excerpt(self) -> None:
|
|
"""Test when text is shorter than requested excerpt."""
|
|
result = find_best_excerpt("hello world", ["hello"], excerpt_length=5)
|
|
assert result == []
|
|
|
|
def test_zero_excerpt_length(self) -> None:
|
|
"""Test with zero excerpt length."""
|
|
result = find_best_excerpt("hello world", ["hello"], excerpt_length=0)
|
|
assert result == []
|
|
|
|
def test_negative_excerpt_length(self) -> None:
|
|
"""Test with negative excerpt length."""
|
|
result = find_best_excerpt("hello world", ["hello"], excerpt_length=-1)
|
|
assert result == []
|
|
|
|
def test_excerpt_at_text_boundaries(self) -> None:
|
|
"""Test that excerpts at start and end of text are found."""
|
|
text = "the the the middle words here end end end"
|
|
result = find_best_excerpt(text, ["the"], excerpt_length=3, top_n=10)
|
|
|
|
# Check that we find the "the the the" at the start
|
|
excerpts = [r.excerpt for r in result]
|
|
assert "the the the" in excerpts
|
|
|
|
def test_unicode_words(self) -> None:
|
|
"""Test with Polish/unicode words."""
|
|
text = "zażółć gęślą jaźń i w się nie"
|
|
result = find_best_excerpt(text, ["zażółć", "jaźń"], excerpt_length=3)
|
|
|
|
assert len(result) == 1
|
|
# "zażółć gęślą jaźń" should have 2 matches
|
|
assert result[0].match_count == 2
|
|
|
|
def test_result_structure(self) -> None:
|
|
"""Test that result has correct structure."""
|
|
text = "hello world test"
|
|
result = find_best_excerpt(text, ["hello"], excerpt_length=2)
|
|
|
|
assert len(result) == 1
|
|
assert isinstance(result[0], ExcerptResult)
|
|
assert isinstance(result[0].excerpt, str)
|
|
assert isinstance(result[0].words, list)
|
|
assert isinstance(result[0].start_index, int)
|
|
assert isinstance(result[0].end_index, int)
|
|
assert isinstance(result[0].match_count, int)
|
|
assert isinstance(result[0].match_percentage, float)
|
|
|
|
def test_word_indices(self) -> None:
|
|
"""Test that word indices are correct."""
|
|
text = "a b c d e"
|
|
result = find_best_excerpt(text, ["c"], excerpt_length=1)
|
|
|
|
# "c" is at index 2
|
|
assert result[0].start_index == 2
|
|
assert result[0].end_index == 3
|
|
assert result[0].excerpt == "c"
|
|
|
|
|
|
class TestFindBestExcerptWithContext:
|
|
"""Tests for find_best_excerpt_with_context function."""
|
|
|
|
def test_no_context(self) -> None:
|
|
"""Test with zero context (should behave like find_best_excerpt)."""
|
|
text = "a b c d e f g"
|
|
result = find_best_excerpt_with_context(
|
|
text,
|
|
["c"],
|
|
excerpt_length=1,
|
|
options=ExcerptSearchOptions(context_words=0),
|
|
)
|
|
|
|
assert result[0].excerpt == "c"
|
|
|
|
def test_with_context(self) -> None:
|
|
"""Test with context words."""
|
|
text = "a b c d e f g"
|
|
result = find_best_excerpt_with_context(
|
|
text,
|
|
["d"],
|
|
excerpt_length=1,
|
|
options=ExcerptSearchOptions(context_words=2),
|
|
)
|
|
|
|
# "d" at index 3, with context should include 2 words before and after
|
|
# Result should be "b c d e f"
|
|
assert "d" in result[0].excerpt
|
|
assert len(result[0].words) == 5
|
|
|
|
def test_context_at_start(self) -> None:
|
|
"""Test context doesn't go before start of text."""
|
|
text = "a b c d e"
|
|
result = find_best_excerpt_with_context(
|
|
text,
|
|
["a"],
|
|
excerpt_length=1,
|
|
options=ExcerptSearchOptions(context_words=3),
|
|
)
|
|
|
|
# Can't go before "a", so just get words after
|
|
assert result[0].start_index == 0
|
|
assert result[0].words[0] == "a"
|
|
|
|
def test_context_at_end(self) -> None:
|
|
"""Test context doesn't go beyond end of text."""
|
|
text = "a b c d e"
|
|
result = find_best_excerpt_with_context(
|
|
text,
|
|
["e"],
|
|
excerpt_length=1,
|
|
options=ExcerptSearchOptions(context_words=3),
|
|
)
|
|
|
|
# Can't go beyond "e"
|
|
assert result[0].words[-1] == "e"
|
|
|
|
|
|
class TestFormatExcerptResults:
|
|
"""Tests for format_excerpt_results function."""
|
|
|
|
def test_single_result(self) -> None:
|
|
"""Test formatting a single result."""
|
|
results = [
|
|
ExcerptResult(
|
|
excerpt="hello world",
|
|
words=["hello", "world"],
|
|
start_index=0,
|
|
end_index=2,
|
|
match_count=1,
|
|
match_percentage=50.0,
|
|
)
|
|
]
|
|
output = format_excerpt_results(results, ["hello"])
|
|
|
|
assert "hello" in output
|
|
assert "50.00%" in output
|
|
assert "hello world" in output
|
|
|
|
def test_multiple_results(self) -> None:
|
|
"""Test formatting multiple results."""
|
|
results = [
|
|
ExcerptResult(
|
|
excerpt="a b",
|
|
words=["a", "b"],
|
|
start_index=0,
|
|
end_index=2,
|
|
match_count=2,
|
|
match_percentage=100.0,
|
|
),
|
|
ExcerptResult(
|
|
excerpt="c d",
|
|
words=["c", "d"],
|
|
start_index=2,
|
|
end_index=4,
|
|
match_count=1,
|
|
match_percentage=50.0,
|
|
),
|
|
]
|
|
output = format_excerpt_results(results, ["a", "b"])
|
|
|
|
assert "Result #1" in output
|
|
assert "Result #2" in output
|
|
|
|
def test_empty_results(self) -> None:
|
|
"""Test formatting empty results."""
|
|
output = format_excerpt_results([], ["hello"])
|
|
assert "No excerpts found" in output
|
|
|
|
|
|
class TestMain:
|
|
"""Tests for main CLI function."""
|
|
|
|
def test_text_and_words_input(self, caplog: pytest.LogCaptureFixture) -> None:
|
|
"""Test --text and --words options."""
|
|
with caplog.at_level(logging.INFO):
|
|
exit_code = main(
|
|
["--text", "hello world hello", "--words", "hello", "--length", "2"]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
assert "hello" in caplog.text
|
|
|
|
def test_file_input(self, tmp_path: Path, caplog: pytest.LogCaptureFixture) -> None:
|
|
"""Test --file input option."""
|
|
test_file = tmp_path / "test.txt"
|
|
test_file.write_text("hello world hello world", encoding="utf-8")
|
|
|
|
with caplog.at_level(logging.INFO):
|
|
exit_code = main(
|
|
["--file", str(test_file), "--words", "hello", "--length", "2"]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
assert "hello" in caplog.text
|
|
|
|
def test_words_file_input(
|
|
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
|
|
) -> None:
|
|
"""Test --words-file option."""
|
|
text_file = tmp_path / "text.txt"
|
|
words_file = tmp_path / "words.txt"
|
|
text_file.write_text("hello world hello world", encoding="utf-8")
|
|
words_file.write_text("hello\nworld\n", encoding="utf-8")
|
|
|
|
with caplog.at_level(logging.INFO):
|
|
exit_code = main(
|
|
[
|
|
"--file",
|
|
str(text_file),
|
|
"--words-file",
|
|
str(words_file),
|
|
"--length",
|
|
"2",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
assert "100.00%" in caplog.text # Both words match
|
|
|
|
def test_top_option(self, caplog: pytest.LogCaptureFixture) -> None:
|
|
"""Test --top option."""
|
|
with caplog.at_level(logging.INFO):
|
|
exit_code = main(
|
|
[
|
|
"--text",
|
|
"a b c d e f",
|
|
"--words",
|
|
"a",
|
|
"b",
|
|
"--length",
|
|
"2",
|
|
"--top",
|
|
"3",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
# Should show multiple results
|
|
assert "Result #1" in caplog.text
|
|
|
|
def test_context_option(self, caplog: pytest.LogCaptureFixture) -> None:
|
|
"""Test --context option."""
|
|
with caplog.at_level(logging.INFO):
|
|
exit_code = main(
|
|
[
|
|
"--text",
|
|
"a b c d e f g",
|
|
"--words",
|
|
"d",
|
|
"--length",
|
|
"1",
|
|
"--context",
|
|
"2",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
# Excerpt should include context words
|
|
|
|
def test_case_sensitive_option(self, caplog: pytest.LogCaptureFixture) -> None:
|
|
"""Test --case-sensitive option."""
|
|
with caplog.at_level(logging.INFO):
|
|
exit_code = main(
|
|
[
|
|
"--text",
|
|
"Hello HELLO hello",
|
|
"--words",
|
|
"hello",
|
|
"--length",
|
|
"1",
|
|
"--case-sensitive",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 0
|
|
# Only lowercase "hello" should match
|
|
|
|
def test_file_not_found(self, caplog: pytest.LogCaptureFixture) -> None:
|
|
"""Test error handling for missing file."""
|
|
with caplog.at_level(logging.ERROR):
|
|
exit_code = main(
|
|
["--file", "/nonexistent/file.txt", "--words", "hello", "--length", "2"]
|
|
)
|
|
|
|
assert exit_code == 1
|
|
assert "Error" in caplog.text
|
|
|
|
def test_empty_words_file(
|
|
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
|
|
) -> None:
|
|
"""Test error when words file is empty."""
|
|
text_file = tmp_path / "text.txt"
|
|
words_file = tmp_path / "words.txt"
|
|
text_file.write_text("hello world", encoding="utf-8")
|
|
words_file.write_text("", encoding="utf-8")
|
|
|
|
with caplog.at_level(logging.ERROR):
|
|
exit_code = main(
|
|
[
|
|
"--file",
|
|
str(text_file),
|
|
"--words-file",
|
|
str(words_file),
|
|
"--length",
|
|
"2",
|
|
]
|
|
)
|
|
|
|
assert exit_code == 1
|
|
assert "No target words" in caplog.text
|
|
|
|
def test_output_to_file(self, tmp_path: Path) -> None:
|
|
"""Test --output option writes to file."""
|
|
out = tmp_path / "result.txt"
|
|
exit_code = main(
|
|
[
|
|
"--text",
|
|
"hello world hello",
|
|
"--words",
|
|
"hello",
|
|
"--length",
|
|
"2",
|
|
"--output",
|
|
str(out),
|
|
]
|
|
)
|
|
assert exit_code == 0
|
|
assert out.exists()
|
|
assert "hello" in out.read_text(encoding="utf-8")
|
|
|
|
def test_unicode_decode_error(
|
|
self, tmp_path: Path, caplog: pytest.LogCaptureFixture
|
|
) -> None:
|
|
"""Test UnicodeDecodeError handling."""
|
|
from unittest.mock import patch
|
|
|
|
f = tmp_path / "bad.txt"
|
|
f.write_bytes(b"\x80\x81")
|
|
with (
|
|
caplog.at_level(logging.ERROR),
|
|
patch(
|
|
"python_pkg.word_frequency.excerpt_finder.read_file",
|
|
side_effect=UnicodeDecodeError("utf-8", b"", 0, 1, "bad"),
|
|
),
|
|
):
|
|
exit_code = main(["--file", str(f), "--words", "hello", "--length", "2"])
|
|
assert exit_code == 1
|
|
|
|
def test_duplicate_excerpt_skipped(self) -> None:
|
|
"""Test that duplicate excerpts at the same position are skipped."""
|
|
# All windows are the same content "a a"
|
|
text = "a a a a a"
|
|
result = find_best_excerpt(text, ["a"], excerpt_length=2, top_n=10)
|
|
# All excerpts are "a a" but only first unique should be kept
|
|
excerpts = [r.excerpt for r in result]
|
|
assert len(excerpts) == len(set(excerpts))
|
|
|
|
|
|
class TestPerformance:
|
|
"""Performance tests for excerpt finder."""
|
|
|
|
def test_large_text_performance(self) -> None:
|
|
"""Test that finding excerpts in large text completes quickly."""
|
|
# Generate large text (~100k words)
|
|
words = ["the", "and", "of", "to", "in", "a", "that", "is", "was", "for"]
|
|
large_text = " ".join(words * 10000)
|
|
|
|
target_words = ["the", "and", "of"]
|
|
|
|
start_time = time.perf_counter()
|
|
result = find_best_excerpt(
|
|
large_text, target_words, excerpt_length=100, top_n=10
|
|
)
|
|
elapsed = time.perf_counter() - start_time
|
|
|
|
assert elapsed < 5.0, f"Search took {elapsed:.2f}s, expected < 5s"
|
|
assert len(result) > 0
|
|
|
|
def test_many_target_words_performance(self) -> None:
|
|
"""Test performance with many target words."""
|
|
# Generate text
|
|
text_words = [f"word{i}" for i in range(1000)] * 100
|
|
large_text = " ".join(text_words)
|
|
|
|
# Many target words
|
|
target_words = [f"word{i}" for i in range(500)]
|
|
|
|
start_time = time.perf_counter()
|
|
result = find_best_excerpt(large_text, target_words, excerpt_length=50, top_n=5)
|
|
elapsed = time.perf_counter() - start_time
|
|
|
|
assert elapsed < 10.0, f"Search took {elapsed:.2f}s, expected < 10s"
|
|
assert len(result) > 0
|
|
|
|
def test_long_excerpt_performance(self) -> None:
|
|
"""Test performance with long excerpt length."""
|
|
words = ["a", "b", "c", "d", "e"] * 10000
|
|
large_text = " ".join(words)
|
|
|
|
start_time = time.perf_counter()
|
|
result = find_best_excerpt(large_text, ["a", "b"], excerpt_length=1000, top_n=5)
|
|
elapsed = time.perf_counter() - start_time
|
|
|
|
assert elapsed < 5.0, f"Search took {elapsed:.2f}s, expected < 5s"
|
|
assert len(result) > 0
|