testsAndMisc/python_pkg/word_frequency/excerpt_finder.py
Copilot aa5b566ac5 Add pre-commit workflow and fix linting violations (#2)
* Initial plan

* Add pre-commit GitHub workflow and fix linting issues

- Created .github/workflows/pre-commit.yml to run pre-commit hooks in CI
- Fixed mypy type errors in translator.py
- Fixed shellcheck warning in run_anki_generator.sh
- Added per-file ignores for word_frequency module legacy code
- Applied auto-fixes from ruff, ruff-format, autoflake, prettier
- All pre-commit hooks now passing

Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>

* Make Python scripts with shebangs executable

- Set executable bit for word_frequency module scripts with shebangs
- All 30 pre-commit hooks now passing

Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>

* Fix: Restore imports in check functions (autoflake-proof)

- Restored imports in _check_argos(), _check_deep_translator(), _check_langdetect()
- Used _ = module assignment to prevent autoflake from removing imports
- These imports test module availability by triggering ImportError if missing
- All 30 pre-commit hooks now passing

Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
2026-01-07 22:57:42 +01:00

366 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
"""Excerpt finder - finds text excerpts where target words are most prevalent.
Given a text and a list of target words, this tool finds the excerpt of a
specified length (in words) where the target words appear most frequently.
Usage:
# From raw text with target words
python -m python_pkg.word_frequency.excerpt_finder --text "they went somewhere he and she and the guy" --words and the --length 3
# From a file
python -m python_pkg.word_frequency.excerpt_finder --file path/to/file.txt --words the and of --length 10
# Target words from a file (one word per line)
python -m python_pkg.word_frequency.excerpt_finder --file text.txt --words-file targets.txt --length 20
# Show top N excerpts instead of just the best one
python -m python_pkg.word_frequency.excerpt_finder --file text.txt --words the and --length 10 --top 5
"""
from __future__ import annotations
import argparse
from pathlib import Path
import sys
from typing import TYPE_CHECKING, NamedTuple
try:
from python_pkg.word_frequency.analyzer import extract_words, read_file
except ModuleNotFoundError:
from analyzer import extract_words, read_file # type: ignore[import-not-found]
if TYPE_CHECKING:
from collections.abc import Sequence
class ExcerptResult(NamedTuple):
"""Result of an excerpt search."""
excerpt: str
words: list[str]
start_index: int
end_index: int
match_count: int
match_percentage: float
def find_best_excerpt(
text: str,
target_words: Sequence[str],
excerpt_length: int,
*,
case_sensitive: bool = False,
top_n: int = 1,
) -> list[ExcerptResult]:
"""Find the excerpt(s) where target words are most prevalent.
Args:
text: The input text to search.
target_words: Words to search for in the excerpt.
excerpt_length: Length of the excerpt in words.
case_sensitive: If False, match words case-insensitively.
top_n: Number of top excerpts to return.
Returns:
List of ExcerptResult with the best excerpt(s) found.
"""
if excerpt_length <= 0:
return []
# Extract words with positions preserved
words = extract_words(text, case_sensitive=case_sensitive)
if not words or len(words) < excerpt_length:
return []
# Normalize target words for matching
if case_sensitive:
target_set = set(target_words)
else:
target_set = {w.lower() for w in target_words}
# Use sliding window to find the best excerpt
results: list[
tuple[int, int, float, int]
] = [] # (match_count, -start, percentage, start)
# Count matches in first window
current_matches = sum(1 for w in words[:excerpt_length] if w in target_set)
# Store first window result
percentage = (current_matches / excerpt_length) * 100
results.append((current_matches, 0, percentage, 0))
# Slide the window
for i in range(1, len(words) - excerpt_length + 1):
# Remove the word leaving the window
leaving_word = words[i - 1]
if leaving_word in target_set:
current_matches -= 1
# Add the word entering the window
entering_word = words[i + excerpt_length - 1]
if entering_word in target_set:
current_matches += 1
percentage = (current_matches / excerpt_length) * 100
results.append((current_matches, -i, percentage, i))
# Sort by match count (desc), then by position (asc for tie-breaking)
results.sort(key=lambda x: (x[0], x[1]), reverse=True)
# Build ExcerptResult objects for top N
output: list[ExcerptResult] = []
seen_excerpts: set[tuple[str, ...]] = set()
for match_count, _, percentage, start_idx in results:
if len(output) >= top_n:
break
end_idx = start_idx + excerpt_length
excerpt_words = words[start_idx:end_idx]
excerpt_tuple = tuple(excerpt_words)
# Skip duplicate excerpts
if excerpt_tuple in seen_excerpts:
continue
seen_excerpts.add(excerpt_tuple)
output.append(
ExcerptResult(
excerpt=" ".join(excerpt_words),
words=list(excerpt_words),
start_index=start_idx,
end_index=end_idx,
match_count=match_count,
match_percentage=percentage,
)
)
return output
def find_best_excerpt_with_context(
text: str,
target_words: Sequence[str],
excerpt_length: int,
*,
case_sensitive: bool = False,
top_n: int = 1,
context_words: int = 0,
) -> list[ExcerptResult]:
"""Find the excerpt(s) with optional surrounding context.
Args:
text: The input text to search.
target_words: Words to search for in the excerpt.
excerpt_length: Length of the excerpt in words.
case_sensitive: If False, match words case-insensitively.
top_n: Number of top excerpts to return.
context_words: Number of words to include before/after the excerpt.
Returns:
List of ExcerptResult with context included in the excerpt.
"""
base_results = find_best_excerpt(
text,
target_words,
excerpt_length,
case_sensitive=case_sensitive,
top_n=top_n,
)
if context_words <= 0:
return base_results
# Re-extract all words to get context
all_words = extract_words(text, case_sensitive=case_sensitive)
expanded_results: list[ExcerptResult] = []
for result in base_results:
# Expand the excerpt with context
ctx_start = max(0, result.start_index - context_words)
ctx_end = min(len(all_words), result.end_index + context_words)
context_excerpt_words = all_words[ctx_start:ctx_end]
expanded_results.append(
ExcerptResult(
excerpt=" ".join(context_excerpt_words),
words=context_excerpt_words,
start_index=ctx_start,
end_index=ctx_end,
match_count=result.match_count,
match_percentage=result.match_percentage,
)
)
return expanded_results
def format_excerpt_results(
results: list[ExcerptResult],
target_words: Sequence[str],
) -> str:
"""Format excerpt results for display.
Args:
results: List of ExcerptResult to format.
target_words: The target words that were searched for.
Returns:
Formatted string with results.
"""
if not results:
return "No excerpts found."
lines: list[str] = []
lines.append(f"Target words: {', '.join(target_words)}")
lines.append("")
for i, result in enumerate(results, 1):
if len(results) > 1:
lines.append(f"=== Result #{i} ===")
lines.append(f'Excerpt: "{result.excerpt}"')
lines.append(f"Word position: {result.start_index} - {result.end_index - 1}")
lines.append(
f"Matches: {result.match_count}/{len(result.words)} ({result.match_percentage:.2f}%)"
)
lines.append("")
return "\n".join(lines)
def main(argv: Sequence[str] | None = None) -> int:
"""Main entry point for the excerpt finder.
Args:
argv: Command line arguments (defaults to sys.argv[1:]).
Returns:
Exit code (0 for success, non-zero for errors).
"""
parser = argparse.ArgumentParser(
description="Find text excerpts where target words are most prevalent.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
# Input source
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument(
"--text",
"-t",
type=str,
help="Raw text to search",
)
input_group.add_argument(
"--file",
"-f",
type=str,
help="Path to a file to search",
)
# Target words source
words_group = parser.add_mutually_exclusive_group(required=True)
words_group.add_argument(
"--words",
"-w",
nargs="+",
type=str,
help="Target words to find",
)
words_group.add_argument(
"--words-file",
"-W",
type=str,
help="Path to file with target words (one per line)",
)
# Excerpt parameters
parser.add_argument(
"--length",
"-l",
type=int,
required=True,
help="Length of excerpt in words",
)
parser.add_argument(
"--top",
"-n",
type=int,
default=1,
help="Show top N excerpts (default: 1)",
)
parser.add_argument(
"--context",
"-c",
type=int,
default=0,
help="Number of context words before/after excerpt",
)
parser.add_argument(
"--case-sensitive",
"-s",
action="store_true",
help="Match words case-sensitively",
)
parser.add_argument(
"--output",
"-o",
type=str,
help="Output file path (default: print to stdout)",
)
args = parser.parse_args(argv)
try:
# Get input text
if args.text:
text = args.text
else:
text = read_file(args.file)
# Get target words
if args.words:
target_words = args.words
else:
words_content = read_file(args.words_file)
target_words = [w.strip() for w in words_content.splitlines() if w.strip()]
if not target_words:
print("Error: No target words provided", file=sys.stderr)
return 1
# Find excerpts
results = find_best_excerpt_with_context(
text,
target_words,
args.length,
case_sensitive=args.case_sensitive,
top_n=args.top,
context_words=args.context,
)
# Format and print results
output = format_excerpt_results(results, target_words)
if args.output:
Path(args.output).write_text(output, encoding="utf-8")
print(f"Output written to {args.output}")
else:
print(output)
except FileNotFoundError as e:
print(f"Error: File not found - {e}", file=sys.stderr)
return 1
except UnicodeDecodeError as e:
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
return 1
return 0
if __name__ == "__main__":
sys.exit(main())