#!/usr/bin/env python3 """Excerpt finder - finds text excerpts where target words are most prevalent. Given a text and a list of target words, this tool finds the excerpt of a specified length (in words) where the target words appear most frequently. Usage: # From raw text with target words python -m python_pkg.word_frequency.excerpt_finder --text "they went somewhere he and she and the guy" --words and the --length 3 # From a file python -m python_pkg.word_frequency.excerpt_finder --file path/to/file.txt --words the and of --length 10 # Target words from a file (one word per line) python -m python_pkg.word_frequency.excerpt_finder --file text.txt --words-file targets.txt --length 20 # Show top N excerpts instead of just the best one python -m python_pkg.word_frequency.excerpt_finder --file text.txt --words the and --length 10 --top 5 """ from __future__ import annotations import argparse import sys from pathlib import Path from typing import TYPE_CHECKING, NamedTuple try: from python_pkg.word_frequency.analyzer import extract_words, read_file except ModuleNotFoundError: from analyzer import extract_words, read_file # type: ignore[import-not-found] if TYPE_CHECKING: from collections.abc import Sequence class ExcerptResult(NamedTuple): """Result of an excerpt search.""" excerpt: str words: list[str] start_index: int end_index: int match_count: int match_percentage: float def find_best_excerpt( text: str, target_words: Sequence[str], excerpt_length: int, *, case_sensitive: bool = False, top_n: int = 1, ) -> list[ExcerptResult]: """Find the excerpt(s) where target words are most prevalent. Args: text: The input text to search. target_words: Words to search for in the excerpt. excerpt_length: Length of the excerpt in words. case_sensitive: If False, match words case-insensitively. top_n: Number of top excerpts to return. Returns: List of ExcerptResult with the best excerpt(s) found. """ if excerpt_length <= 0: return [] # Extract words with positions preserved words = extract_words(text, case_sensitive=case_sensitive) if not words or len(words) < excerpt_length: return [] # Normalize target words for matching if case_sensitive: target_set = set(target_words) else: target_set = {w.lower() for w in target_words} # Use sliding window to find the best excerpt results: list[tuple[int, int, float, int]] = [] # (match_count, -start, percentage, start) # Count matches in first window current_matches = sum(1 for w in words[:excerpt_length] if w in target_set) # Store first window result percentage = (current_matches / excerpt_length) * 100 results.append((current_matches, 0, percentage, 0)) # Slide the window for i in range(1, len(words) - excerpt_length + 1): # Remove the word leaving the window leaving_word = words[i - 1] if leaving_word in target_set: current_matches -= 1 # Add the word entering the window entering_word = words[i + excerpt_length - 1] if entering_word in target_set: current_matches += 1 percentage = (current_matches / excerpt_length) * 100 results.append((current_matches, -i, percentage, i)) # Sort by match count (desc), then by position (asc for tie-breaking) results.sort(key=lambda x: (x[0], x[1]), reverse=True) # Build ExcerptResult objects for top N output: list[ExcerptResult] = [] seen_excerpts: set[tuple[str, ...]] = set() for match_count, _, percentage, start_idx in results: if len(output) >= top_n: break end_idx = start_idx + excerpt_length excerpt_words = words[start_idx:end_idx] excerpt_tuple = tuple(excerpt_words) # Skip duplicate excerpts if excerpt_tuple in seen_excerpts: continue seen_excerpts.add(excerpt_tuple) output.append( ExcerptResult( excerpt=" ".join(excerpt_words), words=list(excerpt_words), start_index=start_idx, end_index=end_idx, match_count=match_count, match_percentage=percentage, ) ) return output def find_best_excerpt_with_context( text: str, target_words: Sequence[str], excerpt_length: int, *, case_sensitive: bool = False, top_n: int = 1, context_words: int = 0, ) -> list[ExcerptResult]: """Find the excerpt(s) with optional surrounding context. Args: text: The input text to search. target_words: Words to search for in the excerpt. excerpt_length: Length of the excerpt in words. case_sensitive: If False, match words case-insensitively. top_n: Number of top excerpts to return. context_words: Number of words to include before/after the excerpt. Returns: List of ExcerptResult with context included in the excerpt. """ base_results = find_best_excerpt( text, target_words, excerpt_length, case_sensitive=case_sensitive, top_n=top_n, ) if context_words <= 0: return base_results # Re-extract all words to get context all_words = extract_words(text, case_sensitive=case_sensitive) expanded_results: list[ExcerptResult] = [] for result in base_results: # Expand the excerpt with context ctx_start = max(0, result.start_index - context_words) ctx_end = min(len(all_words), result.end_index + context_words) context_excerpt_words = all_words[ctx_start:ctx_end] expanded_results.append( ExcerptResult( excerpt=" ".join(context_excerpt_words), words=context_excerpt_words, start_index=ctx_start, end_index=ctx_end, match_count=result.match_count, match_percentage=result.match_percentage, ) ) return expanded_results def format_excerpt_results( results: list[ExcerptResult], target_words: Sequence[str], ) -> str: """Format excerpt results for display. Args: results: List of ExcerptResult to format. target_words: The target words that were searched for. Returns: Formatted string with results. """ if not results: return "No excerpts found." lines: list[str] = [] lines.append(f"Target words: {', '.join(target_words)}") lines.append("") for i, result in enumerate(results, 1): if len(results) > 1: lines.append(f"=== Result #{i} ===") lines.append(f"Excerpt: \"{result.excerpt}\"") lines.append(f"Word position: {result.start_index} - {result.end_index - 1}") lines.append(f"Matches: {result.match_count}/{len(result.words)} ({result.match_percentage:.2f}%)") lines.append("") return "\n".join(lines) def main(argv: Sequence[str] | None = None) -> int: """Main entry point for the excerpt finder. Args: argv: Command line arguments (defaults to sys.argv[1:]). Returns: Exit code (0 for success, non-zero for errors). """ parser = argparse.ArgumentParser( description="Find text excerpts where target words are most prevalent.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) # Input source input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument( "--text", "-t", type=str, help="Raw text to search", ) input_group.add_argument( "--file", "-f", type=str, help="Path to a file to search", ) # Target words source words_group = parser.add_mutually_exclusive_group(required=True) words_group.add_argument( "--words", "-w", nargs="+", type=str, help="Target words to find", ) words_group.add_argument( "--words-file", "-W", type=str, help="Path to file with target words (one per line)", ) # Excerpt parameters parser.add_argument( "--length", "-l", type=int, required=True, help="Length of excerpt in words", ) parser.add_argument( "--top", "-n", type=int, default=1, help="Show top N excerpts (default: 1)", ) parser.add_argument( "--context", "-c", type=int, default=0, help="Number of context words before/after excerpt", ) parser.add_argument( "--case-sensitive", "-s", action="store_true", help="Match words case-sensitively", ) parser.add_argument( "--output", "-o", type=str, help="Output file path (default: print to stdout)", ) args = parser.parse_args(argv) try: # Get input text if args.text: text = args.text else: text = read_file(args.file) # Get target words if args.words: target_words = args.words else: words_content = read_file(args.words_file) target_words = [w.strip() for w in words_content.splitlines() if w.strip()] if not target_words: print("Error: No target words provided", file=sys.stderr) # noqa: T201 return 1 # Find excerpts results = find_best_excerpt_with_context( text, target_words, args.length, case_sensitive=args.case_sensitive, top_n=args.top, context_words=args.context, ) # Format and print results output = format_excerpt_results(results, target_words) if args.output: Path(args.output).write_text(output, encoding="utf-8") print(f"Output written to {args.output}") # noqa: T201 else: print(output) # noqa: T201 except FileNotFoundError as e: print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201 return 1 except UnicodeDecodeError as e: print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr) # noqa: T201 return 1 return 0 if __name__ == "__main__": sys.exit(main())