testsAndMisc-archive/python_pkg/word_frequency/analyzer.py

261 lines
6.8 KiB
Python
Raw Normal View History

2025-12-27 17:22:17 +01:00
#!/usr/bin/env python3
"""Word frequency analyzer - analyzes text and produces word usage statistics.
Usage:
# From raw text
python -m python_pkg.word_frequency.analyzer --text "Hello world hello"
# From a single file
python -m python_pkg.word_frequency.analyzer --file path/to/file.txt
# From multiple files
python -m python_pkg.word_frequency.analyzer --files file1.txt file2.txt file3.txt
# Limit output to top N words
python -m python_pkg.word_frequency.analyzer --file text.txt --top 20
# Case-sensitive mode
python -m python_pkg.word_frequency.analyzer --file text.txt --case-sensitive
"""
from __future__ import annotations
import argparse
import re
import sys
from collections import Counter
from pathlib import Path
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from collections.abc import Sequence
def extract_words(text: str, *, case_sensitive: bool = False) -> list[str]:
"""Extract words from text.
Args:
text: The input text to extract words from.
case_sensitive: If False, convert all words to lowercase.
Returns:
List of words found in the text.
"""
# Match word characters including unicode letters (for Polish, Latin, etc.)
words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
if not case_sensitive:
words = [word.lower() for word in words]
return words
def analyze_text(text: str, *, case_sensitive: bool = False) -> Counter[str]:
"""Analyze text and return word counts.
Args:
text: The input text to analyze.
case_sensitive: If False, treat words case-insensitively.
Returns:
Counter object with word frequencies.
"""
words = extract_words(text, case_sensitive=case_sensitive)
return Counter(words)
def read_file(filepath: str | Path) -> str:
"""Read text content from a file.
Args:
filepath: Path to the file to read.
Returns:
The text content of the file.
Raises:
FileNotFoundError: If the file doesn't exist.
UnicodeDecodeError: If the file can't be decoded as UTF-8.
"""
path = Path(filepath)
return path.read_text(encoding="utf-8")
def read_files(filepaths: Sequence[str | Path]) -> str:
"""Read and concatenate text content from multiple files.
Args:
filepaths: Sequence of paths to files to read.
Returns:
Combined text content of all files.
"""
texts = []
for filepath in filepaths:
texts.append(read_file(filepath))
return "\n".join(texts)
def format_results(
word_counts: Counter[str],
*,
top_n: int | None = None,
) -> str:
"""Format word frequency results as a table.
Args:
word_counts: Counter object with word frequencies.
top_n: If provided, only show the top N words.
Returns:
Formatted string table with results.
"""
total_words = sum(word_counts.values())
if total_words == 0:
return "No words found in input."
# Get items sorted by frequency
if top_n is not None:
items = word_counts.most_common(top_n)
else:
items = word_counts.most_common()
# Find the maximum width for the word column
max_word_len = max(len(word) for word, _ in items) if items else 4
max_word_len = max(max_word_len, 4) # Minimum width for "Word" header
# Find the maximum width for the count column
max_count = max(count for _, count in items) if items else 0
count_width = max(len(str(max_count)), 5) # Minimum width for "Count" header
# Build the table
lines = []
lines.append(f"Total words: {total_words}")
lines.append(f"Unique words: {len(word_counts)}")
lines.append("")
# Header
header = f"{'Word':<{max_word_len}} {'Count':>{count_width}} {'Percentage':>10}"
lines.append(header)
lines.append("-" * len(header))
# Data rows
for word, count in items:
percentage = (count / total_words) * 100
lines.append(f"{word:<{max_word_len}} {count:>{count_width}} {percentage:>9.2f}%")
return "\n".join(lines)
def analyze_and_format(
text: str,
*,
case_sensitive: bool = False,
top_n: int | None = None,
) -> str:
"""Analyze text and return formatted results.
Args:
text: The input text to analyze.
case_sensitive: If False, treat words case-insensitively.
top_n: If provided, only show the top N words.
Returns:
Formatted string with word frequency analysis.
"""
word_counts = analyze_text(text, case_sensitive=case_sensitive)
return format_results(word_counts, top_n=top_n)
def main(argv: Sequence[str] | None = None) -> int:
"""Main entry point for the word frequency analyzer.
Args:
argv: Command line arguments (defaults to sys.argv[1:]).
Returns:
Exit code (0 for success, non-zero for errors).
"""
parser = argparse.ArgumentParser(
description="Analyze word frequency in text.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument(
"--text",
"-t",
type=str,
help="Raw text to analyze",
)
input_group.add_argument(
"--file",
"-f",
type=str,
help="Path to a file to analyze",
)
input_group.add_argument(
"--files",
"-F",
nargs="+",
type=str,
help="Paths to multiple files to analyze",
)
parser.add_argument(
"--top",
"-n",
type=int,
default=None,
help="Show only the top N most frequent words",
)
parser.add_argument(
"--case-sensitive",
"-c",
action="store_true",
help="Treat words case-sensitively (default: case-insensitive)",
)
parser.add_argument(
"--output",
"-o",
type=str,
help="Output file path (default: print to stdout)",
)
args = parser.parse_args(argv)
try:
if args.text:
text = args.text
elif args.file:
text = read_file(args.file)
else: # args.files
text = read_files(args.files)
result = analyze_and_format(
text,
case_sensitive=args.case_sensitive,
top_n=args.top,
)
if args.output:
Path(args.output).write_text(result, encoding="utf-8")
print(f"Output written to {args.output}") # noqa: T201
else:
print(result) # noqa: T201
except FileNotFoundError as e:
print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201
return 1
except UnicodeDecodeError as e:
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr) # noqa: T201
return 1
return 0
if __name__ == "__main__":
sys.exit(main())