mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 21:03:15 +02:00
- Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters
264 lines
6.8 KiB
Python
Executable File
264 lines
6.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Word frequency analyzer - analyzes text and produces word usage statistics.
|
|
|
|
Usage:
|
|
# From raw text
|
|
python -m python_pkg.word_frequency.analyzer --text "Hello world hello"
|
|
|
|
# From a single file
|
|
python -m python_pkg.word_frequency.analyzer --file path/to/file.txt
|
|
|
|
# From multiple files
|
|
python -m python_pkg.word_frequency.analyzer --files file1.txt file2.txt file3.txt
|
|
|
|
# Limit output to top N words
|
|
python -m python_pkg.word_frequency.analyzer --file text.txt --top 20
|
|
|
|
# Case-sensitive mode
|
|
python -m python_pkg.word_frequency.analyzer --file text.txt --case-sensitive
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from collections import Counter
|
|
import logging
|
|
from pathlib import Path
|
|
import re
|
|
import sys
|
|
from typing import TYPE_CHECKING
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Sequence
|
|
|
|
|
|
def extract_words(text: str, *, case_sensitive: bool = False) -> list[str]:
|
|
"""Extract words from text.
|
|
|
|
Args:
|
|
text: The input text to extract words from.
|
|
case_sensitive: If False, convert all words to lowercase.
|
|
|
|
Returns:
|
|
List of words found in the text.
|
|
"""
|
|
# Match word characters including unicode letters (for Polish, Latin, etc.)
|
|
words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
|
|
|
|
if not case_sensitive:
|
|
words = [word.lower() for word in words]
|
|
|
|
return words
|
|
|
|
|
|
def analyze_text(text: str, *, case_sensitive: bool = False) -> Counter[str]:
|
|
"""Analyze text and return word counts.
|
|
|
|
Args:
|
|
text: The input text to analyze.
|
|
case_sensitive: If False, treat words case-insensitively.
|
|
|
|
Returns:
|
|
Counter object with word frequencies.
|
|
"""
|
|
words = extract_words(text, case_sensitive=case_sensitive)
|
|
return Counter(words)
|
|
|
|
|
|
def read_file(filepath: str | Path) -> str:
|
|
"""Read text content from a file.
|
|
|
|
Args:
|
|
filepath: Path to the file to read.
|
|
|
|
Returns:
|
|
The text content of the file.
|
|
|
|
Raises:
|
|
FileNotFoundError: If the file doesn't exist.
|
|
UnicodeDecodeError: If the file can't be decoded as UTF-8.
|
|
"""
|
|
path = Path(filepath)
|
|
return path.read_text(encoding="utf-8")
|
|
|
|
|
|
def read_files(filepaths: Sequence[str | Path]) -> str:
|
|
"""Read and concatenate text content from multiple files.
|
|
|
|
Args:
|
|
filepaths: Sequence of paths to files to read.
|
|
|
|
Returns:
|
|
Combined text content of all files.
|
|
"""
|
|
texts = [read_file(filepath) for filepath in filepaths]
|
|
return "\n".join(texts)
|
|
|
|
|
|
def format_results(
|
|
word_counts: Counter[str],
|
|
*,
|
|
top_n: int | None = None,
|
|
) -> str:
|
|
"""Format word frequency results as a table.
|
|
|
|
Args:
|
|
word_counts: Counter object with word frequencies.
|
|
top_n: If provided, only show the top N words.
|
|
|
|
Returns:
|
|
Formatted string table with results.
|
|
"""
|
|
total_words = sum(word_counts.values())
|
|
|
|
if total_words == 0:
|
|
return "No words found in input."
|
|
|
|
# Get items sorted by frequency
|
|
if top_n is not None:
|
|
items = word_counts.most_common(top_n)
|
|
else:
|
|
items = word_counts.most_common()
|
|
|
|
# Find the maximum width for the word column
|
|
max_word_len = max(len(word) for word, _ in items) if items else 4
|
|
max_word_len = max(max_word_len, 4) # Minimum width for "Word" header
|
|
|
|
# Find the maximum width for the count column
|
|
max_count = max(count for _, count in items) if items else 0
|
|
count_width = max(len(str(max_count)), 5) # Minimum width for "Count" header
|
|
|
|
# Build the table
|
|
lines = []
|
|
lines.append(f"Total words: {total_words}")
|
|
lines.append(f"Unique words: {len(word_counts)}")
|
|
lines.append("")
|
|
|
|
# Header
|
|
header = f"{'Word':<{max_word_len}} {'Count':>{count_width}} {'Percentage':>10}"
|
|
lines.append(header)
|
|
lines.append("-" * len(header))
|
|
|
|
# Data rows
|
|
for word, count in items:
|
|
percentage = (count / total_words) * 100
|
|
lines.append(
|
|
f"{word:<{max_word_len}} {count:>{count_width}} {percentage:>9.2f}%"
|
|
)
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def analyze_and_format(
|
|
text: str,
|
|
*,
|
|
case_sensitive: bool = False,
|
|
top_n: int | None = None,
|
|
) -> str:
|
|
"""Analyze text and return formatted results.
|
|
|
|
Args:
|
|
text: The input text to analyze.
|
|
case_sensitive: If False, treat words case-insensitively.
|
|
top_n: If provided, only show the top N words.
|
|
|
|
Returns:
|
|
Formatted string with word frequency analysis.
|
|
"""
|
|
word_counts = analyze_text(text, case_sensitive=case_sensitive)
|
|
return format_results(word_counts, top_n=top_n)
|
|
|
|
|
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
"""Main entry point for the word frequency analyzer.
|
|
|
|
Args:
|
|
argv: Command line arguments (defaults to sys.argv[1:]).
|
|
|
|
Returns:
|
|
Exit code (0 for success, non-zero for errors).
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Analyze word frequency in text.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__,
|
|
)
|
|
|
|
input_group = parser.add_mutually_exclusive_group(required=True)
|
|
input_group.add_argument(
|
|
"--text",
|
|
"-t",
|
|
type=str,
|
|
help="Raw text to analyze",
|
|
)
|
|
input_group.add_argument(
|
|
"--file",
|
|
"-f",
|
|
type=str,
|
|
help="Path to a file to analyze",
|
|
)
|
|
input_group.add_argument(
|
|
"--files",
|
|
"-F",
|
|
nargs="+",
|
|
type=str,
|
|
help="Paths to multiple files to analyze",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--top",
|
|
"-n",
|
|
type=int,
|
|
default=None,
|
|
help="Show only the top N most frequent words",
|
|
)
|
|
parser.add_argument(
|
|
"--case-sensitive",
|
|
"-c",
|
|
action="store_true",
|
|
help="Treat words case-sensitively (default: case-insensitive)",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
type=str,
|
|
help="Output file path (default: print to stdout)",
|
|
)
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
try:
|
|
if args.text:
|
|
text = args.text
|
|
elif args.file:
|
|
text = read_file(args.file)
|
|
else: # args.files
|
|
text = read_files(args.files)
|
|
|
|
result = analyze_and_format(
|
|
text,
|
|
case_sensitive=args.case_sensitive,
|
|
top_n=args.top,
|
|
)
|
|
|
|
if args.output:
|
|
Path(args.output).write_text(result, encoding="utf-8")
|
|
logger.info("Output written to %s", args.output)
|
|
else:
|
|
sys.stdout.write(result + "\n")
|
|
|
|
except FileNotFoundError:
|
|
logger.exception("File not found")
|
|
return 1
|
|
except UnicodeDecodeError:
|
|
logger.exception("Could not decode file as UTF-8")
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|