testsAndMisc/python_pkg/word_frequency/analyzer.py

#!/usr/bin/env python3
"""Word frequency analyzer - analyzes text and produces word usage statistics.

Usage:
    # From raw text
    python -m python_pkg.word_frequency.analyzer --text "Hello world hello"

    # From a single file
    python -m python_pkg.word_frequency.analyzer --file path/to/file.txt

    # From multiple files
    python -m python_pkg.word_frequency.analyzer --files file1.txt file2.txt file3.txt

    # Limit output to top N words
    python -m python_pkg.word_frequency.analyzer --file text.txt --top 20

    # Case-sensitive mode
    python -m python_pkg.word_frequency.analyzer --file text.txt --case-sensitive
"""

from __future__ import annotations

import argparse
from collections import Counter
import logging
from pathlib import Path
import re
import sys
from typing import TYPE_CHECKING

logger = logging.getLogger(__name__)

if TYPE_CHECKING:
    from collections.abc import Sequence


def extract_words(text: str, *, case_sensitive: bool = False) -> list[str]:
    """Extract words from text.

    Args:
        text: The input text to extract words from.
        case_sensitive: If False, convert all words to lowercase.

    Returns:
        List of words found in the text.
    """
    # Match word characters including unicode letters (for Polish, Latin, etc.)
    words = re.findall(r"\b[\w]+\b", text, re.UNICODE)

    if not case_sensitive:
        words = [word.lower() for word in words]

    return words


def analyze_text(text: str, *, case_sensitive: bool = False) -> Counter[str]:
    """Analyze text and return word counts.

    Args:
        text: The input text to analyze.
        case_sensitive: If False, treat words case-insensitively.

    Returns:
        Counter object with word frequencies.
    """
    words = extract_words(text, case_sensitive=case_sensitive)
    return Counter(words)


def read_file(filepath: str | Path) -> str:
    """Read text content from a file.

    Args:
        filepath: Path to the file to read.

    Returns:
        The text content of the file.

    Raises:
        FileNotFoundError: If the file doesn't exist.
        UnicodeDecodeError: If the file can't be decoded as UTF-8.
    """
    path = Path(filepath)
    return path.read_text(encoding="utf-8")


def read_files(filepaths: Sequence[str | Path]) -> str:
    """Read and concatenate text content from multiple files.

    Args:
        filepaths: Sequence of paths to files to read.

    Returns:
        Combined text content of all files.
    """
    texts = [read_file(filepath) for filepath in filepaths]
    return "\n".join(texts)


def format_results(
    word_counts: Counter[str],
    *,
    top_n: int | None = None,
) -> str:
    """Format word frequency results as a table.

    Args:
        word_counts: Counter object with word frequencies.
        top_n: If provided, only show the top N words.

    Returns:
        Formatted string table with results.
    """
    total_words = sum(word_counts.values())

    if total_words == 0:
        return "No words found in input."

    # Get items sorted by frequency
    if top_n is not None:
        items = word_counts.most_common(top_n)
    else:
        items = word_counts.most_common()

    # Find the maximum width for the word column
    max_word_len = max(len(word) for word, _ in items) if items else 4
    max_word_len = max(max_word_len, 4)  # Minimum width for "Word" header

    # Find the maximum width for the count column
    max_count = max(count for _, count in items) if items else 0
    count_width = max(len(str(max_count)), 5)  # Minimum width for "Count" header

    # Build the table
    lines = []
    lines.append(f"Total words: {total_words}")
    lines.append(f"Unique words: {len(word_counts)}")
    lines.append("")

    # Header
    header = f"{'Word':<{max_word_len}}  {'Count':>{count_width}}  {'Percentage':>10}"
    lines.append(header)
    lines.append("-" * len(header))

    # Data rows
    for word, count in items:
        percentage = (count / total_words) * 100
        lines.append(
            f"{word:<{max_word_len}}  {count:>{count_width}}  {percentage:>9.2f}%"
        )

    return "\n".join(lines)


def analyze_and_format(
    text: str,
    *,
    case_sensitive: bool = False,
    top_n: int | None = None,
) -> str:
    """Analyze text and return formatted results.

    Args:
        text: The input text to analyze.
        case_sensitive: If False, treat words case-insensitively.
        top_n: If provided, only show the top N words.

    Returns:
        Formatted string with word frequency analysis.
    """
    word_counts = analyze_text(text, case_sensitive=case_sensitive)
    return format_results(word_counts, top_n=top_n)


def main(argv: Sequence[str] | None = None) -> int:
    """Main entry point for the word frequency analyzer.

    Args:
        argv: Command line arguments (defaults to sys.argv[1:]).

    Returns:
        Exit code (0 for success, non-zero for errors).
    """
    parser = argparse.ArgumentParser(
        description="Analyze word frequency in text.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )

    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument(
        "--text",
        "-t",
        type=str,
        help="Raw text to analyze",
    )
    input_group.add_argument(
        "--file",
        "-f",
        type=str,
        help="Path to a file to analyze",
    )
    input_group.add_argument(
        "--files",
        "-F",
        nargs="+",
        type=str,
        help="Paths to multiple files to analyze",
    )

    parser.add_argument(
        "--top",
        "-n",
        type=int,
        default=None,
        help="Show only the top N most frequent words",
    )
    parser.add_argument(
        "--case-sensitive",
        "-c",
        action="store_true",
        help="Treat words case-sensitively (default: case-insensitive)",
    )
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        help="Output file path (default: print to stdout)",
    )

    args = parser.parse_args(argv)

    try:
        if args.text:
            text = args.text
        elif args.file:
            text = read_file(args.file)
        else:  # args.files
            text = read_files(args.files)

        result = analyze_and_format(
            text,
            case_sensitive=args.case_sensitive,
            top_n=args.top,
        )

        if args.output:
            Path(args.output).write_text(result, encoding="utf-8")
            logger.info("Output written to %s", args.output)
        else:
            sys.stdout.write(result + "\n")

    except FileNotFoundError:
        logger.exception("File not found")
        return 1
    except UnicodeDecodeError:
        logger.exception("Could not decode file as UTF-8")
        return 1

    return 0


if __name__ == "__main__":
    sys.exit(main())