testsAndMisc-archive/python_pkg/word_frequency/analyzer.py

#!/usr/bin/env python3
"""Word frequency analyzer - analyzes text and produces word usage statistics.

Usage:
    # From raw text
    python -m python_pkg.word_frequency.analyzer --text "Hello world hello"

    # From a single file
    python -m python_pkg.word_frequency.analyzer --file path/to/file.txt

    # From multiple files
    python -m python_pkg.word_frequency.analyzer --files file1.txt file2.txt file3.txt

    # Limit output to top N words
    python -m python_pkg.word_frequency.analyzer --file text.txt --top 20

    # Case-sensitive mode
    python -m python_pkg.word_frequency.analyzer --file text.txt --case-sensitive
"""

from __future__ import annotations

import argparse
import re
import sys
from collections import Counter
from pathlib import Path
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from collections.abc import Sequence


def extract_words(text: str, *, case_sensitive: bool = False) -> list[str]:
    """Extract words from text.

    Args:
        text: The input text to extract words from.
        case_sensitive: If False, convert all words to lowercase.

    Returns:
        List of words found in the text.
    """
    # Match word characters including unicode letters (for Polish, Latin, etc.)
    words = re.findall(r"\b[\w]+\b", text, re.UNICODE)

    if not case_sensitive:
        words = [word.lower() for word in words]

    return words


def analyze_text(text: str, *, case_sensitive: bool = False) -> Counter[str]:
    """Analyze text and return word counts.

    Args:
        text: The input text to analyze.
        case_sensitive: If False, treat words case-insensitively.

    Returns:
        Counter object with word frequencies.
    """
    words = extract_words(text, case_sensitive=case_sensitive)
    return Counter(words)


def read_file(filepath: str | Path) -> str:
    """Read text content from a file.

    Args:
        filepath: Path to the file to read.

    Returns:
        The text content of the file.

    Raises:
        FileNotFoundError: If the file doesn't exist.
        UnicodeDecodeError: If the file can't be decoded as UTF-8.
    """
    path = Path(filepath)
    return path.read_text(encoding="utf-8")


def read_files(filepaths: Sequence[str | Path]) -> str:
    """Read and concatenate text content from multiple files.

    Args:
        filepaths: Sequence of paths to files to read.

    Returns:
        Combined text content of all files.
    """
    texts = []
    for filepath in filepaths:
        texts.append(read_file(filepath))
    return "\n".join(texts)


def format_results(
    word_counts: Counter[str],
    *,
    top_n: int | None = None,
) -> str:
    """Format word frequency results as a table.

    Args:
        word_counts: Counter object with word frequencies.
        top_n: If provided, only show the top N words.

    Returns:
        Formatted string table with results.
    """
    total_words = sum(word_counts.values())

    if total_words == 0:
        return "No words found in input."

    # Get items sorted by frequency
    if top_n is not None:
        items = word_counts.most_common(top_n)
    else:
        items = word_counts.most_common()

    # Find the maximum width for the word column
    max_word_len = max(len(word) for word, _ in items) if items else 4
    max_word_len = max(max_word_len, 4)  # Minimum width for "Word" header

    # Find the maximum width for the count column
    max_count = max(count for _, count in items) if items else 0
    count_width = max(len(str(max_count)), 5)  # Minimum width for "Count" header

    # Build the table
    lines = []
    lines.append(f"Total words: {total_words}")
    lines.append(f"Unique words: {len(word_counts)}")
    lines.append("")

    # Header
    header = f"{'Word':<{max_word_len}}  {'Count':>{count_width}}  {'Percentage':>10}"
    lines.append(header)
    lines.append("-" * len(header))

    # Data rows
    for word, count in items:
        percentage = (count / total_words) * 100
        lines.append(f"{word:<{max_word_len}}  {count:>{count_width}}  {percentage:>9.2f}%")

    return "\n".join(lines)


def analyze_and_format(
    text: str,
    *,
    case_sensitive: bool = False,
    top_n: int | None = None,
) -> str:
    """Analyze text and return formatted results.

    Args:
        text: The input text to analyze.
        case_sensitive: If False, treat words case-insensitively.
        top_n: If provided, only show the top N words.

    Returns:
        Formatted string with word frequency analysis.
    """
    word_counts = analyze_text(text, case_sensitive=case_sensitive)
    return format_results(word_counts, top_n=top_n)


def main(argv: Sequence[str] | None = None) -> int:
    """Main entry point for the word frequency analyzer.

    Args:
        argv: Command line arguments (defaults to sys.argv[1:]).

    Returns:
        Exit code (0 for success, non-zero for errors).
    """
    parser = argparse.ArgumentParser(
        description="Analyze word frequency in text.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )

    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument(
        "--text",
        "-t",
        type=str,
        help="Raw text to analyze",
    )
    input_group.add_argument(
        "--file",
        "-f",
        type=str,
        help="Path to a file to analyze",
    )
    input_group.add_argument(
        "--files",
        "-F",
        nargs="+",
        type=str,
        help="Paths to multiple files to analyze",
    )

    parser.add_argument(
        "--top",
        "-n",
        type=int,
        default=None,
        help="Show only the top N most frequent words",
    )
    parser.add_argument(
        "--case-sensitive",
        "-c",
        action="store_true",
        help="Treat words case-sensitively (default: case-insensitive)",
    )
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        help="Output file path (default: print to stdout)",
    )

    args = parser.parse_args(argv)

    try:
        if args.text:
            text = args.text
        elif args.file:
            text = read_file(args.file)
        else:  # args.files
            text = read_files(args.files)

        result = analyze_and_format(
            text,
            case_sensitive=args.case_sensitive,
            top_n=args.top,
        )

        if args.output:
            Path(args.output).write_text(result, encoding="utf-8")
            print(f"Output written to {args.output}")  # noqa: T201
        else:
            print(result)  # noqa: T201

    except FileNotFoundError as e:
        print(f"Error: File not found - {e}", file=sys.stderr)  # noqa: T201
        return 1
    except UnicodeDecodeError as e:
        print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)  # noqa: T201
        return 1

    return 0


if __name__ == "__main__":
    sys.exit(main())
feat: text learning pipe 2025-12-27 17:22:17 +01:00			`#!/usr/bin/env python3`
			`"""Word frequency analyzer - analyzes text and produces word usage statistics.`

			`Usage:`
			`# From raw text`
			`python -m python_pkg.word_frequency.analyzer --text "Hello world hello"`

			`# From a single file`
			`python -m python_pkg.word_frequency.analyzer --file path/to/file.txt`

			`# From multiple files`
			`python -m python_pkg.word_frequency.analyzer --files file1.txt file2.txt file3.txt`

			`# Limit output to top N words`
			`python -m python_pkg.word_frequency.analyzer --file text.txt --top 20`

			`# Case-sensitive mode`
			`python -m python_pkg.word_frequency.analyzer --file text.txt --case-sensitive`
			`"""`

			`from __future__ import annotations`

			`import argparse`
			`import re`
			`import sys`
			`from collections import Counter`
			`from pathlib import Path`
			`from typing import TYPE_CHECKING`

			`if TYPE_CHECKING:`
			`from collections.abc import Sequence`


			`def extract_words(text: str, *, case_sensitive: bool = False) -> list[str]:`
			`"""Extract words from text.`

			`Args:`
			`text: The input text to extract words from.`
			`case_sensitive: If False, convert all words to lowercase.`

			`Returns:`
			`List of words found in the text.`
			`"""`
			`# Match word characters including unicode letters (for Polish, Latin, etc.)`
			`words = re.findall(r"\b[\w]+\b", text, re.UNICODE)`

			`if not case_sensitive:`
			`words = [word.lower() for word in words]`

			`return words`


			`def analyze_text(text: str, *, case_sensitive: bool = False) -> Counter[str]:`
			`"""Analyze text and return word counts.`

			`Args:`
			`text: The input text to analyze.`
			`case_sensitive: If False, treat words case-insensitively.`

			`Returns:`
			`Counter object with word frequencies.`
			`"""`
			`words = extract_words(text, case_sensitive=case_sensitive)`
			`return Counter(words)`


			`def read_file(filepath: str \| Path) -> str:`
			`"""Read text content from a file.`

			`Args:`
			`filepath: Path to the file to read.`

			`Returns:`
			`The text content of the file.`

			`Raises:`
			`FileNotFoundError: If the file doesn't exist.`
			`UnicodeDecodeError: If the file can't be decoded as UTF-8.`
			`"""`
			`path = Path(filepath)`
			`return path.read_text(encoding="utf-8")`


			`def read_files(filepaths: Sequence[str \| Path]) -> str:`
			`"""Read and concatenate text content from multiple files.`

			`Args:`
			`filepaths: Sequence of paths to files to read.`

			`Returns:`
			`Combined text content of all files.`
			`"""`
			`texts = []`
			`for filepath in filepaths:`
			`texts.append(read_file(filepath))`
			`return "\n".join(texts)`


			`def format_results(`
			`word_counts: Counter[str],`
			`*,`
			`top_n: int \| None = None,`
			`) -> str:`
			`"""Format word frequency results as a table.`

			`Args:`
			`word_counts: Counter object with word frequencies.`
			`top_n: If provided, only show the top N words.`

			`Returns:`
			`Formatted string table with results.`
			`"""`
			`total_words = sum(word_counts.values())`

			`if total_words == 0:`
			`return "No words found in input."`

			`# Get items sorted by frequency`
			`if top_n is not None:`
			`items = word_counts.most_common(top_n)`
			`else:`
			`items = word_counts.most_common()`

			`# Find the maximum width for the word column`
			`max_word_len = max(len(word) for word, _ in items) if items else 4`
			`max_word_len = max(max_word_len, 4) # Minimum width for "Word" header`

			`# Find the maximum width for the count column`
			`max_count = max(count for _, count in items) if items else 0`
			`count_width = max(len(str(max_count)), 5) # Minimum width for "Count" header`

			`# Build the table`
			`lines = []`
			`lines.append(f"Total words: {total_words}")`
			`lines.append(f"Unique words: {len(word_counts)}")`
			`lines.append("")`

			`# Header`
			`header = f"{'Word':<{max_word_len}} {'Count':>{count_width}} {'Percentage':>10}"`
			`lines.append(header)`
			`lines.append("-" * len(header))`

			`# Data rows`
			`for word, count in items:`
			`percentage = (count / total_words) * 100`
			`lines.append(f"{word:<{max_word_len}} {count:>{count_width}} {percentage:>9.2f}%")`

			`return "\n".join(lines)`


			`def analyze_and_format(`
			`text: str,`
			`*,`
			`case_sensitive: bool = False,`
			`top_n: int \| None = None,`
			`) -> str:`
			`"""Analyze text and return formatted results.`

			`Args:`
			`text: The input text to analyze.`
			`case_sensitive: If False, treat words case-insensitively.`
			`top_n: If provided, only show the top N words.`

			`Returns:`
			`Formatted string with word frequency analysis.`
			`"""`
			`word_counts = analyze_text(text, case_sensitive=case_sensitive)`
			`return format_results(word_counts, top_n=top_n)`


			`def main(argv: Sequence[str] \| None = None) -> int:`
			`"""Main entry point for the word frequency analyzer.`

			`Args:`
			`argv: Command line arguments (defaults to sys.argv[1:]).`

			`Returns:`
			`Exit code (0 for success, non-zero for errors).`
			`"""`
			`parser = argparse.ArgumentParser(`
			`description="Analyze word frequency in text.",`
			`formatter_class=argparse.RawDescriptionHelpFormatter,`
			`epilog=__doc__,`
			`)`

			`input_group = parser.add_mutually_exclusive_group(required=True)`
			`input_group.add_argument(`
			`"--text",`
			`"-t",`
			`type=str,`
			`help="Raw text to analyze",`
			`)`
			`input_group.add_argument(`
			`"--file",`
			`"-f",`
			`type=str,`
			`help="Path to a file to analyze",`
			`)`
			`input_group.add_argument(`
			`"--files",`
			`"-F",`
			`nargs="+",`
			`type=str,`
			`help="Paths to multiple files to analyze",`
			`)`

			`parser.add_argument(`
			`"--top",`
			`"-n",`
			`type=int,`
			`default=None,`
			`help="Show only the top N most frequent words",`
			`)`
			`parser.add_argument(`
			`"--case-sensitive",`
			`"-c",`
			`action="store_true",`
			`help="Treat words case-sensitively (default: case-insensitive)",`
			`)`
			`parser.add_argument(`
			`"--output",`
			`"-o",`
			`type=str,`
			`help="Output file path (default: print to stdout)",`
			`)`

			`args = parser.parse_args(argv)`

			`try:`
			`if args.text:`
			`text = args.text`
			`elif args.file:`
			`text = read_file(args.file)`
			`else: # args.files`
			`text = read_files(args.files)`

			`result = analyze_and_format(`
			`text,`
			`case_sensitive=args.case_sensitive,`
			`top_n=args.top,`
			`)`

			`if args.output:`
			`Path(args.output).write_text(result, encoding="utf-8")`
			`print(f"Output written to {args.output}") # noqa: T201`
			`else:`
			`print(result) # noqa: T201`

			`except FileNotFoundError as e:`
			`print(f"Error: File not found - {e}", file=sys.stderr) # noqa: T201`
			`return 1`
			`except UnicodeDecodeError as e:`
			`print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr) # noqa: T201`
			`return 1`

			`return 0`


			`if __name__ == "__main__":`
			`sys.exit(main())`