testsAndMisc/python_pkg/word_frequency/learning_pipe.py

#!/usr/bin/env python3
r"""Learning pipe - combines word frequency analysis with excerpt finding.

Helps language learners by:

1. Analyzing a text to find the most common words
2. Finding excerpts where those common words are most prevalent
3. Creating a progressive learning experience in batches

The idea is to:
- Learn the top N most frequent words first
- Then read excerpts that are dense with those words
- Progressively learn more words and more complex excerpts

Usage::

    # Basic usage
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt

    # Custom batch size and excerpt length
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt --batch-size 30 --excerpt-length 50

    # Multiple batches for progressive learning
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt --batches 5 --batch-size 20

    # Output to file
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt --output lesson.txt

    # Skip common words using a stopwords file
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt --stopwords stopwords.txt
"""

from __future__ import annotations

import argparse
from dataclasses import replace as _replace_dc
import logging
from pathlib import Path
import sys
from typing import TYPE_CHECKING

from python_pkg.word_frequency._learning_batch import (
    _detect_translation_language,
    _generate_batch_section,
    _LessonContext,
)
from python_pkg.word_frequency._learning_constants import (
    LessonConfig,
    _resolve_stopwords,
    load_stopwords,
)
from python_pkg.word_frequency.analyzer import analyze_text, read_file

if TYPE_CHECKING:
    from collections.abc import Sequence

logger = logging.getLogger(__name__)


def generate_learning_lesson(
    text: str,
    config: LessonConfig | None = None,
) -> str:
    """Generate a learning lesson from text.

    Args:
        text: The source text to analyze.
        config: Lesson configuration. Uses defaults if None.

    Returns:
        Formatted learning lesson as a string.
    """
    if config is None:
        config = LessonConfig()

    all_stopwords = _resolve_stopwords(config)
    word_counts = analyze_text(
        text,
        case_sensitive=config.case_sensitive,
    )

    filtered_words = [
        (word, count)
        for word, count in word_counts.most_common()
        if word.lower() not in all_stopwords
        and len(word) > 1
        and not (config.skip_numbers and word.isdigit())
    ]

    total_words = sum(word_counts.values())
    lines: list[str] = []

    lines.append("=" * 70)
    lines.append("LANGUAGE LEARNING LESSON")
    lines.append("=" * 70)
    lines.append(
        f"Source text: {total_words:,} total words, {len(word_counts):,} unique words"
    )
    if all_stopwords:
        lines.append(
            f"After filtering {len(all_stopwords)} "
            f"stopwords: {len(filtered_words):,} "
            "vocabulary words"
        )
    else:
        lines.append(
            f"Vocabulary words: {len(filtered_words):,}",
        )

    actual_from, actual_to = _detect_translation_language(
        text,
        config,
        lines,
    )
    do_translate = actual_from is not None and actual_to is not None
    if do_translate:
        lines.append(
            f"Translation: {actual_from} -> {actual_to}",
        )
    lines.append("")

    # Create resolved config with detected translation
    resolved_config = _replace_dc(
        config,
        translate_from=actual_from,
        translate_to=actual_to,
    )
    ctx = _LessonContext(
        text=text,
        word_counts=word_counts,
        config=resolved_config,
    )

    cumulative_words: list[str] = []
    for batch_num in range(config.num_batches):
        start_idx = batch_num * config.batch_size
        end_idx = start_idx + config.batch_size
        if start_idx >= len(filtered_words):
            break

        batch_words = filtered_words[start_idx:end_idx]
        cumulative_words.extend(word for word, _ in batch_words)

        lines.extend(
            _generate_batch_section(
                ctx,
                batch_num,
                batch_words,
                cumulative_words,
            )
        )

    # Summary
    lines.append("=" * 70)
    lines.append("SUMMARY")
    lines.append("=" * 70)

    if cumulative_words:
        final_coverage = sum(
            word_counts[w] for w in cumulative_words if w in word_counts
        )
        final_pct = (final_coverage / total_words) * 100
        lines.append(f"Total vocabulary words learned: {len(cumulative_words)}")
        lines.append(f"Text coverage: {final_pct:.1f}%")
        lines.append("")
        lines.append("TIP: Focus on understanding the excerpts first, then read")
        lines.append("more of the original text as your vocabulary grows!")

    return "\n".join(lines)


def main(argv: Sequence[str] | None = None) -> int:
    """Main entry point for the learning pipe.

    Args:
        argv: Command line arguments (defaults to sys.argv[1:]).

    Returns:
        Exit code (0 for success, non-zero for errors).
    """
    parser = argparse.ArgumentParser(
        description="Generate language learning lessons from text.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )

    # Input source
    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument(
        "--text",
        "-t",
        type=str,
        help="Raw text to analyze",
    )
    input_group.add_argument(
        "--file",
        "-f",
        type=str,
        help="Path to a text file to analyze",
    )

    # Learning parameters
    parser.add_argument(
        "--batch-size",
        "-b",
        type=int,
        default=20,
        help="Number of words per learning batch (default: 20)",
    )
    parser.add_argument(
        "--batches",
        "-n",
        type=int,
        default=1,
        help="Number of batches to generate (default: 1)",
    )
    parser.add_argument(
        "--excerpt-length",
        "-l",
        type=int,
        default=30,
        help="Length of excerpts in words (default: 30)",
    )
    parser.add_argument(
        "--excerpts-per-batch",
        "-e",
        type=int,
        default=3,
        help="Number of excerpts per batch (default: 3)",
    )

    # Filtering options
    parser.add_argument(
        "--stopwords",
        "-s",
        type=str,
        help="Path to custom stopwords file (one word per line)",
    )
    parser.add_argument(
        "--no-default-stopwords",
        action="store_true",
        help="Don't filter out default English stopwords",
    )
    parser.add_argument(
        "--case-sensitive",
        "-c",
        action="store_true",
        help="Treat words case-sensitively",
    )
    parser.add_argument(
        "--include-numbers",
        action="store_true",
        help="Include numeric words in vocabulary (filtered by default)",
    )

    # Translation options (enabled by default)
    parser.add_argument(
        "--no-translate",
        "-T",
        action="store_true",
        help="Disable translation",
    )
    parser.add_argument(
        "--translate-from",
        type=str,
        metavar="LANG",
        help=("Source language code (e.g., 'la', 'pl'). If omitted, auto-detected."),
    )
    parser.add_argument(
        "--translate-to",
        type=str,
        metavar="LANG",
        default="en",
        help="Target language code (default: 'en')",
    )

    # Output options
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        help="Output file path (default: print to stdout)",
    )

    args = parser.parse_args(argv)

    try:
        text = args.text or read_file(args.file)

        # Load custom stopwords if provided
        custom_stopwords = load_stopwords(args.stopwords)

        # Determine translation settings
        translate_from: str | None = None
        translate_to: str | None = None

        if not args.no_translate:
            translate_from = args.translate_from or "auto"
            translate_to = args.translate_to

        config = LessonConfig(
            batch_size=args.batch_size,
            num_batches=args.batches,
            excerpt_length=args.excerpt_length,
            excerpts_per_batch=args.excerpts_per_batch,
            stopwords=custom_stopwords,
            skip_default_stopwords=args.no_default_stopwords,
            skip_numbers=not args.include_numbers,
            case_sensitive=args.case_sensitive,
            translate_from=translate_from,
            translate_to=translate_to,
        )
        lesson = generate_learning_lesson(text, config)

        # Output
        if args.output:
            Path(args.output).write_text(
                lesson,
                encoding="utf-8",
            )
            logger.info(
                "Lesson written to %s",
                args.output,
            )
        else:
            logger.info(lesson)

    except FileNotFoundError:
        logger.exception("Error: File not found")
        return 1
    except UnicodeDecodeError:
        logger.exception(
            "Error: Could not decode file as UTF-8",
        )
        return 1

    return 0


if __name__ == "__main__":
    sys.exit(main())