testsAndMisc/python_pkg/word_frequency/learning_pipe.py

#!/usr/bin/env python3
r"""Learning pipe - combines word frequency analysis with excerpt finding.

Helps language learners by:

1. Analyzing a text to find the most common words
2. Finding excerpts where those common words are most prevalent
3. Creating a progressive learning experience in batches

The idea is to:
- Learn the top N most frequent words first
- Then read excerpts that are dense with those words
- Progressively learn more words and more complex excerpts

Usage::

    # Basic usage
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt

    # Custom batch size and excerpt length
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt --batch-size 30 --excerpt-length 50

    # Multiple batches for progressive learning
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt --batches 5 --batch-size 20

    # Output to file
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt --output lesson.txt

    # Skip common words using a stopwords file
    python -m python_pkg.word_frequency.learning_pipe \\
        --file text.txt --stopwords stopwords.txt
"""

from __future__ import annotations

import argparse
from dataclasses import dataclass
from dataclasses import replace as _replace_dc
import logging
from pathlib import Path
import sys
from typing import TYPE_CHECKING

try:
    from python_pkg.word_frequency.analyzer import analyze_text, read_file
    from python_pkg.word_frequency.excerpt_finder import find_best_excerpt
    from python_pkg.word_frequency.translator import (
        detect_language,
        translate_words_batch,
    )
except ModuleNotFoundError:
    from analyzer import analyze_text, read_file  # type: ignore[import-not-found]
    from excerpt_finder import find_best_excerpt  # type: ignore[import-not-found]
    from translator import (  # type: ignore[import-not-found]
        detect_language,
        translate_words_batch,
    )

if TYPE_CHECKING:
    from collections.abc import Sequence

logger = logging.getLogger(__name__)


# Common stopwords for various languages (can be overridden with --stopwords)
DEFAULT_STOPWORDS_EN = frozenset(
    {
        "the",
        "a",
        "an",
        "and",
        "or",
        "but",
        "in",
        "on",
        "at",
        "to",
        "for",
        "of",
        "with",
        "by",
        "from",
        "is",
        "are",
        "was",
        "were",
        "be",
        "been",
        "being",
        "have",
        "has",
        "had",
        "do",
        "does",
        "did",
        "will",
        "would",
        "could",
        "should",
        "may",
        "might",
        "must",
        "shall",
        "can",
        "this",
        "that",
        "these",
        "those",
        "i",
        "you",
        "he",
        "she",
        "it",
        "we",
        "they",
        "me",
        "him",
        "her",
        "us",
        "them",
        "my",
        "your",
        "his",
        "its",
        "our",
        "their",
        "what",
        "which",
        "who",
        "whom",
        "whose",
        "where",
        "when",
        "why",
        "how",
        "all",
        "each",
        "every",
        "both",
        "few",
        "more",
        "most",
        "other",
        "some",
        "such",
        "no",
        "nor",
        "not",
        "only",
        "own",
        "same",
        "so",
        "than",
        "too",
        "very",
        "just",
        "as",
        "if",
        "then",
        "because",
        "while",
        "although",
        "though",
        "after",
        "before",
    }
)


def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
    """Load stopwords from a file (one word per line).

    Args:
        filepath: Path to stopwords file, or None to use defaults.

    Returns:
        Frozenset of stopwords.
    """
    if filepath is None:
        return frozenset()

    path = Path(filepath)
    if not path.exists():
        return frozenset()

    content = path.read_text(encoding="utf-8")
    return frozenset(
        word.strip().lower() for word in content.splitlines() if word.strip()
    )


@dataclass(frozen=True)
class LessonConfig:
    """Configuration for learning lesson generation."""

    batch_size: int = 20
    num_batches: int = 1
    excerpt_length: int = 30
    excerpts_per_batch: int = 3
    stopwords: frozenset[str] | None = None
    skip_default_stopwords: bool = False
    skip_numbers: bool = True
    case_sensitive: bool = False
    translate_from: str | None = None
    translate_to: str | None = None


def _resolve_stopwords(config: LessonConfig) -> frozenset[str]:
    """Resolve combined stopwords from config."""
    if config.skip_default_stopwords:
        return config.stopwords or frozenset()
    return DEFAULT_STOPWORDS_EN | (config.stopwords or frozenset())


def _detect_translation_language(
    text: str,
    config: LessonConfig,
    lines: list[str],
) -> tuple[str | None, str | None]:
    """Detect translation settings and return (from, to) pair."""
    actual_from = config.translate_from
    actual_to = config.translate_to or "en"

    if actual_from == "auto" or (
        config.translate_to and not config.translate_from
    ):
        detected = detect_language(text)
        if detected:
            actual_from = detected
            lines.append(f"Detected language: {detected}")
        else:
            lines.append(
                "Warning: Could not detect language "
                "(install langdetect: "
                "pip install langdetect)"
            )
            actual_from = None

    return actual_from, actual_to


def _format_word_list(
    batch_words: list[tuple[str, int]],
    start_idx: int,
    total_words: int,
    translations: dict[str, str],
) -> list[str]:
    """Format the vocabulary word list for a batch."""
    lines: list[str] = []
    for i, (word, count) in enumerate(
        batch_words, start=start_idx + 1,
    ):
        percentage = (count / total_words) * 100
        if translations:
            trans = translations.get(word, "?")
            lines.append(
                f"  {i:3}. {word:<20} -> {trans:<20}"
                f" ({count:,} occurrences, "
                f"{percentage:.2f}%)"
            )
        else:
            lines.append(
                f"  {i:3}. {word:<20}"
                f" ({count:,} occurrences, "
                f"{percentage:.2f}%)"
            )
    return lines


@dataclass(frozen=True)
class _LessonContext:
    """Shared context for batch generation."""

    text: str
    word_counts: dict[str, int]
    config: LessonConfig


def _generate_batch_section(
    ctx: _LessonContext,
    batch_num: int,
    batch_words: list[tuple[str, int]],
    cumulative_words: list[str],
) -> list[str]:
    """Generate lines for a single batch section."""
    config = ctx.config
    total_words = sum(ctx.word_counts.values())
    start_idx = batch_num * config.batch_size
    end_idx = start_idx + config.batch_size

    lines: list[str] = []
    lines.append("-" * 70)
    lines.append(
        f"BATCH {batch_num + 1}: Words "
        f"{start_idx + 1} - "
        f"{min(end_idx, start_idx + len(batch_words))}"
    )
    lines.append("-" * 70)
    lines.append("")

    # Get translations if requested
    translations: dict[str, str] = {}
    do_translate = (
        config.translate_from is not None
        and config.translate_to is not None
    )
    if do_translate:
        words_to_translate = [word for word, _ in batch_words]
        translation_results = translate_words_batch(
            words_to_translate,
            config.translate_from,  # type: ignore[arg-type]
            config.translate_to,  # type: ignore[arg-type]
        )
        translations = {
            r.source_word: r.translated_word
            for r in translation_results
            if r.success
        }

    lines.append("VOCABULARY TO LEARN:")
    lines.append("")
    lines.extend(
        _format_word_list(
            batch_words, start_idx, total_words, translations,
        )
    )
    lines.append("")

    # Cumulative coverage
    cumulative_count = sum(
        ctx.word_counts[w]
        for w in cumulative_words
        if w in ctx.word_counts
    )
    coverage = (cumulative_count / total_words) * 100
    lines.append(
        "After learning these words, "
        f"you'll recognize ~{coverage:.1f}% of the text"
    )
    lines.append("")

    # Excerpts
    lines.append("PRACTICE EXCERPTS:")
    lines.append(
        "(Excerpts where your learned vocabulary "
        "is most concentrated)"
    )
    lines.append("")

    excerpts = find_best_excerpt(
        ctx.text,
        cumulative_words,
        config.excerpt_length,
        case_sensitive=config.case_sensitive,
        top_n=config.excerpts_per_batch,
    )

    for j, excerpt in enumerate(excerpts, 1):
        lines.append(
            f"  Excerpt {j} "
            f"({excerpt.match_percentage:.1f}% known words):"
        )
        lines.append(f'  "{excerpt.excerpt}"')
        lines.append("")

    return lines


def generate_learning_lesson(
    text: str,
    config: LessonConfig | None = None,
) -> str:
    """Generate a learning lesson from text.

    Args:
        text: The source text to analyze.
        config: Lesson configuration. Uses defaults if None.

    Returns:
        Formatted learning lesson as a string.
    """
    if config is None:
        config = LessonConfig()

    all_stopwords = _resolve_stopwords(config)
    word_counts = analyze_text(
        text, case_sensitive=config.case_sensitive,
    )

    filtered_words = [
        (word, count)
        for word, count in word_counts.most_common()
        if word.lower() not in all_stopwords
        and len(word) > 1
        and not (config.skip_numbers and word.isdigit())
    ]

    total_words = sum(word_counts.values())
    lines: list[str] = []

    lines.append("=" * 70)
    lines.append("LANGUAGE LEARNING LESSON")
    lines.append("=" * 70)
    lines.append(
        f"Source text: {total_words:,} total words, "
        f"{len(word_counts):,} unique words"
    )
    if all_stopwords:
        lines.append(
            f"After filtering {len(all_stopwords)} "
            f"stopwords: {len(filtered_words):,} "
            "vocabulary words"
        )
    else:
        lines.append(
            f"Vocabulary words: {len(filtered_words):,}",
        )

    actual_from, actual_to = _detect_translation_language(
        text, config, lines,
    )
    do_translate = (
        actual_from is not None and actual_to is not None
    )
    if do_translate:
        lines.append(
            f"Translation: {actual_from} -> {actual_to}",
        )
    lines.append("")

    # Create resolved config with detected translation
    resolved_config = _replace_dc(
        config,
        translate_from=actual_from,
        translate_to=actual_to,
    )
    ctx = _LessonContext(
        text=text,
        word_counts=word_counts,
        config=resolved_config,
    )

    cumulative_words: list[str] = []
    for batch_num in range(config.num_batches):
        start_idx = batch_num * config.batch_size
        end_idx = start_idx + config.batch_size
        if start_idx >= len(filtered_words):
            break

        batch_words = filtered_words[start_idx:end_idx]
        cumulative_words.extend(word for word, _ in batch_words)

        lines.extend(
            _generate_batch_section(
                ctx,
                batch_num,
                batch_words,
                cumulative_words,
            )
        )

    # Summary
    lines.append("=" * 70)
    lines.append("SUMMARY")
    lines.append("=" * 70)

    if cumulative_words:
        final_coverage = sum(
            word_counts[w]
            for w in cumulative_words
            if w in word_counts
        )
        final_pct = (final_coverage / total_words) * 100
        lines.append(
            "Total vocabulary words learned: "
            f"{len(cumulative_words)}"
        )
        lines.append(f"Text coverage: {final_pct:.1f}%")
        lines.append("")
        lines.append(
            "TIP: Focus on understanding the excerpts "
            "first, then read"
        )
        lines.append(
            "more of the original text as your "
            "vocabulary grows!"
        )

    return "\n".join(lines)


def main(argv: Sequence[str] | None = None) -> int:
    """Main entry point for the learning pipe.

    Args:
        argv: Command line arguments (defaults to sys.argv[1:]).

    Returns:
        Exit code (0 for success, non-zero for errors).
    """
    parser = argparse.ArgumentParser(
        description="Generate language learning lessons from text.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )

    # Input source
    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument(
        "--text",
        "-t",
        type=str,
        help="Raw text to analyze",
    )
    input_group.add_argument(
        "--file",
        "-f",
        type=str,
        help="Path to a text file to analyze",
    )

    # Learning parameters
    parser.add_argument(
        "--batch-size",
        "-b",
        type=int,
        default=20,
        help="Number of words per learning batch (default: 20)",
    )
    parser.add_argument(
        "--batches",
        "-n",
        type=int,
        default=1,
        help="Number of batches to generate (default: 1)",
    )
    parser.add_argument(
        "--excerpt-length",
        "-l",
        type=int,
        default=30,
        help="Length of excerpts in words (default: 30)",
    )
    parser.add_argument(
        "--excerpts-per-batch",
        "-e",
        type=int,
        default=3,
        help="Number of excerpts per batch (default: 3)",
    )

    # Filtering options
    parser.add_argument(
        "--stopwords",
        "-s",
        type=str,
        help="Path to custom stopwords file (one word per line)",
    )
    parser.add_argument(
        "--no-default-stopwords",
        action="store_true",
        help="Don't filter out default English stopwords",
    )
    parser.add_argument(
        "--case-sensitive",
        "-c",
        action="store_true",
        help="Treat words case-sensitively",
    )
    parser.add_argument(
        "--include-numbers",
        action="store_true",
        help="Include numeric words in vocabulary (filtered by default)",
    )

    # Translation options (enabled by default)
    parser.add_argument(
        "--no-translate",
        "-T",
        action="store_true",
        help="Disable translation",
    )
    parser.add_argument(
        "--translate-from",
        type=str,
        metavar="LANG",
        help=(
            "Source language code (e.g., 'la', 'pl'). "
            "If omitted, auto-detected."
        ),
    )
    parser.add_argument(
        "--translate-to",
        type=str,
        metavar="LANG",
        default="en",
        help="Target language code (default: 'en')",
    )

    # Output options
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        help="Output file path (default: print to stdout)",
    )

    args = parser.parse_args(argv)

    try:
        text = args.text or read_file(args.file)

        # Load custom stopwords if provided
        custom_stopwords = load_stopwords(args.stopwords)

        # Determine translation settings
        translate_from: str | None = None
        translate_to: str | None = None

        if not args.no_translate:
            translate_from = (
                args.translate_from or "auto"
            )
            translate_to = args.translate_to

        config = LessonConfig(
            batch_size=args.batch_size,
            num_batches=args.batches,
            excerpt_length=args.excerpt_length,
            excerpts_per_batch=args.excerpts_per_batch,
            stopwords=custom_stopwords,
            skip_default_stopwords=args.no_default_stopwords,
            skip_numbers=not args.include_numbers,
            case_sensitive=args.case_sensitive,
            translate_from=translate_from,
            translate_to=translate_to,
        )
        lesson = generate_learning_lesson(text, config)

        # Output
        if args.output:
            Path(args.output).write_text(
                lesson, encoding="utf-8",
            )
            logger.info(
                "Lesson written to %s", args.output,
            )
        else:
            logger.info(lesson)

    except FileNotFoundError:
        logger.exception("Error: File not found")
        return 1
    except UnicodeDecodeError:
        logger.exception(
            "Error: Could not decode file as UTF-8",
        )
        return 1

    return 0


if __name__ == "__main__":
    sys.exit(main())