testsAndMisc/python_pkg/word_frequency/learning_pipe.py

#!/usr/bin/env python3
"""Learning pipe - combines word frequency analysis with excerpt finding for language learning.

This script helps language learners by:
1. Analyzing a text to find the most common words
2. Finding excerpts where those common words are most prevalent
3. Creating a progressive learning experience in batches

The idea is to:
- Learn the top N most frequent words first
- Then read excerpts that are dense with those words
- Progressively learn more words and more complex excerpts

Usage:
    # Basic usage - get top 20 words and find excerpts with them
    python -m python_pkg.word_frequency.learning_pipe --file text.txt

    # Custom batch size and excerpt length
    python -m python_pkg.word_frequency.learning_pipe --file text.txt --batch-size 30 --excerpt-length 50

    # Multiple batches for progressive learning
    python -m python_pkg.word_frequency.learning_pipe --file text.txt --batches 5 --batch-size 20

    # Output to file
    python -m python_pkg.word_frequency.learning_pipe --file text.txt --output lesson.txt

    # Skip common words (like "the", "a", "is") using a stopwords file
    python -m python_pkg.word_frequency.learning_pipe --file text.txt --stopwords stopwords.txt
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path
from typing import TYPE_CHECKING

try:
    from python_pkg.word_frequency.analyzer import analyze_text, read_file
    from python_pkg.word_frequency.excerpt_finder import find_best_excerpt
except ModuleNotFoundError:
    from analyzer import analyze_text, read_file  # type: ignore[import-not-found]
    from excerpt_finder import find_best_excerpt  # type: ignore[import-not-found]

if TYPE_CHECKING:
    from collections.abc import Sequence


# Common stopwords for various languages (can be overridden with --stopwords)
DEFAULT_STOPWORDS_EN = frozenset({
    "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
    "of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
    "being", "have", "has", "had", "do", "does", "did", "will", "would",
    "could", "should", "may", "might", "must", "shall", "can", "this",
    "that", "these", "those", "i", "you", "he", "she", "it", "we", "they",
    "me", "him", "her", "us", "them", "my", "your", "his", "its", "our",
    "their", "what", "which", "who", "whom", "whose", "where", "when",
    "why", "how", "all", "each", "every", "both", "few", "more", "most",
    "other", "some", "such", "no", "nor", "not", "only", "own", "same",
    "so", "than", "too", "very", "just", "as", "if", "then", "because",
    "while", "although", "though", "after", "before", "when", "where",
})


def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
    """Load stopwords from a file (one word per line).

    Args:
        filepath: Path to stopwords file, or None to use defaults.

    Returns:
        Frozenset of stopwords.
    """
    if filepath is None:
        return frozenset()

    path = Path(filepath)
    if not path.exists():
        return frozenset()

    content = path.read_text(encoding="utf-8")
    return frozenset(word.strip().lower() for word in content.splitlines() if word.strip())


def generate_learning_lesson(
    text: str,
    *,
    batch_size: int = 20,
    num_batches: int = 1,
    excerpt_length: int = 30,
    excerpts_per_batch: int = 3,
    stopwords: frozenset[str] | None = None,
    skip_default_stopwords: bool = False,
    skip_numbers: bool = True,
    case_sensitive: bool = False,
    context_words: int = 5,
) -> str:
    """Generate a learning lesson from text.

    Args:
        text: The source text to analyze.
        batch_size: Number of words per learning batch.
        num_batches: Number of batches to generate.
        excerpt_length: Length of each excerpt in words.
        excerpts_per_batch: Number of excerpts to find per batch.
        stopwords: Custom stopwords to skip (in addition to defaults).
        skip_default_stopwords: If True, don't filter out default English stopwords.
        skip_numbers: If True, filter out numeric words (default: True).
        case_sensitive: If True, treat words case-sensitively.
        context_words: Words of context to include around excerpts.

    Returns:
        Formatted learning lesson as a string.
    """
    # Combine stopwords
    all_stopwords: frozenset[str]
    if skip_default_stopwords:
        all_stopwords = stopwords or frozenset()
    else:
        all_stopwords = DEFAULT_STOPWORDS_EN | (stopwords or frozenset())

    # Analyze text for word frequencies
    word_counts = analyze_text(text, case_sensitive=case_sensitive)

    # Filter out stopwords and get sorted words
    filtered_words = [
        (word, count)
        for word, count in word_counts.most_common()
        if word.lower() not in all_stopwords
        and len(word) > 1
        and not (skip_numbers and word.isdigit())
    ]

    total_words = sum(word_counts.values())
    lines: list[str] = []

    lines.append("=" * 70)
    lines.append("LANGUAGE LEARNING LESSON")
    lines.append("=" * 70)
    lines.append(f"Source text: {total_words:,} total words, {len(word_counts):,} unique words")
    if all_stopwords:
        lines.append(f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words")
    else:
        lines.append(f"Vocabulary words: {len(filtered_words):,}")
    lines.append("")

    # Generate batches
    cumulative_words: list[str] = []

    for batch_num in range(num_batches):
        start_idx = batch_num * batch_size
        end_idx = start_idx + batch_size

        if start_idx >= len(filtered_words):
            break

        batch_words = filtered_words[start_idx:end_idx]
        cumulative_words.extend(word for word, _ in batch_words)

        lines.append("-" * 70)
        lines.append(f"BATCH {batch_num + 1}: Words {start_idx + 1} - {min(end_idx, len(filtered_words))}")
        lines.append("-" * 70)
        lines.append("")

        # Word list with frequencies
        lines.append("VOCABULARY TO LEARN:")
        lines.append("")

        for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
            percentage = (count / total_words) * 100
            lines.append(f"  {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)")

        lines.append("")

        # Calculate cumulative coverage
        cumulative_count = sum(
            word_counts[word] for word in cumulative_words if word in word_counts
        )
        coverage = (cumulative_count / total_words) * 100
        lines.append(f"After learning these words, you'll recognize ~{coverage:.1f}% of the text")
        lines.append("")

        # Find excerpts using cumulative words
        lines.append("PRACTICE EXCERPTS:")
        lines.append("(Excerpts where your learned vocabulary is most concentrated)")
        lines.append("")

        excerpts = find_best_excerpt(
            text,
            cumulative_words,
            excerpt_length,
            case_sensitive=case_sensitive,
            top_n=excerpts_per_batch,
        )

        for j, excerpt in enumerate(excerpts, 1):
            lines.append(f"  Excerpt {j} ({excerpt.match_percentage:.1f}% known words):")
            lines.append(f"  \"{excerpt.excerpt}\"")
            lines.append("")

    # Summary
    lines.append("=" * 70)
    lines.append("SUMMARY")
    lines.append("=" * 70)

    if cumulative_words:
        final_coverage = sum(
            word_counts[word] for word in cumulative_words if word in word_counts
        )
        final_percentage = (final_coverage / total_words) * 100
        lines.append(f"Total vocabulary words learned: {len(cumulative_words)}")
        lines.append(f"Text coverage: {final_percentage:.1f}%")
        lines.append("")
        lines.append("TIP: Focus on understanding the excerpts first, then read")
        lines.append("more of the original text as your vocabulary grows!")

    return "\n".join(lines)


def main(argv: Sequence[str] | None = None) -> int:
    """Main entry point for the learning pipe.

    Args:
        argv: Command line arguments (defaults to sys.argv[1:]).

    Returns:
        Exit code (0 for success, non-zero for errors).
    """
    parser = argparse.ArgumentParser(
        description="Generate language learning lessons from text.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )

    # Input source
    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument(
        "--text",
        "-t",
        type=str,
        help="Raw text to analyze",
    )
    input_group.add_argument(
        "--file",
        "-f",
        type=str,
        help="Path to a text file to analyze",
    )

    # Learning parameters
    parser.add_argument(
        "--batch-size",
        "-b",
        type=int,
        default=20,
        help="Number of words per learning batch (default: 20)",
    )
    parser.add_argument(
        "--batches",
        "-n",
        type=int,
        default=1,
        help="Number of batches to generate (default: 1)",
    )
    parser.add_argument(
        "--excerpt-length",
        "-l",
        type=int,
        default=30,
        help="Length of excerpts in words (default: 30)",
    )
    parser.add_argument(
        "--excerpts-per-batch",
        "-e",
        type=int,
        default=3,
        help="Number of excerpts per batch (default: 3)",
    )

    # Filtering options
    parser.add_argument(
        "--stopwords",
        "-s",
        type=str,
        help="Path to custom stopwords file (one word per line)",
    )
    parser.add_argument(
        "--no-default-stopwords",
        action="store_true",
        help="Don't filter out default English stopwords",
    )
    parser.add_argument(
        "--case-sensitive",
        "-c",
        action="store_true",
        help="Treat words case-sensitively",
    )
    parser.add_argument(
        "--include-numbers",
        action="store_true",
        help="Include numeric words in vocabulary (filtered by default)",
    )

    # Output options
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        help="Output file path (default: print to stdout)",
    )

    args = parser.parse_args(argv)

    try:
        # Get input text
        if args.text:
            text = args.text
        else:
            text = read_file(args.file)

        # Load custom stopwords if provided
        custom_stopwords = load_stopwords(args.stopwords)

        # Generate lesson
        lesson = generate_learning_lesson(
            text,
            batch_size=args.batch_size,
            num_batches=args.batches,
            excerpt_length=args.excerpt_length,
            excerpts_per_batch=args.excerpts_per_batch,
            stopwords=custom_stopwords,
            skip_default_stopwords=args.no_default_stopwords,
            skip_numbers=not args.include_numbers,
            case_sensitive=args.case_sensitive,
        )

        # Output
        if args.output:
            Path(args.output).write_text(lesson, encoding="utf-8")
            print(f"Lesson written to {args.output}")  # noqa: T201
        else:
            print(lesson)  # noqa: T201

    except FileNotFoundError as e:
        print(f"Error: File not found - {e}", file=sys.stderr)  # noqa: T201
        return 1
    except UnicodeDecodeError as e:
        print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)  # noqa: T201
        return 1

    return 0


if __name__ == "__main__":
    sys.exit(main())