testsAndMisc-archive/python_pkg/word_frequency/vocabulary_curve.py

#!/usr/bin/env python3
"""Vocabulary learning curve analyzer.

Finds the minimum vocabulary needed to understand excerpts of increasing length.
For each excerpt length (1, 2, 3, ... N words), finds the excerpt that requires
the fewest top-frequency words to understand 100%.

Usage:
    python -m python_pkg.word_frequency.vocabulary_curve --file text.txt
    python -m python_pkg.word_frequency.vocabulary_curve --file text.txt --max-length 50
    python -m python_pkg.word_frequency.vocabulary_curve --text "some text here"
"""

from __future__ import annotations

import argparse
import logging
from pathlib import Path
import re
import sys
from typing import TYPE_CHECKING, NamedTuple

if TYPE_CHECKING:
    from collections.abc import Sequence

from python_pkg.word_frequency.analyzer import analyze_text, read_file

logger = logging.getLogger(__name__)


class ExcerptAnalysis(NamedTuple):
    """Analysis result for an excerpt length."""

    excerpt_length: int
    min_vocab_needed: int
    best_excerpt: str
    words_needed: list[str]


def get_word_rank(word: str, ranked_words: list[str]) -> int | None:
    """Get the rank (1-indexed) of a word in the frequency list.

    Args:
        word: The word to look up.
        ranked_words: List of words sorted by frequency (most common first).

    Returns:
        1-indexed rank, or None if word not in list.
    """
    try:
        return ranked_words.index(word) + 1
    except ValueError:
        return None


def analyze_excerpt(
    excerpt_words: list[str],
    ranked_words: list[str],
) -> tuple[int, list[str]]:
    """Analyze how many top words are needed to understand an excerpt 100%.

    Args:
        excerpt_words: List of words in the excerpt.
        ranked_words: List of all words sorted by frequency (most common first).

    Returns:
        Tuple of (max_rank_needed, list_of_words_needed_sorted_by_rank).
    """
    unique_words = set(excerpt_words)
    ranks: list[tuple[int, str]] = []

    for word in unique_words:
        rank = get_word_rank(word, ranked_words)
        if rank is not None:
            ranks.append((rank, word))
        else:
            # Word not in vocabulary - would need infinite learning
            return float("inf"), []

    if not ranks:
        return 0, []

    # Sort by rank
    ranks.sort()
    max_rank = ranks[-1][0]
    words_needed = [word for _, word in ranks]

    return max_rank, words_needed


def find_optimal_excerpts(
    text: str,
    *,
    max_length: int = 30,
    case_sensitive: bool = False,
) -> list[ExcerptAnalysis]:
    """Find optimal excerpts for each length.

    For each excerpt length from 1 to max_length, finds the excerpt
    that requires the minimum number of top-frequency words to understand.

    Args:
        text: The source text to analyze.
        max_length: Maximum excerpt length to analyze.
        case_sensitive: Whether to treat words case-sensitively.

    Returns:
        List of ExcerptAnalysis for each length from 1 to max_length.
    """
    # Get word frequencies and create ranked list
    word_counts = analyze_text(text, case_sensitive=case_sensitive)
    ranked_words = [word for word, _ in word_counts.most_common()]

    # Extract all words from text (preserving order)
    all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)
    if not case_sensitive:
        all_words = [w.lower() for w in all_words]

    if not all_words:
        return []

    results: list[ExcerptAnalysis] = []

    for length in range(1, min(max_length + 1, len(all_words) + 1)):
        best_vocab_needed = float("inf")
        best_excerpt_words: list[str] = []
        best_words_needed: list[str] = []

        # Slide window through text
        for start in range(len(all_words) - length + 1):
            excerpt_words = all_words[start : start + length]
            vocab_needed, words_needed = analyze_excerpt(excerpt_words, ranked_words)

            if vocab_needed < best_vocab_needed:
                best_vocab_needed = vocab_needed
                best_excerpt_words = excerpt_words
                best_words_needed = words_needed

        if best_vocab_needed != float("inf"):  # pragma: no branch
            results.append(
                ExcerptAnalysis(
                    excerpt_length=length,
                    min_vocab_needed=int(best_vocab_needed),
                    best_excerpt=" ".join(best_excerpt_words),
                    words_needed=best_words_needed,
                )
            )

    return results


_MAX_EXCERPT_DISPLAY_LEN = 50


def format_results(
    results: list[ExcerptAnalysis],
    *,
    show_excerpts: bool = False,
    show_words: bool = False,
) -> str:
    """Format analysis results as a table.

    Args:
        results: List of ExcerptAnalysis results.
        show_excerpts: If True, show the actual excerpt text.
        show_words: If True, show which words are needed.

    Returns:
        Formatted string with results.
    """
    if not results:
        return "No excerpts found."

    lines: list[str] = []
    lines.append("=" * 70)
    lines.append("VOCABULARY LEARNING CURVE")
    lines.append("=" * 70)
    lines.append("")
    lines.append("For each excerpt length, the minimum number of top-frequency")
    lines.append("words you need to learn to understand 100% of some excerpt.")
    lines.append("")
    lines.append("-" * 70)

    # Header
    if show_excerpts:
        lines.append(f"{'Length':>6}  {'Vocab':>5}  Excerpt")
        lines.append(f"{'------':>6}  {'-----':>5}  {'-------'}")
    else:
        lines.append(f"{'Length':>6}  {'Vocab Needed':>12}")
        lines.append(f"{'------':>6}  {'------------':>12}")

    prev_vocab = 0
    for r in results:
        # Mark increases
        marker = ""
        if r.min_vocab_needed > prev_vocab:
            marker = f" (+{r.min_vocab_needed - prev_vocab})"
        prev_vocab = r.min_vocab_needed

        if show_excerpts:
            # Truncate long excerpts
            excerpt = r.best_excerpt
            if len(excerpt) > _MAX_EXCERPT_DISPLAY_LEN:
                excerpt = excerpt[:47] + "..."
            lines.append(f"{r.excerpt_length:>6}  {r.min_vocab_needed:>5}  {excerpt}")
        else:
            lines.append(f"{r.excerpt_length:>6}  {r.min_vocab_needed:>12}{marker}")

        if show_words and r.words_needed:
            lines.append(f"        Words: {', '.join(r.words_needed)}")

    lines.append("-" * 70)
    lines.append("")

    # Summary statistics
    if results:  # pragma: no branch
        final = results[-1]
        lines.append(f"To understand a {final.excerpt_length}-word excerpt,")
        lines.append(
            f"you need to learn at minimum {final.min_vocab_needed} top words."
        )

    return "\n".join(lines)


def main(argv: Sequence[str] | None = None) -> int:
    """Main entry point.

    Args:
        argv: Command line arguments.

    Returns:
        Exit code.
    """
    parser = argparse.ArgumentParser(
        description="Analyze minimum vocabulary needed for excerpt lengths.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )

    input_group = parser.add_mutually_exclusive_group(required=True)
    input_group.add_argument(
        "--text",
        "-t",
        type=str,
        help="Raw text to analyze",
    )
    input_group.add_argument(
        "--file",
        "-f",
        type=str,
        help="Path to a file to analyze",
    )

    parser.add_argument(
        "--max-length",
        "-m",
        type=int,
        default=30,
        help="Maximum excerpt length to analyze (default: 30)",
    )
    parser.add_argument(
        "--show-excerpts",
        "-e",
        action="store_true",
        help="Show the actual excerpt text for each length",
    )
    parser.add_argument(
        "--show-words",
        "-w",
        action="store_true",
        help="Show which words are needed for each excerpt",
    )
    parser.add_argument(
        "--case-sensitive",
        "-c",
        action="store_true",
        help="Treat words case-sensitively",
    )
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        help="Output file path (default: print to stdout)",
    )

    args = parser.parse_args(argv)

    try:
        text = args.text or read_file(args.file)

        results = find_optimal_excerpts(
            text,
            max_length=args.max_length,
            case_sensitive=args.case_sensitive,
        )

        output = format_results(
            results,
            show_excerpts=args.show_excerpts,
            show_words=args.show_words,
        )

        if args.output:
            Path(args.output).write_text(output, encoding="utf-8")
            logger.info("Output written to %s", args.output)
        else:
            logger.info("%s", output)

    except FileNotFoundError:
        logger.exception("File not found")
        return 1
    except UnicodeDecodeError:
        logger.exception("Could not decode file")
        return 1

    return 0


if __name__ == "__main__":
    sys.exit(main())
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`#!/usr/bin/env python3`
			`"""Vocabulary learning curve analyzer.`

			`Finds the minimum vocabulary needed to understand excerpts of increasing length.`
			`For each excerpt length (1, 2, 3, ... N words), finds the excerpt that requires`
			`the fewest top-frequency words to understand 100%.`

			`Usage:`
			`python -m python_pkg.word_frequency.vocabulary_curve --file text.txt`
			`python -m python_pkg.word_frequency.vocabulary_curve --file text.txt --max-length 50`
			`python -m python_pkg.word_frequency.vocabulary_curve --text "some text here"`
			`"""`

			`from __future__ import annotations`

			`import argparse`
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`import logging`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`from pathlib import Path`
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`import re`
Add pre-commit workflow and fix linting violations (#2) * Initial plan * Add pre-commit GitHub workflow and fix linting issues - Created .github/workflows/pre-commit.yml to run pre-commit hooks in CI - Fixed mypy type errors in translator.py - Fixed shellcheck warning in run_anki_generator.sh - Added per-file ignores for word_frequency module legacy code - Applied auto-fixes from ruff, ruff-format, autoflake, prettier - All pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Make Python scripts with shebangs executable - Set executable bit for word_frequency module scripts with shebangs - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Fix: Restore imports in check functions (autoflake-proof) - Restored imports in _check_argos(), _check_deep_translator(), _check_langdetect() - Used _ = module assignment to prevent autoflake from removing imports - These imports test module availability by triggering ImportError if missing - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> 2026-01-07 22:57:42 +01:00			`import sys`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`from typing import TYPE_CHECKING, NamedTuple`

			`if TYPE_CHECKING:`
			`from collections.abc import Sequence`

refactor: enforce 500-line limit on all Python source files Split 18+ Python files that exceeded 500 lines into smaller modules with helper files (prefixed with _). All functions are re-exported from the original modules to maintain backward compatibility with test patches and external imports. Files split: - moviepy_showcase.py (1212 -> 302 + 3 helpers) - anki_generator.py (1174 -> 473 + 4 helpers) - test_analyze_chess_game.py (1152 -> 361 + 2 parts) - poker_modifier_app.py (1024 -> 263 + 2 helpers) - transcribe_fw.py (1007 -> 342 + 3 helpers) - music_generator.py (1002 -> 319 + 2 helpers) - translator.py (951 -> 442 + 2 helpers) - cinema_planner.py (893 -> 369 + 2 helpers) - lichess_bot/main.py (757 -> 495 + _game_logic.py) - test_translator.py (725 -> 289 + part2 + conftest) - test_lichess_api.py (680 -> 475 + part2) - learning_pipe.py (668 -> 375 + 2 helpers) - cache.py (655 -> 360 + _cache_decks.py) - analyze_chess_game.py (632 -> 463 + _move_analysis.py) - visualize_q02.py (609 -> 371 + helper) - repo_explorer.py (602 -> 347 + 2 helpers) - keyboard_coop/main.py (515 -> 416 + _dictionary.py) - scanning.py (501 -> 314 + _enforce_loop.py) All tests pass: 144 lichess_bot (100% branch coverage), 243 others. No new lint errors introduced. 2026-03-17 22:47:42 +01:00			`from python_pkg.word_frequency.analyzer import analyze_text, read_file`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`logger = logging.getLogger(__name__)`


feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`class ExcerptAnalysis(NamedTuple):`
			`"""Analysis result for an excerpt length."""`

			`excerpt_length: int`
			`min_vocab_needed: int`
			`best_excerpt: str`
			`words_needed: list[str]`


			`def get_word_rank(word: str, ranked_words: list[str]) -> int \| None:`
			`"""Get the rank (1-indexed) of a word in the frequency list.`

			`Args:`
			`word: The word to look up.`
			`ranked_words: List of words sorted by frequency (most common first).`

			`Returns:`
			`1-indexed rank, or None if word not in list.`
			`"""`
			`try:`
			`return ranked_words.index(word) + 1`
			`except ValueError:`
			`return None`


			`def analyze_excerpt(`
			`excerpt_words: list[str],`
			`ranked_words: list[str],`
			`) -> tuple[int, list[str]]:`
			`"""Analyze how many top words are needed to understand an excerpt 100%.`

			`Args:`
			`excerpt_words: List of words in the excerpt.`
			`ranked_words: List of all words sorted by frequency (most common first).`

			`Returns:`
			`Tuple of (max_rank_needed, list_of_words_needed_sorted_by_rank).`
			`"""`
			`unique_words = set(excerpt_words)`
			`ranks: list[tuple[int, str]] = []`

			`for word in unique_words:`
			`rank = get_word_rank(word, ranked_words)`
			`if rank is not None:`
			`ranks.append((rank, word))`
			`else:`
			`# Word not in vocabulary - would need infinite learning`
fix: resolve all pre-commit hook failures after file splits - Remove all # type: ignore and # noqa comments (banned by no-noqa hook) - Add mypy --disable-error-code flags to pre-commit config for error codes previously suppressed by inline comments - Fix broken imports after ruff auto-removed re-exports: steam_backlog_enforcer, stockfish_analysis, word_frequency, lichess_bot - Re-add re-exports with __all__ in translator.py, screen_lock.py - Split _process_epc_fc.py (524 lines) into _process_epc_fc.py + _process_fc.py - Fix test failures: keyboard_coop, stockfish_analysis, tag_divider - Add per-file-ignores for PLC0415 (deferred imports) in 7 files - Mark shebang scripts as executable - Add __init__.py for generate_images and repo_explorer packages - Fix codespell, eslint, ruff-format, prettier issues - Update copilot-instructions.md with --no-verify ban 2026-03-18 22:20:05 +01:00			`return float("inf"), []`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00
			`if not ranks:`
			`return 0, []`

			`# Sort by rank`
			`ranks.sort()`
			`max_rank = ranks[-1][0]`
			`words_needed = [word for _, word in ranks]`

			`return max_rank, words_needed`


			`def find_optimal_excerpts(`
			`text: str,`
			`*,`
			`max_length: int = 30,`
			`case_sensitive: bool = False,`
			`) -> list[ExcerptAnalysis]:`
			`"""Find optimal excerpts for each length.`

			`For each excerpt length from 1 to max_length, finds the excerpt`
			`that requires the minimum number of top-frequency words to understand.`

			`Args:`
			`text: The source text to analyze.`
			`max_length: Maximum excerpt length to analyze.`
			`case_sensitive: Whether to treat words case-sensitively.`

			`Returns:`
			`List of ExcerptAnalysis for each length from 1 to max_length.`
			`"""`
			`# Get word frequencies and create ranked list`
			`word_counts = analyze_text(text, case_sensitive=case_sensitive)`
			`ranked_words = [word for word, _ in word_counts.most_common()]`

			`# Extract all words from text (preserving order)`
			`all_words = re.findall(r"\b[\w]+\b", text, re.UNICODE)`
			`if not case_sensitive:`
			`all_words = [w.lower() for w in all_words]`

			`if not all_words:`
			`return []`

			`results: list[ExcerptAnalysis] = []`

			`for length in range(1, min(max_length + 1, len(all_words) + 1)):`
			`best_vocab_needed = float("inf")`
			`best_excerpt_words: list[str] = []`
			`best_words_needed: list[str] = []`

			`# Slide window through text`
			`for start in range(len(all_words) - length + 1):`
			`excerpt_words = all_words[start : start + length]`
			`vocab_needed, words_needed = analyze_excerpt(excerpt_words, ranked_words)`

			`if vocab_needed < best_vocab_needed:`
			`best_vocab_needed = vocab_needed`
			`best_excerpt_words = excerpt_words`
			`best_words_needed = words_needed`

test: achieve 100% branch coverage across all python_pkg packages - Add comprehensive tests for all packages (3572 tests, 100% branch coverage) - Split oversized test files to stay under 500-line limit - Add per-file ruff ignores for test-appropriate suppressions - Fix _cache_decks.py to properly convert JSON lists to tuples - Add session-scoped conftest fixture for logging handler cleanup (Python 3.14) - Update ruff pre-commit hook to v0.15.2 - Add codespell ignore words for test data - Add generated output files to .gitignore 2026-03-21 17:51:36 +01:00			`if best_vocab_needed != float("inf"): # pragma: no branch`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`results.append(`
			`ExcerptAnalysis(`
			`excerpt_length=length,`
			`min_vocab_needed=int(best_vocab_needed),`
			`best_excerpt=" ".join(best_excerpt_words),`
			`words_needed=best_words_needed,`
			`)`
			`)`

			`return results`


refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`_MAX_EXCERPT_DISPLAY_LEN = 50`


feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`def format_results(`
			`results: list[ExcerptAnalysis],`
			`*,`
			`show_excerpts: bool = False,`
			`show_words: bool = False,`
			`) -> str:`
			`"""Format analysis results as a table.`

			`Args:`
			`results: List of ExcerptAnalysis results.`
			`show_excerpts: If True, show the actual excerpt text.`
			`show_words: If True, show which words are needed.`

			`Returns:`
			`Formatted string with results.`
			`"""`
			`if not results:`
			`return "No excerpts found."`

			`lines: list[str] = []`
			`lines.append("=" * 70)`
			`lines.append("VOCABULARY LEARNING CURVE")`
			`lines.append("=" * 70)`
			`lines.append("")`
			`lines.append("For each excerpt length, the minimum number of top-frequency")`
			`lines.append("words you need to learn to understand 100% of some excerpt.")`
			`lines.append("")`
			`lines.append("-" * 70)`

			`# Header`
			`if show_excerpts:`
			`lines.append(f"{'Length':>6} {'Vocab':>5} Excerpt")`
			`lines.append(f"{'------':>6} {'-----':>5} {'-------'}")`
			`else:`
			`lines.append(f"{'Length':>6} {'Vocab Needed':>12}")`
			`lines.append(f"{'------':>6} {'------------':>12}")`

			`prev_vocab = 0`
			`for r in results:`
			`# Mark increases`
			`marker = ""`
			`if r.min_vocab_needed > prev_vocab:`
			`marker = f" (+{r.min_vocab_needed - prev_vocab})"`
			`prev_vocab = r.min_vocab_needed`

			`if show_excerpts:`
			`# Truncate long excerpts`
			`excerpt = r.best_excerpt`
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`if len(excerpt) > _MAX_EXCERPT_DISPLAY_LEN:`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`excerpt = excerpt[:47] + "..."`
			`lines.append(f"{r.excerpt_length:>6} {r.min_vocab_needed:>5} {excerpt}")`
			`else:`
			`lines.append(f"{r.excerpt_length:>6} {r.min_vocab_needed:>12}{marker}")`

			`if show_words and r.words_needed:`
			`lines.append(f" Words: {', '.join(r.words_needed)}")`

			`lines.append("-" * 70)`
			`lines.append("")`

			`# Summary statistics`
test: achieve 100% branch coverage across all python_pkg packages - Add comprehensive tests for all packages (3572 tests, 100% branch coverage) - Split oversized test files to stay under 500-line limit - Add per-file ruff ignores for test-appropriate suppressions - Fix _cache_decks.py to properly convert JSON lists to tuples - Add session-scoped conftest fixture for logging handler cleanup (Python 3.14) - Update ruff pre-commit hook to v0.15.2 - Add codespell ignore words for test data - Add generated output files to .gitignore 2026-03-21 17:51:36 +01:00			`if results: # pragma: no branch`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`final = results[-1]`
			`lines.append(f"To understand a {final.excerpt_length}-word excerpt,")`
Add pre-commit workflow and fix linting violations (#2) * Initial plan * Add pre-commit GitHub workflow and fix linting issues - Created .github/workflows/pre-commit.yml to run pre-commit hooks in CI - Fixed mypy type errors in translator.py - Fixed shellcheck warning in run_anki_generator.sh - Added per-file ignores for word_frequency module legacy code - Applied auto-fixes from ruff, ruff-format, autoflake, prettier - All pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Make Python scripts with shebangs executable - Set executable bit for word_frequency module scripts with shebangs - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Fix: Restore imports in check functions (autoflake-proof) - Restored imports in _check_argos(), _check_deep_translator(), _check_langdetect() - Used _ = module assignment to prevent autoflake from removing imports - These imports test module availability by triggering ImportError if missing - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> 2026-01-07 22:57:42 +01:00			`lines.append(`
			`f"you need to learn at minimum {final.min_vocab_needed} top words."`
			`)`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00
			`return "\n".join(lines)`


			`def main(argv: Sequence[str] \| None = None) -> int:`
			`"""Main entry point.`

			`Args:`
			`argv: Command line arguments.`

			`Returns:`
			`Exit code.`
			`"""`
			`parser = argparse.ArgumentParser(`
			`description="Analyze minimum vocabulary needed for excerpt lengths.",`
			`formatter_class=argparse.RawDescriptionHelpFormatter,`
			`epilog=__doc__,`
			`)`

			`input_group = parser.add_mutually_exclusive_group(required=True)`
			`input_group.add_argument(`
			`"--text",`
			`"-t",`
			`type=str,`
			`help="Raw text to analyze",`
			`)`
			`input_group.add_argument(`
			`"--file",`
			`"-f",`
			`type=str,`
			`help="Path to a file to analyze",`
			`)`

			`parser.add_argument(`
			`"--max-length",`
			`"-m",`
			`type=int,`
			`default=30,`
			`help="Maximum excerpt length to analyze (default: 30)",`
			`)`
			`parser.add_argument(`
			`"--show-excerpts",`
			`"-e",`
			`action="store_true",`
			`help="Show the actual excerpt text for each length",`
			`)`
			`parser.add_argument(`
			`"--show-words",`
			`"-w",`
			`action="store_true",`
			`help="Show which words are needed for each excerpt",`
			`)`
			`parser.add_argument(`
			`"--case-sensitive",`
			`"-c",`
			`action="store_true",`
			`help="Treat words case-sensitively",`
			`)`
			`parser.add_argument(`
			`"--output",`
			`"-o",`
			`type=str,`
			`help="Output file path (default: print to stdout)",`
			`)`

			`args = parser.parse_args(argv)`

			`try:`
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`text = args.text or read_file(args.file)`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00
			`results = find_optimal_excerpts(`
			`text,`
			`max_length=args.max_length,`
			`case_sensitive=args.case_sensitive,`
			`)`

			`output = format_results(`
			`results,`
			`show_excerpts=args.show_excerpts,`
			`show_words=args.show_words,`
			`)`

			`if args.output:`
			`Path(args.output).write_text(output, encoding="utf-8")`
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`logger.info("Output written to %s", args.output)`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`else:`
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`logger.info("%s", output)`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`except FileNotFoundError:`
			`logger.exception("File not found")`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`return 1`
refactor(word_frequency): fix all ruff violations and remove noqa comments - Replace print() with logging module throughout - Add type annotations and Google docstrings to all functions - Introduce DeckInput and LessonConfig dataclasses to reduce function parameters - Use specific exception types instead of bare except (BLE001) - Remove all noqa suppression comments - Fix test fixtures: remove unused _capsys/_tmp_path parameters 2026-03-13 20:41:31 +01:00			`except UnicodeDecodeError:`
			`logger.exception("Could not decode file")`
feat: vocabulary curbe in C 2025-12-28 16:15:38 +01:00			`return 1`

			`return 0`


			`if __name__ == "__main__":`
			`sys.exit(main())`