testsAndMisc/python_pkg/word_frequency/learning_pipe.py
Krzysztof kuhy Rudnicki 0460f3fac6 refactor(word_frequency): fix all ruff violations and remove noqa comments
- Replace print() with logging module throughout
- Add type annotations and Google docstrings to all functions
- Introduce DeckInput and LessonConfig dataclasses to reduce function parameters
- Use specific exception types instead of bare except (BLE001)
- Remove all noqa suppression comments
- Fix test fixtures: remove unused _capsys/_tmp_path parameters
2026-03-13 20:41:31 +01:00

669 lines
17 KiB
Python
Executable File

#!/usr/bin/env python3
r"""Learning pipe - combines word frequency analysis with excerpt finding.
Helps language learners by:
1. Analyzing a text to find the most common words
2. Finding excerpts where those common words are most prevalent
3. Creating a progressive learning experience in batches
The idea is to:
- Learn the top N most frequent words first
- Then read excerpts that are dense with those words
- Progressively learn more words and more complex excerpts
Usage::
# Basic usage
python -m python_pkg.word_frequency.learning_pipe \\
--file text.txt
# Custom batch size and excerpt length
python -m python_pkg.word_frequency.learning_pipe \\
--file text.txt --batch-size 30 --excerpt-length 50
# Multiple batches for progressive learning
python -m python_pkg.word_frequency.learning_pipe \\
--file text.txt --batches 5 --batch-size 20
# Output to file
python -m python_pkg.word_frequency.learning_pipe \\
--file text.txt --output lesson.txt
# Skip common words using a stopwords file
python -m python_pkg.word_frequency.learning_pipe \\
--file text.txt --stopwords stopwords.txt
"""
from __future__ import annotations
import argparse
from dataclasses import dataclass
from dataclasses import replace as _replace_dc
import logging
from pathlib import Path
import sys
from typing import TYPE_CHECKING
try:
from python_pkg.word_frequency.analyzer import analyze_text, read_file
from python_pkg.word_frequency.excerpt_finder import find_best_excerpt
from python_pkg.word_frequency.translator import (
detect_language,
translate_words_batch,
)
except ModuleNotFoundError:
from analyzer import analyze_text, read_file # type: ignore[import-not-found]
from excerpt_finder import find_best_excerpt # type: ignore[import-not-found]
from translator import ( # type: ignore[import-not-found]
detect_language,
translate_words_batch,
)
if TYPE_CHECKING:
from collections.abc import Sequence
logger = logging.getLogger(__name__)
# Common stopwords for various languages (can be overridden with --stopwords)
DEFAULT_STOPWORDS_EN = frozenset(
{
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"by",
"from",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
"may",
"might",
"must",
"shall",
"can",
"this",
"that",
"these",
"those",
"i",
"you",
"he",
"she",
"it",
"we",
"they",
"me",
"him",
"her",
"us",
"them",
"my",
"your",
"his",
"its",
"our",
"their",
"what",
"which",
"who",
"whom",
"whose",
"where",
"when",
"why",
"how",
"all",
"each",
"every",
"both",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"own",
"same",
"so",
"than",
"too",
"very",
"just",
"as",
"if",
"then",
"because",
"while",
"although",
"though",
"after",
"before",
}
)
def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
"""Load stopwords from a file (one word per line).
Args:
filepath: Path to stopwords file, or None to use defaults.
Returns:
Frozenset of stopwords.
"""
if filepath is None:
return frozenset()
path = Path(filepath)
if not path.exists():
return frozenset()
content = path.read_text(encoding="utf-8")
return frozenset(
word.strip().lower() for word in content.splitlines() if word.strip()
)
@dataclass(frozen=True)
class LessonConfig:
"""Configuration for learning lesson generation."""
batch_size: int = 20
num_batches: int = 1
excerpt_length: int = 30
excerpts_per_batch: int = 3
stopwords: frozenset[str] | None = None
skip_default_stopwords: bool = False
skip_numbers: bool = True
case_sensitive: bool = False
translate_from: str | None = None
translate_to: str | None = None
def _resolve_stopwords(config: LessonConfig) -> frozenset[str]:
"""Resolve combined stopwords from config."""
if config.skip_default_stopwords:
return config.stopwords or frozenset()
return DEFAULT_STOPWORDS_EN | (config.stopwords or frozenset())
def _detect_translation_language(
text: str,
config: LessonConfig,
lines: list[str],
) -> tuple[str | None, str | None]:
"""Detect translation settings and return (from, to) pair."""
actual_from = config.translate_from
actual_to = config.translate_to or "en"
if actual_from == "auto" or (
config.translate_to and not config.translate_from
):
detected = detect_language(text)
if detected:
actual_from = detected
lines.append(f"Detected language: {detected}")
else:
lines.append(
"Warning: Could not detect language "
"(install langdetect: "
"pip install langdetect)"
)
actual_from = None
return actual_from, actual_to
def _format_word_list(
batch_words: list[tuple[str, int]],
start_idx: int,
total_words: int,
translations: dict[str, str],
) -> list[str]:
"""Format the vocabulary word list for a batch."""
lines: list[str] = []
for i, (word, count) in enumerate(
batch_words, start=start_idx + 1,
):
percentage = (count / total_words) * 100
if translations:
trans = translations.get(word, "?")
lines.append(
f" {i:3}. {word:<20} -> {trans:<20}"
f" ({count:,} occurrences, "
f"{percentage:.2f}%)"
)
else:
lines.append(
f" {i:3}. {word:<20}"
f" ({count:,} occurrences, "
f"{percentage:.2f}%)"
)
return lines
@dataclass(frozen=True)
class _LessonContext:
"""Shared context for batch generation."""
text: str
word_counts: dict[str, int]
config: LessonConfig
def _generate_batch_section(
ctx: _LessonContext,
batch_num: int,
batch_words: list[tuple[str, int]],
cumulative_words: list[str],
) -> list[str]:
"""Generate lines for a single batch section."""
config = ctx.config
total_words = sum(ctx.word_counts.values())
start_idx = batch_num * config.batch_size
end_idx = start_idx + config.batch_size
lines: list[str] = []
lines.append("-" * 70)
lines.append(
f"BATCH {batch_num + 1}: Words "
f"{start_idx + 1} - "
f"{min(end_idx, start_idx + len(batch_words))}"
)
lines.append("-" * 70)
lines.append("")
# Get translations if requested
translations: dict[str, str] = {}
do_translate = (
config.translate_from is not None
and config.translate_to is not None
)
if do_translate:
words_to_translate = [word for word, _ in batch_words]
translation_results = translate_words_batch(
words_to_translate,
config.translate_from, # type: ignore[arg-type]
config.translate_to, # type: ignore[arg-type]
)
translations = {
r.source_word: r.translated_word
for r in translation_results
if r.success
}
lines.append("VOCABULARY TO LEARN:")
lines.append("")
lines.extend(
_format_word_list(
batch_words, start_idx, total_words, translations,
)
)
lines.append("")
# Cumulative coverage
cumulative_count = sum(
ctx.word_counts[w]
for w in cumulative_words
if w in ctx.word_counts
)
coverage = (cumulative_count / total_words) * 100
lines.append(
"After learning these words, "
f"you'll recognize ~{coverage:.1f}% of the text"
)
lines.append("")
# Excerpts
lines.append("PRACTICE EXCERPTS:")
lines.append(
"(Excerpts where your learned vocabulary "
"is most concentrated)"
)
lines.append("")
excerpts = find_best_excerpt(
ctx.text,
cumulative_words,
config.excerpt_length,
case_sensitive=config.case_sensitive,
top_n=config.excerpts_per_batch,
)
for j, excerpt in enumerate(excerpts, 1):
lines.append(
f" Excerpt {j} "
f"({excerpt.match_percentage:.1f}% known words):"
)
lines.append(f' "{excerpt.excerpt}"')
lines.append("")
return lines
def generate_learning_lesson(
text: str,
config: LessonConfig | None = None,
) -> str:
"""Generate a learning lesson from text.
Args:
text: The source text to analyze.
config: Lesson configuration. Uses defaults if None.
Returns:
Formatted learning lesson as a string.
"""
if config is None:
config = LessonConfig()
all_stopwords = _resolve_stopwords(config)
word_counts = analyze_text(
text, case_sensitive=config.case_sensitive,
)
filtered_words = [
(word, count)
for word, count in word_counts.most_common()
if word.lower() not in all_stopwords
and len(word) > 1
and not (config.skip_numbers and word.isdigit())
]
total_words = sum(word_counts.values())
lines: list[str] = []
lines.append("=" * 70)
lines.append("LANGUAGE LEARNING LESSON")
lines.append("=" * 70)
lines.append(
f"Source text: {total_words:,} total words, "
f"{len(word_counts):,} unique words"
)
if all_stopwords:
lines.append(
f"After filtering {len(all_stopwords)} "
f"stopwords: {len(filtered_words):,} "
"vocabulary words"
)
else:
lines.append(
f"Vocabulary words: {len(filtered_words):,}",
)
actual_from, actual_to = _detect_translation_language(
text, config, lines,
)
do_translate = (
actual_from is not None and actual_to is not None
)
if do_translate:
lines.append(
f"Translation: {actual_from} -> {actual_to}",
)
lines.append("")
# Create resolved config with detected translation
resolved_config = _replace_dc(
config,
translate_from=actual_from,
translate_to=actual_to,
)
ctx = _LessonContext(
text=text,
word_counts=word_counts,
config=resolved_config,
)
cumulative_words: list[str] = []
for batch_num in range(config.num_batches):
start_idx = batch_num * config.batch_size
end_idx = start_idx + config.batch_size
if start_idx >= len(filtered_words):
break
batch_words = filtered_words[start_idx:end_idx]
cumulative_words.extend(word for word, _ in batch_words)
lines.extend(
_generate_batch_section(
ctx,
batch_num,
batch_words,
cumulative_words,
)
)
# Summary
lines.append("=" * 70)
lines.append("SUMMARY")
lines.append("=" * 70)
if cumulative_words:
final_coverage = sum(
word_counts[w]
for w in cumulative_words
if w in word_counts
)
final_pct = (final_coverage / total_words) * 100
lines.append(
"Total vocabulary words learned: "
f"{len(cumulative_words)}"
)
lines.append(f"Text coverage: {final_pct:.1f}%")
lines.append("")
lines.append(
"TIP: Focus on understanding the excerpts "
"first, then read"
)
lines.append(
"more of the original text as your "
"vocabulary grows!"
)
return "\n".join(lines)
def main(argv: Sequence[str] | None = None) -> int:
"""Main entry point for the learning pipe.
Args:
argv: Command line arguments (defaults to sys.argv[1:]).
Returns:
Exit code (0 for success, non-zero for errors).
"""
parser = argparse.ArgumentParser(
description="Generate language learning lessons from text.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
# Input source
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument(
"--text",
"-t",
type=str,
help="Raw text to analyze",
)
input_group.add_argument(
"--file",
"-f",
type=str,
help="Path to a text file to analyze",
)
# Learning parameters
parser.add_argument(
"--batch-size",
"-b",
type=int,
default=20,
help="Number of words per learning batch (default: 20)",
)
parser.add_argument(
"--batches",
"-n",
type=int,
default=1,
help="Number of batches to generate (default: 1)",
)
parser.add_argument(
"--excerpt-length",
"-l",
type=int,
default=30,
help="Length of excerpts in words (default: 30)",
)
parser.add_argument(
"--excerpts-per-batch",
"-e",
type=int,
default=3,
help="Number of excerpts per batch (default: 3)",
)
# Filtering options
parser.add_argument(
"--stopwords",
"-s",
type=str,
help="Path to custom stopwords file (one word per line)",
)
parser.add_argument(
"--no-default-stopwords",
action="store_true",
help="Don't filter out default English stopwords",
)
parser.add_argument(
"--case-sensitive",
"-c",
action="store_true",
help="Treat words case-sensitively",
)
parser.add_argument(
"--include-numbers",
action="store_true",
help="Include numeric words in vocabulary (filtered by default)",
)
# Translation options (enabled by default)
parser.add_argument(
"--no-translate",
"-T",
action="store_true",
help="Disable translation",
)
parser.add_argument(
"--translate-from",
type=str,
metavar="LANG",
help=(
"Source language code (e.g., 'la', 'pl'). "
"If omitted, auto-detected."
),
)
parser.add_argument(
"--translate-to",
type=str,
metavar="LANG",
default="en",
help="Target language code (default: 'en')",
)
# Output options
parser.add_argument(
"--output",
"-o",
type=str,
help="Output file path (default: print to stdout)",
)
args = parser.parse_args(argv)
try:
text = args.text or read_file(args.file)
# Load custom stopwords if provided
custom_stopwords = load_stopwords(args.stopwords)
# Determine translation settings
translate_from: str | None = None
translate_to: str | None = None
if not args.no_translate:
translate_from = (
args.translate_from or "auto"
)
translate_to = args.translate_to
config = LessonConfig(
batch_size=args.batch_size,
num_batches=args.batches,
excerpt_length=args.excerpt_length,
excerpts_per_batch=args.excerpts_per_batch,
stopwords=custom_stopwords,
skip_default_stopwords=args.no_default_stopwords,
skip_numbers=not args.include_numbers,
case_sensitive=args.case_sensitive,
translate_from=translate_from,
translate_to=translate_to,
)
lesson = generate_learning_lesson(text, config)
# Output
if args.output:
Path(args.output).write_text(
lesson, encoding="utf-8",
)
logger.info(
"Lesson written to %s", args.output,
)
else:
logger.info(lesson)
except FileNotFoundError:
logger.exception("Error: File not found")
return 1
except UnicodeDecodeError:
logger.exception(
"Error: Could not decode file as UTF-8",
)
return 1
return 0
if __name__ == "__main__":
sys.exit(main())