testsAndMisc/python_pkg/word_frequency/learning_pipe.py
copilot-swe-agent[bot] 840a712bb4 Add pre-commit GitHub workflow and fix linting issues
- Created .github/workflows/pre-commit.yml to run pre-commit hooks in CI
- Fixed mypy type errors in translator.py
- Fixed shellcheck warning in run_anki_generator.sh
- Added per-file ignores for word_frequency module legacy code
- Applied auto-fixes from ruff, ruff-format, autoflake, prettier
- All pre-commit hooks now passing

Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
2026-01-07 21:32:45 +00:00

551 lines
16 KiB
Python

#!/usr/bin/env python3
"""Learning pipe - combines word frequency analysis with excerpt finding for language learning.
This script helps language learners by:
1. Analyzing a text to find the most common words
2. Finding excerpts where those common words are most prevalent
3. Creating a progressive learning experience in batches
The idea is to:
- Learn the top N most frequent words first
- Then read excerpts that are dense with those words
- Progressively learn more words and more complex excerpts
Usage:
# Basic usage - get top 20 words and find excerpts with them
python -m python_pkg.word_frequency.learning_pipe --file text.txt
# Custom batch size and excerpt length
python -m python_pkg.word_frequency.learning_pipe --file text.txt --batch-size 30 --excerpt-length 50
# Multiple batches for progressive learning
python -m python_pkg.word_frequency.learning_pipe --file text.txt --batches 5 --batch-size 20
# Output to file
python -m python_pkg.word_frequency.learning_pipe --file text.txt --output lesson.txt
# Skip common words (like "the", "a", "is") using a stopwords file
python -m python_pkg.word_frequency.learning_pipe --file text.txt --stopwords stopwords.txt
"""
from __future__ import annotations
import argparse
from pathlib import Path
import sys
from typing import TYPE_CHECKING
try:
from python_pkg.word_frequency.analyzer import analyze_text, read_file
from python_pkg.word_frequency.excerpt_finder import find_best_excerpt
from python_pkg.word_frequency.translator import (
detect_language,
translate_words_batch,
)
except ModuleNotFoundError:
from analyzer import analyze_text, read_file # type: ignore[import-not-found]
from excerpt_finder import find_best_excerpt # type: ignore[import-not-found]
from translator import ( # type: ignore[import-not-found]
detect_language,
translate_words_batch,
)
if TYPE_CHECKING:
from collections.abc import Sequence
# Common stopwords for various languages (can be overridden with --stopwords)
DEFAULT_STOPWORDS_EN = frozenset(
{
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"by",
"from",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
"may",
"might",
"must",
"shall",
"can",
"this",
"that",
"these",
"those",
"i",
"you",
"he",
"she",
"it",
"we",
"they",
"me",
"him",
"her",
"us",
"them",
"my",
"your",
"his",
"its",
"our",
"their",
"what",
"which",
"who",
"whom",
"whose",
"where",
"when",
"why",
"how",
"all",
"each",
"every",
"both",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"own",
"same",
"so",
"than",
"too",
"very",
"just",
"as",
"if",
"then",
"because",
"while",
"although",
"though",
"after",
"before",
}
)
def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
"""Load stopwords from a file (one word per line).
Args:
filepath: Path to stopwords file, or None to use defaults.
Returns:
Frozenset of stopwords.
"""
if filepath is None:
return frozenset()
path = Path(filepath)
if not path.exists():
return frozenset()
content = path.read_text(encoding="utf-8")
return frozenset(
word.strip().lower() for word in content.splitlines() if word.strip()
)
def generate_learning_lesson(
text: str,
*,
batch_size: int = 20,
num_batches: int = 1,
excerpt_length: int = 30,
excerpts_per_batch: int = 3,
stopwords: frozenset[str] | None = None,
skip_default_stopwords: bool = False,
skip_numbers: bool = True,
case_sensitive: bool = False,
context_words: int = 5,
translate_from: str | None = None,
translate_to: str | None = None,
) -> str:
"""Generate a learning lesson from text.
Args:
text: The source text to analyze.
batch_size: Number of words per learning batch.
num_batches: Number of batches to generate.
excerpt_length: Length of each excerpt in words.
excerpts_per_batch: Number of excerpts to find per batch.
stopwords: Custom stopwords to skip (in addition to defaults).
skip_default_stopwords: If True, don't filter out default English stopwords.
skip_numbers: If True, filter out numeric words (default: True).
case_sensitive: If True, treat words case-sensitively.
context_words: Words of context to include around excerpts.
translate_from: Source language code for translation (e.g., 'la', 'pl').
translate_to: Target language code for translation (e.g., 'en').
Returns:
Formatted learning lesson as a string.
"""
# Combine stopwords
all_stopwords: frozenset[str]
if skip_default_stopwords:
all_stopwords = stopwords or frozenset()
else:
all_stopwords = DEFAULT_STOPWORDS_EN | (stopwords or frozenset())
# Analyze text for word frequencies
word_counts = analyze_text(text, case_sensitive=case_sensitive)
# Filter out stopwords and get sorted words
filtered_words = [
(word, count)
for word, count in word_counts.most_common()
if word.lower() not in all_stopwords
and len(word) > 1
and not (skip_numbers and word.isdigit())
]
total_words = sum(word_counts.values())
lines: list[str] = []
lines.append("=" * 70)
lines.append("LANGUAGE LEARNING LESSON")
lines.append("=" * 70)
lines.append(
f"Source text: {total_words:,} total words, {len(word_counts):,} unique words"
)
if all_stopwords:
lines.append(
f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words"
)
else:
lines.append(f"Vocabulary words: {len(filtered_words):,}")
# Handle translation setup
actual_translate_from = translate_from
actual_translate_to = translate_to or "en" # Default to English
# Auto-detect language if translation is enabled but source not specified
if translate_from == "auto" or (translate_to and not translate_from):
detected = detect_language(text)
if detected:
actual_translate_from = detected
lines.append(f"Detected language: {detected}")
# Note: langdetect doesn't support Latin (often detected as Italian)
# If detection seems wrong, use --translate-from to override
else:
lines.append(
"Warning: Could not detect language "
"(install langdetect: pip install langdetect)"
)
actual_translate_from = None
do_translate = actual_translate_from is not None and actual_translate_to is not None
if do_translate:
lines.append(f"Translation: {actual_translate_from} -> {actual_translate_to}")
lines.append("")
# Generate batches
cumulative_words: list[str] = []
for batch_num in range(num_batches):
start_idx = batch_num * batch_size
end_idx = start_idx + batch_size
if start_idx >= len(filtered_words):
break
batch_words = filtered_words[start_idx:end_idx]
cumulative_words.extend(word for word, _ in batch_words)
lines.append("-" * 70)
lines.append(
f"BATCH {batch_num + 1}: Words {start_idx + 1} - {min(end_idx, len(filtered_words))}"
)
lines.append("-" * 70)
lines.append("")
# Get translations if requested
translations: dict[str, str] = {}
if do_translate:
words_to_translate = [word for word, _ in batch_words]
translation_results = translate_words_batch(
words_to_translate,
actual_translate_from, # type: ignore[arg-type]
actual_translate_to, # type: ignore[arg-type]
)
translations = {
r.source_word: r.translated_word
for r in translation_results
if r.success
}
# Word list with frequencies
lines.append("VOCABULARY TO LEARN:")
lines.append("")
if do_translate and translations:
# Include translations in output
for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
percentage = (count / total_words) * 100
trans = translations.get(word, "?")
lines.append(
f" {i:3}. {word:<20} -> {trans:<20} ({count:,} occurrences, {percentage:.2f}%)"
)
else:
for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
percentage = (count / total_words) * 100
lines.append(
f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)"
)
lines.append("")
# Calculate cumulative coverage
cumulative_count = sum(
word_counts[word] for word in cumulative_words if word in word_counts
)
coverage = (cumulative_count / total_words) * 100
lines.append(
f"After learning these words, you'll recognize ~{coverage:.1f}% of the text"
)
lines.append("")
# Find excerpts using cumulative words
lines.append("PRACTICE EXCERPTS:")
lines.append("(Excerpts where your learned vocabulary is most concentrated)")
lines.append("")
excerpts = find_best_excerpt(
text,
cumulative_words,
excerpt_length,
case_sensitive=case_sensitive,
top_n=excerpts_per_batch,
)
for j, excerpt in enumerate(excerpts, 1):
lines.append(
f" Excerpt {j} ({excerpt.match_percentage:.1f}% known words):"
)
lines.append(f' "{excerpt.excerpt}"')
lines.append("")
# Summary
lines.append("=" * 70)
lines.append("SUMMARY")
lines.append("=" * 70)
if cumulative_words:
final_coverage = sum(
word_counts[word] for word in cumulative_words if word in word_counts
)
final_percentage = (final_coverage / total_words) * 100
lines.append(f"Total vocabulary words learned: {len(cumulative_words)}")
lines.append(f"Text coverage: {final_percentage:.1f}%")
lines.append("")
lines.append("TIP: Focus on understanding the excerpts first, then read")
lines.append("more of the original text as your vocabulary grows!")
return "\n".join(lines)
def main(argv: Sequence[str] | None = None) -> int:
"""Main entry point for the learning pipe.
Args:
argv: Command line arguments (defaults to sys.argv[1:]).
Returns:
Exit code (0 for success, non-zero for errors).
"""
parser = argparse.ArgumentParser(
description="Generate language learning lessons from text.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
# Input source
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument(
"--text",
"-t",
type=str,
help="Raw text to analyze",
)
input_group.add_argument(
"--file",
"-f",
type=str,
help="Path to a text file to analyze",
)
# Learning parameters
parser.add_argument(
"--batch-size",
"-b",
type=int,
default=20,
help="Number of words per learning batch (default: 20)",
)
parser.add_argument(
"--batches",
"-n",
type=int,
default=1,
help="Number of batches to generate (default: 1)",
)
parser.add_argument(
"--excerpt-length",
"-l",
type=int,
default=30,
help="Length of excerpts in words (default: 30)",
)
parser.add_argument(
"--excerpts-per-batch",
"-e",
type=int,
default=3,
help="Number of excerpts per batch (default: 3)",
)
# Filtering options
parser.add_argument(
"--stopwords",
"-s",
type=str,
help="Path to custom stopwords file (one word per line)",
)
parser.add_argument(
"--no-default-stopwords",
action="store_true",
help="Don't filter out default English stopwords",
)
parser.add_argument(
"--case-sensitive",
"-c",
action="store_true",
help="Treat words case-sensitively",
)
parser.add_argument(
"--include-numbers",
action="store_true",
help="Include numeric words in vocabulary (filtered by default)",
)
# Translation options (enabled by default)
parser.add_argument(
"--no-translate",
"-T",
action="store_true",
help="Disable translation",
)
parser.add_argument(
"--translate-from",
type=str,
metavar="LANG",
help="Source language code (e.g., 'la', 'pl', 'de'). If omitted, auto-detected.",
)
parser.add_argument(
"--translate-to",
type=str,
metavar="LANG",
default="en",
help="Target language code (default: 'en')",
)
# Output options
parser.add_argument(
"--output",
"-o",
type=str,
help="Output file path (default: print to stdout)",
)
args = parser.parse_args(argv)
try:
# Get input text
if args.text:
text = args.text
else:
text = read_file(args.file)
# Load custom stopwords if provided
custom_stopwords = load_stopwords(args.stopwords)
# Determine translation settings
# Translation enabled by default, --no-translate disables it
translate_from: str | None = None
translate_to: str | None = None
if not args.no_translate:
translate_from = args.translate_from or "auto" # "auto" triggers detection
translate_to = args.translate_to
# Generate lesson
lesson = generate_learning_lesson(
text,
batch_size=args.batch_size,
num_batches=args.batches,
excerpt_length=args.excerpt_length,
excerpts_per_batch=args.excerpts_per_batch,
stopwords=custom_stopwords,
skip_default_stopwords=args.no_default_stopwords,
skip_numbers=not args.include_numbers,
case_sensitive=args.case_sensitive,
translate_from=translate_from,
translate_to=translate_to,
)
# Output
if args.output:
Path(args.output).write_text(lesson, encoding="utf-8")
print(f"Lesson written to {args.output}")
else:
print(lesson)
except FileNotFoundError as e:
print(f"Error: File not found - {e}", file=sys.stderr)
return 1
except UnicodeDecodeError as e:
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
return 1
return 0
if __name__ == "__main__":
sys.exit(main())