mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 21:23:16 +02:00
- Add comprehensive tests for all packages (3572 tests, 100% branch coverage) - Split oversized test files to stay under 500-line limit - Add per-file ruff ignores for test-appropriate suppressions - Fix _cache_decks.py to properly convert JSON lists to tuples - Add session-scoped conftest fixture for logging handler cleanup (Python 3.14) - Update ruff pre-commit hook to v0.15.2 - Add codespell ignore words for test data - Add generated output files to .gitignore
347 lines
9.2 KiB
Python
Executable File
347 lines
9.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
r"""Learning pipe - combines word frequency analysis with excerpt finding.
|
|
|
|
Helps language learners by:
|
|
|
|
1. Analyzing a text to find the most common words
|
|
2. Finding excerpts where those common words are most prevalent
|
|
3. Creating a progressive learning experience in batches
|
|
|
|
The idea is to:
|
|
- Learn the top N most frequent words first
|
|
- Then read excerpts that are dense with those words
|
|
- Progressively learn more words and more complex excerpts
|
|
|
|
Usage::
|
|
|
|
# Basic usage
|
|
python -m python_pkg.word_frequency.learning_pipe \\
|
|
--file text.txt
|
|
|
|
# Custom batch size and excerpt length
|
|
python -m python_pkg.word_frequency.learning_pipe \\
|
|
--file text.txt --batch-size 30 --excerpt-length 50
|
|
|
|
# Multiple batches for progressive learning
|
|
python -m python_pkg.word_frequency.learning_pipe \\
|
|
--file text.txt --batches 5 --batch-size 20
|
|
|
|
# Output to file
|
|
python -m python_pkg.word_frequency.learning_pipe \\
|
|
--file text.txt --output lesson.txt
|
|
|
|
# Skip common words using a stopwords file
|
|
python -m python_pkg.word_frequency.learning_pipe \\
|
|
--file text.txt --stopwords stopwords.txt
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from dataclasses import replace as _replace_dc
|
|
import logging
|
|
from pathlib import Path
|
|
import sys
|
|
from typing import TYPE_CHECKING
|
|
|
|
from python_pkg.word_frequency._learning_batch import (
|
|
_detect_translation_language,
|
|
_generate_batch_section,
|
|
_LessonContext,
|
|
)
|
|
from python_pkg.word_frequency._learning_constants import (
|
|
LessonConfig,
|
|
_resolve_stopwords,
|
|
load_stopwords,
|
|
)
|
|
from python_pkg.word_frequency.analyzer import analyze_text, read_file
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Sequence
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def generate_learning_lesson(
|
|
text: str,
|
|
config: LessonConfig | None = None,
|
|
) -> str:
|
|
"""Generate a learning lesson from text.
|
|
|
|
Args:
|
|
text: The source text to analyze.
|
|
config: Lesson configuration. Uses defaults if None.
|
|
|
|
Returns:
|
|
Formatted learning lesson as a string.
|
|
"""
|
|
if config is None:
|
|
config = LessonConfig()
|
|
|
|
all_stopwords = _resolve_stopwords(config)
|
|
word_counts = analyze_text(
|
|
text,
|
|
case_sensitive=config.case_sensitive,
|
|
)
|
|
|
|
filtered_words = [
|
|
(word, count)
|
|
for word, count in word_counts.most_common()
|
|
if word.lower() not in all_stopwords
|
|
and len(word) > 1
|
|
and not (config.skip_numbers and word.isdigit())
|
|
]
|
|
|
|
total_words = sum(word_counts.values())
|
|
lines: list[str] = []
|
|
|
|
lines.append("=" * 70)
|
|
lines.append("LANGUAGE LEARNING LESSON")
|
|
lines.append("=" * 70)
|
|
lines.append(
|
|
f"Source text: {total_words:,} total words, {len(word_counts):,} unique words"
|
|
)
|
|
if all_stopwords:
|
|
lines.append(
|
|
f"After filtering {len(all_stopwords)} "
|
|
f"stopwords: {len(filtered_words):,} "
|
|
"vocabulary words"
|
|
)
|
|
else:
|
|
lines.append(
|
|
f"Vocabulary words: {len(filtered_words):,}",
|
|
)
|
|
|
|
actual_from, actual_to = _detect_translation_language(
|
|
text,
|
|
config,
|
|
lines,
|
|
)
|
|
do_translate = actual_from is not None and actual_to is not None
|
|
if do_translate:
|
|
lines.append(
|
|
f"Translation: {actual_from} -> {actual_to}",
|
|
)
|
|
lines.append("")
|
|
|
|
# Create resolved config with detected translation
|
|
resolved_config = _replace_dc(
|
|
config,
|
|
translate_from=actual_from,
|
|
translate_to=actual_to,
|
|
)
|
|
ctx = _LessonContext(
|
|
text=text,
|
|
word_counts=word_counts,
|
|
config=resolved_config,
|
|
)
|
|
|
|
cumulative_words: list[str] = []
|
|
for batch_num in range(config.num_batches):
|
|
start_idx = batch_num * config.batch_size
|
|
end_idx = start_idx + config.batch_size
|
|
if start_idx >= len(filtered_words):
|
|
break
|
|
|
|
batch_words = filtered_words[start_idx:end_idx]
|
|
cumulative_words.extend(word for word, _ in batch_words)
|
|
|
|
lines.extend(
|
|
_generate_batch_section(
|
|
ctx,
|
|
batch_num,
|
|
batch_words,
|
|
cumulative_words,
|
|
)
|
|
)
|
|
|
|
# Summary
|
|
lines.append("=" * 70)
|
|
lines.append("SUMMARY")
|
|
lines.append("=" * 70)
|
|
|
|
if cumulative_words:
|
|
final_coverage = sum(
|
|
word_counts[w] for w in cumulative_words if w in word_counts
|
|
)
|
|
final_pct = (final_coverage / total_words) * 100
|
|
lines.append(f"Total vocabulary words learned: {len(cumulative_words)}")
|
|
lines.append(f"Text coverage: {final_pct:.1f}%")
|
|
lines.append("")
|
|
lines.append("TIP: Focus on understanding the excerpts first, then read")
|
|
lines.append("more of the original text as your vocabulary grows!")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
"""Main entry point for the learning pipe.
|
|
|
|
Args:
|
|
argv: Command line arguments (defaults to sys.argv[1:]).
|
|
|
|
Returns:
|
|
Exit code (0 for success, non-zero for errors).
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate language learning lessons from text.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__,
|
|
)
|
|
|
|
# Input source
|
|
input_group = parser.add_mutually_exclusive_group(required=True)
|
|
input_group.add_argument(
|
|
"--text",
|
|
"-t",
|
|
type=str,
|
|
help="Raw text to analyze",
|
|
)
|
|
input_group.add_argument(
|
|
"--file",
|
|
"-f",
|
|
type=str,
|
|
help="Path to a text file to analyze",
|
|
)
|
|
|
|
# Learning parameters
|
|
parser.add_argument(
|
|
"--batch-size",
|
|
"-b",
|
|
type=int,
|
|
default=20,
|
|
help="Number of words per learning batch (default: 20)",
|
|
)
|
|
parser.add_argument(
|
|
"--batches",
|
|
"-n",
|
|
type=int,
|
|
default=1,
|
|
help="Number of batches to generate (default: 1)",
|
|
)
|
|
parser.add_argument(
|
|
"--excerpt-length",
|
|
"-l",
|
|
type=int,
|
|
default=30,
|
|
help="Length of excerpts in words (default: 30)",
|
|
)
|
|
parser.add_argument(
|
|
"--excerpts-per-batch",
|
|
"-e",
|
|
type=int,
|
|
default=3,
|
|
help="Number of excerpts per batch (default: 3)",
|
|
)
|
|
|
|
# Filtering options
|
|
parser.add_argument(
|
|
"--stopwords",
|
|
"-s",
|
|
type=str,
|
|
help="Path to custom stopwords file (one word per line)",
|
|
)
|
|
parser.add_argument(
|
|
"--no-default-stopwords",
|
|
action="store_true",
|
|
help="Don't filter out default English stopwords",
|
|
)
|
|
parser.add_argument(
|
|
"--case-sensitive",
|
|
"-c",
|
|
action="store_true",
|
|
help="Treat words case-sensitively",
|
|
)
|
|
parser.add_argument(
|
|
"--include-numbers",
|
|
action="store_true",
|
|
help="Include numeric words in vocabulary (filtered by default)",
|
|
)
|
|
|
|
# Translation options (enabled by default)
|
|
parser.add_argument(
|
|
"--no-translate",
|
|
"-T",
|
|
action="store_true",
|
|
help="Disable translation",
|
|
)
|
|
parser.add_argument(
|
|
"--translate-from",
|
|
type=str,
|
|
metavar="LANG",
|
|
help=("Source language code (e.g., 'la', 'pl'). If omitted, auto-detected."),
|
|
)
|
|
parser.add_argument(
|
|
"--translate-to",
|
|
type=str,
|
|
metavar="LANG",
|
|
default="en",
|
|
help="Target language code (default: 'en')",
|
|
)
|
|
|
|
# Output options
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
type=str,
|
|
help="Output file path (default: print to stdout)",
|
|
)
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
try:
|
|
text = args.text or read_file(args.file)
|
|
|
|
# Load custom stopwords if provided
|
|
custom_stopwords = load_stopwords(args.stopwords)
|
|
|
|
# Determine translation settings
|
|
translate_from: str | None = None
|
|
translate_to: str | None = None
|
|
|
|
if not args.no_translate:
|
|
translate_from = args.translate_from or "auto"
|
|
translate_to = args.translate_to
|
|
|
|
config = LessonConfig(
|
|
batch_size=args.batch_size,
|
|
num_batches=args.batches,
|
|
excerpt_length=args.excerpt_length,
|
|
excerpts_per_batch=args.excerpts_per_batch,
|
|
stopwords=custom_stopwords,
|
|
skip_default_stopwords=args.no_default_stopwords,
|
|
skip_numbers=not args.include_numbers,
|
|
case_sensitive=args.case_sensitive,
|
|
translate_from=translate_from,
|
|
translate_to=translate_to,
|
|
)
|
|
lesson = generate_learning_lesson(text, config)
|
|
|
|
# Output
|
|
if args.output:
|
|
Path(args.output).write_text(
|
|
lesson,
|
|
encoding="utf-8",
|
|
)
|
|
logger.info(
|
|
"Lesson written to %s",
|
|
args.output,
|
|
)
|
|
else:
|
|
logger.info(lesson)
|
|
|
|
except FileNotFoundError:
|
|
logger.exception("Error: File not found")
|
|
return 1
|
|
except UnicodeDecodeError:
|
|
logger.exception(
|
|
"Error: Could not decode file as UTF-8",
|
|
)
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|