mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 17:23:09 +02:00
* Initial plan * Add pre-commit GitHub workflow and fix linting issues - Created .github/workflows/pre-commit.yml to run pre-commit hooks in CI - Fixed mypy type errors in translator.py - Fixed shellcheck warning in run_anki_generator.sh - Added per-file ignores for word_frequency module legacy code - Applied auto-fixes from ruff, ruff-format, autoflake, prettier - All pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Make Python scripts with shebangs executable - Set executable bit for word_frequency module scripts with shebangs - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> * Fix: Restore imports in check functions (autoflake-proof) - Restored imports in _check_argos(), _check_deep_translator(), _check_langdetect() - Used _ = module assignment to prevent autoflake from removing imports - These imports test module availability by triggering ImportError if missing - All 30 pre-commit hooks now passing Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
551 lines
16 KiB
Python
Executable File
551 lines
16 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Learning pipe - combines word frequency analysis with excerpt finding for language learning.
|
|
|
|
This script helps language learners by:
|
|
1. Analyzing a text to find the most common words
|
|
2. Finding excerpts where those common words are most prevalent
|
|
3. Creating a progressive learning experience in batches
|
|
|
|
The idea is to:
|
|
- Learn the top N most frequent words first
|
|
- Then read excerpts that are dense with those words
|
|
- Progressively learn more words and more complex excerpts
|
|
|
|
Usage:
|
|
# Basic usage - get top 20 words and find excerpts with them
|
|
python -m python_pkg.word_frequency.learning_pipe --file text.txt
|
|
|
|
# Custom batch size and excerpt length
|
|
python -m python_pkg.word_frequency.learning_pipe --file text.txt --batch-size 30 --excerpt-length 50
|
|
|
|
# Multiple batches for progressive learning
|
|
python -m python_pkg.word_frequency.learning_pipe --file text.txt --batches 5 --batch-size 20
|
|
|
|
# Output to file
|
|
python -m python_pkg.word_frequency.learning_pipe --file text.txt --output lesson.txt
|
|
|
|
# Skip common words (like "the", "a", "is") using a stopwords file
|
|
python -m python_pkg.word_frequency.learning_pipe --file text.txt --stopwords stopwords.txt
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
import sys
|
|
from typing import TYPE_CHECKING
|
|
|
|
try:
|
|
from python_pkg.word_frequency.analyzer import analyze_text, read_file
|
|
from python_pkg.word_frequency.excerpt_finder import find_best_excerpt
|
|
from python_pkg.word_frequency.translator import (
|
|
detect_language,
|
|
translate_words_batch,
|
|
)
|
|
except ModuleNotFoundError:
|
|
from analyzer import analyze_text, read_file # type: ignore[import-not-found]
|
|
from excerpt_finder import find_best_excerpt # type: ignore[import-not-found]
|
|
from translator import ( # type: ignore[import-not-found]
|
|
detect_language,
|
|
translate_words_batch,
|
|
)
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Sequence
|
|
|
|
|
|
# Common stopwords for various languages (can be overridden with --stopwords)
|
|
DEFAULT_STOPWORDS_EN = frozenset(
|
|
{
|
|
"the",
|
|
"a",
|
|
"an",
|
|
"and",
|
|
"or",
|
|
"but",
|
|
"in",
|
|
"on",
|
|
"at",
|
|
"to",
|
|
"for",
|
|
"of",
|
|
"with",
|
|
"by",
|
|
"from",
|
|
"is",
|
|
"are",
|
|
"was",
|
|
"were",
|
|
"be",
|
|
"been",
|
|
"being",
|
|
"have",
|
|
"has",
|
|
"had",
|
|
"do",
|
|
"does",
|
|
"did",
|
|
"will",
|
|
"would",
|
|
"could",
|
|
"should",
|
|
"may",
|
|
"might",
|
|
"must",
|
|
"shall",
|
|
"can",
|
|
"this",
|
|
"that",
|
|
"these",
|
|
"those",
|
|
"i",
|
|
"you",
|
|
"he",
|
|
"she",
|
|
"it",
|
|
"we",
|
|
"they",
|
|
"me",
|
|
"him",
|
|
"her",
|
|
"us",
|
|
"them",
|
|
"my",
|
|
"your",
|
|
"his",
|
|
"its",
|
|
"our",
|
|
"their",
|
|
"what",
|
|
"which",
|
|
"who",
|
|
"whom",
|
|
"whose",
|
|
"where",
|
|
"when",
|
|
"why",
|
|
"how",
|
|
"all",
|
|
"each",
|
|
"every",
|
|
"both",
|
|
"few",
|
|
"more",
|
|
"most",
|
|
"other",
|
|
"some",
|
|
"such",
|
|
"no",
|
|
"nor",
|
|
"not",
|
|
"only",
|
|
"own",
|
|
"same",
|
|
"so",
|
|
"than",
|
|
"too",
|
|
"very",
|
|
"just",
|
|
"as",
|
|
"if",
|
|
"then",
|
|
"because",
|
|
"while",
|
|
"although",
|
|
"though",
|
|
"after",
|
|
"before",
|
|
}
|
|
)
|
|
|
|
|
|
def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
|
|
"""Load stopwords from a file (one word per line).
|
|
|
|
Args:
|
|
filepath: Path to stopwords file, or None to use defaults.
|
|
|
|
Returns:
|
|
Frozenset of stopwords.
|
|
"""
|
|
if filepath is None:
|
|
return frozenset()
|
|
|
|
path = Path(filepath)
|
|
if not path.exists():
|
|
return frozenset()
|
|
|
|
content = path.read_text(encoding="utf-8")
|
|
return frozenset(
|
|
word.strip().lower() for word in content.splitlines() if word.strip()
|
|
)
|
|
|
|
|
|
def generate_learning_lesson(
|
|
text: str,
|
|
*,
|
|
batch_size: int = 20,
|
|
num_batches: int = 1,
|
|
excerpt_length: int = 30,
|
|
excerpts_per_batch: int = 3,
|
|
stopwords: frozenset[str] | None = None,
|
|
skip_default_stopwords: bool = False,
|
|
skip_numbers: bool = True,
|
|
case_sensitive: bool = False,
|
|
context_words: int = 5,
|
|
translate_from: str | None = None,
|
|
translate_to: str | None = None,
|
|
) -> str:
|
|
"""Generate a learning lesson from text.
|
|
|
|
Args:
|
|
text: The source text to analyze.
|
|
batch_size: Number of words per learning batch.
|
|
num_batches: Number of batches to generate.
|
|
excerpt_length: Length of each excerpt in words.
|
|
excerpts_per_batch: Number of excerpts to find per batch.
|
|
stopwords: Custom stopwords to skip (in addition to defaults).
|
|
skip_default_stopwords: If True, don't filter out default English stopwords.
|
|
skip_numbers: If True, filter out numeric words (default: True).
|
|
case_sensitive: If True, treat words case-sensitively.
|
|
context_words: Words of context to include around excerpts.
|
|
translate_from: Source language code for translation (e.g., 'la', 'pl').
|
|
translate_to: Target language code for translation (e.g., 'en').
|
|
|
|
Returns:
|
|
Formatted learning lesson as a string.
|
|
"""
|
|
# Combine stopwords
|
|
all_stopwords: frozenset[str]
|
|
if skip_default_stopwords:
|
|
all_stopwords = stopwords or frozenset()
|
|
else:
|
|
all_stopwords = DEFAULT_STOPWORDS_EN | (stopwords or frozenset())
|
|
|
|
# Analyze text for word frequencies
|
|
word_counts = analyze_text(text, case_sensitive=case_sensitive)
|
|
|
|
# Filter out stopwords and get sorted words
|
|
filtered_words = [
|
|
(word, count)
|
|
for word, count in word_counts.most_common()
|
|
if word.lower() not in all_stopwords
|
|
and len(word) > 1
|
|
and not (skip_numbers and word.isdigit())
|
|
]
|
|
|
|
total_words = sum(word_counts.values())
|
|
lines: list[str] = []
|
|
|
|
lines.append("=" * 70)
|
|
lines.append("LANGUAGE LEARNING LESSON")
|
|
lines.append("=" * 70)
|
|
lines.append(
|
|
f"Source text: {total_words:,} total words, {len(word_counts):,} unique words"
|
|
)
|
|
if all_stopwords:
|
|
lines.append(
|
|
f"After filtering {len(all_stopwords)} stopwords: {len(filtered_words):,} vocabulary words"
|
|
)
|
|
else:
|
|
lines.append(f"Vocabulary words: {len(filtered_words):,}")
|
|
|
|
# Handle translation setup
|
|
actual_translate_from = translate_from
|
|
actual_translate_to = translate_to or "en" # Default to English
|
|
|
|
# Auto-detect language if translation is enabled but source not specified
|
|
if translate_from == "auto" or (translate_to and not translate_from):
|
|
detected = detect_language(text)
|
|
if detected:
|
|
actual_translate_from = detected
|
|
lines.append(f"Detected language: {detected}")
|
|
# Note: langdetect doesn't support Latin (often detected as Italian)
|
|
# If detection seems wrong, use --translate-from to override
|
|
else:
|
|
lines.append(
|
|
"Warning: Could not detect language "
|
|
"(install langdetect: pip install langdetect)"
|
|
)
|
|
actual_translate_from = None
|
|
|
|
do_translate = actual_translate_from is not None and actual_translate_to is not None
|
|
if do_translate:
|
|
lines.append(f"Translation: {actual_translate_from} -> {actual_translate_to}")
|
|
|
|
lines.append("")
|
|
|
|
# Generate batches
|
|
cumulative_words: list[str] = []
|
|
|
|
for batch_num in range(num_batches):
|
|
start_idx = batch_num * batch_size
|
|
end_idx = start_idx + batch_size
|
|
|
|
if start_idx >= len(filtered_words):
|
|
break
|
|
|
|
batch_words = filtered_words[start_idx:end_idx]
|
|
cumulative_words.extend(word for word, _ in batch_words)
|
|
|
|
lines.append("-" * 70)
|
|
lines.append(
|
|
f"BATCH {batch_num + 1}: Words {start_idx + 1} - {min(end_idx, len(filtered_words))}"
|
|
)
|
|
lines.append("-" * 70)
|
|
lines.append("")
|
|
|
|
# Get translations if requested
|
|
translations: dict[str, str] = {}
|
|
if do_translate:
|
|
words_to_translate = [word for word, _ in batch_words]
|
|
translation_results = translate_words_batch(
|
|
words_to_translate,
|
|
actual_translate_from, # type: ignore[arg-type]
|
|
actual_translate_to, # type: ignore[arg-type]
|
|
)
|
|
translations = {
|
|
r.source_word: r.translated_word
|
|
for r in translation_results
|
|
if r.success
|
|
}
|
|
|
|
# Word list with frequencies
|
|
lines.append("VOCABULARY TO LEARN:")
|
|
lines.append("")
|
|
|
|
if do_translate and translations:
|
|
# Include translations in output
|
|
for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
|
|
percentage = (count / total_words) * 100
|
|
trans = translations.get(word, "?")
|
|
lines.append(
|
|
f" {i:3}. {word:<20} -> {trans:<20} ({count:,} occurrences, {percentage:.2f}%)"
|
|
)
|
|
else:
|
|
for i, (word, count) in enumerate(batch_words, start=start_idx + 1):
|
|
percentage = (count / total_words) * 100
|
|
lines.append(
|
|
f" {i:3}. {word:<20} ({count:,} occurrences, {percentage:.2f}%)"
|
|
)
|
|
|
|
lines.append("")
|
|
|
|
# Calculate cumulative coverage
|
|
cumulative_count = sum(
|
|
word_counts[word] for word in cumulative_words if word in word_counts
|
|
)
|
|
coverage = (cumulative_count / total_words) * 100
|
|
lines.append(
|
|
f"After learning these words, you'll recognize ~{coverage:.1f}% of the text"
|
|
)
|
|
lines.append("")
|
|
|
|
# Find excerpts using cumulative words
|
|
lines.append("PRACTICE EXCERPTS:")
|
|
lines.append("(Excerpts where your learned vocabulary is most concentrated)")
|
|
lines.append("")
|
|
|
|
excerpts = find_best_excerpt(
|
|
text,
|
|
cumulative_words,
|
|
excerpt_length,
|
|
case_sensitive=case_sensitive,
|
|
top_n=excerpts_per_batch,
|
|
)
|
|
|
|
for j, excerpt in enumerate(excerpts, 1):
|
|
lines.append(
|
|
f" Excerpt {j} ({excerpt.match_percentage:.1f}% known words):"
|
|
)
|
|
lines.append(f' "{excerpt.excerpt}"')
|
|
lines.append("")
|
|
|
|
# Summary
|
|
lines.append("=" * 70)
|
|
lines.append("SUMMARY")
|
|
lines.append("=" * 70)
|
|
|
|
if cumulative_words:
|
|
final_coverage = sum(
|
|
word_counts[word] for word in cumulative_words if word in word_counts
|
|
)
|
|
final_percentage = (final_coverage / total_words) * 100
|
|
lines.append(f"Total vocabulary words learned: {len(cumulative_words)}")
|
|
lines.append(f"Text coverage: {final_percentage:.1f}%")
|
|
lines.append("")
|
|
lines.append("TIP: Focus on understanding the excerpts first, then read")
|
|
lines.append("more of the original text as your vocabulary grows!")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
"""Main entry point for the learning pipe.
|
|
|
|
Args:
|
|
argv: Command line arguments (defaults to sys.argv[1:]).
|
|
|
|
Returns:
|
|
Exit code (0 for success, non-zero for errors).
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
description="Generate language learning lessons from text.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__,
|
|
)
|
|
|
|
# Input source
|
|
input_group = parser.add_mutually_exclusive_group(required=True)
|
|
input_group.add_argument(
|
|
"--text",
|
|
"-t",
|
|
type=str,
|
|
help="Raw text to analyze",
|
|
)
|
|
input_group.add_argument(
|
|
"--file",
|
|
"-f",
|
|
type=str,
|
|
help="Path to a text file to analyze",
|
|
)
|
|
|
|
# Learning parameters
|
|
parser.add_argument(
|
|
"--batch-size",
|
|
"-b",
|
|
type=int,
|
|
default=20,
|
|
help="Number of words per learning batch (default: 20)",
|
|
)
|
|
parser.add_argument(
|
|
"--batches",
|
|
"-n",
|
|
type=int,
|
|
default=1,
|
|
help="Number of batches to generate (default: 1)",
|
|
)
|
|
parser.add_argument(
|
|
"--excerpt-length",
|
|
"-l",
|
|
type=int,
|
|
default=30,
|
|
help="Length of excerpts in words (default: 30)",
|
|
)
|
|
parser.add_argument(
|
|
"--excerpts-per-batch",
|
|
"-e",
|
|
type=int,
|
|
default=3,
|
|
help="Number of excerpts per batch (default: 3)",
|
|
)
|
|
|
|
# Filtering options
|
|
parser.add_argument(
|
|
"--stopwords",
|
|
"-s",
|
|
type=str,
|
|
help="Path to custom stopwords file (one word per line)",
|
|
)
|
|
parser.add_argument(
|
|
"--no-default-stopwords",
|
|
action="store_true",
|
|
help="Don't filter out default English stopwords",
|
|
)
|
|
parser.add_argument(
|
|
"--case-sensitive",
|
|
"-c",
|
|
action="store_true",
|
|
help="Treat words case-sensitively",
|
|
)
|
|
parser.add_argument(
|
|
"--include-numbers",
|
|
action="store_true",
|
|
help="Include numeric words in vocabulary (filtered by default)",
|
|
)
|
|
|
|
# Translation options (enabled by default)
|
|
parser.add_argument(
|
|
"--no-translate",
|
|
"-T",
|
|
action="store_true",
|
|
help="Disable translation",
|
|
)
|
|
parser.add_argument(
|
|
"--translate-from",
|
|
type=str,
|
|
metavar="LANG",
|
|
help="Source language code (e.g., 'la', 'pl', 'de'). If omitted, auto-detected.",
|
|
)
|
|
parser.add_argument(
|
|
"--translate-to",
|
|
type=str,
|
|
metavar="LANG",
|
|
default="en",
|
|
help="Target language code (default: 'en')",
|
|
)
|
|
|
|
# Output options
|
|
parser.add_argument(
|
|
"--output",
|
|
"-o",
|
|
type=str,
|
|
help="Output file path (default: print to stdout)",
|
|
)
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
try:
|
|
# Get input text
|
|
if args.text:
|
|
text = args.text
|
|
else:
|
|
text = read_file(args.file)
|
|
|
|
# Load custom stopwords if provided
|
|
custom_stopwords = load_stopwords(args.stopwords)
|
|
|
|
# Determine translation settings
|
|
# Translation enabled by default, --no-translate disables it
|
|
translate_from: str | None = None
|
|
translate_to: str | None = None
|
|
|
|
if not args.no_translate:
|
|
translate_from = args.translate_from or "auto" # "auto" triggers detection
|
|
translate_to = args.translate_to
|
|
|
|
# Generate lesson
|
|
lesson = generate_learning_lesson(
|
|
text,
|
|
batch_size=args.batch_size,
|
|
num_batches=args.batches,
|
|
excerpt_length=args.excerpt_length,
|
|
excerpts_per_batch=args.excerpts_per_batch,
|
|
stopwords=custom_stopwords,
|
|
skip_default_stopwords=args.no_default_stopwords,
|
|
skip_numbers=not args.include_numbers,
|
|
case_sensitive=args.case_sensitive,
|
|
translate_from=translate_from,
|
|
translate_to=translate_to,
|
|
)
|
|
|
|
# Output
|
|
if args.output:
|
|
Path(args.output).write_text(lesson, encoding="utf-8")
|
|
print(f"Lesson written to {args.output}")
|
|
else:
|
|
print(lesson)
|
|
|
|
except FileNotFoundError as e:
|
|
print(f"Error: File not found - {e}", file=sys.stderr)
|
|
return 1
|
|
except UnicodeDecodeError as e:
|
|
print(f"Error: Could not decode file as UTF-8 - {e}", file=sys.stderr)
|
|
return 1
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|