mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 18:03:11 +02:00
Split 18+ Python files that exceeded 500 lines into smaller modules with helper files (prefixed with _). All functions are re-exported from the original modules to maintain backward compatibility with test patches and external imports. Files split: - moviepy_showcase.py (1212 -> 302 + 3 helpers) - anki_generator.py (1174 -> 473 + 4 helpers) - test_analyze_chess_game.py (1152 -> 361 + 2 parts) - poker_modifier_app.py (1024 -> 263 + 2 helpers) - transcribe_fw.py (1007 -> 342 + 3 helpers) - music_generator.py (1002 -> 319 + 2 helpers) - translator.py (951 -> 442 + 2 helpers) - cinema_planner.py (893 -> 369 + 2 helpers) - lichess_bot/main.py (757 -> 495 + _game_logic.py) - test_translator.py (725 -> 289 + part2 + conftest) - test_lichess_api.py (680 -> 475 + part2) - learning_pipe.py (668 -> 375 + 2 helpers) - cache.py (655 -> 360 + _cache_decks.py) - analyze_chess_game.py (632 -> 463 + _move_analysis.py) - visualize_q02.py (609 -> 371 + helper) - repo_explorer.py (602 -> 347 + 2 helpers) - keyboard_coop/main.py (515 -> 416 + _dictionary.py) - scanning.py (501 -> 314 + _enforce_loop.py) All tests pass: 144 lichess_bot (100% branch coverage), 243 others. No new lint errors introduced.
156 lines
3.0 KiB
Python
156 lines
3.0 KiB
Python
"""Constants and configuration for the learning pipe module."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
# Common stopwords for various languages (can be overridden with --stopwords)
|
|
DEFAULT_STOPWORDS_EN = frozenset(
|
|
{
|
|
"the",
|
|
"a",
|
|
"an",
|
|
"and",
|
|
"or",
|
|
"but",
|
|
"in",
|
|
"on",
|
|
"at",
|
|
"to",
|
|
"for",
|
|
"of",
|
|
"with",
|
|
"by",
|
|
"from",
|
|
"is",
|
|
"are",
|
|
"was",
|
|
"were",
|
|
"be",
|
|
"been",
|
|
"being",
|
|
"have",
|
|
"has",
|
|
"had",
|
|
"do",
|
|
"does",
|
|
"did",
|
|
"will",
|
|
"would",
|
|
"could",
|
|
"should",
|
|
"may",
|
|
"might",
|
|
"must",
|
|
"shall",
|
|
"can",
|
|
"this",
|
|
"that",
|
|
"these",
|
|
"those",
|
|
"i",
|
|
"you",
|
|
"he",
|
|
"she",
|
|
"it",
|
|
"we",
|
|
"they",
|
|
"me",
|
|
"him",
|
|
"her",
|
|
"us",
|
|
"them",
|
|
"my",
|
|
"your",
|
|
"his",
|
|
"its",
|
|
"our",
|
|
"their",
|
|
"what",
|
|
"which",
|
|
"who",
|
|
"whom",
|
|
"whose",
|
|
"where",
|
|
"when",
|
|
"why",
|
|
"how",
|
|
"all",
|
|
"each",
|
|
"every",
|
|
"both",
|
|
"few",
|
|
"more",
|
|
"most",
|
|
"other",
|
|
"some",
|
|
"such",
|
|
"no",
|
|
"nor",
|
|
"not",
|
|
"only",
|
|
"own",
|
|
"same",
|
|
"so",
|
|
"than",
|
|
"too",
|
|
"very",
|
|
"just",
|
|
"as",
|
|
"if",
|
|
"then",
|
|
"because",
|
|
"while",
|
|
"although",
|
|
"though",
|
|
"after",
|
|
"before",
|
|
}
|
|
)
|
|
|
|
|
|
def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
|
|
"""Load stopwords from a file (one word per line).
|
|
|
|
Args:
|
|
filepath: Path to stopwords file, or None to use defaults.
|
|
|
|
Returns:
|
|
Frozenset of stopwords.
|
|
"""
|
|
if filepath is None:
|
|
return frozenset()
|
|
|
|
path = Path(filepath)
|
|
if not path.exists():
|
|
return frozenset()
|
|
|
|
content = path.read_text(encoding="utf-8")
|
|
return frozenset(
|
|
word.strip().lower() for word in content.splitlines() if word.strip()
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class LessonConfig:
|
|
"""Configuration for learning lesson generation."""
|
|
|
|
batch_size: int = 20
|
|
num_batches: int = 1
|
|
excerpt_length: int = 30
|
|
excerpts_per_batch: int = 3
|
|
stopwords: frozenset[str] | None = None
|
|
skip_default_stopwords: bool = False
|
|
skip_numbers: bool = True
|
|
case_sensitive: bool = False
|
|
translate_from: str | None = None
|
|
translate_to: str | None = None
|
|
|
|
|
|
def _resolve_stopwords(config: LessonConfig) -> frozenset[str]:
|
|
"""Resolve combined stopwords from config."""
|
|
if config.skip_default_stopwords:
|
|
return config.stopwords or frozenset()
|
|
return DEFAULT_STOPWORDS_EN | (config.stopwords or frozenset())
|