testsAndMisc-archive/python_pkg/word_frequency/_learning_constants.py
Krzysztof kuhy Rudnicki 8f2fbd2311 refactor: enforce 500-line limit on all Python source files
Split 18+ Python files that exceeded 500 lines into smaller modules
with helper files (prefixed with _). All functions are re-exported
from the original modules to maintain backward compatibility with
test patches and external imports.

Files split:
- moviepy_showcase.py (1212 -> 302 + 3 helpers)
- anki_generator.py (1174 -> 473 + 4 helpers)
- test_analyze_chess_game.py (1152 -> 361 + 2 parts)
- poker_modifier_app.py (1024 -> 263 + 2 helpers)
- transcribe_fw.py (1007 -> 342 + 3 helpers)
- music_generator.py (1002 -> 319 + 2 helpers)
- translator.py (951 -> 442 + 2 helpers)
- cinema_planner.py (893 -> 369 + 2 helpers)
- lichess_bot/main.py (757 -> 495 + _game_logic.py)
- test_translator.py (725 -> 289 + part2 + conftest)
- test_lichess_api.py (680 -> 475 + part2)
- learning_pipe.py (668 -> 375 + 2 helpers)
- cache.py (655 -> 360 + _cache_decks.py)
- analyze_chess_game.py (632 -> 463 + _move_analysis.py)
- visualize_q02.py (609 -> 371 + helper)
- repo_explorer.py (602 -> 347 + 2 helpers)
- keyboard_coop/main.py (515 -> 416 + _dictionary.py)
- scanning.py (501 -> 314 + _enforce_loop.py)

All tests pass: 144 lichess_bot (100% branch coverage), 243 others.
No new lint errors introduced.
2026-03-17 22:47:42 +01:00

156 lines
3.0 KiB
Python

"""Constants and configuration for the learning pipe module."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
# Common stopwords for various languages (can be overridden with --stopwords)
DEFAULT_STOPWORDS_EN = frozenset(
{
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"by",
"from",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
"may",
"might",
"must",
"shall",
"can",
"this",
"that",
"these",
"those",
"i",
"you",
"he",
"she",
"it",
"we",
"they",
"me",
"him",
"her",
"us",
"them",
"my",
"your",
"his",
"its",
"our",
"their",
"what",
"which",
"who",
"whom",
"whose",
"where",
"when",
"why",
"how",
"all",
"each",
"every",
"both",
"few",
"more",
"most",
"other",
"some",
"such",
"no",
"nor",
"not",
"only",
"own",
"same",
"so",
"than",
"too",
"very",
"just",
"as",
"if",
"then",
"because",
"while",
"although",
"though",
"after",
"before",
}
)
def load_stopwords(filepath: str | Path | None) -> frozenset[str]:
"""Load stopwords from a file (one word per line).
Args:
filepath: Path to stopwords file, or None to use defaults.
Returns:
Frozenset of stopwords.
"""
if filepath is None:
return frozenset()
path = Path(filepath)
if not path.exists():
return frozenset()
content = path.read_text(encoding="utf-8")
return frozenset(
word.strip().lower() for word in content.splitlines() if word.strip()
)
@dataclass(frozen=True)
class LessonConfig:
"""Configuration for learning lesson generation."""
batch_size: int = 20
num_batches: int = 1
excerpt_length: int = 30
excerpts_per_batch: int = 3
stopwords: frozenset[str] | None = None
skip_default_stopwords: bool = False
skip_numbers: bool = True
case_sensitive: bool = False
translate_from: str | None = None
translate_to: str | None = None
def _resolve_stopwords(config: LessonConfig) -> frozenset[str]:
"""Resolve combined stopwords from config."""
if config.skip_default_stopwords:
return config.stopwords or frozenset()
return DEFAULT_STOPWORDS_EN | (config.stopwords or frozenset())