mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 19:03:08 +02:00
- Remove all # type: ignore and # noqa comments (banned by no-noqa hook) - Add mypy --disable-error-code flags to pre-commit config for error codes previously suppressed by inline comments - Fix broken imports after ruff auto-removed re-exports: steam_backlog_enforcer, stockfish_analysis, word_frequency, lichess_bot - Re-add re-exports with __all__ in translator.py, screen_lock.py - Split _process_epc_fc.py (524 lines) into _process_epc_fc.py + _process_fc.py - Fix test failures: keyboard_coop, stockfish_analysis, tag_divider - Add per-file-ignores for PLC0415 (deferred imports) in 7 files - Mark shebang scripts as executable - Add __init__.py for generate_images and repo_explorer packages - Fix codespell, eslint, ruff-format, prettier issues - Update copilot-instructions.md with --no-verify ban
386 lines
10 KiB
Python
386 lines
10 KiB
Python
"""Core flashcard generation logic."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
import subprocess
|
|
|
|
from python_pkg.word_frequency._deck_builder import (
|
|
find_word_contexts,
|
|
generate_anki_deck,
|
|
)
|
|
from python_pkg.word_frequency._parsing import (
|
|
parse_inverse_mode_output,
|
|
parse_vocabulary_curve_output,
|
|
)
|
|
from python_pkg.word_frequency._translator_helpers import detect_language
|
|
from python_pkg.word_frequency._types import (
|
|
C_EXECUTABLE,
|
|
DeckInput,
|
|
FlashcardOptions,
|
|
)
|
|
from python_pkg.word_frequency.analyzer import read_file
|
|
from python_pkg.word_frequency.cache import (
|
|
AnkiDeckKey,
|
|
get_anki_deck_cache,
|
|
get_vocab_curve_cache,
|
|
)
|
|
|
|
|
|
def run_vocabulary_curve(
|
|
filepath: Path, max_length: int, *, dump_vocab: bool = False
|
|
) -> str:
|
|
"""Run the C vocabulary_curve executable.
|
|
|
|
Args:
|
|
filepath: Path to the text file.
|
|
max_length: Maximum excerpt length.
|
|
dump_vocab: If True, also dump all vocabulary up to max rank needed.
|
|
|
|
Returns:
|
|
Output from the executable.
|
|
|
|
Raises:
|
|
FileNotFoundError: If executable not found.
|
|
subprocess.CalledProcessError: If execution fails.
|
|
"""
|
|
if not C_EXECUTABLE.exists():
|
|
msg = (
|
|
f"C executable not found at {C_EXECUTABLE}. "
|
|
"Please compile it first: cd C/vocabulary_curve && make"
|
|
)
|
|
raise FileNotFoundError(msg)
|
|
|
|
cmd = [str(C_EXECUTABLE), str(filepath), str(max_length)]
|
|
if dump_vocab:
|
|
cmd.append("--dump-vocab")
|
|
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
check=True,
|
|
)
|
|
return result.stdout
|
|
|
|
|
|
def run_vocabulary_curve_inverse(
|
|
filepath: Path, max_vocab: int, *, dump_vocab: bool = False
|
|
) -> str:
|
|
"""Run the C vocabulary_curve executable in inverse mode.
|
|
|
|
Args:
|
|
filepath: Path to the text file.
|
|
max_vocab: Maximum vocabulary size (top N words).
|
|
dump_vocab: If True, also dump all vocabulary up to max_vocab.
|
|
|
|
Returns:
|
|
Output from the executable.
|
|
|
|
Raises:
|
|
FileNotFoundError: If executable not found.
|
|
subprocess.CalledProcessError: If execution fails.
|
|
"""
|
|
if not C_EXECUTABLE.exists():
|
|
msg = (
|
|
f"C executable not found at {C_EXECUTABLE}. "
|
|
"Please compile it first: cd C/vocabulary_curve && make"
|
|
)
|
|
raise FileNotFoundError(msg)
|
|
|
|
cmd = [str(C_EXECUTABLE), str(filepath), "--max-vocab", str(max_vocab)]
|
|
if dump_vocab:
|
|
cmd.append("--dump-vocab")
|
|
|
|
result = subprocess.run(
|
|
cmd,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
check=True,
|
|
)
|
|
return result.stdout
|
|
|
|
|
|
def get_cached_excerpt(
|
|
filepath: Path, length: int, *, force: bool = False
|
|
) -> tuple[str, list[tuple[str, int]]] | None:
|
|
"""Get cached excerpt if available.
|
|
|
|
Args:
|
|
filepath: Path to source file.
|
|
length: Excerpt length.
|
|
force: If True, ignore cache.
|
|
|
|
Returns:
|
|
Tuple of (excerpt, words) or None if not cached.
|
|
"""
|
|
if force:
|
|
return None
|
|
return get_vocab_curve_cache().get(filepath, length)
|
|
|
|
|
|
def cache_excerpt(
|
|
filepath: Path, length: int, excerpt: str, words: list[tuple[str, int]]
|
|
) -> None:
|
|
"""Store excerpt in cache.
|
|
|
|
Args:
|
|
filepath: Path to source file.
|
|
length: Excerpt length.
|
|
excerpt: The excerpt text.
|
|
words: List of (word, rank) tuples.
|
|
"""
|
|
get_vocab_curve_cache().set(filepath, length, excerpt, words)
|
|
|
|
|
|
def get_cached_deck(
|
|
key: AnkiDeckKey,
|
|
*,
|
|
force: bool = False,
|
|
) -> tuple[str, str, int, int] | None:
|
|
"""Get cached Anki deck if available.
|
|
|
|
Args:
|
|
key: Cache key parameters.
|
|
force: If True, ignore cache.
|
|
|
|
Returns:
|
|
Tuple of (content, excerpt, num_words, max_rank) or None.
|
|
"""
|
|
if force:
|
|
return None
|
|
return get_anki_deck_cache().get(key)
|
|
|
|
|
|
def cache_deck(
|
|
key: AnkiDeckKey,
|
|
anki_content: str,
|
|
excerpt: str,
|
|
num_words: int,
|
|
max_rank: int,
|
|
) -> None:
|
|
"""Store Anki deck in cache.
|
|
|
|
Args:
|
|
key: Cache key parameters.
|
|
anki_content: The deck content.
|
|
excerpt: The excerpt text.
|
|
num_words: Number of words.
|
|
max_rank: Maximum rank.
|
|
"""
|
|
get_anki_deck_cache().set(
|
|
key,
|
|
anki_content,
|
|
excerpt,
|
|
num_words,
|
|
max_rank,
|
|
)
|
|
|
|
|
|
def _detect_source_language(
|
|
filepath: Path,
|
|
text: str,
|
|
) -> str:
|
|
"""Auto-detect source language from file content.
|
|
|
|
Args:
|
|
filepath: Path to source file.
|
|
text: Already-read text (may be empty).
|
|
|
|
Returns:
|
|
Detected language code.
|
|
|
|
Raises:
|
|
ValueError: If language cannot be detected.
|
|
"""
|
|
sample_text = read_file(filepath)[:1000] if not text else text[:1000]
|
|
detected = detect_language(sample_text)
|
|
if detected is None:
|
|
msg = (
|
|
"Could not auto-detect source language. "
|
|
"Please specify with --from (e.g., --from pl for Polish). "
|
|
"Install langdetect for auto-detection: "
|
|
"pip install langdetect"
|
|
)
|
|
raise ValueError(msg)
|
|
return detected
|
|
|
|
|
|
def generate_flashcards(
|
|
filepath: str | Path,
|
|
excerpt_length: int,
|
|
options: FlashcardOptions | None = None,
|
|
*,
|
|
all_vocab: bool = True,
|
|
) -> tuple[str, str, int, int]:
|
|
"""Generate Anki flashcards for vocabulary needed for an excerpt.
|
|
|
|
Args:
|
|
filepath: Path to the source text file.
|
|
excerpt_length: Target excerpt length.
|
|
options: Flashcard generation options.
|
|
all_vocab: If True, include ALL words rank 1 to max rank.
|
|
|
|
Returns:
|
|
Tuple of (anki_content, excerpt, num_words, max_rank).
|
|
"""
|
|
if options is None:
|
|
options = FlashcardOptions()
|
|
filepath = Path(filepath)
|
|
deck_key = AnkiDeckKey(
|
|
filepath=filepath,
|
|
length=excerpt_length,
|
|
target_lang=options.target_lang,
|
|
include_context=options.include_context,
|
|
all_vocab=all_vocab,
|
|
)
|
|
|
|
# Check for cached full deck (if not using no_translate)
|
|
if not options.no_translate and not options.force:
|
|
cached = get_cached_deck(deck_key)
|
|
if cached is not None:
|
|
return cached
|
|
|
|
# Read the text (only needed for context finding)
|
|
text = read_file(filepath) if options.include_context else ""
|
|
|
|
# Auto-detect language if not provided
|
|
source_lang = options.source_lang
|
|
if source_lang is None:
|
|
source_lang = _detect_source_language(filepath, text)
|
|
|
|
# Run vocabulary curve analysis with vocab dump for all words
|
|
output = run_vocabulary_curve(filepath, excerpt_length, dump_vocab=all_vocab)
|
|
excerpt, excerpt_words, all_vocab_words = parse_vocabulary_curve_output(
|
|
output, excerpt_length
|
|
)
|
|
|
|
if not excerpt_words:
|
|
msg = f"No words found for excerpt length {excerpt_length}"
|
|
raise ValueError(msg)
|
|
|
|
max_rank = max(rank for _, rank in excerpt_words)
|
|
words_with_ranks = (
|
|
all_vocab_words if all_vocab and all_vocab_words else excerpt_words
|
|
)
|
|
|
|
contexts = None
|
|
if options.include_context:
|
|
if not text:
|
|
text = read_file(filepath)
|
|
words = [w for w, _ in words_with_ranks]
|
|
contexts = find_word_contexts(text, words)
|
|
|
|
deck_name = options.deck_name or f"{filepath.stem}_vocab_{excerpt_length}"
|
|
|
|
anki_content = generate_anki_deck(
|
|
DeckInput(
|
|
words_with_ranks=words_with_ranks,
|
|
source_lang=source_lang,
|
|
target_lang=options.target_lang,
|
|
contexts=contexts,
|
|
deck_name=deck_name,
|
|
),
|
|
include_context=options.include_context,
|
|
no_translate=options.no_translate,
|
|
excerpt=excerpt,
|
|
excerpt_words=excerpt_words,
|
|
)
|
|
|
|
if not options.no_translate:
|
|
cache_deck(
|
|
deck_key,
|
|
anki_content,
|
|
excerpt,
|
|
len(words_with_ranks),
|
|
max_rank,
|
|
)
|
|
|
|
return anki_content, excerpt, len(words_with_ranks), max_rank
|
|
|
|
|
|
def generate_flashcards_inverse(
|
|
filepath: str | Path,
|
|
max_vocab: int,
|
|
options: FlashcardOptions | None = None,
|
|
) -> tuple[str, str, int, int, int]:
|
|
"""Generate Anki flashcards for the longest excerpt using top N words.
|
|
|
|
This is the inverse mode: given a vocabulary size, find the longest
|
|
excerpt that can be understood with only those words.
|
|
|
|
Args:
|
|
filepath: Path to the source text file.
|
|
max_vocab: Maximum vocabulary size (top N words to learn).
|
|
options: Flashcard generation options.
|
|
|
|
Returns:
|
|
Tuple of (anki_content, excerpt, excerpt_length,
|
|
num_words, max_rank_used).
|
|
"""
|
|
if options is None:
|
|
options = FlashcardOptions()
|
|
filepath = Path(filepath)
|
|
|
|
text = read_file(filepath) if options.include_context else ""
|
|
|
|
source_lang = options.source_lang
|
|
if source_lang is None:
|
|
source_lang = _detect_source_language(filepath, text)
|
|
|
|
output = run_vocabulary_curve_inverse(filepath, max_vocab, dump_vocab=True)
|
|
excerpt, excerpt_length, max_rank_used, all_vocab_words = parse_inverse_mode_output(
|
|
output
|
|
)
|
|
|
|
if excerpt_length == 0:
|
|
msg = (
|
|
f"No valid excerpt found using only top {max_vocab} "
|
|
"words. Try increasing the vocabulary limit."
|
|
)
|
|
raise ValueError(msg)
|
|
|
|
if not all_vocab_words:
|
|
msg = f"No vocabulary returned for max_vocab={max_vocab}"
|
|
raise ValueError(msg)
|
|
|
|
words_with_ranks = all_vocab_words
|
|
|
|
excerpt_word_set = set(excerpt.lower().split())
|
|
excerpt_words = [
|
|
(w, r) for w, r in all_vocab_words if w.lower() in excerpt_word_set
|
|
]
|
|
|
|
contexts = None
|
|
if options.include_context:
|
|
if not text:
|
|
text = read_file(filepath)
|
|
words = [w for w, _ in words_with_ranks]
|
|
contexts = find_word_contexts(text, words)
|
|
|
|
deck_name = options.deck_name or f"{filepath.stem}_top{max_vocab}"
|
|
|
|
anki_content = generate_anki_deck(
|
|
DeckInput(
|
|
words_with_ranks=words_with_ranks,
|
|
source_lang=source_lang,
|
|
target_lang=options.target_lang,
|
|
contexts=contexts,
|
|
deck_name=deck_name,
|
|
),
|
|
include_context=options.include_context,
|
|
no_translate=options.no_translate,
|
|
excerpt=excerpt,
|
|
excerpt_words=excerpt_words or None,
|
|
)
|
|
|
|
return (
|
|
anki_content,
|
|
excerpt,
|
|
excerpt_length,
|
|
len(words_with_ranks),
|
|
max_rank_used,
|
|
)
|