mirror of
https://github.com/kuhyx/testsAndMisc-archive.git
synced 2026-07-04 13:23:01 +02:00
Split 18+ Python files that exceeded 500 lines into smaller modules with helper files (prefixed with _). All functions are re-exported from the original modules to maintain backward compatibility with test patches and external imports. Files split: - moviepy_showcase.py (1212 -> 302 + 3 helpers) - anki_generator.py (1174 -> 473 + 4 helpers) - test_analyze_chess_game.py (1152 -> 361 + 2 parts) - poker_modifier_app.py (1024 -> 263 + 2 helpers) - transcribe_fw.py (1007 -> 342 + 3 helpers) - music_generator.py (1002 -> 319 + 2 helpers) - translator.py (951 -> 442 + 2 helpers) - cinema_planner.py (893 -> 369 + 2 helpers) - lichess_bot/main.py (757 -> 495 + _game_logic.py) - test_translator.py (725 -> 289 + part2 + conftest) - test_lichess_api.py (680 -> 475 + part2) - learning_pipe.py (668 -> 375 + 2 helpers) - cache.py (655 -> 360 + _cache_decks.py) - analyze_chess_game.py (632 -> 463 + _move_analysis.py) - visualize_q02.py (609 -> 371 + helper) - repo_explorer.py (602 -> 347 + 2 helpers) - keyboard_coop/main.py (515 -> 416 + _dictionary.py) - scanning.py (501 -> 314 + _enforce_loop.py) All tests pass: 144 lichess_bot (100% branch coverage), 243 others. No new lint errors introduced.
174 lines
5.1 KiB
Python
174 lines
5.1 KiB
Python
"""Parsing functions for vocabulary curve output."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import contextlib
|
|
import re
|
|
|
|
from python_pkg.word_frequency._types import (
|
|
_MIN_EXCERPT_PARTS,
|
|
_MIN_VOCAB_DUMP_PARTS,
|
|
)
|
|
|
|
|
|
def _parse_vocab_dump(lines: list[str]) -> list[tuple[str, int]]:
|
|
"""Parse VOCAB_DUMP section from output lines.
|
|
|
|
Args:
|
|
lines: Output lines from vocabulary_curve.
|
|
|
|
Returns:
|
|
List of (word, rank) tuples.
|
|
"""
|
|
all_vocab: list[tuple[str, int]] = []
|
|
in_vocab_dump = False
|
|
for line in lines:
|
|
stripped = line.strip()
|
|
if stripped == "VOCAB_DUMP_START":
|
|
in_vocab_dump = True
|
|
continue
|
|
if stripped == "VOCAB_DUMP_END":
|
|
break
|
|
if in_vocab_dump and ";" in stripped:
|
|
parts = stripped.split(";")
|
|
if len(parts) == _MIN_VOCAB_DUMP_PARTS:
|
|
word, rank_str = parts
|
|
with contextlib.suppress(ValueError):
|
|
all_vocab.append((word, int(rank_str)))
|
|
return all_vocab
|
|
|
|
|
|
def _parse_excerpt_lines(lines: list[str], start: int) -> str:
|
|
"""Parse excerpt text from output lines starting after 'Excerpt:'.
|
|
|
|
Args:
|
|
lines: Output lines.
|
|
start: Index of the line after 'Excerpt:'.
|
|
|
|
Returns:
|
|
Joined excerpt text.
|
|
"""
|
|
excerpt_parts: list[str] = []
|
|
idx = start
|
|
while idx < len(lines):
|
|
next_line = lines[idx].strip()
|
|
next_line = next_line.removeprefix('"')
|
|
if next_line.endswith('"'):
|
|
next_line = next_line[:-1]
|
|
excerpt_parts.append(next_line)
|
|
break
|
|
excerpt_parts.append(next_line)
|
|
idx += 1
|
|
return " ".join(excerpt_parts)
|
|
|
|
|
|
def parse_inverse_mode_output(
|
|
output: str,
|
|
) -> tuple[str, int, int, list[tuple[str, int]]]:
|
|
"""Parse output from vocabulary_curve inverse mode.
|
|
|
|
Args:
|
|
output: Raw output from vocabulary_curve --max-vocab.
|
|
|
|
Returns:
|
|
Tuple of (excerpt_text, excerpt_length, max_rank_used, all_vocab_words).
|
|
"""
|
|
lines = output.split("\n")
|
|
excerpt = ""
|
|
excerpt_length = 0
|
|
max_rank_used = 0
|
|
|
|
for i, raw_line in enumerate(lines):
|
|
line = raw_line.strip()
|
|
|
|
if line.startswith("LONGEST EXCERPT:"):
|
|
parts = line.split()
|
|
if len(parts) >= _MIN_EXCERPT_PARTS:
|
|
excerpt_length = int(parts[2])
|
|
|
|
elif line.startswith("Excerpt:"):
|
|
excerpt = _parse_excerpt_lines(lines, i + 1)
|
|
|
|
elif line.startswith("Rarest word used:"):
|
|
match = re.search(r"\(#(\d+)\)", line)
|
|
if match:
|
|
max_rank_used = int(match.group(1))
|
|
|
|
all_vocab = _parse_vocab_dump(lines)
|
|
return excerpt, excerpt_length, max_rank_used, all_vocab
|
|
|
|
|
|
def _parse_target_length_block(
|
|
lines: list[str],
|
|
target_length: int,
|
|
) -> tuple[str, list[tuple[str, int]]]:
|
|
"""Parse the [Length N] block from vocabulary curve output.
|
|
|
|
Args:
|
|
lines: Output lines.
|
|
target_length: Target excerpt length to find.
|
|
|
|
Returns:
|
|
Tuple of (excerpt, excerpt_words).
|
|
"""
|
|
excerpt = ""
|
|
excerpt_words: list[tuple[str, int]] = []
|
|
i = 0
|
|
while i < len(lines):
|
|
if lines[i].strip().startswith(f"[Length {target_length}]"):
|
|
i += 1
|
|
# Find excerpt line
|
|
while i < len(lines) and not lines[i].strip().startswith(
|
|
"Excerpt:"
|
|
):
|
|
i += 1
|
|
if i < len(lines):
|
|
excerpt_line = lines[i].strip()
|
|
if '"' in excerpt_line:
|
|
start = excerpt_line.index('"') + 1
|
|
end = excerpt_line.rindex('"')
|
|
excerpt = excerpt_line[start:end]
|
|
# Find words line
|
|
i += 1
|
|
while i < len(lines) and not lines[i].strip().startswith(
|
|
"Words:"
|
|
):
|
|
i += 1
|
|
if i < len(lines):
|
|
words_line = lines[i].strip()
|
|
if words_line.startswith("Words:"):
|
|
words_part = words_line[6:].strip()
|
|
pattern = r"(\S+)\(#(\d+)\)"
|
|
matches = re.findall(pattern, words_part)
|
|
excerpt_words = [
|
|
(w, int(r)) for w, r in matches
|
|
]
|
|
break
|
|
i += 1
|
|
return excerpt, excerpt_words
|
|
|
|
|
|
def parse_vocabulary_curve_output(
|
|
output: str, target_length: int
|
|
) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
|
|
"""Parse output from vocabulary_curve to get words needed.
|
|
|
|
Args:
|
|
output: Raw output from vocabulary_curve.
|
|
target_length: The target excerpt length.
|
|
|
|
Returns:
|
|
Tuple of (excerpt_text, excerpt_words, all_vocab_words).
|
|
excerpt_words: words in the excerpt with their ranks.
|
|
all_vocab_words: all words up to max rank
|
|
(from VOCAB_DUMP if present).
|
|
"""
|
|
lines = output.split("\n")
|
|
|
|
excerpt, excerpt_words = _parse_target_length_block(
|
|
lines, target_length
|
|
)
|
|
all_vocab = _parse_vocab_dump(lines)
|
|
|
|
return excerpt, excerpt_words, all_vocab
|