testsAndMisc-archive/python_pkg/word_frequency/_parsing.py
Krzysztof kuhy Rudnicki 78c1d77144 fix: resolve all pre-commit hook failures after file splits
- Remove all # type: ignore and # noqa comments (banned by no-noqa hook)
- Add mypy --disable-error-code flags to pre-commit config for error
  codes previously suppressed by inline comments
- Fix broken imports after ruff auto-removed re-exports:
  steam_backlog_enforcer, stockfish_analysis, word_frequency, lichess_bot
- Re-add re-exports with __all__ in translator.py, screen_lock.py
- Split _process_epc_fc.py (524 lines) into _process_epc_fc.py + _process_fc.py
- Fix test failures: keyboard_coop, stockfish_analysis, tag_divider
- Add per-file-ignores for PLC0415 (deferred imports) in 7 files
- Mark shebang scripts as executable
- Add __init__.py for generate_images and repo_explorer packages
- Fix codespell, eslint, ruff-format, prettier issues
- Update copilot-instructions.md with --no-verify ban
2026-03-18 22:20:05 +01:00

166 lines
4.9 KiB
Python

"""Parsing functions for vocabulary curve output."""
from __future__ import annotations
import contextlib
import re
from python_pkg.word_frequency._types import (
_MIN_EXCERPT_PARTS,
_MIN_VOCAB_DUMP_PARTS,
)
def _parse_vocab_dump(lines: list[str]) -> list[tuple[str, int]]:
"""Parse VOCAB_DUMP section from output lines.
Args:
lines: Output lines from vocabulary_curve.
Returns:
List of (word, rank) tuples.
"""
all_vocab: list[tuple[str, int]] = []
in_vocab_dump = False
for line in lines:
stripped = line.strip()
if stripped == "VOCAB_DUMP_START":
in_vocab_dump = True
continue
if stripped == "VOCAB_DUMP_END":
break
if in_vocab_dump and ";" in stripped:
parts = stripped.split(";")
if len(parts) == _MIN_VOCAB_DUMP_PARTS:
word, rank_str = parts
with contextlib.suppress(ValueError):
all_vocab.append((word, int(rank_str)))
return all_vocab
def _parse_excerpt_lines(lines: list[str], start: int) -> str:
"""Parse excerpt text from output lines starting after 'Excerpt:'.
Args:
lines: Output lines.
start: Index of the line after 'Excerpt:'.
Returns:
Joined excerpt text.
"""
excerpt_parts: list[str] = []
idx = start
while idx < len(lines):
next_line = lines[idx].strip()
next_line = next_line.removeprefix('"')
if next_line.endswith('"'):
next_line = next_line[:-1]
excerpt_parts.append(next_line)
break
excerpt_parts.append(next_line)
idx += 1
return " ".join(excerpt_parts)
def parse_inverse_mode_output(
output: str,
) -> tuple[str, int, int, list[tuple[str, int]]]:
"""Parse output from vocabulary_curve inverse mode.
Args:
output: Raw output from vocabulary_curve --max-vocab.
Returns:
Tuple of (excerpt_text, excerpt_length, max_rank_used, all_vocab_words).
"""
lines = output.split("\n")
excerpt = ""
excerpt_length = 0
max_rank_used = 0
for i, raw_line in enumerate(lines):
line = raw_line.strip()
if line.startswith("LONGEST EXCERPT:"):
parts = line.split()
if len(parts) >= _MIN_EXCERPT_PARTS:
excerpt_length = int(parts[2])
elif line.startswith("Excerpt:"):
excerpt = _parse_excerpt_lines(lines, i + 1)
elif line.startswith("Rarest word used:"):
match = re.search(r"\(#(\d+)\)", line)
if match:
max_rank_used = int(match.group(1))
all_vocab = _parse_vocab_dump(lines)
return excerpt, excerpt_length, max_rank_used, all_vocab
def _parse_target_length_block(
lines: list[str],
target_length: int,
) -> tuple[str, list[tuple[str, int]]]:
"""Parse the [Length N] block from vocabulary curve output.
Args:
lines: Output lines.
target_length: Target excerpt length to find.
Returns:
Tuple of (excerpt, excerpt_words).
"""
excerpt = ""
excerpt_words: list[tuple[str, int]] = []
i = 0
while i < len(lines):
if lines[i].strip().startswith(f"[Length {target_length}]"):
i += 1
# Find excerpt line
while i < len(lines) and not lines[i].strip().startswith("Excerpt:"):
i += 1
if i < len(lines):
excerpt_line = lines[i].strip()
if '"' in excerpt_line:
start = excerpt_line.index('"') + 1
end = excerpt_line.rindex('"')
excerpt = excerpt_line[start:end]
# Find words line
i += 1
while i < len(lines) and not lines[i].strip().startswith("Words:"):
i += 1
if i < len(lines):
words_line = lines[i].strip()
if words_line.startswith("Words:"):
words_part = words_line[6:].strip()
pattern = r"(\S+)\(#(\d+)\)"
matches = re.findall(pattern, words_part)
excerpt_words = [(w, int(r)) for w, r in matches]
break
i += 1
return excerpt, excerpt_words
def parse_vocabulary_curve_output(
output: str, target_length: int
) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
"""Parse output from vocabulary_curve to get words needed.
Args:
output: Raw output from vocabulary_curve.
target_length: The target excerpt length.
Returns:
Tuple of (excerpt_text, excerpt_words, all_vocab_words).
excerpt_words: words in the excerpt with their ranks.
all_vocab_words: all words up to max rank
(from VOCAB_DUMP if present).
"""
lines = output.split("\n")
excerpt, excerpt_words = _parse_target_length_block(lines, target_length)
all_vocab = _parse_vocab_dump(lines)
return excerpt, excerpt_words, all_vocab