testsAndMisc/python_pkg/word_frequency/_parsing.py

"""Parsing functions for vocabulary curve output."""

from __future__ import annotations

import contextlib
import re

from python_pkg.word_frequency._types import (
    _MIN_EXCERPT_PARTS,
    _MIN_VOCAB_DUMP_PARTS,
)


def _parse_vocab_dump(lines: list[str]) -> list[tuple[str, int]]:
    """Parse VOCAB_DUMP section from output lines.

    Args:
        lines: Output lines from vocabulary_curve.

    Returns:
        List of (word, rank) tuples.
    """
    all_vocab: list[tuple[str, int]] = []
    in_vocab_dump = False
    for line in lines:
        stripped = line.strip()
        if stripped == "VOCAB_DUMP_START":
            in_vocab_dump = True
            continue
        if stripped == "VOCAB_DUMP_END":
            break
        if in_vocab_dump and ";" in stripped:
            parts = stripped.split(";")
            if len(parts) == _MIN_VOCAB_DUMP_PARTS:
                word, rank_str = parts
                with contextlib.suppress(ValueError):
                    all_vocab.append((word, int(rank_str)))
    return all_vocab


def _parse_excerpt_lines(lines: list[str], start: int) -> str:
    """Parse excerpt text from output lines starting after 'Excerpt:'.

    Args:
        lines: Output lines.
        start: Index of the line after 'Excerpt:'.

    Returns:
        Joined excerpt text.
    """
    excerpt_parts: list[str] = []
    idx = start
    while idx < len(lines):
        next_line = lines[idx].strip()
        next_line = next_line.removeprefix('"')
        if next_line.endswith('"'):
            next_line = next_line[:-1]
            excerpt_parts.append(next_line)
            break
        excerpt_parts.append(next_line)
        idx += 1
    return " ".join(excerpt_parts)


def parse_inverse_mode_output(
    output: str,
) -> tuple[str, int, int, list[tuple[str, int]]]:
    """Parse output from vocabulary_curve inverse mode.

    Args:
        output: Raw output from vocabulary_curve --max-vocab.

    Returns:
        Tuple of (excerpt_text, excerpt_length, max_rank_used, all_vocab_words).
    """
    lines = output.split("\n")
    excerpt = ""
    excerpt_length = 0
    max_rank_used = 0

    for i, raw_line in enumerate(lines):
        line = raw_line.strip()

        if line.startswith("LONGEST EXCERPT:"):
            parts = line.split()
            if len(parts) >= _MIN_EXCERPT_PARTS:
                excerpt_length = int(parts[2])

        elif line.startswith("Excerpt:"):
            excerpt = _parse_excerpt_lines(lines, i + 1)

        elif line.startswith("Rarest word used:"):
            match = re.search(r"\(#(\d+)\)", line)
            if match:
                max_rank_used = int(match.group(1))

    all_vocab = _parse_vocab_dump(lines)
    return excerpt, excerpt_length, max_rank_used, all_vocab


def _parse_target_length_block(
    lines: list[str],
    target_length: int,
) -> tuple[str, list[tuple[str, int]]]:
    """Parse the [Length N] block from vocabulary curve output.

    Args:
        lines: Output lines.
        target_length: Target excerpt length to find.

    Returns:
        Tuple of (excerpt, excerpt_words).
    """
    excerpt = ""
    excerpt_words: list[tuple[str, int]] = []
    i = 0
    while i < len(lines):
        if lines[i].strip().startswith(f"[Length {target_length}]"):
            i += 1
            # Find excerpt line
            while i < len(lines) and not lines[i].strip().startswith(
                "Excerpt:"
            ):
                i += 1
            if i < len(lines):
                excerpt_line = lines[i].strip()
                if '"' in excerpt_line:
                    start = excerpt_line.index('"') + 1
                    end = excerpt_line.rindex('"')
                    excerpt = excerpt_line[start:end]
            # Find words line
            i += 1
            while i < len(lines) and not lines[i].strip().startswith(
                "Words:"
            ):
                i += 1
            if i < len(lines):
                words_line = lines[i].strip()
                if words_line.startswith("Words:"):
                    words_part = words_line[6:].strip()
                    pattern = r"(\S+)\(#(\d+)\)"
                    matches = re.findall(pattern, words_part)
                    excerpt_words = [
                        (w, int(r)) for w, r in matches
                    ]
            break
        i += 1
    return excerpt, excerpt_words


def parse_vocabulary_curve_output(
    output: str, target_length: int
) -> tuple[str, list[tuple[str, int]], list[tuple[str, int]]]:
    """Parse output from vocabulary_curve to get words needed.

    Args:
        output: Raw output from vocabulary_curve.
        target_length: The target excerpt length.

    Returns:
        Tuple of (excerpt_text, excerpt_words, all_vocab_words).
        excerpt_words: words in the excerpt with their ranks.
        all_vocab_words: all words up to max rank
            (from VOCAB_DUMP if present).
    """
    lines = output.split("\n")

    excerpt, excerpt_words = _parse_target_length_block(
        lines, target_length
    )
    all_vocab = _parse_vocab_dump(lines)

    return excerpt, excerpt_words, all_vocab