testsAndMisc-archive/python_pkg/cinema_planner/_cinema_parsing.py

"""Parsing functions for Cinema City schedules and manual input."""

from __future__ import annotations

from dataclasses import dataclass, field
import importlib
import logging
from pathlib import Path
import re
import shutil
import subprocess
import sys
from typing import TYPE_CHECKING, TextIO

if TYPE_CHECKING:
    import types

logger = logging.getLogger(__name__)

# Constants for validation and parsing
_MIN_MANUAL_LINE_PARTS = 3
_MIN_TITLE_LENGTH = 3
_DEFAULT_MOVIE_DURATION = 120
_TITLE_LOOKAHEAD_LINES = 5


def _try_import(name: str) -> types.ModuleType | None:
    """Attempt to import a module, returning None if unavailable."""
    try:
        return importlib.import_module(name)
    except ImportError:
        return None


_pdfplumber = _try_import("pdfplumber")
_fitz = _try_import("fitz")


@dataclass
class Movie:
    """A movie with screening times and metadata."""

    name: str
    start_times: list[int]
    duration: int
    genres: list[str] = field(default_factory=list)


def parse_time(time_str: str) -> int:
    """Parse time string like '18:20' to minutes from midnight."""
    time_str = time_str.strip().replace(".", ":")
    match = re.match(r"(\d{1,2}):(\d{2})", time_str)
    if not match:
        msg = f"Invalid time format: {time_str}"
        raise ValueError(msg)
    hours, minutes = int(match.group(1)), int(match.group(2))
    return hours * 60 + minutes


def parse_duration(duration_str: str) -> int:
    """Parse duration like '1h 46m', '1:46', '106m', '110 min', etc."""
    duration_str = duration_str.strip().lower()

    # Try "X min" format (from Cinema City)
    match = re.search(r"(\d+)\s*min", duration_str)
    if match:
        return int(match.group(1))

    hours = 0
    minutes = 0

    h_match = re.search(r"(\d+)\s*h", duration_str)
    m_match = re.search(r"(\d+)\s*m(?!in)", duration_str)

    if h_match or m_match:
        if h_match:
            hours = int(h_match.group(1))
        if m_match:
            minutes = int(m_match.group(1))
        return hours * 60 + minutes

    # Try "H:MM" format
    match = re.match(r"(\d+):(\d{2})", duration_str)
    if match:
        return int(match.group(1)) * 60 + int(match.group(2))

    # Try pure minutes
    match = re.match(r"(\d+)", duration_str)
    if match:
        return int(match.group(1))

    msg = f"Invalid duration format: {duration_str}"
    raise ValueError(msg)


def parse_manual_line(line: str) -> Movie | None:
    """Parse a manual format line like 'Movie A, 18:20 or 20:50, 1h 46m'."""
    line = line.strip()
    if not line or line.startswith("#"):
        return None

    parts = line.split(",")
    if len(parts) < _MIN_MANUAL_LINE_PARTS:
        msg = f"Invalid line format: {line}"
        raise ValueError(msg)

    movie = parts[0].strip()
    times_str = parts[1].strip()
    duration_str = ",".join(parts[2:]).strip()

    start_times = [
        parse_time(time_part)
        for time_part in re.split(r"\s+or\s+", times_str, flags=re.IGNORECASE)
    ]

    duration = parse_duration(duration_str)

    return Movie(movie, start_times, duration)


def _try_parse_time(time_str: str) -> int | None:
    """Try to parse a time string, returning None on failure."""
    try:
        return parse_time(time_str)
    except ValueError:
        return None


def _try_parse_manual_line(
    line: str,
    error_stream: TextIO | None = None,
) -> Movie | None:
    """Try to parse a manual line, writing errors to error_stream."""
    try:
        return parse_manual_line(line)
    except ValueError as e:
        if error_stream is not None:
            error_stream.write(f"Warning: {e}\n")
        return None


def _try_parse_interactive_line(line: str) -> Movie | None:
    """Try to parse a line in interactive mode, logging errors."""
    try:
        result = parse_manual_line(line)
    except ValueError:
        logger.exception("  Error parsing input")
        return None
    if result:
        logger.info("  Added: %s", result.name)
    return result


def extract_date_from_html(content: str) -> str | None:
    """Extract schedule date from Cinema City HTML."""
    # Look for date in YYYY-MM-DD format
    match = re.search(r"(202\d-\d{2}-\d{2})", content)
    if match:
        return match.group(1)
    return None


def parse_cinema_city_html(
    filepath: str,
) -> tuple[list[Movie], str | None]:
    """Parse Cinema City HTML schedule.

    Returns:
        Tuple of (movies, date).
    """
    with Path(filepath).open(encoding="utf-8") as f:
        content = f.read()

    movies: list[Movie] = []
    schedule_date = extract_date_from_html(content)

    # Split content by movie sections
    sections = re.split(r'class="row movie-row', content)

    for section in sections[1:]:  # Skip first (before any movie)
        # Get movie name
        name_match = re.search(r'qb-movie-name">([^<]+)<', section)
        if not name_match:
            continue
        movie_name = name_match.group(1).strip()

        # Get genres
        genre_match = re.search(r'class="mr-sm"[^>]*>([^<]+)<\s*span', section)
        genres: list[str] = []
        if genre_match:
            genre_text = genre_match.group(1).strip()
            genres = [g.strip() for g in genre_text.split(",") if g.strip()]

        # Get duration
        duration_match = re.search(r"(\d+)\s*min", section)
        if not duration_match:
            continue
        duration = int(duration_match.group(1))

        # Get screening times - look for time buttons
        times = re.findall(r'btn btn-primary btn-lg">\s*(\d{2}:\d{2})\s*<', section)
        if not times:
            # Try alternate pattern
            times = re.findall(r">\s*(\d{2}:\d{2})\s*\(HTTPS://", section)

        if times:
            start_times = list(dict.fromkeys(parse_time(t) for t in times))
            movies.append(
                Movie(movie_name, start_times, duration, genres),
            )

    # Deduplicate movies (same movie might appear multiple times)
    seen: set[str] = set()
    unique_movies: list[Movie] = []
    for movie in movies:
        if movie.name not in seen:
            seen.add(movie.name)
            unique_movies.append(movie)

    return unique_movies, schedule_date


def parse_cinema_city_pdf(filepath: str) -> list[Movie]:
    """Parse Cinema City PDF schedule by extracting text."""
    if _pdfplumber is not None:
        with _pdfplumber.open(filepath) as pdf:
            full_text = ""
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text += text + "\n"
        return parse_cinema_city_text(full_text)

    return _parse_cinema_city_pdf_basic(filepath)


def _parse_cinema_city_pdf_basic(filepath: str) -> list[Movie]:
    """Basic PDF parsing using PyMuPDF or falling back to subprocess."""
    if _fitz is not None:
        doc = _fitz.open(filepath)
        full_text = ""
        for page in doc:
            full_text += page.get_text() + "\n"
        doc.close()
        return parse_cinema_city_text(full_text)

    pdftotext_path = shutil.which("pdftotext")
    if pdftotext_path is None:
        _exit_no_pdf_support()

    try:
        result = subprocess.run(
            [pdftotext_path, "-layout", filepath, "-"],
            capture_output=True,
            text=True,
            check=True,
        )
    except subprocess.CalledProcessError:
        _exit_no_pdf_support()

    return parse_cinema_city_text(result.stdout)


def _exit_no_pdf_support() -> None:
    """Log PDF support error and exit."""
    logger.error("Install pdfplumber, PyMuPDF, or poppler-utils for PDF support")
    logger.error("  pip install pdfplumber")
    logger.error("  pip install pymupdf")
    logger.error("  pacman -S poppler")
    sys.exit(1)


def parse_cinema_city_text(text: str) -> list[Movie]:
    """Parse Cinema City schedule from extracted text."""
    movies: list[Movie] = []
    lines = text.split("\n")

    current_movie: str | None = None
    current_duration: int | None = None
    current_times: list[int] = []

    # Patterns for movie titles (all caps, usually)
    movie_title_pattern = re.compile(
        r"^([A-ZĄĆĘŁŃÓŚŹŻ][A-ZĄĆĘŁŃÓŚŹŻ0-9\s:,\.\-\!\?\(\)]+)$"
    )
    duration_pattern = re.compile(r"(\d+)\s*min")
    time_pattern = re.compile(r"\b(\d{1,2}:\d{2})\b")

    for i, raw_line in enumerate(lines):
        line = raw_line.strip()

        if movie_title_pattern.match(line) and len(line) > _MIN_TITLE_LENGTH:
            if current_movie and current_times:
                movies.append(
                    Movie(
                        current_movie,
                        list(dict.fromkeys(current_times)),
                        current_duration or _DEFAULT_MOVIE_DURATION,
                    )
                )

            current_movie = line.title()
            current_times = []
            current_duration = None

            # Look ahead for duration
            end = min(i + _TITLE_LOOKAHEAD_LINES, len(lines))
            for j in range(i + 1, end):
                dur_match = duration_pattern.search(lines[j])
                if dur_match:
                    current_duration = int(dur_match.group(1))
                    break

        if current_movie:
            times_in_line = time_pattern.findall(line)
            for t in times_in_line:
                parsed = _try_parse_time(t)
                if parsed is not None:
                    current_times.append(parsed)

    # Save last movie
    if current_movie and current_times:
        movies.append(
            Movie(
                current_movie,
                list(dict.fromkeys(current_times)),
                current_duration or _DEFAULT_MOVIE_DURATION,
            )
        )

    return movies