testsAndMisc-archive/python_pkg/cinema_planner/_cinema_parsing.py
Krzysztof kuhy Rudnicki 78c1d77144 fix: resolve all pre-commit hook failures after file splits
- Remove all # type: ignore and # noqa comments (banned by no-noqa hook)
- Add mypy --disable-error-code flags to pre-commit config for error
  codes previously suppressed by inline comments
- Fix broken imports after ruff auto-removed re-exports:
  steam_backlog_enforcer, stockfish_analysis, word_frequency, lichess_bot
- Re-add re-exports with __all__ in translator.py, screen_lock.py
- Split _process_epc_fc.py (524 lines) into _process_epc_fc.py + _process_fc.py
- Fix test failures: keyboard_coop, stockfish_analysis, tag_divider
- Add per-file-ignores for PLC0415 (deferred imports) in 7 files
- Mark shebang scripts as executable
- Add __init__.py for generate_images and repo_explorer packages
- Fix codespell, eslint, ruff-format, prettier issues
- Update copilot-instructions.md with --no-verify ban
2026-03-18 22:20:05 +01:00

332 lines
9.6 KiB
Python

"""Parsing functions for Cinema City schedules and manual input."""
from __future__ import annotations
from dataclasses import dataclass, field
import importlib
import logging
from pathlib import Path
import re
import shutil
import subprocess
import sys
from typing import TYPE_CHECKING, TextIO
if TYPE_CHECKING:
import types
logger = logging.getLogger(__name__)
# Constants for validation and parsing
_MIN_MANUAL_LINE_PARTS = 3
_MIN_TITLE_LENGTH = 3
_DEFAULT_MOVIE_DURATION = 120
_TITLE_LOOKAHEAD_LINES = 5
def _try_import(name: str) -> types.ModuleType | None:
"""Attempt to import a module, returning None if unavailable."""
try:
return importlib.import_module(name)
except ImportError:
return None
_pdfplumber = _try_import("pdfplumber")
_fitz = _try_import("fitz")
@dataclass
class Movie:
"""A movie with screening times and metadata."""
name: str
start_times: list[int]
duration: int
genres: list[str] = field(default_factory=list)
def parse_time(time_str: str) -> int:
"""Parse time string like '18:20' to minutes from midnight."""
time_str = time_str.strip().replace(".", ":")
match = re.match(r"(\d{1,2}):(\d{2})", time_str)
if not match:
msg = f"Invalid time format: {time_str}"
raise ValueError(msg)
hours, minutes = int(match.group(1)), int(match.group(2))
return hours * 60 + minutes
def parse_duration(duration_str: str) -> int:
"""Parse duration like '1h 46m', '1:46', '106m', '110 min', etc."""
duration_str = duration_str.strip().lower()
# Try "X min" format (from Cinema City)
match = re.search(r"(\d+)\s*min", duration_str)
if match:
return int(match.group(1))
hours = 0
minutes = 0
h_match = re.search(r"(\d+)\s*h", duration_str)
m_match = re.search(r"(\d+)\s*m(?!in)", duration_str)
if h_match or m_match:
if h_match:
hours = int(h_match.group(1))
if m_match:
minutes = int(m_match.group(1))
return hours * 60 + minutes
# Try "H:MM" format
match = re.match(r"(\d+):(\d{2})", duration_str)
if match:
return int(match.group(1)) * 60 + int(match.group(2))
# Try pure minutes
match = re.match(r"(\d+)", duration_str)
if match:
return int(match.group(1))
msg = f"Invalid duration format: {duration_str}"
raise ValueError(msg)
def parse_manual_line(line: str) -> Movie | None:
"""Parse a manual format line like 'Movie A, 18:20 or 20:50, 1h 46m'."""
line = line.strip()
if not line or line.startswith("#"):
return None
parts = line.split(",")
if len(parts) < _MIN_MANUAL_LINE_PARTS:
msg = f"Invalid line format: {line}"
raise ValueError(msg)
movie = parts[0].strip()
times_str = parts[1].strip()
duration_str = ",".join(parts[2:]).strip()
start_times = [
parse_time(time_part)
for time_part in re.split(r"\s+or\s+", times_str, flags=re.IGNORECASE)
]
duration = parse_duration(duration_str)
return Movie(movie, start_times, duration)
def _try_parse_time(time_str: str) -> int | None:
"""Try to parse a time string, returning None on failure."""
try:
return parse_time(time_str)
except ValueError:
return None
def _try_parse_manual_line(
line: str,
error_stream: TextIO | None = None,
) -> Movie | None:
"""Try to parse a manual line, writing errors to error_stream."""
try:
return parse_manual_line(line)
except ValueError as e:
if error_stream is not None:
error_stream.write(f"Warning: {e}\n")
return None
def _try_parse_interactive_line(line: str) -> Movie | None:
"""Try to parse a line in interactive mode, logging errors."""
try:
result = parse_manual_line(line)
except ValueError:
logger.exception(" Error parsing input")
return None
if result:
logger.info(" Added: %s", result.name)
return result
def extract_date_from_html(content: str) -> str | None:
"""Extract schedule date from Cinema City HTML."""
# Look for date in YYYY-MM-DD format
match = re.search(r"(202\d-\d{2}-\d{2})", content)
if match:
return match.group(1)
return None
def parse_cinema_city_html(
filepath: str,
) -> tuple[list[Movie], str | None]:
"""Parse Cinema City HTML schedule.
Returns:
Tuple of (movies, date).
"""
with Path(filepath).open(encoding="utf-8") as f:
content = f.read()
movies: list[Movie] = []
schedule_date = extract_date_from_html(content)
# Split content by movie sections
sections = re.split(r'class="row movie-row', content)
for section in sections[1:]: # Skip first (before any movie)
# Get movie name
name_match = re.search(r'qb-movie-name">([^<]+)<', section)
if not name_match:
continue
movie_name = name_match.group(1).strip()
# Get genres
genre_match = re.search(r'class="mr-sm"[^>]*>([^<]+)<\s*span', section)
genres: list[str] = []
if genre_match:
genre_text = genre_match.group(1).strip()
genres = [g.strip() for g in genre_text.split(",") if g.strip()]
# Get duration
duration_match = re.search(r"(\d+)\s*min", section)
if not duration_match:
continue
duration = int(duration_match.group(1))
# Get screening times - look for time buttons
times = re.findall(r'btn btn-primary btn-lg">\s*(\d{2}:\d{2})\s*<', section)
if not times:
# Try alternate pattern
times = re.findall(r">\s*(\d{2}:\d{2})\s*\(HTTPS://", section)
if times:
start_times = list(dict.fromkeys(parse_time(t) for t in times))
movies.append(
Movie(movie_name, start_times, duration, genres),
)
# Deduplicate movies (same movie might appear multiple times)
seen: set[str] = set()
unique_movies: list[Movie] = []
for movie in movies:
if movie.name not in seen:
seen.add(movie.name)
unique_movies.append(movie)
return unique_movies, schedule_date
def parse_cinema_city_pdf(filepath: str) -> list[Movie]:
"""Parse Cinema City PDF schedule by extracting text."""
if _pdfplumber is not None:
with _pdfplumber.open(filepath) as pdf:
full_text = ""
for page in pdf.pages:
text = page.extract_text()
if text:
full_text += text + "\n"
return parse_cinema_city_text(full_text)
return _parse_cinema_city_pdf_basic(filepath)
def _parse_cinema_city_pdf_basic(filepath: str) -> list[Movie]:
"""Basic PDF parsing using PyMuPDF or falling back to subprocess."""
if _fitz is not None:
doc = _fitz.open(filepath)
full_text = ""
for page in doc:
full_text += page.get_text() + "\n"
doc.close()
return parse_cinema_city_text(full_text)
pdftotext_path = shutil.which("pdftotext")
if pdftotext_path is None:
_exit_no_pdf_support()
try:
result = subprocess.run(
[pdftotext_path, "-layout", filepath, "-"],
capture_output=True,
text=True,
check=True,
)
except subprocess.CalledProcessError:
_exit_no_pdf_support()
return parse_cinema_city_text(result.stdout)
def _exit_no_pdf_support() -> None:
"""Log PDF support error and exit."""
logger.error("Install pdfplumber, PyMuPDF, or poppler-utils for PDF support")
logger.error(" pip install pdfplumber")
logger.error(" pip install pymupdf")
logger.error(" pacman -S poppler")
sys.exit(1)
def parse_cinema_city_text(text: str) -> list[Movie]:
"""Parse Cinema City schedule from extracted text."""
movies: list[Movie] = []
lines = text.split("\n")
current_movie: str | None = None
current_duration: int | None = None
current_times: list[int] = []
# Patterns for movie titles (all caps, usually)
movie_title_pattern = re.compile(
r"^([A-ZĄĆĘŁŃÓŚŹŻ][A-ZĄĆĘŁŃÓŚŹŻ0-9\s:,\.\-\!\?\(\)]+)$"
)
duration_pattern = re.compile(r"(\d+)\s*min")
time_pattern = re.compile(r"\b(\d{1,2}:\d{2})\b")
for i, raw_line in enumerate(lines):
line = raw_line.strip()
if movie_title_pattern.match(line) and len(line) > _MIN_TITLE_LENGTH:
if current_movie and current_times:
movies.append(
Movie(
current_movie,
list(dict.fromkeys(current_times)),
current_duration or _DEFAULT_MOVIE_DURATION,
)
)
current_movie = line.title()
current_times = []
current_duration = None
# Look ahead for duration
end = min(i + _TITLE_LOOKAHEAD_LINES, len(lines))
for j in range(i + 1, end):
dur_match = duration_pattern.search(lines[j])
if dur_match:
current_duration = int(dur_match.group(1))
break
if current_movie:
times_in_line = time_pattern.findall(line)
for t in times_in_line:
parsed = _try_parse_time(t)
if parsed is not None:
current_times.append(parsed)
# Save last movie
if current_movie and current_times:
movies.append(
Movie(
current_movie,
list(dict.fromkeys(current_times)),
current_duration or _DEFAULT_MOVIE_DURATION,
)
)
return movies