mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 18:03:07 +02:00
- Remove all # type: ignore and # noqa comments (banned by no-noqa hook) - Add mypy --disable-error-code flags to pre-commit config for error codes previously suppressed by inline comments - Fix broken imports after ruff auto-removed re-exports: steam_backlog_enforcer, stockfish_analysis, word_frequency, lichess_bot - Re-add re-exports with __all__ in translator.py, screen_lock.py - Split _process_epc_fc.py (524 lines) into _process_epc_fc.py + _process_fc.py - Fix test failures: keyboard_coop, stockfish_analysis, tag_divider - Add per-file-ignores for PLC0415 (deferred imports) in 7 files - Mark shebang scripts as executable - Add __init__.py for generate_images and repo_explorer packages - Fix codespell, eslint, ruff-format, prettier issues - Update copilot-instructions.md with --no-verify ban
332 lines
9.6 KiB
Python
332 lines
9.6 KiB
Python
"""Parsing functions for Cinema City schedules and manual input."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
import importlib
|
|
import logging
|
|
from pathlib import Path
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
from typing import TYPE_CHECKING, TextIO
|
|
|
|
if TYPE_CHECKING:
|
|
import types
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Constants for validation and parsing
|
|
_MIN_MANUAL_LINE_PARTS = 3
|
|
_MIN_TITLE_LENGTH = 3
|
|
_DEFAULT_MOVIE_DURATION = 120
|
|
_TITLE_LOOKAHEAD_LINES = 5
|
|
|
|
|
|
def _try_import(name: str) -> types.ModuleType | None:
|
|
"""Attempt to import a module, returning None if unavailable."""
|
|
try:
|
|
return importlib.import_module(name)
|
|
except ImportError:
|
|
return None
|
|
|
|
|
|
_pdfplumber = _try_import("pdfplumber")
|
|
_fitz = _try_import("fitz")
|
|
|
|
|
|
@dataclass
|
|
class Movie:
|
|
"""A movie with screening times and metadata."""
|
|
|
|
name: str
|
|
start_times: list[int]
|
|
duration: int
|
|
genres: list[str] = field(default_factory=list)
|
|
|
|
|
|
def parse_time(time_str: str) -> int:
|
|
"""Parse time string like '18:20' to minutes from midnight."""
|
|
time_str = time_str.strip().replace(".", ":")
|
|
match = re.match(r"(\d{1,2}):(\d{2})", time_str)
|
|
if not match:
|
|
msg = f"Invalid time format: {time_str}"
|
|
raise ValueError(msg)
|
|
hours, minutes = int(match.group(1)), int(match.group(2))
|
|
return hours * 60 + minutes
|
|
|
|
|
|
def parse_duration(duration_str: str) -> int:
|
|
"""Parse duration like '1h 46m', '1:46', '106m', '110 min', etc."""
|
|
duration_str = duration_str.strip().lower()
|
|
|
|
# Try "X min" format (from Cinema City)
|
|
match = re.search(r"(\d+)\s*min", duration_str)
|
|
if match:
|
|
return int(match.group(1))
|
|
|
|
hours = 0
|
|
minutes = 0
|
|
|
|
h_match = re.search(r"(\d+)\s*h", duration_str)
|
|
m_match = re.search(r"(\d+)\s*m(?!in)", duration_str)
|
|
|
|
if h_match or m_match:
|
|
if h_match:
|
|
hours = int(h_match.group(1))
|
|
if m_match:
|
|
minutes = int(m_match.group(1))
|
|
return hours * 60 + minutes
|
|
|
|
# Try "H:MM" format
|
|
match = re.match(r"(\d+):(\d{2})", duration_str)
|
|
if match:
|
|
return int(match.group(1)) * 60 + int(match.group(2))
|
|
|
|
# Try pure minutes
|
|
match = re.match(r"(\d+)", duration_str)
|
|
if match:
|
|
return int(match.group(1))
|
|
|
|
msg = f"Invalid duration format: {duration_str}"
|
|
raise ValueError(msg)
|
|
|
|
|
|
def parse_manual_line(line: str) -> Movie | None:
|
|
"""Parse a manual format line like 'Movie A, 18:20 or 20:50, 1h 46m'."""
|
|
line = line.strip()
|
|
if not line or line.startswith("#"):
|
|
return None
|
|
|
|
parts = line.split(",")
|
|
if len(parts) < _MIN_MANUAL_LINE_PARTS:
|
|
msg = f"Invalid line format: {line}"
|
|
raise ValueError(msg)
|
|
|
|
movie = parts[0].strip()
|
|
times_str = parts[1].strip()
|
|
duration_str = ",".join(parts[2:]).strip()
|
|
|
|
start_times = [
|
|
parse_time(time_part)
|
|
for time_part in re.split(r"\s+or\s+", times_str, flags=re.IGNORECASE)
|
|
]
|
|
|
|
duration = parse_duration(duration_str)
|
|
|
|
return Movie(movie, start_times, duration)
|
|
|
|
|
|
def _try_parse_time(time_str: str) -> int | None:
|
|
"""Try to parse a time string, returning None on failure."""
|
|
try:
|
|
return parse_time(time_str)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _try_parse_manual_line(
|
|
line: str,
|
|
error_stream: TextIO | None = None,
|
|
) -> Movie | None:
|
|
"""Try to parse a manual line, writing errors to error_stream."""
|
|
try:
|
|
return parse_manual_line(line)
|
|
except ValueError as e:
|
|
if error_stream is not None:
|
|
error_stream.write(f"Warning: {e}\n")
|
|
return None
|
|
|
|
|
|
def _try_parse_interactive_line(line: str) -> Movie | None:
|
|
"""Try to parse a line in interactive mode, logging errors."""
|
|
try:
|
|
result = parse_manual_line(line)
|
|
except ValueError:
|
|
logger.exception(" Error parsing input")
|
|
return None
|
|
if result:
|
|
logger.info(" Added: %s", result.name)
|
|
return result
|
|
|
|
|
|
def extract_date_from_html(content: str) -> str | None:
|
|
"""Extract schedule date from Cinema City HTML."""
|
|
# Look for date in YYYY-MM-DD format
|
|
match = re.search(r"(202\d-\d{2}-\d{2})", content)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
|
|
def parse_cinema_city_html(
|
|
filepath: str,
|
|
) -> tuple[list[Movie], str | None]:
|
|
"""Parse Cinema City HTML schedule.
|
|
|
|
Returns:
|
|
Tuple of (movies, date).
|
|
"""
|
|
with Path(filepath).open(encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
movies: list[Movie] = []
|
|
schedule_date = extract_date_from_html(content)
|
|
|
|
# Split content by movie sections
|
|
sections = re.split(r'class="row movie-row', content)
|
|
|
|
for section in sections[1:]: # Skip first (before any movie)
|
|
# Get movie name
|
|
name_match = re.search(r'qb-movie-name">([^<]+)<', section)
|
|
if not name_match:
|
|
continue
|
|
movie_name = name_match.group(1).strip()
|
|
|
|
# Get genres
|
|
genre_match = re.search(r'class="mr-sm"[^>]*>([^<]+)<\s*span', section)
|
|
genres: list[str] = []
|
|
if genre_match:
|
|
genre_text = genre_match.group(1).strip()
|
|
genres = [g.strip() for g in genre_text.split(",") if g.strip()]
|
|
|
|
# Get duration
|
|
duration_match = re.search(r"(\d+)\s*min", section)
|
|
if not duration_match:
|
|
continue
|
|
duration = int(duration_match.group(1))
|
|
|
|
# Get screening times - look for time buttons
|
|
times = re.findall(r'btn btn-primary btn-lg">\s*(\d{2}:\d{2})\s*<', section)
|
|
if not times:
|
|
# Try alternate pattern
|
|
times = re.findall(r">\s*(\d{2}:\d{2})\s*\(HTTPS://", section)
|
|
|
|
if times:
|
|
start_times = list(dict.fromkeys(parse_time(t) for t in times))
|
|
movies.append(
|
|
Movie(movie_name, start_times, duration, genres),
|
|
)
|
|
|
|
# Deduplicate movies (same movie might appear multiple times)
|
|
seen: set[str] = set()
|
|
unique_movies: list[Movie] = []
|
|
for movie in movies:
|
|
if movie.name not in seen:
|
|
seen.add(movie.name)
|
|
unique_movies.append(movie)
|
|
|
|
return unique_movies, schedule_date
|
|
|
|
|
|
def parse_cinema_city_pdf(filepath: str) -> list[Movie]:
|
|
"""Parse Cinema City PDF schedule by extracting text."""
|
|
if _pdfplumber is not None:
|
|
with _pdfplumber.open(filepath) as pdf:
|
|
full_text = ""
|
|
for page in pdf.pages:
|
|
text = page.extract_text()
|
|
if text:
|
|
full_text += text + "\n"
|
|
return parse_cinema_city_text(full_text)
|
|
|
|
return _parse_cinema_city_pdf_basic(filepath)
|
|
|
|
|
|
def _parse_cinema_city_pdf_basic(filepath: str) -> list[Movie]:
|
|
"""Basic PDF parsing using PyMuPDF or falling back to subprocess."""
|
|
if _fitz is not None:
|
|
doc = _fitz.open(filepath)
|
|
full_text = ""
|
|
for page in doc:
|
|
full_text += page.get_text() + "\n"
|
|
doc.close()
|
|
return parse_cinema_city_text(full_text)
|
|
|
|
pdftotext_path = shutil.which("pdftotext")
|
|
if pdftotext_path is None:
|
|
_exit_no_pdf_support()
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
[pdftotext_path, "-layout", filepath, "-"],
|
|
capture_output=True,
|
|
text=True,
|
|
check=True,
|
|
)
|
|
except subprocess.CalledProcessError:
|
|
_exit_no_pdf_support()
|
|
|
|
return parse_cinema_city_text(result.stdout)
|
|
|
|
|
|
def _exit_no_pdf_support() -> None:
|
|
"""Log PDF support error and exit."""
|
|
logger.error("Install pdfplumber, PyMuPDF, or poppler-utils for PDF support")
|
|
logger.error(" pip install pdfplumber")
|
|
logger.error(" pip install pymupdf")
|
|
logger.error(" pacman -S poppler")
|
|
sys.exit(1)
|
|
|
|
|
|
def parse_cinema_city_text(text: str) -> list[Movie]:
|
|
"""Parse Cinema City schedule from extracted text."""
|
|
movies: list[Movie] = []
|
|
lines = text.split("\n")
|
|
|
|
current_movie: str | None = None
|
|
current_duration: int | None = None
|
|
current_times: list[int] = []
|
|
|
|
# Patterns for movie titles (all caps, usually)
|
|
movie_title_pattern = re.compile(
|
|
r"^([A-ZĄĆĘŁŃÓŚŹŻ][A-ZĄĆĘŁŃÓŚŹŻ0-9\s:,\.\-\!\?\(\)]+)$"
|
|
)
|
|
duration_pattern = re.compile(r"(\d+)\s*min")
|
|
time_pattern = re.compile(r"\b(\d{1,2}:\d{2})\b")
|
|
|
|
for i, raw_line in enumerate(lines):
|
|
line = raw_line.strip()
|
|
|
|
if movie_title_pattern.match(line) and len(line) > _MIN_TITLE_LENGTH:
|
|
if current_movie and current_times:
|
|
movies.append(
|
|
Movie(
|
|
current_movie,
|
|
list(dict.fromkeys(current_times)),
|
|
current_duration or _DEFAULT_MOVIE_DURATION,
|
|
)
|
|
)
|
|
|
|
current_movie = line.title()
|
|
current_times = []
|
|
current_duration = None
|
|
|
|
# Look ahead for duration
|
|
end = min(i + _TITLE_LOOKAHEAD_LINES, len(lines))
|
|
for j in range(i + 1, end):
|
|
dur_match = duration_pattern.search(lines[j])
|
|
if dur_match:
|
|
current_duration = int(dur_match.group(1))
|
|
break
|
|
|
|
if current_movie:
|
|
times_in_line = time_pattern.findall(line)
|
|
for t in times_in_line:
|
|
parsed = _try_parse_time(t)
|
|
if parsed is not None:
|
|
current_times.append(parsed)
|
|
|
|
# Save last movie
|
|
if current_movie and current_times:
|
|
movies.append(
|
|
Movie(
|
|
current_movie,
|
|
list(dict.fromkeys(current_times)),
|
|
current_duration or _DEFAULT_MOVIE_DURATION,
|
|
)
|
|
)
|
|
|
|
return movies
|