testsAndMisc-archive/python_pkg/cinema_planner/cinema_planner.py

#!/usr/bin/env python3
"""Cinema Day Planner - Maximize movies watched in a day.

Supports:
- Cinema City HTML/PDF schedules (auto-parsed)
- Manual input format

Usage:
    ./cinema_planner.py schedule.html          # Parse Cinema City HTML
    ./cinema_planner.py schedule.pdf           # Parse Cinema City PDF
    ./cinema_planner.py -i                     # Interactive manual input
    ./cinema_planner.py movies.txt             # Manual format file
"""

from __future__ import annotations

import argparse
from contextlib import suppress
from dataclasses import dataclass, field
import importlib
from io import StringIO
import logging
from pathlib import Path
import re
import shutil
import subprocess
import sys
from typing import TYPE_CHECKING, TextIO

if TYPE_CHECKING:
    import types

logger = logging.getLogger(__name__)

# Default genres to exclude (can be overridden with --all-genres)
DEFAULT_EXCLUDED_GENRES = {"horror"}

# Ads duration before movie starts (Cinema City shows ~15 min of ads)
ADS_DURATION = 15

# Constants for validation and parsing
_MIN_MANUAL_LINE_PARTS = 3
_MIN_TITLE_LENGTH = 3
_DEFAULT_MOVIE_DURATION = 120
_TITLE_LOOKAHEAD_LINES = 5
_SEPARATOR_WIDTH = 60


def _try_import(name: str) -> types.ModuleType | None:
    """Attempt to import a module, returning None if unavailable."""
    try:
        return importlib.import_module(name)
    except ImportError:
        return None


_pdfplumber = _try_import("pdfplumber")
_fitz = _try_import("fitz")


@dataclass
class Movie:
    """A movie with screening times and metadata."""

    name: str
    start_times: list[int]
    duration: int
    genres: list[str] = field(default_factory=list)


@dataclass
class Screening:
    """A specific screening of a movie at a particular time."""

    movie: str
    start: int  # minutes from midnight
    end: int  # minutes from midnight

    def overlaps(self, other: Screening, buffer: int = 0) -> bool:
        """Check if this screening overlaps with another, considering buffer."""
        # Account for ADS_DURATION grace period
        return not (
            self.end + buffer <= other.start + ADS_DURATION
            or other.end + buffer <= self.start + ADS_DURATION
        )

    def start_str(self) -> str:
        """Format start time as HH:MM."""
        return f"{self.start // 60:02d}:{self.start % 60:02d}"

    def end_str(self) -> str:
        """Format end time as HH:MM."""
        return f"{self.end // 60:02d}:{self.end % 60:02d}"


def parse_time(time_str: str) -> int:
    """Parse time string like '18:20' to minutes from midnight."""
    time_str = time_str.strip().replace(".", ":")
    match = re.match(r"(\d{1,2}):(\d{2})", time_str)
    if not match:
        msg = f"Invalid time format: {time_str}"
        raise ValueError(msg)
    hours, minutes = int(match.group(1)), int(match.group(2))
    return hours * 60 + minutes


def parse_duration(duration_str: str) -> int:
    """Parse duration like '1h 46m', '1:46', '106m', '110 min', etc."""
    duration_str = duration_str.strip().lower()

    # Try "X min" format (from Cinema City)
    match = re.search(r"(\d+)\s*min", duration_str)
    if match:
        return int(match.group(1))

    hours = 0
    minutes = 0

    h_match = re.search(r"(\d+)\s*h", duration_str)
    m_match = re.search(r"(\d+)\s*m(?!in)", duration_str)

    if h_match or m_match:
        if h_match:
            hours = int(h_match.group(1))
        if m_match:
            minutes = int(m_match.group(1))
        return hours * 60 + minutes

    # Try "H:MM" format
    match = re.match(r"(\d+):(\d{2})", duration_str)
    if match:
        return int(match.group(1)) * 60 + int(match.group(2))

    # Try pure minutes
    match = re.match(r"(\d+)", duration_str)
    if match:
        return int(match.group(1))

    msg = f"Invalid duration format: {duration_str}"
    raise ValueError(msg)


def parse_manual_line(line: str) -> Movie | None:
    """Parse a manual format line like 'Movie A, 18:20 or 20:50, 1h 46m'."""
    line = line.strip()
    if not line or line.startswith("#"):
        return None

    parts = line.split(",")
    if len(parts) < _MIN_MANUAL_LINE_PARTS:
        msg = f"Invalid line format: {line}"
        raise ValueError(msg)

    movie = parts[0].strip()
    times_str = parts[1].strip()
    duration_str = ",".join(parts[2:]).strip()

    start_times = [
        parse_time(time_part)
        for time_part in re.split(r"\s+or\s+", times_str, flags=re.IGNORECASE)
    ]

    duration = parse_duration(duration_str)

    return Movie(movie, start_times, duration)


def _try_parse_time(time_str: str) -> int | None:
    """Try to parse a time string, returning None on failure."""
    try:
        return parse_time(time_str)
    except ValueError:
        return None


def _try_parse_manual_line(
    line: str,
    error_stream: TextIO | None = None,
) -> Movie | None:
    """Try to parse a manual line, writing errors to error_stream."""
    try:
        return parse_manual_line(line)
    except ValueError as e:
        if error_stream is not None:
            error_stream.write(f"Warning: {e}\n")
        return None


def _try_parse_interactive_line(line: str) -> Movie | None:
    """Try to parse a line in interactive mode, logging errors."""
    try:
        result = parse_manual_line(line)
    except ValueError:
        logger.exception("  Error parsing input")
        return None
    if result:
        logger.info("  Added: %s", result.name)
    return result


def extract_date_from_html(content: str) -> str | None:
    """Extract schedule date from Cinema City HTML."""
    # Look for date in YYYY-MM-DD format
    match = re.search(r"(202\d-\d{2}-\d{2})", content)
    if match:
        return match.group(1)
    return None


def parse_cinema_city_html(
    filepath: str,
) -> tuple[list[Movie], str | None]:
    """Parse Cinema City HTML schedule.

    Returns:
        Tuple of (movies, date).
    """
    with Path(filepath).open(encoding="utf-8") as f:
        content = f.read()

    movies: list[Movie] = []
    schedule_date = extract_date_from_html(content)

    # Split content by movie sections
    sections = re.split(r'class="row movie-row', content)

    for section in sections[1:]:  # Skip first (before any movie)
        # Get movie name
        name_match = re.search(r'qb-movie-name">([^<]+)<', section)
        if not name_match:
            continue
        movie_name = name_match.group(1).strip()

        # Get genres
        genre_match = re.search(
            r'class="mr-sm"[^>]*>([^<]+)<\s*span', section
        )
        genres: list[str] = []
        if genre_match:
            genre_text = genre_match.group(1).strip()
            genres = [
                g.strip() for g in genre_text.split(",") if g.strip()
            ]

        # Get duration
        duration_match = re.search(r"(\d+)\s*min", section)
        if not duration_match:
            continue
        duration = int(duration_match.group(1))

        # Get screening times - look for time buttons
        times = re.findall(
            r'btn btn-primary btn-lg">\s*(\d{2}:\d{2})\s*<', section
        )
        if not times:
            # Try alternate pattern
            times = re.findall(
                r">\s*(\d{2}:\d{2})\s*\(HTTPS://", section
            )

        if times:
            start_times = list(dict.fromkeys(
                parse_time(t) for t in times
            ))
            movies.append(
                Movie(movie_name, start_times, duration, genres),
            )

    # Deduplicate movies (same movie might appear multiple times)
    seen: set[str] = set()
    unique_movies: list[Movie] = []
    for movie in movies:
        if movie.name not in seen:
            seen.add(movie.name)
            unique_movies.append(movie)

    return unique_movies, schedule_date


def parse_cinema_city_pdf(filepath: str) -> list[Movie]:
    """Parse Cinema City PDF schedule by extracting text."""
    if _pdfplumber is not None:
        with _pdfplumber.open(filepath) as pdf:
            full_text = ""
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text += text + "\n"
        return parse_cinema_city_text(full_text)

    return _parse_cinema_city_pdf_basic(filepath)


def _parse_cinema_city_pdf_basic(filepath: str) -> list[Movie]:
    """Basic PDF parsing using PyMuPDF or falling back to subprocess."""
    if _fitz is not None:
        doc = _fitz.open(filepath)
        full_text = ""
        for page in doc:
            full_text += page.get_text() + "\n"
        doc.close()
        return parse_cinema_city_text(full_text)

    pdftotext_path = shutil.which("pdftotext")
    if pdftotext_path is None:
        _exit_no_pdf_support()

    try:
        result = subprocess.run(
            [pdftotext_path, "-layout", filepath, "-"],
            capture_output=True,
            text=True,
            check=True,
        )
    except subprocess.CalledProcessError:
        _exit_no_pdf_support()

    return parse_cinema_city_text(result.stdout)


def _exit_no_pdf_support() -> None:
    """Log PDF support error and exit."""
    logger.error(
        "Install pdfplumber, PyMuPDF, or poppler-utils for PDF support"
    )
    logger.error("  pip install pdfplumber")
    logger.error("  pip install pymupdf")
    logger.error("  pacman -S poppler")
    sys.exit(1)


def parse_cinema_city_text(text: str) -> list[Movie]:
    """Parse Cinema City schedule from extracted text."""
    movies: list[Movie] = []
    lines = text.split("\n")

    current_movie: str | None = None
    current_duration: int | None = None
    current_times: list[int] = []

    # Patterns for movie titles (all caps, usually)
    movie_title_pattern = re.compile(
        r"^([A-ZĄĆĘŁŃÓŚŹŻ][A-ZĄĆĘŁŃÓŚŹŻ0-9\s:,\.\-\!\?\(\)]+)$"
    )
    duration_pattern = re.compile(r"(\d+)\s*min")
    time_pattern = re.compile(r"\b(\d{1,2}:\d{2})\b")

    for i, raw_line in enumerate(lines):
        line = raw_line.strip()

        if (
            movie_title_pattern.match(line)
            and len(line) > _MIN_TITLE_LENGTH
        ):
            if current_movie and current_times:
                movies.append(Movie(
                    current_movie,
                    list(dict.fromkeys(current_times)),
                    current_duration or _DEFAULT_MOVIE_DURATION,
                ))

            current_movie = line.title()
            current_times = []
            current_duration = None

            # Look ahead for duration
            end = min(i + _TITLE_LOOKAHEAD_LINES, len(lines))
            for j in range(i + 1, end):
                dur_match = duration_pattern.search(lines[j])
                if dur_match:
                    current_duration = int(dur_match.group(1))
                    break

        if current_movie:
            times_in_line = time_pattern.findall(line)
            for t in times_in_line:
                parsed = _try_parse_time(t)
                if parsed is not None:
                    current_times.append(parsed)

    # Save last movie
    if current_movie and current_times:
        movies.append(Movie(
            current_movie,
            list(dict.fromkeys(current_times)),
            current_duration or _DEFAULT_MOVIE_DURATION,
        ))

    return movies


def find_best_schedule(
    movies: list[Movie],
    buffer: int,
) -> list[list[Screening]]:
    """Find ALL schedules that maximize number of movies watched."""
    movie_screenings: list[list[Screening]] = [
        [
            Screening(movie.name, start, start + movie.duration)
            for start in movie.start_times
        ]
        for movie in movies
    ]

    best_count = 0
    all_best_schedules: list[list[Screening]] = []

    def _backtrack(
        movie_idx: int,
        current_schedule: list[Screening],
    ) -> None:
        nonlocal best_count, all_best_schedules

        if movie_idx == len(movie_screenings):
            if len(current_schedule) > best_count:
                best_count = len(current_schedule)
                all_best_schedules = [current_schedule.copy()]
            elif (
                len(current_schedule) == best_count
                and best_count > 0
            ):
                all_best_schedules.append(current_schedule.copy())
            return

        # Pruning: can't beat the best
        remaining = len(movie_screenings) - movie_idx
        if len(current_schedule) + remaining < best_count:
            return

        # Try each screening of current movie
        for screening in movie_screenings[movie_idx]:
            conflicts = any(
                screening.overlaps(s, buffer)
                for s in current_schedule
            )
            if not conflicts:
                current_schedule.append(screening)
                _backtrack(movie_idx + 1, current_schedule)
                current_schedule.pop()

        # Also try skipping this movie
        _backtrack(movie_idx + 1, current_schedule)

    _backtrack(0, [])

    # Sort each schedule by start time and return
    return [
        sorted(schedule, key=lambda s: s.start)
        for schedule in all_best_schedules
    ]


def _format_single_schedule(
    schedule: list[Screening],
    output: TextIO,
) -> None:
    """Format a single schedule to the output stream."""
    for i, screening in enumerate(schedule, 1):
        duration = screening.end - screening.start
        hours, mins = divmod(duration, 60)
        actual_start = screening.start + ADS_DURATION
        actual_start_str = (
            f"{actual_start // 60:02d}:{actual_start % 60:02d}"
        )
        output.write(
            f"  {i}. {screening.start_str()} - "
            f"{screening.end_str()}  {screening.movie}\n"
        )
        output.write(
            f"     Duration: {hours}h {mins}m "
            f"(movie starts ~{actual_start_str})\n"
        )
        if i < len(schedule):
            gap = schedule[i].start - screening.end
            if gap > 0:
                output.write(f"     [{gap} min break]\n")
        output.write("\n")


def _format_schedules(
    schedules: list[list[Screening]],
    all_movies: list[str],
    date: str | None = None,
    max_display: int = 5,
    *,
    output: TextIO | None = None,
) -> None:
    """Format optimal schedules to the output stream."""
    if output is None:
        output = sys.stdout

    sep = "=" * _SEPARATOR_WIDTH
    thin_sep = "\u2500" * _SEPARATOR_WIDTH

    if not schedules or not schedules[0]:
        output.write("No movies can be scheduled!\n")
        return

    num_movies = len(schedules[0])
    num_schedules = len(schedules)

    output.write(f"\n{sep}\n")
    if date:
        output.write(f"  OPTIMAL CINEMA SCHEDULES - {date}\n")
    else:
        output.write("  OPTIMAL CINEMA SCHEDULES\n")
    output.write(
        f"  {num_movies} movies, "
        f"{num_schedules} possible combination(s)\n"
    )
    output.write(f"{sep}\n\n")

    display_count = min(num_schedules, max_display)
    for idx, schedule in enumerate(schedules[:display_count], 1):
        if num_schedules > 1:
            output.write(f"{thin_sep}\n")
            output.write(f"  OPTION {idx}:\n")
            output.write(f"{thin_sep}\n\n")
        _format_single_schedule(schedule, output)

    if num_schedules > display_count:
        output.write(f"{thin_sep}\n")
        output.write(
            f"  ... and {num_schedules - display_count} "
            "more combinations\n"
        )
        output.write("  (use -n to show more, e.g., -n 10)\n")
        output.write("\n")

    # Show skipped movies (from first schedule as reference)
    scheduled_movies = {s.movie for s in schedules[0]}
    skipped = [m for m in all_movies if m not in scheduled_movies]
    if skipped and num_schedules == 1:
        output.write(f"{thin_sep}\n")
        output.write(f"  Skipped movies ({len(skipped)}):\n")
        for movie in skipped:
            output.write(f"    - {movie}\n")
        output.write("\n")


def _format_all_movies(
    movies: list[Movie],
    date: str | None = None,
    *,
    output: TextIO | None = None,
) -> None:
    """Format all parsed movies to the output stream."""
    if output is None:
        output = sys.stdout

    thin_sep = "\u2500" * _SEPARATOR_WIDTH

    output.write(f"\n{thin_sep}\n")
    if date:
        output.write(f"  Parsed {len(movies)} movies for {date}:\n")
    else:
        output.write(f"  Parsed {len(movies)} movies:\n")
    output.write(f"{thin_sep}\n")
    for movie in movies:
        times_str = ", ".join(
            f"{t // 60:02d}:{t % 60:02d}"
            for t in sorted(movie.start_times)
        )
        genre_str = (
            f" [{', '.join(movie.genres)}]" if movie.genres else ""
        )
        output.write(
            f"  {movie.name} ({movie.duration} min){genre_str}\n"
        )
        output.write(f"    Times: {times_str}\n")
    output.write("\n")


def _build_parser() -> argparse.ArgumentParser:
    """Build the argument parser for the cinema planner."""
    parser = argparse.ArgumentParser(
        description=(
            "Plan your cinema day to watch "
            "as many movies as possible."
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Supports Cinema City HTML/PDF schedules (auto-detected).

Manual input format (one movie per line):
    Movie Title, start_time1 [or start_time2 ...], duration

Example:
    Inception, 10:30 or 14:00 or 18:30, 2h 28m
    The Matrix, 12:00 or 16:45, 2h 16m
        """,
    )
    parser.add_argument(
        "input_file", nargs="?", help="Input file (HTML/PDF/TXT)"
    )
    parser.add_argument(
        "-b",
        "--buffer",
        type=int,
        default=0,
        help="Buffer time between movies in minutes (default: 0)",
    )
    parser.add_argument(
        "-i",
        "--interactive",
        action="store_true",
        help="Interactive mode - enter movies one by one",
    )
    parser.add_argument(
        "-l",
        "--list",
        action="store_true",
        help="List all parsed movies without scheduling",
    )
    parser.add_argument(
        "-s",
        "--select",
        type=str,
        help="Comma-separated movie names to include (partial match)",
    )
    parser.add_argument(
        "-x",
        "--exclude",
        type=str,
        help="Comma-separated movie names to exclude (partial match)",
    )
    parser.add_argument(
        "-g",
        "--exclude-genre",
        type=str,
        help="Comma-separated genres to exclude (e.g., 'Horror')",
    )
    parser.add_argument(
        "--all-genres",
        action="store_true",
        help="Include all genres (disable default Horror exclusion)",
    )
    parser.add_argument(
        "-o",
        "--output",
        type=str,
        help="Save schedule to file (default: cinema_plan_DATE.txt)",
    )
    parser.add_argument(
        "-n",
        "--max-schedules",
        type=int,
        default=5,
        help="Max schedule options to display (default: 5)",
    )
    parser.add_argument(
        "-m",
        "--must-watch",
        type=str,
        help="Only show schedules containing this movie (partial match)",
    )
    return parser


def _load_movies_interactive() -> list[Movie]:
    """Load movies through interactive terminal input."""
    logger.info("Enter movies (empty line to finish):")
    logger.info("Format: Title, start1 [or start2 ...], duration")
    logger.info("Example: Inception, 10:30 or 14:00, 2h 28m")
    logger.info("")
    movies: list[Movie] = []
    with suppress(EOFError):
        while True:
            line = input("> ")
            if not line.strip():
                break
            result = _try_parse_interactive_line(line)
            if result:
                movies.append(result)
    return movies


def _load_movies_from_file(
    filepath: Path,
) -> tuple[list[Movie], str | None]:
    """Load movies from a file (HTML, PDF, or manual format)."""
    suffix = filepath.suffix.lower()
    logger.info("Parsing: %s", filepath)

    if suffix in {".html", ".htm"}:
        return parse_cinema_city_html(str(filepath))

    if suffix == ".pdf":
        return parse_cinema_city_pdf(str(filepath)), None

    movies: list[Movie] = []
    with filepath.open() as f:
        for line in f:
            result = _try_parse_manual_line(line, sys.stderr)
            if result:
                movies.append(result)
    return movies, None


def _load_movies_from_stdin() -> list[Movie]:
    """Load movies from standard input."""
    logger.info("Enter movies (Ctrl+D when done):")
    movies: list[Movie] = []
    for line in sys.stdin:
        result = _try_parse_manual_line(line, sys.stderr)
        if result:
            movies.append(result)
    return movies


def _filter_movies(
    movies: list[Movie],
    args: argparse.Namespace,
) -> tuple[list[Movie], set[str]]:
    """Apply name and genre filters to movies."""
    if args.select:
        select_terms = [
            t.strip().lower() for t in args.select.split(",")
        ]
        movies = [
            m
            for m in movies
            if any(t in m.name.lower() for t in select_terms)
        ]
        logger.info(
            "Selected %d movies matching: %s",
            len(movies),
            args.select,
        )

    if args.exclude:
        exclude_terms = [
            t.strip().lower() for t in args.exclude.split(",")
        ]
        movies = [
            m
            for m in movies
            if not any(t in m.name.lower() for t in exclude_terms)
        ]
        logger.info("After name exclusion: %d movies", len(movies))

    excluded_genres: set[str] = set()
    if not args.all_genres:
        excluded_genres.update(DEFAULT_EXCLUDED_GENRES)
    if args.exclude_genre:
        excluded_genres.update(
            g.strip().lower() for g in args.exclude_genre.split(",")
        )

    if excluded_genres:
        before_count = len(movies)
        movies = [
            m
            for m in movies
            if not any(
                g.lower() in excluded_genres for g in m.genres
            )
        ]
        filtered_count = before_count - len(movies)
        if filtered_count > 0:
            logger.info(
                "Excluded %d movies by genre: %s",
                filtered_count,
                ", ".join(sorted(excluded_genres)),
            )

    return movies, excluded_genres


def _apply_must_watch_filter(
    schedules: list[list[Screening]],
    must_watch: str,
) -> list[list[Screening]]:
    """Filter schedules to only those containing must-watch movie."""
    must_watch_lower = must_watch.lower()
    filtered = [
        s
        for s in schedules
        if any(
            must_watch_lower in screening.movie.lower()
            for screening in s
        )
    ]
    if filtered:
        logger.info(
            "Filtered to %d schedules containing '%s'",
            len(filtered),
            must_watch,
        )
        return filtered

    logger.warning(
        "No optimal schedules contain '%s'", must_watch
    )
    logger.warning("Showing all schedules instead.")
    return schedules


def _output_schedules(
    schedules: list[list[Screening]],
    all_movie_names: list[str],
    schedule_date: str | None,
    args: argparse.Namespace,
    excluded_genres: set[str],
) -> None:
    """Handle schedule output, optionally saving to file."""
    output_buffer = StringIO()
    _format_schedules(
        schedules,
        all_movie_names,
        schedule_date,
        args.max_schedules,
        output=output_buffer,
    )
    schedule_output = output_buffer.getvalue()
    sys.stdout.write(schedule_output)

    if args.output or schedule_date:
        output_file = (
            Path(args.output)
            if args.output
            else Path(f"cinema_plan_{schedule_date}.txt")
        )
        with output_file.open("w") as f:
            f.write(
                f"Generated: {schedule_date or 'unknown date'}\n"
            )
            f.write(f"Movies considered: {len(all_movie_names)}\n")
            f.write(f"Buffer time: {args.buffer} minutes\n")
            if excluded_genres:
                f.write(
                    "Excluded genres: "
                    f"{', '.join(sorted(excluded_genres))}\n"
                )
            f.write(schedule_output)
        logger.info("Schedule saved to: %s", output_file)


def main() -> None:
    """Run the cinema day planner CLI."""
    logging.basicConfig(format="%(message)s", level=logging.INFO)

    parser = _build_parser()
    args = parser.parse_args()

    movies: list[Movie] = []
    schedule_date: str | None = None

    if args.interactive:
        movies = _load_movies_interactive()
    elif args.input_file:
        movies, schedule_date = _load_movies_from_file(
            Path(args.input_file),
        )
    else:
        movies = _load_movies_from_stdin()

    if not movies:
        logger.error("No movies found!")
        sys.exit(1)

    movies, excluded_genres = _filter_movies(movies, args)

    if args.list:
        _format_all_movies(movies, schedule_date)
        return

    logger.info(
        "\nOptimizing schedule for %d movies...", len(movies)
    )
    logger.info(
        "Buffer time between movies: %d minutes", args.buffer
    )

    schedules = find_best_schedule(movies, args.buffer)
    all_movie_names = [m.name for m in movies]

    if args.must_watch:
        schedules = _apply_must_watch_filter(
            schedules, args.must_watch
        )

    _output_schedules(
        schedules,
        all_movie_names,
        schedule_date,
        args,
        excluded_genres,
    )


if __name__ == "__main__":
    main()