steam-backlog-enforcer/steam_backlog_enforcer/_stats.py

"""Backlog completion-time statistics for Steam Backlog Enforcer."""

from __future__ import annotations

from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
import logging
import secrets
from typing import TYPE_CHECKING
from urllib.parse import quote_plus

from steam_backlog_enforcer._hltb_types import (
    HLTB_BASE_URL,
    _read_raw_cache,
    load_hltb_cache,
    load_hltb_game_id_cache,
    load_hltb_leisure_100h_cache,
    load_hltb_rush_cache,
)
from steam_backlog_enforcer._scanning_confidence import (
    _apply_cached_confidence_to_candidates,
    _confidence_fail_reasons,
    _refresh_candidate_confidence_batch,
)
from steam_backlog_enforcer._web_dataset import (
    PaceVsHLTB,
    compute_pace_vs_hltb,
    count_complete_since_start,
)
from steam_backlog_enforcer.config import SNAPSHOT_FILE, load_snapshot
from steam_backlog_enforcer.game_install import _echo
from steam_backlog_enforcer.hltb import fetch_hltb_detail_missing
from steam_backlog_enforcer.protondb import (
    ProtonDBRating,
    fetch_protondb_ratings,
)
from steam_backlog_enforcer.steam_api import GameInfo, SteamAPIClient

if TYPE_CHECKING:
    from steam_backlog_enforcer.config import Config, State

logger = logging.getLogger(__name__)

_HOURS_PER_DAY_PRESETS = (2.0, 4.0, 6.0, 8.0)

_LINE = "─" * 70

_HLTB_SEARCH_BASE = "https://howlongtobeat.com/?q="


@dataclass
class _GameTimes:
    """Per-game time estimates for stats display."""

    game: GameInfo
    worst_hours: float
    rush_hours: float
    leisure_100h: float
    hltb_game_id: int = field(default=0)


def _filter_qualifying_games(
    games: list[GameInfo],
    state: State,
) -> tuple[list[_GameTimes], int, int, int]:
    """Return qualifying incomplete games with their time estimates.

    Applies the same HLTB-confidence and Linux-compatibility filters as the
    game picker.  The current game and already-finished games are excluded.

    Returns:
        (qualified_list, hltb_skipped, linux_skipped, no_data_skipped)
    """
    rush_cache = load_hltb_rush_cache()
    leisure_100h_cache = load_hltb_leisure_100h_cache()
    game_id_cache = load_hltb_game_id_cache()
    hours_cache = load_hltb_cache()

    exclude = set(state.finished_app_ids)
    if state.current_app_id is not None:
        exclude.add(state.current_app_id)

    candidates = [g for g in games if not g.is_complete and g.app_id not in exclude]
    _apply_cached_confidence_to_candidates(candidates)
    _refresh_candidate_confidence_batch(candidates)

    hltb_skipped = 0
    linux_skipped = 0
    no_data_skipped = 0
    app_ids_to_check: list[int] = []

    conf_ok: list[GameInfo] = []
    for game in candidates:
        if _confidence_fail_reasons(game):
            hltb_skipped += 1
            continue
        conf_ok.append(game)
        app_ids_to_check.append(game.app_id)

    ratings: dict[int, ProtonDBRating] = {}
    if app_ids_to_check:
        ratings = fetch_protondb_ratings(app_ids_to_check)

    qualified: list[_GameTimes] = []
    for game in conf_ok:
        rating = ratings.get(game.app_id, ProtonDBRating(app_id=game.app_id))
        if not rating.is_playable:
            linux_skipped += 1
            continue

        rush = rush_cache.get(game.app_id, -1)
        leisure = leisure_100h_cache.get(game.app_id, -1)

        # worst_hours = max of: snapshot completionist, HLTB hours cache (fallback
        # when snapshot is stale/missing), and leisure_100h (slowest 100% time).
        snap_hours = game.completionist_hours if game.completionist_hours > 0 else -1
        cache_hours = hours_cache.get(game.app_id, -1)
        worst_candidates = [v for v in (snap_hours, cache_hours, leisure) if v > 0]
        worst = max(worst_candidates) if worst_candidates else -1

        if worst <= 0 and rush <= 0 and leisure <= 0:
            no_data_skipped += 1
            continue

        qualified.append(
            _GameTimes(
                game=game,
                worst_hours=worst,
                rush_hours=rush,
                leisure_100h=leisure,
                hltb_game_id=game_id_cache.get(game.app_id, 0),
            )
        )

    return qualified, hltb_skipped, linux_skipped, no_data_skipped


def _ensure_rush_data(qualified: list[_GameTimes]) -> bool:
    """Auto-fetch rush/leisure detail for games that are missing it.

    Returns True when a fetch was performed; the caller should then re-run
    ``_filter_qualifying_games`` to pick up the updated caches.
    """
    total_q = len(qualified)
    missing = sum(1 for e in qualified if e.rush_hours <= 0)
    if not qualified or not missing:
        return False
    _echo(f"Fetching HLTB detail for {missing}/{total_q} games missing rush/leisure...")
    game_pairs = [(e.game.app_id, e.game.name) for e in qualified]
    fetch_hltb_detail_missing(game_pairs)
    return True


def _ensure_completed_rush_data(games: list[GameInfo]) -> bool:
    """Fetch rush/leisure detail for completed games used for pace calibration.

    Completed games aren't processed by ``_ensure_rush_data`` (which only
    handles incomplete qualifying games), so this separate pass fills in
    their rush/leisure data for ``compute_pace_vs_hltb``.

    Returns True when at least one new fetch was performed.
    """
    pairs = [
        (g.app_id, g.name) for g in games if g.is_complete and g.playtime_minutes > 0
    ]
    if not pairs:
        return False
    _echo(
        f"Fetching HLTB detail for {len(pairs)} completed games (pace calibration)..."
    )
    fetched = fetch_hltb_detail_missing(pairs)
    return fetched > 0


def _print_worst_example(entries: list[_GameTimes]) -> None:
    """Print a randomly selected example from the worst-case qualified games."""
    examples = [e for e in entries if e.worst_hours > 0]
    if not examples:
        return
    example = secrets.choice(examples)
    _echo(f"\n  Example game: {example.game.name!r}")
    _echo(f"    Worst case: {example.worst_hours:.1f} h")
    if example.rush_hours > 0:
        _echo(f"    Rush:       {example.rush_hours:.1f} h")
    if example.leisure_100h > 0:
        _echo(f"    Leisure:    {example.leisure_100h:.1f} h")
    hltb_game_id = example.hltb_game_id
    if hltb_game_id == 0:
        # On-demand backfill: one search to get the HLTB game ID for this game.
        fetch_hltb_detail_missing([(example.game.app_id, example.game.name)])
        hltb_game_id = load_hltb_game_id_cache().get(example.game.app_id, 0)
    if hltb_game_id > 0:
        _echo(f"    HLTB:       {HLTB_BASE_URL}/game/{hltb_game_id}")
    else:
        _echo(f"    HLTB:       {_HLTB_SEARCH_BASE}{quote_plus(example.game.name)}")


def _sum_hours(entries: list[_GameTimes], attr: str) -> tuple[float, int]:
    """Sum a time attribute across entries; return (total_hours, missing_count).

    Games where the attribute is ≤ 0 contribute 0 to the sum and are counted
    in ``missing_count`` so the user knows the estimate may be an undercount.
    """
    total = 0.0
    missing = 0
    for e in entries:
        val: float = getattr(e, attr)
        if val > 0:
            total += val
        else:
            missing += 1
    return round(total, 1), missing


def _format_completion_date(hours: float, daily_hours: float) -> str:
    """Return 'N days (YYYY-MM-DD)' for finishing hours at daily_hours per day."""
    if hours <= 0 or daily_hours <= 0:
        return "N/A"
    days = int(hours / daily_hours)
    target = datetime.now(timezone.utc) + timedelta(days=days)
    return f"{days} days ({target.strftime('%Y-%m-%d')})"


def _print_scenario(
    label: str,
    total_hours: float,
    missing: int,
    total_games: int,
) -> None:
    """Print a single time-scenario block."""
    _echo(f"\n  {label}")
    if total_hours <= 0:
        _echo("    No data available.")
        return

    missing_note = (
        f"  ({missing}/{total_games} games had no data, hours underestimated)"
        if missing
        else ""
    )
    _echo(f"    Total: {total_hours:,.1f} h{missing_note}")
    for daily in _HOURS_PER_DAY_PRESETS:
        estimate = _format_completion_date(total_hours, daily)
        _echo(f"    @ {daily:.0f} h/day → {estimate}")


def _print_pace_scenario(state: State, remaining: int, games_done: int) -> None:
    """Print the pace-based completion estimate.

    ``games_done`` must be the count of games completed ON OR AFTER
    ``state.enforcement_started_at`` (use ``count_complete_since_start``).
    Pre-enforcement completions inflate the rate and are excluded.
    """
    _echo("\n  1. AT YOUR CURRENT PACE")
    if not state.enforcement_started_at:
        _echo("    No start date recorded.")
        _echo("    Set enforcement_started_at in state.json (ISO-8601 UTC)")
        _echo("    to enable this estimate.")
        return

    try:
        started = datetime.fromisoformat(state.enforcement_started_at)
    except ValueError:
        _echo(f"    Invalid enforcement_started_at: {state.enforcement_started_at!r}")
        return

    now = datetime.now(timezone.utc)
    days_elapsed = max(1, (now - started).days)

    if games_done == 0:
        _echo(f"    Started: {started.strftime('%Y-%m-%d')}")
        _echo("    No games finished yet — pace cannot be estimated.")
        return

    rate = games_done / days_elapsed
    _echo(f"    Started:        {started.strftime('%Y-%m-%d')}")
    _echo(
        f"    Finished:       {games_done} games in {days_elapsed} days (since enforcement start)"
    )
    _echo(
        f"    Pace:           {rate:.4f} games/day  (1 game every {1 / rate:.1f} days)"
    )
    _echo(f"    Remaining:      {remaining} games")

    days_to_go = int(remaining / rate)
    finish = now + timedelta(days=days_to_go)
    _echo(f"    Est. complete:  {days_to_go} days ({finish.strftime('%Y-%m-%d')})")


def _print_player_speed_scenario(
    pace: PaceVsHLTB | None,
    rush_total: float,
    leisure_total: float,
) -> None:
    """Print player pace vs HLTB averages and an extrapolated backlog estimate."""
    _echo(f"\n{_LINE}")
    _echo("\n  5. YOUR PLAY STYLE vs HLTB AVERAGES")

    if pace is None or pace.calibration_count == 0:
        _echo("    No calibration data available.")
        _echo(
            "    Finish some games (100 % achievements) and re-run 'stats'"
            " to enable this estimate."
        )
        return

    _echo(f"\n    Calibration games: {pace.calibration_count}")
    if pace.ratio_vs_rush > 0:
        _echo(f"    vs Rush:           {pace.ratio_vs_rush:.2f}x rush pace")
    if pace.ratio_vs_leisure > 0:
        _echo(f"    vs Leisure:        {pace.ratio_vs_leisure:.2f}x leisure pace")
    if pace.interpolation_t != -1.0:
        _echo(
            f"    Interpolation t:   {pace.interpolation_t:.3f}"
            "  (0 = rush speed, 1 = leisure speed)"
        )

    style_labels = {
        "faster_than_rush": "Faster than rush",
        "rush_to_leisure": "Between rush and leisure",
        "slower_than_leisure": "Slower than leisure",
        "unknown": "Unknown",
    }
    style = style_labels.get(pace.player_style, pace.player_style)
    _echo(f"    Play style:        {style}")

    if pace.interpolation_t != -1.0 and rush_total > 0 and leisure_total > 0:
        est = rush_total + pace.interpolation_t * (leisure_total - rush_total)
    elif pace.ratio_vs_rush > 0 and rush_total > 0:
        est = rush_total * pace.ratio_vs_rush
    else:
        est = -1.0

    if est > 0:
        _echo(f"\n    Estimated backlog total at your pace: {est:,.1f} h")
        for daily in _HOURS_PER_DAY_PRESETS:
            estimate = _format_completion_date(est, daily)
            _echo(f"    @ {daily:.0f} h/day → {estimate}")


def _refresh_recently_played_completions(
    games: list[GameInfo],
    config: Config,
) -> list[GameInfo]:
    """Refresh achievement data for incomplete games played since last scan.

    Makes 1 ``GetOwnedGames`` request + 1 ``GetPlayerAchievements`` per
    recently-played incomplete game.  Finds games newly completed since the
    last ``scan`` without re-scanning the whole library.

    Returns a new list with updated GameInfo objects for any game that was
    played after the snapshot was written; all other games are unchanged.
    """
    try:
        snapshot_mtime = SNAPSHOT_FILE.stat().st_mtime
    except OSError:
        return games

    from steam_backlog_enforcer.steam_api import SteamAPIError

    try:
        client = SteamAPIClient(config.steam_api_key, config.steam_id)
        owned_raw = client.get_owned_games()
    except SteamAPIError:
        logger.debug("Steam API unavailable; skipping completion refresh.")
        return games
    last_played_map = {g["appid"]: g.get("rtime_last_played", 0) for g in owned_raw}

    to_refresh = [
        g
        for g in games
        if not g.is_complete and last_played_map.get(g.app_id, 0) > snapshot_mtime
    ]

    if not to_refresh:
        return games

    _echo(
        f"Refreshing {len(to_refresh)} recently-played game(s)"
        " for up-to-date completion status..."
    )

    game_map = {g.app_id: g for g in games}

    def _refresh_one(game: GameInfo) -> GameInfo:
        achievements = client.get_achievement_details(game.app_id)
        if not achievements:
            return game
        unlocked = sum(1 for a in achievements if a.achieved)
        return GameInfo(
            app_id=game.app_id,
            name=game.name,
            total_achievements=len(achievements),
            unlocked_achievements=unlocked,
            playtime_minutes=game.playtime_minutes,
            achievements=achievements,
            completionist_hours=game.completionist_hours,
            comp_100_count=game.comp_100_count,
            count_comp=game.count_comp,
        )

    with ThreadPoolExecutor(max_workers=20) as pool:
        futures = {pool.submit(_refresh_one, g): g for g in to_refresh}
        for future in as_completed(futures):
            refreshed = future.result()
            game_map[refreshed.app_id] = refreshed

    return list(game_map.values())


def cmd_stats(_config: Config, state: State) -> None:
    """Display backlog completion-time statistics.

    Filters games by the same HLTB-confidence and Linux-compatibility rules
    used when picking the next game.  Auto-fetches missing rush/leisure detail
    data before printing.  Shows five scenarios:

    1. At your current pace (games finished per day since enforcement started).
    2. Rush   — avg comp_100 + DLC completion time per HLTB.
    3. Leisure — comp_100_h (slowest 100 %) + DLC leisure per HLTB.
    4. Worst   — absolute maximum recorded time (any category) per HLTB.
    5. Your play style — extrapolated from completed-game calibration vs HLTB.
    """
    snapshot = load_snapshot()
    if snapshot is None:
        _echo("No snapshot found. Run 'scan' first.")
        return

    games = [GameInfo.from_snapshot(d) for d in snapshot]
    games = _refresh_recently_played_completions(games, _config)
    # Count all 100%-achievement games in library (more accurate than
    # finished_app_ids, which only tracks enforcer-assigned completions).
    games_done = sum(1 for g in games if g.is_complete)
    # Only count games completed on/after enforcement start for pace — pre-start
    # completions are not representative of the enforcer period's throughput.
    games_done_since_start = count_complete_since_start(
        games, state.enforcement_started_at
    )

    # Ensure completed games have rush/leisure data for pace calibration.
    _ensure_completed_rush_data(games)

    qualified, hltb_skip, linux_skip, no_data_skip = _filter_qualifying_games(
        games, state
    )
    if _ensure_rush_data(qualified):
        # Re-filter picks up updated rush/leisure caches; ProtonDB is now cached.
        qualified, hltb_skip, linux_skip, no_data_skip = _filter_qualifying_games(
            games, state
        )
    total_q = len(qualified)

    _echo(f"\n{'═' * 70}")
    _echo("  BACKLOG COMPLETION ESTIMATES")
    _echo(f"{'═' * 70}")
    _echo(f"\n  Qualifying games:  {total_q}")
    if hltb_skip:
        _echo(f"  HLTB-skipped:      {hltb_skip} (confidence too low)")
    if linux_skip:
        _echo(f"  Linux-skipped:     {linux_skip} (poor ProtonDB rating)")
    if no_data_skip:
        _echo(f"  No-data-skipped:   {no_data_skip} (no HLTB hours at all)")

    missing_rush_final = sum(1 for e in qualified if e.rush_hours <= 0)
    if missing_rush_final:
        _echo(
            f"\n  Note: {missing_rush_final}/{total_q} games still missing"
            " rush/leisure data (HLTB search may not have matched them)."
        )
    elif total_q:
        _echo(
            f"\n  Detail data: rush + leisure available for all {total_q}"
            " qualifying games."
        )

    if state.current_app_id:
        _echo(
            f"\n  Current game:      {state.current_game_name} (excluded from totals)"
        )
    _echo(f"  Finished games:    {games_done} (excluded from totals)")

    _echo(f"\n{_LINE}")
    _print_pace_scenario(state, total_q, games_done_since_start)

    worst_total, worst_missing = _sum_hours(qualified, "worst_hours")
    rush_total, rush_missing = _sum_hours(qualified, "rush_hours")
    leisure_total, leisure_missing = _sum_hours(qualified, "leisure_100h")

    _echo(f"\n{_LINE}")
    _print_scenario(
        "2. RUSH (avg comp_100 + DLC — typical fast completionist)",
        rush_total,
        rush_missing,
        total_q,
    )

    _echo(f"\n{_LINE}")
    _print_scenario(
        "3. LEISURE (comp_100_h + DLC — slow/comfortable 100 %)",
        leisure_total,
        leisure_missing,
        total_q,
    )

    _echo(f"\n{_LINE}")
    _print_scenario(
        "4. WORST CASE (max recorded time, any category, + DLC)",
        worst_total,
        worst_missing,
        total_q,
    )
    _print_worst_example(qualified)

    # Pace calibration uses the freshly-updated cache (both fetches above ran).
    raw_cache = _read_raw_cache()
    pace_vs_hltb = compute_pace_vs_hltb(games, raw_cache)
    _print_player_speed_scenario(pace_vs_hltb, rush_total, leisure_total)

    _echo(f"\n{_LINE}\n")