steam-backlog-enforcer/steam_backlog_enforcer/_web_dataset.py

437 lines
15 KiB
Python
Raw Permalink Normal View History

"""Read-only projection of cached data for the interactive web UI.
Builds a compact, secrets-free dataset from the on-disk caches (snapshot,
HLTB, ProtonDB, state) so a browser UI can filter games and estimate backlog
completion times entirely client-side. This module performs **no network
calls** it only reads caches that previous ``scan``/``stats`` runs populated.
The projection deliberately emits *every* incomplete, non-current,
non-finished game with its raw HLTB-confidence counters and ProtonDB tiers, so
the client can move its filter thresholds *below* the CLI defaults. The CLI
default thresholds and a parity summary are included so the UI can show
"matches the CLI" and so changes that break parity are easy to spot.
"""
from __future__ import annotations
from dataclasses import asdict, dataclass, field
from datetime import datetime, timezone
from typing import Any
from steam_backlog_enforcer._hltb_types import _read_raw_cache
from steam_backlog_enforcer._scanning_confidence import (
_MIN_COMP_100_POLLS,
_MIN_CONFIDENCE_SUM,
_MIN_COUNT_COMP,
)
from steam_backlog_enforcer.config import State, load_snapshot
from steam_backlog_enforcer.protondb import (
MIN_PLAYABLE_TIER,
ProtonDBRating,
_load_cache,
_rating_from_cache,
)
from steam_backlog_enforcer.steam_api import GameInfo
# Mirrors ``_stats._HOURS_PER_DAY_PRESETS`` but mutable/JSON-friendly.
HOURS_PER_DAY_PRESETS = [2.0, 4.0, 6.0, 8.0]
@dataclass
class WebGame:
"""One incomplete candidate game, with raw filterable fields.
Hour fields use ``-1`` to mean "no data" (matching the cache convention),
so the client can choose to include or exclude unknown-length games.
"""
app_id: int
name: str
completion_pct: float
playtime_minutes: int
rush_hours: float
leisure_hours: float
worst_hours: float
count_comp: int
comp_100_count: int
hltb_game_id: int
protondb_tier: str
protondb_trending_tier: str
protondb_score: float
@dataclass
class WebStateInfo:
"""Pace inputs and current-assignment metadata for the UI."""
current_app_id: int | None
current_game_name: str
games_done: int
games_done_since_start: int
days_elapsed: int
enforcement_started_at: str
pace_games_per_day: float
@dataclass
class WebDefaults:
"""The CLI's hardcoded filter thresholds, surfaced as editable defaults."""
min_comp_100_polls: int
min_count_comp: int
min_confidence_sum: int
min_playable_tier: str
hours_per_day_presets: list[float]
@dataclass
class DefaultSummary:
"""Totals the CLI ``stats`` command would print at default thresholds.
Used as a parity oracle: the client's own default-filtered totals must
reproduce these numbers.
"""
qualifying: int
rush_total: float
leisure_total: float
worst_total: float
@dataclass
class PaceVsHLTB:
"""Player pace calibrated against HLTB rush/leisure averages.
Derived from completed games that have HLTB detail data. All ratio /
interpolation fields use ``-1`` to mean "insufficient data", matching the
cache convention used elsewhere.
Fields:
calibration_count: number of completed games used for calibration.
ratio_vs_rush: actual_hours / rush_hours across calibration games.
ratio_vs_leisure: actual_hours / leisure_hours (-1 if no leisure data).
interpolation_t: position between rush (0.0) and leisure (1.0) speed.
Negative means faster than rush; >1 means slower than leisure.
-1 means insufficient data.
player_style: human-readable style label.
"""
calibration_count: int
ratio_vs_rush: float
ratio_vs_leisure: float
interpolation_t: float
player_style: str
@dataclass
class WebDataset:
"""Full payload served to the browser."""
games: list[WebGame]
state: WebStateInfo
defaults: WebDefaults
default_summary: DefaultSummary
pace_vs_hltb: PaceVsHLTB | None
generated_at: str = field(default="")
def _worst_hours(game: GameInfo, cache_hours: float, leisure: float) -> float:
"""Replicate ``_stats`` worst-case selection exactly.
worst = max of snapshot completionist hours, the HLTB hours-cache value,
and the leisure-100% time considering only positive values.
"""
snap_hours = game.completionist_hours if game.completionist_hours > 0 else -1
candidates = [v for v in (snap_hours, cache_hours, leisure) if v > 0]
return max(candidates) if candidates else -1.0
def _passes_default_confidence(game: WebGame) -> bool:
"""True if the game clears all three CLI HLTB-confidence thresholds."""
if game.comp_100_count < _MIN_COMP_100_POLLS:
return False
if game.count_comp < _MIN_COUNT_COMP:
return False
return game.comp_100_count + game.count_comp >= _MIN_CONFIDENCE_SUM
def _has_any_time(game: WebGame) -> bool:
"""True if the game has at least one positive time estimate."""
return game.worst_hours > 0 or game.rush_hours > 0 or game.leisure_hours > 0
def _build_games(games: list[GameInfo], exclude: set[int]) -> list[WebGame]:
"""Project incomplete, non-excluded games into compact rows (no network)."""
raw = _read_raw_cache()
protondb_cache = _load_cache()
rows: list[WebGame] = []
for game in games:
if game.is_complete or game.app_id in exclude:
continue
entry = raw.get(game.app_id, {})
rush = float(entry.get("rush_hours", -1))
leisure = float(entry.get("leisure_100h", -1))
cache_hours = float(entry.get("hours", -1))
count_comp = int(entry.get("count_comp", 0))
comp_100_count = int(entry.get("polls", 0))
hltb_game_id = int(entry.get("hltb_game_id", 0))
rating: ProtonDBRating = (
_rating_from_cache(game.app_id, protondb_cache[str(game.app_id)])
if str(game.app_id) in protondb_cache
else ProtonDBRating(app_id=game.app_id)
)
rows.append(
WebGame(
app_id=game.app_id,
name=game.name,
completion_pct=round(game.completion_pct, 1),
playtime_minutes=game.playtime_minutes,
rush_hours=rush,
leisure_hours=leisure,
worst_hours=_worst_hours(game, cache_hours, leisure),
count_comp=count_comp,
comp_100_count=comp_100_count,
hltb_game_id=hltb_game_id,
protondb_tier=rating.tier,
protondb_trending_tier=rating.trending_tier,
protondb_score=rating.score,
)
)
return rows
def _default_qualifying(rows: list[WebGame]) -> list[WebGame]:
"""Apply the exact CLI default filters (confidence + ProtonDB + has-data)."""
qualifying: list[WebGame] = []
for game in rows:
if not _passes_default_confidence(game):
continue
rating = ProtonDBRating(
app_id=game.app_id,
tier=game.protondb_tier,
trending_tier=game.protondb_trending_tier,
)
if not rating.is_playable:
continue
if not _has_any_time(game):
continue
qualifying.append(game)
return qualifying
def _sum_positive(rows: list[WebGame], attr: str) -> float:
"""Sum a positive-only hour attribute across rows (matches ``_sum_hours``)."""
total = sum(getattr(g, attr) for g in rows if getattr(g, attr) > 0)
return round(total, 1)
def _default_summary(rows: list[WebGame]) -> DefaultSummary:
"""Compute the CLI parity totals at default thresholds."""
qualifying = _default_qualifying(rows)
return DefaultSummary(
qualifying=len(qualifying),
rush_total=_sum_positive(qualifying, "rush_hours"),
leisure_total=_sum_positive(qualifying, "leisure_hours"),
worst_total=_sum_positive(qualifying, "worst_hours"),
)
def count_complete_since_start(games: list[GameInfo], started_at: str) -> int:
"""Count complete games whose last achievement was unlocked on/after started_at.
Games with no achievement timestamp data are excluded their completion
date is unknown, and they were most likely finished before Steam began
recording unlock timestamps (i.e. before the enforcement period).
Returns 0 when started_at is empty or unparseable.
"""
if not started_at:
return 0
try:
started = datetime.fromisoformat(started_at)
except ValueError:
return 0
started_ts = int(started.timestamp())
count = 0
for game in games:
if not game.is_complete:
continue
achieved_times = [
a.unlock_time for a in game.achievements if a.achieved and a.unlock_time > 0
]
if not achieved_times:
continue
if max(achieved_times) >= started_ts:
count += 1
return count
def _state_info(
state: State, games_done: int, games_done_since_start: int
) -> WebStateInfo:
"""Build pace metadata, mirroring ``_print_pace_scenario`` inputs."""
days_elapsed = 0
pace = 0.0
if state.enforcement_started_at:
try:
started = datetime.fromisoformat(state.enforcement_started_at)
except ValueError:
started = None
if started is not None:
now = datetime.now(timezone.utc)
days_elapsed = max(1, (now - started).days)
if games_done_since_start > 0:
pace = round(games_done_since_start / days_elapsed, 4)
return WebStateInfo(
current_app_id=state.current_app_id,
current_game_name=state.current_game_name,
games_done=games_done,
games_done_since_start=games_done_since_start,
days_elapsed=days_elapsed,
enforcement_started_at=state.enforcement_started_at,
pace_games_per_day=pace,
)
def _collect_calibration_pairs(
raw_games: list[GameInfo],
raw_cache: dict[int, dict[str, Any]],
) -> tuple[list[tuple[float, float]], list[tuple[float, float, float]]]:
"""Separate complete games into rush-only and rush+leisure sample sets."""
rush_pairs: list[tuple[float, float]] = []
both_pairs: list[tuple[float, float, float]] = []
for game in raw_games:
if not game.is_complete or game.playtime_minutes <= 0:
continue
entry = raw_cache.get(game.app_id, {})
rush = float(entry.get("rush_hours", -1))
leisure = float(entry.get("leisure_100h", -1))
actual = game.playtime_minutes / 60.0
if rush > 0:
rush_pairs.append((actual, rush))
if rush > 0 and leisure > 0:
both_pairs.append((actual, rush, leisure))
return rush_pairs, both_pairs
def _interpolate_from_both(
both_pairs: list[tuple[float, float, float]],
) -> tuple[float, float]:
"""Return (ratio_vs_leisure, interpolation_t) from (actual, rush, leisure) triples.
Returns -1.0 for interpolation_t when leisure <= rush (degenerate data).
"""
sum_actual = sum(p[0] for p in both_pairs)
sum_rush = sum(p[1] for p in both_pairs)
sum_leisure = sum(p[2] for p in both_pairs)
ratio_vs_leisure = round(sum_actual / sum_leisure, 3)
if sum_leisure > sum_rush:
t = round((sum_actual - sum_rush) / (sum_leisure - sum_rush), 3)
else:
t = -1.0
return ratio_vs_leisure, t
def _classify_player_style(interpolation_t: float, ratio_vs_rush: float) -> str:
"""Map calibration metrics to a player-style label."""
if interpolation_t != -1.0:
if interpolation_t < 0:
return "faster_than_rush"
if interpolation_t <= 1.0:
return "rush_to_leisure"
return "slower_than_leisure"
return "faster_than_rush" if ratio_vs_rush < 1.0 else "unknown"
def compute_pace_vs_hltb(
raw_games: list[GameInfo],
raw_cache: dict[int, dict[str, Any]],
) -> PaceVsHLTB | None:
"""Compute player pace relative to HLTB rush/leisure averages.
Uses completed games (100 % achievements, positive playtime) as calibration
samples. Steam playtime includes idle time, so ratios > 1 are expected for
most players.
Args:
raw_games: All games from the snapshot (completed + incomplete).
raw_cache: The full HLTB cache (from ``_read_raw_cache()``).
Returns:
A ``PaceVsHLTB`` when at least one completed game has rush data,
``None`` when there is no calibration data at all.
"""
rush_pairs, both_pairs = _collect_calibration_pairs(raw_games, raw_cache)
if not rush_pairs:
return None
ratio_vs_rush = round(
sum(p[0] for p in rush_pairs) / sum(p[1] for p in rush_pairs), 3
)
if both_pairs:
ratio_vs_leisure, interpolation_t = _interpolate_from_both(both_pairs)
else:
ratio_vs_leisure = -1.0
interpolation_t = -1.0
return PaceVsHLTB(
calibration_count=len(rush_pairs),
ratio_vs_rush=ratio_vs_rush,
ratio_vs_leisure=ratio_vs_leisure,
interpolation_t=interpolation_t,
player_style=_classify_player_style(interpolation_t, ratio_vs_rush),
)
def build_web_dataset(state: State) -> WebDataset:
"""Build the full web dataset from on-disk caches (no network calls).
Args:
state: The loaded enforcer state (current game, finished IDs, pace).
Returns:
A ``WebDataset`` with every incomplete candidate game, the CLI default
thresholds, and a parity summary. Raises no exceptions for a missing
snapshot it returns an empty game list instead.
"""
snapshot = load_snapshot()
raw_games = (
[GameInfo.from_snapshot(d) for d in snapshot] if snapshot is not None else []
)
games_done = sum(1 for g in raw_games if g.is_complete)
games_done_since_start = count_complete_since_start(
raw_games, state.enforcement_started_at
)
exclude = set(state.finished_app_ids)
if state.current_app_id is not None:
exclude.add(state.current_app_id)
rows = _build_games(raw_games, exclude)
raw_cache = _read_raw_cache()
pace_vs_hltb = compute_pace_vs_hltb(raw_games, raw_cache)
return WebDataset(
games=rows,
state=_state_info(state, games_done, games_done_since_start),
defaults=WebDefaults(
min_comp_100_polls=_MIN_COMP_100_POLLS,
min_count_comp=_MIN_COUNT_COMP,
min_confidence_sum=_MIN_CONFIDENCE_SUM,
min_playable_tier=MIN_PLAYABLE_TIER,
hours_per_day_presets=list(HOURS_PER_DAY_PRESETS),
),
default_summary=_default_summary(rows),
pace_vs_hltb=pace_vs_hltb,
generated_at=datetime.now(timezone.utc).isoformat(),
)
def dataset_to_payload(dataset: WebDataset) -> dict[str, Any]:
"""Serialize a ``WebDataset`` to a JSON-ready dict."""
return asdict(dataset)