testsAndMisc/python_pkg/steam_backlog_enforcer/_hltb_detail.py
Krzysztof kuhy Rudnicki 61a9e5dc3c steam_backlog_enforcer: fix library_hider crash on invalid AppIDs + improve HLTB hour extraction
- library_hider.py: add safeHide(ids) JS helper that binary-bisects on failure
  to skip problematic DLC/tool IDs without blocking the entire hide pass
- library_hider.py: increase CDP timeout 30s -> 120s; extract richer CDP error
  details from exceptionDetails/exception.description
- _hltb_detail.py: rewrite _extract_base_leisure_hours() to pick the maximum
  (slowest) time across all platform comp_high values and *_h fields; add
  _platform_comp_high_candidates() helper
2026-05-22 15:59:18 +02:00

306 lines
9.7 KiB
Python

"""Detail page parsing and leisure time / DLC fetching for HLTB."""
from __future__ import annotations
import asyncio
from http import HTTPStatus
import json
import logging
import re
from typing import Any
import aiohttp
from python_pkg.steam_backlog_enforcer._hltb_types import (
_SAVE_INTERVAL,
HLTB_BASE_URL,
MAX_CONCURRENT,
HLTBResult,
ProgressCb,
save_hltb_cache,
)
logger = logging.getLogger(__name__)
_NEXT_DATA_RE = re.compile(
r'<script id="__NEXT_DATA__" type="application/json">(.*?)</script>',
)
def _parse_game_page(html: str) -> dict[str, Any] | None:
"""Extract game data dict from a HLTB game page's __NEXT_DATA__."""
match = _NEXT_DATA_RE.search(html)
if not match:
return None
try:
data = json.loads(match.group(1))
result: dict[str, Any] = data["props"]["pageProps"]["game"]["data"]
except (json.JSONDecodeError, KeyError, TypeError):
return None
return result
def _as_positive_int(value: object) -> int:
"""Convert HLTB numeric JSON values to a positive int, or 0 when invalid."""
if isinstance(value, int):
return max(0, value)
if isinstance(value, float):
int_value = int(value)
return max(0, int_value)
if isinstance(value, str):
try:
int_value = int(value)
return max(0, int_value)
except ValueError:
return 0
return 0
def _platform_comp_high_candidates(game_data: dict[str, Any]) -> list[int]:
"""Collect positive ``comp_high`` values from ``platformData`` entries."""
platform_data = game_data.get("platformData", [])
if not isinstance(platform_data, list):
return []
candidates = []
for entry in platform_data:
if isinstance(entry, dict):
v = _as_positive_int(entry.get("comp_high", 0))
if v > 0:
candidates.append(v)
return candidates
def _extract_base_leisure_hours(game_data: dict[str, Any]) -> float:
"""Extract base-game leisure hours from game detail data.
Returns the highest (slowest) time to beat across all play styles.
Candidates considered:
1. ``comp_high`` from each entry in ``platformData`` — the per-platform
slowest individual submission displayed on the HLTB page.
2. The ``_h`` (leisure/high) fields from ``game[0]``:
``comp_main_h``, ``comp_plus_h``, ``comp_100_h``, ``comp_all_h``.
3. Falls back to average times: ``comp_main``, ``comp_plus``, ``comp_100``.
"""
games = game_data.get("game", [])
if not isinstance(games, list) or not games:
return -1
if not isinstance(games[0], dict):
return -1
base = games[0]
candidates = _platform_comp_high_candidates(game_data)
# 2. Leisure/high fields from the game record
for field in ("comp_main_h", "comp_plus_h", "comp_100_h", "comp_all_h"):
v = _as_positive_int(base.get(field, 0))
if v > 0:
candidates.append(v)
leisure_s = max(candidates) if candidates else 0
# 3. Fallback: average completion times
if leisure_s <= 0:
avg_candidates = [
_as_positive_int(base.get("comp_main", 0)),
_as_positive_int(base.get("comp_plus", 0)),
_as_positive_int(base.get("comp_100", 0)),
]
leisure_s = max(avg_candidates)
if leisure_s <= 0:
return -1
return round(leisure_s / 3600, 2)
def _extract_dlc_relationships(game_data: dict[str, Any]) -> list[tuple[int, float]]:
"""Extract DLC relationship IDs and fallback hours from detail data."""
relationships = game_data.get("relationships", [])
if not isinstance(relationships, list):
return []
dlcs: list[tuple[int, float]] = []
for rel in relationships:
if not isinstance(rel, dict):
continue
if str(rel.get("game_type", "")).lower() != "dlc":
continue
dlc_id = _as_positive_int(rel.get("game_id", 0))
fallback_comp_100 = _as_positive_int(rel.get("comp_100", 0))
if fallback_comp_100 > 0:
fallback_hours = round(fallback_comp_100 / 3600, 2)
else:
fallback_hours = 0.0
dlcs.append((dlc_id, fallback_hours))
return dlcs
def _extract_leisure_hours(game_data: dict[str, Any]) -> float:
"""Compute total leisure hours: base game + all DLCs.
Uses the highest (slowest) time across ``platformData comp_high`` and
leisure ``_h`` fields from ``game[0]``. Falls back to average completion
times. Also sums leisure time from any DLC listed in ``relationships``.
"""
base_hours = _extract_base_leisure_hours(game_data)
if base_hours <= 0:
return -1
total_hours = base_hours
# Add DLC leisure times from relationships.
for _dlc_id, fallback_hours in _extract_dlc_relationships(game_data):
total_hours += fallback_hours
return round(total_hours, 2)
async def _fetch_detail_one(
sem: asyncio.Semaphore,
session: aiohttp.ClientSession,
hltb_game_id: int,
) -> dict[str, Any] | None:
"""Fetch a single HLTB game detail page and parse its data."""
async with sem:
url = f"{HLTB_BASE_URL}/game/{hltb_game_id}"
headers = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0"
),
"accept": "text/html",
"referer": "https://howlongtobeat.com/",
}
try:
async with session.get(url, headers=headers) as resp:
if resp.status == HTTPStatus.OK:
html = await resp.text()
return _parse_game_page(html)
except (aiohttp.ClientError, asyncio.TimeoutError) as exc:
logger.debug(
"HLTB detail fetch failed for game_id=%d: %s",
hltb_game_id,
exc,
)
return None
async def _fetch_leisure_times(
search_results: list[HLTBResult],
cache: dict[int, float],
polls: dict[int, int],
progress_cb: ProgressCb | None,
count_comp: dict[int, int] | None = None,
) -> None:
"""Fetch leisure times from game detail pages for all search results.
Updates ``cache`` in-place with leisure hours (including DLC time).
The ``polls`` and ``count_comp`` mappings are forwarded to
:func:`save_hltb_cache` so the on-disk cache keeps confidence metrics
captured during the search step.
"""
if count_comp is None:
count_comp = {}
valid = [r for r in search_results if r.hltb_game_id > 0]
if not valid:
return
timeout = aiohttp.ClientTimeout(total=30, sock_read=20)
sem = asyncio.Semaphore(MAX_CONCURRENT)
connector = aiohttp.TCPConnector(
limit=MAX_CONCURRENT,
keepalive_timeout=30,
)
total = len(valid)
done = 0
found = 0
async with aiohttp.ClientSession(
timeout=timeout,
connector=connector,
) as session:
coros = [_fetch_detail_one(sem, session, r.hltb_game_id) for r in valid]
details = await asyncio.gather(*coros)
dlc_relationships_by_app, dlc_ids = _collect_dlc_relationships(valid, details)
dlc_hours_by_id = await _fetch_dlc_leisure_hours(sem, session, dlc_ids)
for r, game_data in zip(valid, details, strict=False):
done += 1
if game_data is not None:
leisure = _extract_leisure_hours(game_data)
if leisure > 0:
leisure = _apply_dlc_leisure_overrides(
leisure,
dlc_relationships_by_app.get(r.app_id, []),
dlc_hours_by_id,
)
r.completionist_hours = leisure
cache[r.app_id] = leisure
found += 1
if progress_cb is not None:
progress_cb(done, total, found, r.game_name)
if not done % _SAVE_INTERVAL:
save_hltb_cache(cache, polls, count_comp)
def _collect_dlc_relationships(
valid: list[HLTBResult],
details: list[dict[str, Any] | None],
) -> tuple[dict[int, list[tuple[int, float]]], list[int]]:
"""Collect DLC relationship IDs for all base-game detail responses."""
by_app: dict[int, list[tuple[int, float]]] = {}
unique_dlc_ids: set[int] = set()
for result, game_data in zip(valid, details, strict=False):
if game_data is None:
continue
dlc_rels = _extract_dlc_relationships(game_data)
by_app[result.app_id] = dlc_rels
for dlc_id, _fallback_hours in dlc_rels:
if dlc_id > 0:
unique_dlc_ids.add(dlc_id)
return by_app, sorted(unique_dlc_ids)
async def _fetch_dlc_leisure_hours(
sem: asyncio.Semaphore,
session: aiohttp.ClientSession,
dlc_ids: list[int],
) -> dict[int, float]:
"""Fetch leisure hours for each DLC game id."""
if not dlc_ids:
return {}
coros = [_fetch_detail_one(sem, session, dlc_id) for dlc_id in dlc_ids]
dlc_details = await asyncio.gather(*coros)
dlc_hours_by_id: dict[int, float] = {}
for dlc_id, dlc_data in zip(dlc_ids, dlc_details, strict=False):
if dlc_data is None:
continue
dlc_leisure = _extract_base_leisure_hours(dlc_data)
if dlc_leisure > 0:
dlc_hours_by_id[dlc_id] = dlc_leisure
return dlc_hours_by_id
def _apply_dlc_leisure_overrides(
base_hours: float,
dlc_rels: list[tuple[int, float]],
dlc_hours_by_id: dict[int, float],
) -> float:
"""Replace fallback DLC hours with detailed leisure hours when available."""
adjusted = base_hours
for dlc_id, fallback_hours in dlc_rels:
dlc_leisure = dlc_hours_by_id.get(dlc_id, -1.0)
if dlc_leisure > 0:
adjusted += dlc_leisure - fallback_hours
return round(adjusted, 2)