testsAndMisc/python_pkg/steam_backlog_enforcer/_scanning_confidence.py
Krzysztof kuhy Rudnicki 63354a9e2e steam_backlog_enforcer: fix stats command — show real Rush/Leisure/Worst data
Four bugs fixed:
- HLTB search returned 0 results for ~87 games with special chars (™, ®, &,
  standalone -, (Legacy), RHCP, etc.) — add _sanitize_search_name() and
  extend _build_search_variants() with Steam-suffix and edition stripping
- fetch_hltb_detail_missing returned immediately because `app_id not in rush`
  was always False (all keys present with -1) — fix to `rush.get(id,-1) <= 0`
- save_hltb_cache overwrote rush/leisure on confidence-only partial saves —
  now reads existing cache and preserves data when extras dicts are empty
- _filter_qualifying_games excluded 57 games with stale snapshot hours (-1)
  even though HLTB hours cache had valid data — add cache fallback

Result: stats shows Rush 64,670h / Leisure 136,807h / Worst 228,594h for
all 785 qualifying games with full rush+leisure detail.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-28 07:02:48 +02:00

253 lines
8.6 KiB
Python

"""Confidence-checking and candidate-filtering helpers for scanning."""
from __future__ import annotations
import logging
from typing import TYPE_CHECKING
from python_pkg.steam_backlog_enforcer._hltb_types import (
_HLTBExtras,
load_hltb_cache,
load_hltb_count_comp_cache,
load_hltb_polls_cache,
save_hltb_cache,
)
from python_pkg.steam_backlog_enforcer.game_install import _echo
from python_pkg.steam_backlog_enforcer.hltb import fetch_hltb_confidence_cached
if TYPE_CHECKING:
from python_pkg.steam_backlog_enforcer.config import State
from python_pkg.steam_backlog_enforcer.steam_api import GameInfo
logger = logging.getLogger(__name__)
_MIN_COMP_100_POLLS = 3
_MIN_COUNT_COMP = 15
_MIN_CONFIDENCE_SUM = 18
def _apply_cached_confidence_to_candidates(candidates: list[GameInfo]) -> None:
"""Overlay cached confidence counters onto candidate game objects."""
polls_cache = load_hltb_polls_cache()
count_comp_cache = load_hltb_count_comp_cache()
for game in candidates:
if game.app_id in polls_cache:
game.comp_100_count = polls_cache[game.app_id]
if game.app_id in count_comp_cache:
game.count_comp = count_comp_cache[game.app_id]
def _confidence_fail_reasons(game: GameInfo) -> list[str]:
"""Return threshold-failure reasons for a game's HLTB confidence data."""
reasons: list[str] = []
if game.comp_100_count < _MIN_COMP_100_POLLS:
reasons.append(f"comp_100 polls {game.comp_100_count} < {_MIN_COMP_100_POLLS}")
if game.count_comp < _MIN_COUNT_COMP:
reasons.append(f"count_comp {game.count_comp} < {_MIN_COUNT_COMP}")
total = game.comp_100_count + game.count_comp
if total < _MIN_CONFIDENCE_SUM:
reasons.append(f"comp_100+count_comp {total} < {_MIN_CONFIDENCE_SUM}")
return reasons
def _refresh_candidate_confidence(game: GameInfo) -> None:
"""Refresh confidence metrics for one candidate when cache looks stale.
Only refreshes when both metrics are missing (0), which typically means
the game was cached before confidence fields were added.
"""
if game.comp_100_count > 0 or game.count_comp > 0:
return
_refresh_candidate_confidence_batch([game])
def _force_refresh_candidate_confidence(game: GameInfo) -> None:
"""Force-refresh one candidate's confidence metrics from HLTB."""
_refresh_candidate_confidence_batch([game], force=True)
def _refresh_candidate_confidence_batch(
candidates: list[GameInfo],
*,
force: bool = False,
) -> None:
"""Refresh missing confidence metrics for candidates in one HLTB batch.
This prevents O(N) one-game API loops when many snapshot entries predate
confidence fields and therefore have ``comp_100_count==0`` and
``count_comp==0``.
"""
missing = [
game
for game in candidates
if force or (game.comp_100_count == 0 and game.count_comp == 0)
]
if not missing:
return
refresh_slice = missing
if len(refresh_slice) == 1:
game = refresh_slice[0]
_echo(f" Refreshing HLTB confidence for {game.name} (AppID={game.app_id})...")
else:
_echo(f" Refreshing HLTB confidence for {len(refresh_slice)} candidate(s)...")
cache = load_hltb_cache()
polls = load_hltb_polls_cache()
count_comp = load_hltb_count_comp_cache()
app_ids = [game.app_id for game in refresh_slice]
names = [(game.app_id, game.name) for game in refresh_slice]
prior_hours = {aid: cache.get(aid, -1) for aid in app_ids}
for aid in app_ids:
cache.pop(aid, None)
polls.pop(aid, None)
count_comp.pop(aid, None)
save_hltb_cache(cache, polls, _HLTBExtras(count_comp=count_comp))
fetch_hltb_confidence_cached(names)
refreshed_hours = load_hltb_cache()
refreshed_polls = load_hltb_polls_cache()
refreshed_count_comp = load_hltb_count_comp_cache()
for aid, old_hours in prior_hours.items():
if old_hours > 0 and refreshed_hours.get(aid, -1) <= 0:
refreshed_hours[aid] = old_hours
save_hltb_cache(
refreshed_hours, refreshed_polls, _HLTBExtras(count_comp=refreshed_count_comp)
)
for game in refresh_slice:
game.comp_100_count = refreshed_polls.get(game.app_id, 0)
game.count_comp = refreshed_count_comp.get(game.app_id, 0)
def _filter_hltb_confident_candidates(
candidates: list[GameInfo],
) -> list[GameInfo]:
"""Keep only candidates that satisfy HLTB confidence thresholds."""
_refresh_candidate_confidence_batch(candidates)
kept: list[GameInfo] = []
for game in candidates:
reasons = _confidence_fail_reasons(game)
if reasons:
_echo(
f" Skipping {game.name} (AppID={game.app_id}): "
f"HLTB confidence too low ({'; '.join(reasons)})"
)
continue
kept.append(game)
return kept
def _candidate_passes_hltb_confidence(game: GameInfo) -> bool:
"""Return True if candidate passes confidence with cache-first behavior.
Only refreshes when confidence fields are missing (both zero), which keeps
normal runs cache-friendly and avoids repeated refetches for known
low-confidence entries.
"""
reasons = _confidence_fail_reasons(game)
if not reasons:
return True
# Re-check once when confidence fields are missing in cache.
_refresh_candidate_confidence(game)
reasons = _confidence_fail_reasons(game)
if reasons:
_echo(
f" Skipping {game.name} (AppID={game.app_id}): "
f"HLTB confidence too low ({'; '.join(reasons)})"
)
return False
return True
def _backfill_polls_for_finished(
state: State,
games: list[GameInfo],
) -> dict[int, int]:
"""Lazily fetch poll counts for already-finished games missing them.
Reads the polls cache, identifies finished games whose poll count is
still ``0`` (typically because the cache predates the polls schema),
and triggers a one-shot HLTB search to backfill them. Returns the
refreshed polls cache.
"""
polls_cache = load_hltb_polls_cache()
name_by_id = {g.app_id: g.name for g in games}
missing = [
(aid, name_by_id[aid])
for aid in state.finished_app_ids
if aid in name_by_id and polls_cache.get(aid, 0) == 0
]
if not missing:
return polls_cache
logger.info(
"Backfilling HLTB poll counts for %d already-finished games...",
len(missing),
)
# Force a fresh search by removing the hours entries we want to refetch.
# (fetch_hltb_times_cached skips entries already in the hours cache.)
cache = load_hltb_cache()
preserved_hours = {aid: cache[aid] for aid, _ in missing if aid in cache}
for aid, _name in missing:
cache.pop(aid, None)
save_hltb_cache(cache, polls_cache)
fetch_hltb_confidence_cached(missing)
# Restore any previously-known hours that the refetch may have replaced
# with a worse match (we trust prior leisure+dlc estimates).
refreshed_hours = load_hltb_cache()
refreshed_polls = load_hltb_polls_cache()
for aid, prior_hours in preserved_hours.items():
if prior_hours > 0 and refreshed_hours.get(aid, -1) <= 0:
refreshed_hours[aid] = prior_hours
save_hltb_cache(refreshed_hours, refreshed_polls)
return refreshed_polls
def _report_poll_confidence(
chosen: GameInfo,
games: list[GameInfo],
state: State,
) -> None:
"""Print HLTB poll-count confidence info for the just-assigned game.
Shows the chosen game's ``comp_100_count`` (number of polled
completionist times on HowLongToBeat) and the historical minimum
among the user's previously-finished games. Marks a new historical
low so the user can be skeptical of unreliable estimates.
"""
polls_cache = _backfill_polls_for_finished(state, games)
chosen_polls = polls_cache.get(chosen.app_id, chosen.comp_100_count)
chosen.comp_100_count = chosen_polls
finished_polls = [
(polls_cache[aid], aid)
for aid in state.finished_app_ids
if polls_cache.get(aid, 0) > 0
]
if not finished_polls:
_echo(f" HLTB confidence: {chosen_polls} polled completionist times")
return
min_polls, min_aid = min(finished_polls)
name_by_id = {g.app_id: g.name for g in games}
min_name = name_by_id.get(min_aid, f"AppID={min_aid}")
warning = ""
if 0 < chosen_polls < min_polls:
warning = " ⚠ NEW LOW — estimate may be unreliable"
elif chosen_polls == 0:
warning = " ⚠ no polls recorded — estimate may be unreliable"
_echo(f" HLTB confidence: {chosen_polls} polled completionist times{warning}")
_echo(f" Historical min among finished: {min_polls} ({min_name})")