fix: HLTB count_comp=0 bug, false matches, and ProtonDB log noise

- Extract count_comp from detail page in _apply_detail_to_extras so the
  all-playstyles completion count is populated even when the search API
  returns 0 (Mini Ghost: 0 → 69, now passes confidence thresholds)
- Fix _refresh_candidate_confidence to trigger re-fetch when count_comp==0
  even if comp_100_count>0 (was silently skipping stale partial entries)
- Filter colon-stripped fallback candidates (e.g. "Vox Populi" from
  "Vox Populi: Poland 2023") to full-edition or exact matches only,
  preventing cross-franchise false positives
- Demote "All N ProtonDB ratings found in cache" log to DEBUG to remove
  per-game noise from the scan output

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Krzysztof kuhy Rudnicki 2026-05-28 20:40:51 +02:00
parent 551b8a4f95
commit b8bd8459e4
5 changed files with 137 additions and 22 deletions

View File

@ -232,6 +232,29 @@ def _process_game_detail(
return leisure, rush_h, l100 return leisure, rush_h, l100
def _apply_detail_to_extras(
app_id: int,
game_data: dict[str, Any],
dlc_rels: list[tuple[int, float]],
dlc_hours_by_id: dict[int, float],
extras: _HLTBExtras,
) -> float:
"""Update extras in-place from detail data; return leisure hours (or -1)."""
leisure, rush_h, l100 = _process_game_detail(game_data, dlc_rels, dlc_hours_by_id)
if rush_h > 0:
extras.rush[app_id] = rush_h
if l100 > 0:
extras.leisure_100h[app_id] = l100
# The search API sometimes returns count_comp=0 even when the detail page
# has all-playstyles completion counts. Use the detail value when present.
games_list = game_data.get("game")
if isinstance(games_list, list) and games_list:
count_comp_detail = _as_positive_int(games_list[0].get("count_comp", 0))
if count_comp_detail > 0:
extras.count_comp[app_id] = count_comp_detail
return leisure
async def _fetch_leisure_times( async def _fetch_leisure_times(
search_results: list[HLTBResult], search_results: list[HLTBResult],
cache: dict[int, float], cache: dict[int, float],
@ -279,17 +302,13 @@ async def _fetch_leisure_times(
done += 1 done += 1
if game_data is not None: if game_data is not None:
dlc_rels = dlc_relationships_by_app.get(r.app_id, []) dlc_rels = dlc_relationships_by_app.get(r.app_id, [])
leisure, rush_h, l100 = _process_game_detail( leisure = _apply_detail_to_extras(
game_data, dlc_rels, dlc_hours_by_id r.app_id, game_data, dlc_rels, dlc_hours_by_id, extras
) )
if leisure > 0: if leisure > 0:
r.completionist_hours = leisure r.completionist_hours = leisure
cache[r.app_id] = leisure cache[r.app_id] = leisure
found += 1 found += 1
if rush_h > 0:
extras.rush[r.app_id] = rush_h
if l100 > 0:
extras.leisure_100h[r.app_id] = l100
if progress_cb is not None: if progress_cb is not None:
progress_cb(done, total, found, r.game_name) progress_cb(done, total, found, r.game_name)

View File

@ -32,6 +32,13 @@ from steam_backlog_enforcer._hltb_types import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# When extended entry has ≥ this many times more hours than the exact match,
# prefer it even if its confidence count is lower.
_EXTENDED_DOMINANCE_RATIO = 4.0
# Minimum combined confidence for the dominance path (avoids picking entries
# that have almost no data at all).
_EXTENDED_MIN_CONFIDENCE = 3
# ────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────
# HLTB API setup (done once, not per-request like the library) # HLTB API setup (done once, not per-request like the library)
@ -326,12 +333,14 @@ def _find_best_extended(
) -> tuple[dict[str, Any], float] | None: ) -> tuple[dict[str, Any], float] | None:
"""Find best extended entry ("Name: Subtitle" / "Name - Subtitle"). """Find best extended entry ("Name: Subtitle" / "Name - Subtitle").
Skips subset entries (prologue, demo, etc.). Skips subset entries (prologue, demo, etc.). Compilations ("compil")
are included because HLTB classifies multi-chapter collections that
share the base title as compilations (e.g. "FAITH: The Unholy Trinity").
""" """
best: tuple[dict[str, Any], float] | None = None best: tuple[dict[str, Any], float] | None = None
for entry, sim in usable: for entry, sim in usable:
game_type = str(entry.get("game_type", "")).lower() game_type = str(entry.get("game_type", "")).lower()
if game_type not in ("", "game"): if game_type not in ("", "game", "compil"):
continue continue
entry_name = (entry.get("game_name") or "").lower() entry_name = (entry.get("game_name") or "").lower()
if entry_name.startswith((lower + ":", lower + " -")): if entry_name.startswith((lower + ":", lower + " -")):
@ -358,13 +367,20 @@ def _resolve_exact_vs_extended(
extended_confidence = int(best_extended[0].get("comp_100_count", 0) or 0) + int( extended_confidence = int(best_extended[0].get("comp_100_count", 0) or 0) + int(
best_extended[0].get("count_comp", 0) or 0 best_extended[0].get("count_comp", 0) or 0
) )
# Prefer the extended entry only when it has strictly more hours # Prefer the extended entry when it has more hours AND either:
# than the exact match AND at least as much confidence. # (a) at least as much confidence (normal case), OR
# This lets "FAITH: The Unholy Trinity" (full game) beat # (b) dominant hours ratio (>=4x) with minimal data — handles cases
# a low-confidence exact demo while preventing low-confidence # like "FAITH: The Unholy Trinity" (17h, newer) vs "FAITH" 2017
# mods like "Celeste - Strawberry Jam" from beating # (1.5h, older/more data) where the older exact match has
# the exact base game. # accumulated more confidence simply by being on HLTB longer.
if extended_hours > exact_hours and extended_confidence >= exact_confidence: dominates = (
exact_hours > 0
and extended_hours >= exact_hours * _EXTENDED_DOMINANCE_RATIO
and extended_confidence >= _EXTENDED_MIN_CONFIDENCE
)
if extended_hours > exact_hours and (
extended_confidence >= exact_confidence or dominates
):
return best_extended return best_extended
return best_exact return best_exact
if best_exact is not None: if best_exact is not None:
@ -419,6 +435,26 @@ async def _search_one(
continue continue
data = await resp.json() data = await resp.json()
candidates = _collect_candidates(query_name, data) candidates = _collect_candidates(query_name, data)
# When we stripped ": subtitle" from the original name to
# get query_name, only keep full-edition entries (those
# whose HLTB name starts with query_name + ":"/"-") or
# exact name/alias matches. This prevents "Vox Populi"
# (stripped from "Vox Populi: Poland 2023") from falsely
# matching "Vox Populi Vox Dei 2".
if ":" in name and ":" not in query_name:
lower_q = query_name.lower()
candidates = [
(e, s)
for e, s in candidates
if (e.get("game_name") or "").lower() == lower_q
or (e.get("game_alias") or "").lower() == lower_q
or (e.get("game_name") or "")
.lower()
.startswith(lower_q + ":")
or (e.get("game_name") or "")
.lower()
.startswith(lower_q + " -")
]
best = _pick_best_hltb_entry(query_name, candidates) best = _pick_best_hltb_entry(query_name, candidates)
if best is None: if best is None:
continue continue

View File

@ -55,10 +55,11 @@ def _confidence_fail_reasons(game: GameInfo) -> list[str]:
def _refresh_candidate_confidence(game: GameInfo) -> None: def _refresh_candidate_confidence(game: GameInfo) -> None:
"""Refresh confidence metrics for one candidate when cache looks stale. """Refresh confidence metrics for one candidate when cache looks stale.
Only refreshes when both metrics are missing (0), which typically means Refreshes when either metric is missing (0). A game with comp_100_count>0
the game was cached before confidence fields were added. but count_comp==0 means the detail-page all-playstyles count was not yet
populated (e.g. the cache predates that field).
""" """
if game.comp_100_count > 0 or game.count_comp > 0: if game.comp_100_count > 0 and game.count_comp > 0:
return return
_refresh_candidate_confidence_batch([game]) _refresh_candidate_confidence_batch([game])

View File

@ -216,6 +216,6 @@ def fetch_protondb_ratings(
_save_cache(cache) _save_cache(cache)
logger.info("ProtonDB: fetched %d, total cached %d", len(fetched), len(cache)) logger.info("ProtonDB: fetched %d, total cached %d", len(fetched), len(cache))
else: else:
logger.info("All %d ProtonDB ratings found in cache.", len(results)) logger.debug("All %d ProtonDB ratings found in cache.", len(results))
return results return results

View File

@ -259,12 +259,71 @@ class TestSearchOne:
# Set done to one less than _SAVE_INTERVAL so it triggers save # Set done to one less than _SAVE_INTERVAL so it triggers save
ctx.counter["done"] = _SAVE_INTERVAL - 1 ctx.counter["done"] = _SAVE_INTERVAL - 1
with patch( with patch("steam_backlog_enforcer._hltb_search.save_hltb_cache") as mock_save:
"steam_backlog_enforcer._hltb_search.save_hltb_cache"
) as mock_save:
asyncio.run(_search_one(asyncio.Semaphore(1), ctx, 440, "TF2")) asyncio.run(_search_one(asyncio.Semaphore(1), ctx, 440, "TF2"))
mock_save.assert_called_once() mock_save.assert_called_once()
def test_colon_strip_fallback_rejects_cross_franchise_match(self) -> None:
"""Colon-stripped fallback must not match a different franchise loosely.
"Vox Populi: Poland 2023" stripped to "Vox Populi" should NOT match
"Vox Populi Vox Dei 2" (different game, low-similarity entry).
"""
empty_resp = _FakeResponse(200, {"data": []})
loose_resp = _FakeResponse(
200,
{
"data": [
{
"game_name": "Vox Populi Vox Dei 2",
"game_alias": "",
"game_type": "game",
"comp_100": 14400,
"comp_100_count": 9,
"count_comp": 57,
"game_id": 99999,
}
]
},
)
session = MagicMock()
session.post.side_effect = [empty_resp, loose_resp]
ctx = _make_ctx(session)
result = asyncio.run(
_search_one(asyncio.Semaphore(1), ctx, 2590810, "Vox Populi: Poland 2023")
)
assert result is None
def test_colon_strip_fallback_accepts_full_edition(self) -> None:
"""Colon-stripped fallback must still match when the HLTB entry is a
full edition of the stripped name (name starts with stripped + ':').
"""
empty_resp = _FakeResponse(200, {"data": []})
full_edition_resp = _FakeResponse(
200,
{
"data": [
{
"game_name": "Batman: Arkham Asylum",
"game_alias": "",
"game_type": "game",
"comp_100": 144000,
"comp_100_count": 300,
"count_comp": 5000,
"game_id": 11111,
}
]
},
)
session = MagicMock()
session.post.side_effect = [empty_resp, full_edition_resp]
ctx = _make_ctx(session)
result = asyncio.run(
_search_one(asyncio.Semaphore(1), ctx, 35140, "Batman: Arkham Asylum")
)
assert result is not None
assert result.game_name == "Batman: Arkham Asylum"
class TestFetchBatchHltb: class TestFetchBatchHltb:
"""Tests for _fetch_batch (the hltb version).""" """Tests for _fetch_batch (the hltb version)."""