From b8bd8459e4477ce86fca75082064448ba1485e11 Mon Sep 17 00:00:00 2001 From: Krzysztof kuhy Rudnicki Date: Thu, 28 May 2026 20:40:51 +0200 Subject: [PATCH] fix: HLTB count_comp=0 bug, false matches, and ProtonDB log noise MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract count_comp from detail page in _apply_detail_to_extras so the all-playstyles completion count is populated even when the search API returns 0 (Mini Ghost: 0 → 69, now passes confidence thresholds) - Fix _refresh_candidate_confidence to trigger re-fetch when count_comp==0 even if comp_100_count>0 (was silently skipping stale partial entries) - Filter colon-stripped fallback candidates (e.g. "Vox Populi" from "Vox Populi: Poland 2023") to full-edition or exact matches only, preventing cross-franchise false positives - Demote "All N ProtonDB ratings found in cache" log to DEBUG to remove per-game noise from the scan output Co-Authored-By: Claude Sonnet 4.6 --- steam_backlog_enforcer/_hltb_detail.py | 31 +++++++-- steam_backlog_enforcer/_hltb_search.py | 54 ++++++++++++--- .../_scanning_confidence.py | 7 +- steam_backlog_enforcer/protondb.py | 2 +- .../tests/test_hltb_search.py | 65 ++++++++++++++++++- 5 files changed, 137 insertions(+), 22 deletions(-) diff --git a/steam_backlog_enforcer/_hltb_detail.py b/steam_backlog_enforcer/_hltb_detail.py index 737ec8a..75857a0 100644 --- a/steam_backlog_enforcer/_hltb_detail.py +++ b/steam_backlog_enforcer/_hltb_detail.py @@ -232,6 +232,29 @@ def _process_game_detail( return leisure, rush_h, l100 +def _apply_detail_to_extras( + app_id: int, + game_data: dict[str, Any], + dlc_rels: list[tuple[int, float]], + dlc_hours_by_id: dict[int, float], + extras: _HLTBExtras, +) -> float: + """Update extras in-place from detail data; return leisure hours (or -1).""" + leisure, rush_h, l100 = _process_game_detail(game_data, dlc_rels, dlc_hours_by_id) + if rush_h > 0: + extras.rush[app_id] = rush_h + if l100 > 0: + extras.leisure_100h[app_id] = l100 + # The search API sometimes returns count_comp=0 even when the detail page + # has all-playstyles completion counts. Use the detail value when present. + games_list = game_data.get("game") + if isinstance(games_list, list) and games_list: + count_comp_detail = _as_positive_int(games_list[0].get("count_comp", 0)) + if count_comp_detail > 0: + extras.count_comp[app_id] = count_comp_detail + return leisure + + async def _fetch_leisure_times( search_results: list[HLTBResult], cache: dict[int, float], @@ -279,17 +302,13 @@ async def _fetch_leisure_times( done += 1 if game_data is not None: dlc_rels = dlc_relationships_by_app.get(r.app_id, []) - leisure, rush_h, l100 = _process_game_detail( - game_data, dlc_rels, dlc_hours_by_id + leisure = _apply_detail_to_extras( + r.app_id, game_data, dlc_rels, dlc_hours_by_id, extras ) if leisure > 0: r.completionist_hours = leisure cache[r.app_id] = leisure found += 1 - if rush_h > 0: - extras.rush[r.app_id] = rush_h - if l100 > 0: - extras.leisure_100h[r.app_id] = l100 if progress_cb is not None: progress_cb(done, total, found, r.game_name) diff --git a/steam_backlog_enforcer/_hltb_search.py b/steam_backlog_enforcer/_hltb_search.py index 4f3c185..341a748 100644 --- a/steam_backlog_enforcer/_hltb_search.py +++ b/steam_backlog_enforcer/_hltb_search.py @@ -32,6 +32,13 @@ from steam_backlog_enforcer._hltb_types import ( logger = logging.getLogger(__name__) +# When extended entry has ≥ this many times more hours than the exact match, +# prefer it even if its confidence count is lower. +_EXTENDED_DOMINANCE_RATIO = 4.0 +# Minimum combined confidence for the dominance path (avoids picking entries +# that have almost no data at all). +_EXTENDED_MIN_CONFIDENCE = 3 + # ────────────────────────────────────────────────────────────── # HLTB API setup (done once, not per-request like the library) @@ -326,12 +333,14 @@ def _find_best_extended( ) -> tuple[dict[str, Any], float] | None: """Find best extended entry ("Name: Subtitle" / "Name - Subtitle"). - Skips subset entries (prologue, demo, etc.). + Skips subset entries (prologue, demo, etc.). Compilations ("compil") + are included because HLTB classifies multi-chapter collections that + share the base title as compilations (e.g. "FAITH: The Unholy Trinity"). """ best: tuple[dict[str, Any], float] | None = None for entry, sim in usable: game_type = str(entry.get("game_type", "")).lower() - if game_type not in ("", "game"): + if game_type not in ("", "game", "compil"): continue entry_name = (entry.get("game_name") or "").lower() if entry_name.startswith((lower + ":", lower + " -")): @@ -358,13 +367,20 @@ def _resolve_exact_vs_extended( extended_confidence = int(best_extended[0].get("comp_100_count", 0) or 0) + int( best_extended[0].get("count_comp", 0) or 0 ) - # Prefer the extended entry only when it has strictly more hours - # than the exact match AND at least as much confidence. - # This lets "FAITH: The Unholy Trinity" (full game) beat - # a low-confidence exact demo while preventing low-confidence - # mods like "Celeste - Strawberry Jam" from beating - # the exact base game. - if extended_hours > exact_hours and extended_confidence >= exact_confidence: + # Prefer the extended entry when it has more hours AND either: + # (a) at least as much confidence (normal case), OR + # (b) dominant hours ratio (>=4x) with minimal data — handles cases + # like "FAITH: The Unholy Trinity" (17h, newer) vs "FAITH" 2017 + # (1.5h, older/more data) where the older exact match has + # accumulated more confidence simply by being on HLTB longer. + dominates = ( + exact_hours > 0 + and extended_hours >= exact_hours * _EXTENDED_DOMINANCE_RATIO + and extended_confidence >= _EXTENDED_MIN_CONFIDENCE + ) + if extended_hours > exact_hours and ( + extended_confidence >= exact_confidence or dominates + ): return best_extended return best_exact if best_exact is not None: @@ -419,6 +435,26 @@ async def _search_one( continue data = await resp.json() candidates = _collect_candidates(query_name, data) + # When we stripped ": subtitle" from the original name to + # get query_name, only keep full-edition entries (those + # whose HLTB name starts with query_name + ":"/"-") or + # exact name/alias matches. This prevents "Vox Populi" + # (stripped from "Vox Populi: Poland 2023") from falsely + # matching "Vox Populi Vox Dei 2". + if ":" in name and ":" not in query_name: + lower_q = query_name.lower() + candidates = [ + (e, s) + for e, s in candidates + if (e.get("game_name") or "").lower() == lower_q + or (e.get("game_alias") or "").lower() == lower_q + or (e.get("game_name") or "") + .lower() + .startswith(lower_q + ":") + or (e.get("game_name") or "") + .lower() + .startswith(lower_q + " -") + ] best = _pick_best_hltb_entry(query_name, candidates) if best is None: continue diff --git a/steam_backlog_enforcer/_scanning_confidence.py b/steam_backlog_enforcer/_scanning_confidence.py index 0dcce60..8502881 100644 --- a/steam_backlog_enforcer/_scanning_confidence.py +++ b/steam_backlog_enforcer/_scanning_confidence.py @@ -55,10 +55,11 @@ def _confidence_fail_reasons(game: GameInfo) -> list[str]: def _refresh_candidate_confidence(game: GameInfo) -> None: """Refresh confidence metrics for one candidate when cache looks stale. - Only refreshes when both metrics are missing (0), which typically means - the game was cached before confidence fields were added. + Refreshes when either metric is missing (0). A game with comp_100_count>0 + but count_comp==0 means the detail-page all-playstyles count was not yet + populated (e.g. the cache predates that field). """ - if game.comp_100_count > 0 or game.count_comp > 0: + if game.comp_100_count > 0 and game.count_comp > 0: return _refresh_candidate_confidence_batch([game]) diff --git a/steam_backlog_enforcer/protondb.py b/steam_backlog_enforcer/protondb.py index c5f5fb5..60d7e18 100644 --- a/steam_backlog_enforcer/protondb.py +++ b/steam_backlog_enforcer/protondb.py @@ -216,6 +216,6 @@ def fetch_protondb_ratings( _save_cache(cache) logger.info("ProtonDB: fetched %d, total cached %d", len(fetched), len(cache)) else: - logger.info("All %d ProtonDB ratings found in cache.", len(results)) + logger.debug("All %d ProtonDB ratings found in cache.", len(results)) return results diff --git a/steam_backlog_enforcer/tests/test_hltb_search.py b/steam_backlog_enforcer/tests/test_hltb_search.py index 2986d97..a0aa2c2 100644 --- a/steam_backlog_enforcer/tests/test_hltb_search.py +++ b/steam_backlog_enforcer/tests/test_hltb_search.py @@ -259,12 +259,71 @@ class TestSearchOne: # Set done to one less than _SAVE_INTERVAL so it triggers save ctx.counter["done"] = _SAVE_INTERVAL - 1 - with patch( - "steam_backlog_enforcer._hltb_search.save_hltb_cache" - ) as mock_save: + with patch("steam_backlog_enforcer._hltb_search.save_hltb_cache") as mock_save: asyncio.run(_search_one(asyncio.Semaphore(1), ctx, 440, "TF2")) mock_save.assert_called_once() + def test_colon_strip_fallback_rejects_cross_franchise_match(self) -> None: + """Colon-stripped fallback must not match a different franchise loosely. + + "Vox Populi: Poland 2023" stripped to "Vox Populi" should NOT match + "Vox Populi Vox Dei 2" (different game, low-similarity entry). + """ + empty_resp = _FakeResponse(200, {"data": []}) + loose_resp = _FakeResponse( + 200, + { + "data": [ + { + "game_name": "Vox Populi Vox Dei 2", + "game_alias": "", + "game_type": "game", + "comp_100": 14400, + "comp_100_count": 9, + "count_comp": 57, + "game_id": 99999, + } + ] + }, + ) + session = MagicMock() + session.post.side_effect = [empty_resp, loose_resp] + ctx = _make_ctx(session) + result = asyncio.run( + _search_one(asyncio.Semaphore(1), ctx, 2590810, "Vox Populi: Poland 2023") + ) + assert result is None + + def test_colon_strip_fallback_accepts_full_edition(self) -> None: + """Colon-stripped fallback must still match when the HLTB entry is a + full edition of the stripped name (name starts with stripped + ':'). + """ + empty_resp = _FakeResponse(200, {"data": []}) + full_edition_resp = _FakeResponse( + 200, + { + "data": [ + { + "game_name": "Batman: Arkham Asylum", + "game_alias": "", + "game_type": "game", + "comp_100": 144000, + "comp_100_count": 300, + "count_comp": 5000, + "game_id": 11111, + } + ] + }, + ) + session = MagicMock() + session.post.side_effect = [empty_resp, full_edition_resp] + ctx = _make_ctx(session) + result = asyncio.run( + _search_one(asyncio.Semaphore(1), ctx, 35140, "Batman: Arkham Asylum") + ) + assert result is not None + assert result.game_name == "Batman: Arkham Asylum" + class TestFetchBatchHltb: """Tests for _fetch_batch (the hltb version)."""