fix: HLTB count_comp=0 bug, false matches, and ProtonDB log noise

- Extract count_comp from detail page in _apply_detail_to_extras so the all-playstyles completion count is populated even when the search API returns 0 (Mini Ghost: 0 → 69, now passes confidence thresholds) - Fix _refresh_candidate_confidence to trigger re-fetch when count_comp==0 even if comp_100_count>0 (was silently skipping stale partial entries) - Filter colon-stripped fallback candidates (e.g. "Vox Populi" from "Vox Populi: Poland 2023") to full-edition or exact matches only, preventing cross-franchise false positives - Demote "All N ProtonDB ratings found in cache" log to DEBUG to remove per-game noise from the scan output Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-07-04 13:43:45 +02:00 · 2026-05-28 20:40:51 +02:00 · 2026-05-28 20:40:51 +02:00 · b8bd8459e4
commit b8bd8459e4
parent 551b8a4f95
5 changed files with 137 additions and 22 deletions
--- a/steam_backlog_enforcer/_hltb_detail.py
+++ b/steam_backlog_enforcer/_hltb_detail.py
@ -232,6 +232,29 @@ def _process_game_detail(
    return leisure, rush_h, l100
 def _apply_detail_to_extras(
    app_id: int,
    game_data: dict[str, Any],
    dlc_rels: list[tuple[int, float]],
    dlc_hours_by_id: dict[int, float],
    extras: _HLTBExtras,
 ) -> float:
    """Update extras in-place from detail data; return leisure hours (or -1)."""
    leisure, rush_h, l100 = _process_game_detail(game_data, dlc_rels, dlc_hours_by_id)
    if rush_h > 0:
        extras.rush[app_id] = rush_h
    if l100 > 0:
        extras.leisure_100h[app_id] = l100
    # The search API sometimes returns count_comp=0 even when the detail page
    # has all-playstyles completion counts.  Use the detail value when present.
    games_list = game_data.get("game")
    if isinstance(games_list, list) and games_list:
        count_comp_detail = _as_positive_int(games_list[0].get("count_comp", 0))
        if count_comp_detail > 0:
            extras.count_comp[app_id] = count_comp_detail
    return leisure
 async def _fetch_leisure_times(
    search_results: list[HLTBResult],
    cache: dict[int, float],
@ -279,17 +302,13 @@ async def _fetch_leisure_times(
            done += 1
            if game_data is not None:
                dlc_rels = dlc_relationships_by_app.get(r.app_id, [])
-                leisure, rush_h, l100 = _process_game_detail(
+                leisure = _apply_detail_to_extras(
-                    game_data, dlc_rels, dlc_hours_by_id
+                    r.app_id, game_data, dlc_rels, dlc_hours_by_id, extras
                )
                if leisure > 0:
                    r.completionist_hours = leisure
                    cache[r.app_id] = leisure
                    found += 1
                if rush_h > 0:
                    extras.rush[r.app_id] = rush_h
                if l100 > 0:
                    extras.leisure_100h[r.app_id] = l100
            if progress_cb is not None:
                progress_cb(done, total, found, r.game_name)
--- a/steam_backlog_enforcer/_hltb_search.py
+++ b/steam_backlog_enforcer/_hltb_search.py
@ -32,6 +32,13 @@ from steam_backlog_enforcer._hltb_types import (
 logger = logging.getLogger(__name__)
 # When extended entry has ≥ this many times more hours than the exact match,
 # prefer it even if its confidence count is lower.
 _EXTENDED_DOMINANCE_RATIO = 4.0
 # Minimum combined confidence for the dominance path (avoids picking entries
 # that have almost no data at all).
 _EXTENDED_MIN_CONFIDENCE = 3
 # ──────────────────────────────────────────────────────────────
 # HLTB API setup (done once, not per-request like the library)
@ -326,12 +333,14 @@ def _find_best_extended(
 ) -> tuple[dict[str, Any], float] | None:
    """Find best extended entry ("Name: Subtitle" / "Name - Subtitle").
-    Skips subset entries (prologue, demo, etc.).
+    Skips subset entries (prologue, demo, etc.).  Compilations ("compil")
    are included because HLTB classifies multi-chapter collections that
    share the base title as compilations (e.g. "FAITH: The Unholy Trinity").
    """
    best: tuple[dict[str, Any], float] | None = None
    for entry, sim in usable:
        game_type = str(entry.get("game_type", "")).lower()
-        if game_type not in ("", "game"):
+        if game_type not in ("", "game", "compil"):
            continue
        entry_name = (entry.get("game_name") or "").lower()
        if entry_name.startswith((lower + ":", lower + " -")):
@ -358,13 +367,20 @@ def _resolve_exact_vs_extended(
        extended_confidence = int(best_extended[0].get("comp_100_count", 0) or 0) + int(
            best_extended[0].get("count_comp", 0) or 0
        )
-        # Prefer the extended entry only when it has strictly more hours
+        # Prefer the extended entry when it has more hours AND either:
-        # than the exact match AND at least as much confidence.
+        #  (a) at least as much confidence (normal case), OR
-        # This lets "FAITH: The Unholy Trinity" (full game) beat
+        #  (b) dominant hours ratio (>=4x) with minimal data — handles cases
-        # a low-confidence exact demo while preventing low-confidence
+        #      like "FAITH: The Unholy Trinity" (17h, newer) vs "FAITH" 2017
-        # mods like "Celeste - Strawberry Jam" from beating
+        #      (1.5h, older/more data) where the older exact match has
-        # the exact base game.
+        #      accumulated more confidence simply by being on HLTB longer.
-        if extended_hours > exact_hours and extended_confidence >= exact_confidence:
+        dominates = (
            exact_hours > 0
            and extended_hours >= exact_hours * _EXTENDED_DOMINANCE_RATIO
            and extended_confidence >= _EXTENDED_MIN_CONFIDENCE
        )
        if extended_hours > exact_hours and (
            extended_confidence >= exact_confidence or dominates
        ):
            return best_extended
        return best_exact
    if best_exact is not None:
@ -419,6 +435,26 @@ async def _search_one(
                        continue
                    data = await resp.json()
                    candidates = _collect_candidates(query_name, data)
                    # When we stripped ": subtitle" from the original name to
                    # get query_name, only keep full-edition entries (those
                    # whose HLTB name starts with query_name + ":"/"-") or
                    # exact name/alias matches.  This prevents "Vox Populi"
                    # (stripped from "Vox Populi: Poland 2023") from falsely
                    # matching "Vox Populi Vox Dei 2".
                    if ":" in name and ":" not in query_name:
                        lower_q = query_name.lower()
                        candidates = [
                            (e, s)
                            for e, s in candidates
                            if (e.get("game_name") or "").lower() == lower_q
                            or (e.get("game_alias") or "").lower() == lower_q
                            or (e.get("game_name") or "")
                            .lower()
                            .startswith(lower_q + ":")
                            or (e.get("game_name") or "")
                            .lower()
                            .startswith(lower_q + " -")
                        ]
                    best = _pick_best_hltb_entry(query_name, candidates)
                    if best is None:
                        continue
--- a/steam_backlog_enforcer/_scanning_confidence.py
+++ b/steam_backlog_enforcer/_scanning_confidence.py
@ -55,10 +55,11 @@ def _confidence_fail_reasons(game: GameInfo) -> list[str]:
 def _refresh_candidate_confidence(game: GameInfo) -> None:
    """Refresh confidence metrics for one candidate when cache looks stale.
-    Only refreshes when both metrics are missing (0), which typically means
+    Refreshes when either metric is missing (0).  A game with comp_100_count>0
-    the game was cached before confidence fields were added.
+    but count_comp==0 means the detail-page all-playstyles count was not yet
    populated (e.g. the cache predates that field).
    """
-    if game.comp_100_count > 0 or game.count_comp > 0:
+    if game.comp_100_count > 0 and game.count_comp > 0:
        return
    _refresh_candidate_confidence_batch([game])
--- a/steam_backlog_enforcer/protondb.py
+++ b/steam_backlog_enforcer/protondb.py
@ -216,6 +216,6 @@ def fetch_protondb_ratings(
        _save_cache(cache)
        logger.info("ProtonDB: fetched %d, total cached %d", len(fetched), len(cache))
    else:
-        logger.info("All %d ProtonDB ratings found in cache.", len(results))
+        logger.debug("All %d ProtonDB ratings found in cache.", len(results))
    return results
--- a/steam_backlog_enforcer/tests/test_hltb_search.py
+++ b/steam_backlog_enforcer/tests/test_hltb_search.py
@ -259,12 +259,71 @@ class TestSearchOne:
        # Set done to one less than _SAVE_INTERVAL so it triggers save
        ctx.counter["done"] = _SAVE_INTERVAL - 1
-        with patch(
+        with patch("steam_backlog_enforcer._hltb_search.save_hltb_cache") as mock_save:
            "steam_backlog_enforcer._hltb_search.save_hltb_cache"
        ) as mock_save:
            asyncio.run(_search_one(asyncio.Semaphore(1), ctx, 440, "TF2"))
            mock_save.assert_called_once()
    def test_colon_strip_fallback_rejects_cross_franchise_match(self) -> None:
        """Colon-stripped fallback must not match a different franchise loosely.
        "Vox Populi: Poland 2023" stripped to "Vox Populi" should NOT match
        "Vox Populi Vox Dei 2" (different game, low-similarity entry).
        """
        empty_resp = _FakeResponse(200, {"data": []})
        loose_resp = _FakeResponse(
            200,
            {
                "data": [
                    {
                        "game_name": "Vox Populi Vox Dei 2",
                        "game_alias": "",
                        "game_type": "game",
                        "comp_100": 14400,
                        "comp_100_count": 9,
                        "count_comp": 57,
                        "game_id": 99999,
                    }
                ]
            },
        )
        session = MagicMock()
        session.post.side_effect = [empty_resp, loose_resp]
        ctx = _make_ctx(session)
        result = asyncio.run(
            _search_one(asyncio.Semaphore(1), ctx, 2590810, "Vox Populi: Poland 2023")
        )
        assert result is None
    def test_colon_strip_fallback_accepts_full_edition(self) -> None:
        """Colon-stripped fallback must still match when the HLTB entry is a
        full edition of the stripped name (name starts with stripped + ':').
        """
        empty_resp = _FakeResponse(200, {"data": []})
        full_edition_resp = _FakeResponse(
            200,
            {
                "data": [
                    {
                        "game_name": "Batman: Arkham Asylum",
                        "game_alias": "",
                        "game_type": "game",
                        "comp_100": 144000,
                        "comp_100_count": 300,
                        "count_comp": 5000,
                        "game_id": 11111,
                    }
                ]
            },
        )
        session = MagicMock()
        session.post.side_effect = [empty_resp, full_edition_resp]
        ctx = _make_ctx(session)
        result = asyncio.run(
            _search_one(asyncio.Semaphore(1), ctx, 35140, "Batman: Arkham Asylum")
        )
        assert result is not None
        assert result.game_name == "Batman: Arkham Asylum"
 class TestFetchBatchHltb:
    """Tests for _fetch_batch (the hltb version)."""