From b8bd8459e4477ce86fca75082064448ba1485e11 Mon Sep 17 00:00:00 2001
From: Krzysztof kuhy Rudnicki <krzysztofrudnicki0@gmail.com>
Date: Thu, 28 May 2026 20:40:51 +0200
Subject: [PATCH] fix: HLTB count_comp=0 bug, false matches, and ProtonDB log
 noise
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Extract count_comp from detail page in _apply_detail_to_extras so the
  all-playstyles completion count is populated even when the search API
  returns 0 (Mini Ghost: 0 → 69, now passes confidence thresholds)
- Fix _refresh_candidate_confidence to trigger re-fetch when count_comp==0
  even if comp_100_count>0 (was silently skipping stale partial entries)
- Filter colon-stripped fallback candidates (e.g. "Vox Populi" from
  "Vox Populi: Poland 2023") to full-edition or exact matches only,
  preventing cross-franchise false positives
- Demote "All N ProtonDB ratings found in cache" log to DEBUG to remove
  per-game noise from the scan output

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 steam_backlog_enforcer/_hltb_detail.py        | 31 +++++++--
 steam_backlog_enforcer/_hltb_search.py        | 54 ++++++++++++---
 .../_scanning_confidence.py                   |  7 +-
 steam_backlog_enforcer/protondb.py            |  2 +-
 .../tests/test_hltb_search.py                 | 65 ++++++++++++++++++-
 5 files changed, 137 insertions(+), 22 deletions(-)

diff --git a/steam_backlog_enforcer/_hltb_detail.py b/steam_backlog_enforcer/_hltb_detail.py
index 737ec8a..75857a0 100644
--- a/steam_backlog_enforcer/_hltb_detail.py
+++ b/steam_backlog_enforcer/_hltb_detail.py
@@ -232,6 +232,29 @@ def _process_game_detail(
     return leisure, rush_h, l100
 
 
+def _apply_detail_to_extras(
+    app_id: int,
+    game_data: dict[str, Any],
+    dlc_rels: list[tuple[int, float]],
+    dlc_hours_by_id: dict[int, float],
+    extras: _HLTBExtras,
+) -> float:
+    """Update extras in-place from detail data; return leisure hours (or -1)."""
+    leisure, rush_h, l100 = _process_game_detail(game_data, dlc_rels, dlc_hours_by_id)
+    if rush_h > 0:
+        extras.rush[app_id] = rush_h
+    if l100 > 0:
+        extras.leisure_100h[app_id] = l100
+    # The search API sometimes returns count_comp=0 even when the detail page
+    # has all-playstyles completion counts.  Use the detail value when present.
+    games_list = game_data.get("game")
+    if isinstance(games_list, list) and games_list:
+        count_comp_detail = _as_positive_int(games_list[0].get("count_comp", 0))
+        if count_comp_detail > 0:
+            extras.count_comp[app_id] = count_comp_detail
+    return leisure
+
+
 async def _fetch_leisure_times(
     search_results: list[HLTBResult],
     cache: dict[int, float],
@@ -279,17 +302,13 @@ async def _fetch_leisure_times(
             done += 1
             if game_data is not None:
                 dlc_rels = dlc_relationships_by_app.get(r.app_id, [])
-                leisure, rush_h, l100 = _process_game_detail(
-                    game_data, dlc_rels, dlc_hours_by_id
+                leisure = _apply_detail_to_extras(
+                    r.app_id, game_data, dlc_rels, dlc_hours_by_id, extras
                 )
                 if leisure > 0:
                     r.completionist_hours = leisure
                     cache[r.app_id] = leisure
                     found += 1
-                if rush_h > 0:
-                    extras.rush[r.app_id] = rush_h
-                if l100 > 0:
-                    extras.leisure_100h[r.app_id] = l100
 
             if progress_cb is not None:
                 progress_cb(done, total, found, r.game_name)
diff --git a/steam_backlog_enforcer/_hltb_search.py b/steam_backlog_enforcer/_hltb_search.py
index 4f3c185..341a748 100644
--- a/steam_backlog_enforcer/_hltb_search.py
+++ b/steam_backlog_enforcer/_hltb_search.py
@@ -32,6 +32,13 @@ from steam_backlog_enforcer._hltb_types import (
 
 logger = logging.getLogger(__name__)
 
+# When extended entry has ≥ this many times more hours than the exact match,
+# prefer it even if its confidence count is lower.
+_EXTENDED_DOMINANCE_RATIO = 4.0
+# Minimum combined confidence for the dominance path (avoids picking entries
+# that have almost no data at all).
+_EXTENDED_MIN_CONFIDENCE = 3
+
 
 # ──────────────────────────────────────────────────────────────
 # HLTB API setup (done once, not per-request like the library)
@@ -326,12 +333,14 @@ def _find_best_extended(
 ) -> tuple[dict[str, Any], float] | None:
     """Find best extended entry ("Name: Subtitle" / "Name - Subtitle").
 
-    Skips subset entries (prologue, demo, etc.).
+    Skips subset entries (prologue, demo, etc.).  Compilations ("compil")
+    are included because HLTB classifies multi-chapter collections that
+    share the base title as compilations (e.g. "FAITH: The Unholy Trinity").
     """
     best: tuple[dict[str, Any], float] | None = None
     for entry, sim in usable:
         game_type = str(entry.get("game_type", "")).lower()
-        if game_type not in ("", "game"):
+        if game_type not in ("", "game", "compil"):
             continue
         entry_name = (entry.get("game_name") or "").lower()
         if entry_name.startswith((lower + ":", lower + " -")):
@@ -358,13 +367,20 @@ def _resolve_exact_vs_extended(
         extended_confidence = int(best_extended[0].get("comp_100_count", 0) or 0) + int(
             best_extended[0].get("count_comp", 0) or 0
         )
-        # Prefer the extended entry only when it has strictly more hours
-        # than the exact match AND at least as much confidence.
-        # This lets "FAITH: The Unholy Trinity" (full game) beat
-        # a low-confidence exact demo while preventing low-confidence
-        # mods like "Celeste - Strawberry Jam" from beating
-        # the exact base game.
-        if extended_hours > exact_hours and extended_confidence >= exact_confidence:
+        # Prefer the extended entry when it has more hours AND either:
+        #  (a) at least as much confidence (normal case), OR
+        #  (b) dominant hours ratio (>=4x) with minimal data — handles cases
+        #      like "FAITH: The Unholy Trinity" (17h, newer) vs "FAITH" 2017
+        #      (1.5h, older/more data) where the older exact match has
+        #      accumulated more confidence simply by being on HLTB longer.
+        dominates = (
+            exact_hours > 0
+            and extended_hours >= exact_hours * _EXTENDED_DOMINANCE_RATIO
+            and extended_confidence >= _EXTENDED_MIN_CONFIDENCE
+        )
+        if extended_hours > exact_hours and (
+            extended_confidence >= exact_confidence or dominates
+        ):
             return best_extended
         return best_exact
     if best_exact is not None:
@@ -419,6 +435,26 @@ async def _search_one(
                         continue
                     data = await resp.json()
                     candidates = _collect_candidates(query_name, data)
+                    # When we stripped ": subtitle" from the original name to
+                    # get query_name, only keep full-edition entries (those
+                    # whose HLTB name starts with query_name + ":"/"-") or
+                    # exact name/alias matches.  This prevents "Vox Populi"
+                    # (stripped from "Vox Populi: Poland 2023") from falsely
+                    # matching "Vox Populi Vox Dei 2".
+                    if ":" in name and ":" not in query_name:
+                        lower_q = query_name.lower()
+                        candidates = [
+                            (e, s)
+                            for e, s in candidates
+                            if (e.get("game_name") or "").lower() == lower_q
+                            or (e.get("game_alias") or "").lower() == lower_q
+                            or (e.get("game_name") or "")
+                            .lower()
+                            .startswith(lower_q + ":")
+                            or (e.get("game_name") or "")
+                            .lower()
+                            .startswith(lower_q + " -")
+                        ]
                     best = _pick_best_hltb_entry(query_name, candidates)
                     if best is None:
                         continue
diff --git a/steam_backlog_enforcer/_scanning_confidence.py b/steam_backlog_enforcer/_scanning_confidence.py
index 0dcce60..8502881 100644
--- a/steam_backlog_enforcer/_scanning_confidence.py
+++ b/steam_backlog_enforcer/_scanning_confidence.py
@@ -55,10 +55,11 @@ def _confidence_fail_reasons(game: GameInfo) -> list[str]:
 def _refresh_candidate_confidence(game: GameInfo) -> None:
     """Refresh confidence metrics for one candidate when cache looks stale.
 
-    Only refreshes when both metrics are missing (0), which typically means
-    the game was cached before confidence fields were added.
+    Refreshes when either metric is missing (0).  A game with comp_100_count>0
+    but count_comp==0 means the detail-page all-playstyles count was not yet
+    populated (e.g. the cache predates that field).
     """
-    if game.comp_100_count > 0 or game.count_comp > 0:
+    if game.comp_100_count > 0 and game.count_comp > 0:
         return
 
     _refresh_candidate_confidence_batch([game])
diff --git a/steam_backlog_enforcer/protondb.py b/steam_backlog_enforcer/protondb.py
index c5f5fb5..60d7e18 100644
--- a/steam_backlog_enforcer/protondb.py
+++ b/steam_backlog_enforcer/protondb.py
@@ -216,6 +216,6 @@ def fetch_protondb_ratings(
         _save_cache(cache)
         logger.info("ProtonDB: fetched %d, total cached %d", len(fetched), len(cache))
     else:
-        logger.info("All %d ProtonDB ratings found in cache.", len(results))
+        logger.debug("All %d ProtonDB ratings found in cache.", len(results))
 
     return results
diff --git a/steam_backlog_enforcer/tests/test_hltb_search.py b/steam_backlog_enforcer/tests/test_hltb_search.py
index 2986d97..a0aa2c2 100644
--- a/steam_backlog_enforcer/tests/test_hltb_search.py
+++ b/steam_backlog_enforcer/tests/test_hltb_search.py
@@ -259,12 +259,71 @@ class TestSearchOne:
         # Set done to one less than _SAVE_INTERVAL so it triggers save
 
         ctx.counter["done"] = _SAVE_INTERVAL - 1
-        with patch(
-            "steam_backlog_enforcer._hltb_search.save_hltb_cache"
-        ) as mock_save:
+        with patch("steam_backlog_enforcer._hltb_search.save_hltb_cache") as mock_save:
             asyncio.run(_search_one(asyncio.Semaphore(1), ctx, 440, "TF2"))
             mock_save.assert_called_once()
 
+    def test_colon_strip_fallback_rejects_cross_franchise_match(self) -> None:
+        """Colon-stripped fallback must not match a different franchise loosely.
+
+        "Vox Populi: Poland 2023" stripped to "Vox Populi" should NOT match
+        "Vox Populi Vox Dei 2" (different game, low-similarity entry).
+        """
+        empty_resp = _FakeResponse(200, {"data": []})
+        loose_resp = _FakeResponse(
+            200,
+            {
+                "data": [
+                    {
+                        "game_name": "Vox Populi Vox Dei 2",
+                        "game_alias": "",
+                        "game_type": "game",
+                        "comp_100": 14400,
+                        "comp_100_count": 9,
+                        "count_comp": 57,
+                        "game_id": 99999,
+                    }
+                ]
+            },
+        )
+        session = MagicMock()
+        session.post.side_effect = [empty_resp, loose_resp]
+        ctx = _make_ctx(session)
+        result = asyncio.run(
+            _search_one(asyncio.Semaphore(1), ctx, 2590810, "Vox Populi: Poland 2023")
+        )
+        assert result is None
+
+    def test_colon_strip_fallback_accepts_full_edition(self) -> None:
+        """Colon-stripped fallback must still match when the HLTB entry is a
+        full edition of the stripped name (name starts with stripped + ':').
+        """
+        empty_resp = _FakeResponse(200, {"data": []})
+        full_edition_resp = _FakeResponse(
+            200,
+            {
+                "data": [
+                    {
+                        "game_name": "Batman: Arkham Asylum",
+                        "game_alias": "",
+                        "game_type": "game",
+                        "comp_100": 144000,
+                        "comp_100_count": 300,
+                        "count_comp": 5000,
+                        "game_id": 11111,
+                    }
+                ]
+            },
+        )
+        session = MagicMock()
+        session.post.side_effect = [empty_resp, full_edition_resp]
+        ctx = _make_ctx(session)
+        result = asyncio.run(
+            _search_one(asyncio.Semaphore(1), ctx, 35140, "Batman: Arkham Asylum")
+        )
+        assert result is not None
+        assert result.game_name == "Batman: Arkham Asylum"
+
 
 class TestFetchBatchHltb:
     """Tests for _fetch_batch (the hltb version)."""