From 1c468667d506d49c7c4e4320c52210dfccb3fb64 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sun, 18 Jan 2026 13:25:56 +0000
Subject: [PATCH] Add caching to Wikipedia scraper to avoid unnecessary
 requests

Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
---
 .gitignore                                    |   3 +
 python_pkg/polish_license_plates/README.md    |   6 +
 .../fetch_license_plates.py                   | 121 +++++++++++++++++-
 3 files changed, 124 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 48cc27b..19fbd41 100644
--- a/.gitignore
+++ b/.gitignore
@@ -273,3 +273,6 @@ python_pkg/download_cats/http_cat_cache/
 
 # Large geojson files that can be downloaded
 python_pkg/warsaw_districts/warszawa-dzielnice.geojson
+
+# Wikipedia cache (can be refreshed)
+python_pkg/polish_license_plates/.wikipedia_cache/
diff --git a/python_pkg/polish_license_plates/README.md b/python_pkg/polish_license_plates/README.md
index 39dd421..5caf4be 100644
--- a/python_pkg/polish_license_plates/README.md
+++ b/python_pkg/polish_license_plates/README.md
@@ -53,9 +53,15 @@ python -m python_pkg.polish_license_plates.polish_license_plates_anki \
 To fetch the latest data from Wikipedia:
 
 ```bash
+# Use cached data if available (default)
 python -m python_pkg.polish_license_plates.fetch_license_plates
+
+# Force refresh from Wikipedia (ignore cache)
+python -m python_pkg.polish_license_plates.fetch_license_plates --force
 ```
 
+**Caching**: Downloaded Wikipedia data is cached for 7 days in `.wikipedia_cache/` to avoid unnecessary requests. Use `--force` to bypass the cache.
+
 This will update `license_plate_data.py` with the current codes from Wikipedia.
 
 **Requirements**: `pip install requests beautifulsoup4 lxml`
diff --git a/python_pkg/polish_license_plates/fetch_license_plates.py b/python_pkg/polish_license_plates/fetch_license_plates.py
index 5b56fcd..43f7025 100755
--- a/python_pkg/polish_license_plates/fetch_license_plates.py
+++ b/python_pkg/polish_license_plates/fetch_license_plates.py
@@ -6,9 +6,17 @@ to extract the official license plate codes and their corresponding locations.
 
 The data is extracted from the wikitable on the page and saved to license_plate_data.py.
 
+Caching:
+    Fetched Wikipedia HTML is cached to avoid unnecessary requests.
+    Cache location: .wikipedia_cache/license_plates.html
+    Cache expires after 7 days by default.
+
 Usage:
     python -m python_pkg.polish_license_plates.fetch_license_plates
 
+    # Force refresh (ignore cache)
+    python -m python_pkg.polish_license_plates.fetch_license_plates --force
+
 Source:
     https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland
 
@@ -21,9 +29,11 @@ Note:
 
 from __future__ import annotations
 
+import argparse
 from pathlib import Path
 import re
 import sys
+import time
 
 try:
     from bs4 import BeautifulSoup
@@ -38,6 +48,7 @@ except ImportError:
 # Constants
 MIN_TABLE_COLUMNS = 2  # Minimum columns needed to extract code and location
 MAX_CODE_LENGTH = 4  # Maximum length for a valid license plate code
+CACHE_EXPIRY_DAYS = 7  # Cache expires after 7 days
 USER_AGENT = (
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
     "AppleWebKit/537.36 (KHTML, like Gecko) "
@@ -45,18 +56,66 @@ USER_AGENT = (
 )
 
 
-def fetch_wikipedia_license_plates() -> dict[str, str]:
-    """Fetch Polish license plate codes from Wikipedia.
+def get_cache_path() -> Path:
+    """Get the path to the cache file.
 
     Returns:
-        Dictionary mapping license plate codes to their locations.
+        Path to the cache file.
+    """
+    script_dir = Path(__file__).parent
+    cache_dir = script_dir / ".wikipedia_cache"
+    cache_dir.mkdir(exist_ok=True)
+    return cache_dir / "license_plates.html"
+
+
+def is_cache_valid(cache_path: Path, max_age_days: int = CACHE_EXPIRY_DAYS) -> bool:
+    """Check if the cache file exists and is not expired.
+
+    Args:
+        cache_path: Path to the cache file.
+        max_age_days: Maximum age in days before cache is considered expired.
+
+    Returns:
+        True if cache is valid, False otherwise.
+    """
+    if not cache_path.exists():
+        return False
+
+    # Check age
+    file_age_seconds = time.time() - cache_path.stat().st_mtime
+    max_age_seconds = max_age_days * 24 * 60 * 60
+
+    return file_age_seconds < max_age_seconds
+
+
+def fetch_wikipedia_html(*, force_refresh: bool = False) -> str:
+    """Fetch Wikipedia HTML, using cache if available.
+
+    Args:
+        force_refresh: If True, ignore cache and fetch fresh data.
+
+    Returns:
+        HTML content of the Wikipedia page.
 
     Raises:
-        RuntimeError: If the page cannot be fetched or parsed.
+        RuntimeError: If the page cannot be fetched.
     """
+    cache_path = get_cache_path()
+
+    # Check if we can use cache
+    if not force_refresh and is_cache_valid(cache_path):
+        sys.stdout.write(f"Using cached data from {cache_path}\n")
+        cache_age_hours = int((time.time() - cache_path.stat().st_mtime) / 3600)
+        sys.stdout.write(f"Cache age: {cache_age_hours} hours\n")
+        return cache_path.read_text(encoding="utf-8")
+
+    # Fetch from Wikipedia
     url = "https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland"
     headers = {"User-Agent": USER_AGENT}
 
+    if force_refresh:
+        sys.stdout.write("Force refresh: Ignoring cache\n")
+
     sys.stdout.write(f"Fetching data from {url}...\n")
 
     try:
@@ -66,7 +125,26 @@ def fetch_wikipedia_license_plates() -> dict[str, str]:
         msg = f"Failed to fetch Wikipedia page: {e}"
         raise RuntimeError(msg) from e
 
-    soup = BeautifulSoup(response.text, "html.parser")
+    # Cache the response
+    cache_path.write_text(response.text, encoding="utf-8")
+    sys.stdout.write(f"Cached response to {cache_path}\n")
+
+    return response.text
+
+
+def parse_license_plates_from_html(html_content: str) -> dict[str, str]:
+    """Parse license plate codes from Wikipedia HTML.
+
+    Args:
+        html_content: HTML content of the Wikipedia page.
+
+    Returns:
+        Dictionary mapping license plate codes to their locations.
+
+    Raises:
+        RuntimeError: If no valid tables are found.
+    """
+    soup = BeautifulSoup(html_content, "html.parser")
 
     # Find all wikitables
     tables = soup.find_all("table", {"class": "wikitable"})
@@ -112,6 +190,22 @@ def fetch_wikipedia_license_plates() -> dict[str, str]:
     return license_plates
 
 
+def fetch_wikipedia_license_plates(*, force_refresh: bool = False) -> dict[str, str]:
+    """Fetch Polish license plate codes from Wikipedia.
+
+    Args:
+        force_refresh: If True, ignore cache and fetch fresh data.
+
+    Returns:
+        Dictionary mapping license plate codes to their locations.
+
+    Raises:
+        RuntimeError: If the page cannot be fetched or parsed.
+    """
+    html_content = fetch_wikipedia_html(force_refresh=force_refresh)
+    return parse_license_plates_from_html(html_content)
+
+
 def generate_license_plate_data_file(
     license_plates: dict[str, str],
     output_path: Path,
@@ -224,9 +318,22 @@ def main() -> int:
     Returns:
         Exit code.
     """
+    parser = argparse.ArgumentParser(
+        description="Fetch Polish license plate codes from Wikipedia.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--force",
+        "-f",
+        action="store_true",
+        help="Force refresh: ignore cache and fetch fresh data from Wikipedia",
+    )
+
+    args = parser.parse_args()
+
     try:
         # Fetch data from Wikipedia
-        license_plates = fetch_wikipedia_license_plates()
+        license_plates = fetch_wikipedia_license_plates(force_refresh=args.force)
 
         # Determine output path
         script_dir = Path(__file__).parent
@@ -247,6 +354,8 @@ def main() -> int:
             "URL: https://en.wikipedia.org/wiki/"
             "Vehicle_registration_plates_of_Poland\n"
         )
+        sys.stdout.write(f"Cache location: {get_cache_path()}\n")
+        sys.stdout.write(f"Cache expiry: {CACHE_EXPIRY_DAYS} days\n")
         sys.stdout.write("\n")
         sys.stdout.write("Next steps:\n")
         sys.stdout.write("  1. Review the generated file\n")