Add caching to Wikipedia scraper to avoid unnecessary requests

Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
2026-07-04 15:03:01 +02:00 · 2026-01-18 13:25:56 +00:00 · 2026-01-18 13:25:56 +00:00 · 1c468667d5
commit 1c468667d5
parent 2ce6a74a65
3 changed files with 124 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@ -273,3 +273,6 @@ python_pkg/download_cats/http_cat_cache/

 # Large geojson files that can be downloaded
 python_pkg/warsaw_districts/warszawa-dzielnice.geojson
+
+# Wikipedia cache (can be refreshed)
+python_pkg/polish_license_plates/.wikipedia_cache/
--- a/python_pkg/polish_license_plates/README.md
+++ b/python_pkg/polish_license_plates/README.md
@ -53,9 +53,15 @@ python -m python_pkg.polish_license_plates.polish_license_plates_anki \
 To fetch the latest data from Wikipedia:

 ```bash
+# Use cached data if available (default)
 python -m python_pkg.polish_license_plates.fetch_license_plates
+
+# Force refresh from Wikipedia (ignore cache)
+python -m python_pkg.polish_license_plates.fetch_license_plates --force
 ```

+**Caching**: Downloaded Wikipedia data is cached for 7 days in `.wikipedia_cache/` to avoid unnecessary requests. Use `--force` to bypass the cache.
+
 This will update `license_plate_data.py` with the current codes from Wikipedia.

 **Requirements**: `pip install requests beautifulsoup4 lxml`
--- a/python_pkg/polish_license_plates/fetch_license_plates.py
+++ b/python_pkg/polish_license_plates/fetch_license_plates.py
@ -6,9 +6,17 @@ to extract the official license plate codes and their corresponding locations.

 The data is extracted from the wikitable on the page and saved to license_plate_data.py.

+Caching:
+    Fetched Wikipedia HTML is cached to avoid unnecessary requests.
+    Cache location: .wikipedia_cache/license_plates.html
+    Cache expires after 7 days by default.
+
 Usage:
    python -m python_pkg.polish_license_plates.fetch_license_plates

+    # Force refresh (ignore cache)
+    python -m python_pkg.polish_license_plates.fetch_license_plates --force
+
 Source:
    https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland

@ -21,9 +29,11 @@ Note:

 from __future__ import annotations

+import argparse
 from pathlib import Path
 import re
 import sys
+import time

 try:
    from bs4 import BeautifulSoup
@ -38,6 +48,7 @@ except ImportError:
 # Constants
 MIN_TABLE_COLUMNS = 2  # Minimum columns needed to extract code and location
 MAX_CODE_LENGTH = 4  # Maximum length for a valid license plate code
+CACHE_EXPIRY_DAYS = 7  # Cache expires after 7 days
 USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
@ -45,18 +56,66 @@ USER_AGENT = (
 )


-def fetch_wikipedia_license_plates() -> dict[str, str]:
-    """Fetch Polish license plate codes from Wikipedia.
+def get_cache_path() -> Path:
+    """Get the path to the cache file.

    Returns:
-        Dictionary mapping license plate codes to their locations.
+        Path to the cache file.
+    """
+    script_dir = Path(__file__).parent
+    cache_dir = script_dir / ".wikipedia_cache"
+    cache_dir.mkdir(exist_ok=True)
+    return cache_dir / "license_plates.html"
+
+
+def is_cache_valid(cache_path: Path, max_age_days: int = CACHE_EXPIRY_DAYS) -> bool:
+    """Check if the cache file exists and is not expired.
+
+    Args:
+        cache_path: Path to the cache file.
+        max_age_days: Maximum age in days before cache is considered expired.
+
+    Returns:
+        True if cache is valid, False otherwise.
+    """
+    if not cache_path.exists():
+        return False
+
+    # Check age
+    file_age_seconds = time.time() - cache_path.stat().st_mtime
+    max_age_seconds = max_age_days * 24 * 60 * 60
+
+    return file_age_seconds < max_age_seconds
+
+
+def fetch_wikipedia_html(*, force_refresh: bool = False) -> str:
+    """Fetch Wikipedia HTML, using cache if available.
+
+    Args:
+        force_refresh: If True, ignore cache and fetch fresh data.
+
+    Returns:
+        HTML content of the Wikipedia page.

    Raises:
-        RuntimeError: If the page cannot be fetched or parsed.
+        RuntimeError: If the page cannot be fetched.
    """
+    cache_path = get_cache_path()
+
+    # Check if we can use cache
+    if not force_refresh and is_cache_valid(cache_path):
+        sys.stdout.write(f"Using cached data from {cache_path}\n")
+        cache_age_hours = int((time.time() - cache_path.stat().st_mtime) / 3600)
+        sys.stdout.write(f"Cache age: {cache_age_hours} hours\n")
+        return cache_path.read_text(encoding="utf-8")
+
+    # Fetch from Wikipedia
    url = "https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland"
    headers = {"User-Agent": USER_AGENT}

+    if force_refresh:
+        sys.stdout.write("Force refresh: Ignoring cache\n")
+
    sys.stdout.write(f"Fetching data from {url}...\n")

    try:
@ -66,7 +125,26 @@ def fetch_wikipedia_license_plates() -> dict[str, str]:
        msg = f"Failed to fetch Wikipedia page: {e}"
        raise RuntimeError(msg) from e

-    soup = BeautifulSoup(response.text, "html.parser")
+    # Cache the response
+    cache_path.write_text(response.text, encoding="utf-8")
+    sys.stdout.write(f"Cached response to {cache_path}\n")
+
+    return response.text
+
+
+def parse_license_plates_from_html(html_content: str) -> dict[str, str]:
+    """Parse license plate codes from Wikipedia HTML.
+
+    Args:
+        html_content: HTML content of the Wikipedia page.
+
+    Returns:
+        Dictionary mapping license plate codes to their locations.
+
+    Raises:
+        RuntimeError: If no valid tables are found.
+    """
+    soup = BeautifulSoup(html_content, "html.parser")

    # Find all wikitables
    tables = soup.find_all("table", {"class": "wikitable"})
@ -112,6 +190,22 @@ def fetch_wikipedia_license_plates() -> dict[str, str]:
    return license_plates


+def fetch_wikipedia_license_plates(*, force_refresh: bool = False) -> dict[str, str]:
+    """Fetch Polish license plate codes from Wikipedia.
+
+    Args:
+        force_refresh: If True, ignore cache and fetch fresh data.
+
+    Returns:
+        Dictionary mapping license plate codes to their locations.
+
+    Raises:
+        RuntimeError: If the page cannot be fetched or parsed.
+    """
+    html_content = fetch_wikipedia_html(force_refresh=force_refresh)
+    return parse_license_plates_from_html(html_content)
+
+
 def generate_license_plate_data_file(
    license_plates: dict[str, str],
    output_path: Path,
@ -224,9 +318,22 @@ def main() -> int:
    Returns:
        Exit code.
    """
+    parser = argparse.ArgumentParser(
+        description="Fetch Polish license plate codes from Wikipedia.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--force",
+        "-f",
+        action="store_true",
+        help="Force refresh: ignore cache and fetch fresh data from Wikipedia",
+    )
+
+    args = parser.parse_args()
+
    try:
        # Fetch data from Wikipedia
-        license_plates = fetch_wikipedia_license_plates()
+        license_plates = fetch_wikipedia_license_plates(force_refresh=args.force)

        # Determine output path
        script_dir = Path(__file__).parent
@ -247,6 +354,8 @@ def main() -> int:
            "URL: https://en.wikipedia.org/wiki/"
            "Vehicle_registration_plates_of_Poland\n"
        )
+        sys.stdout.write(f"Cache location: {get_cache_path()}\n")
+        sys.stdout.write(f"Cache expiry: {CACHE_EXPIRY_DAYS} days\n")
        sys.stdout.write("\n")
        sys.stdout.write("Next steps:\n")
        sys.stdout.write("  1. Review the generated file\n")