From 1c468667d506d49c7c4e4320c52210dfccb3fb64 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 13:25:56 +0000 Subject: [PATCH] Add caching to Wikipedia scraper to avoid unnecessary requests Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> --- .gitignore | 3 + python_pkg/polish_license_plates/README.md | 6 + .../fetch_license_plates.py | 121 +++++++++++++++++- 3 files changed, 124 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 48cc27b..19fbd41 100644 --- a/.gitignore +++ b/.gitignore @@ -273,3 +273,6 @@ python_pkg/download_cats/http_cat_cache/ # Large geojson files that can be downloaded python_pkg/warsaw_districts/warszawa-dzielnice.geojson + +# Wikipedia cache (can be refreshed) +python_pkg/polish_license_plates/.wikipedia_cache/ diff --git a/python_pkg/polish_license_plates/README.md b/python_pkg/polish_license_plates/README.md index 39dd421..5caf4be 100644 --- a/python_pkg/polish_license_plates/README.md +++ b/python_pkg/polish_license_plates/README.md @@ -53,9 +53,15 @@ python -m python_pkg.polish_license_plates.polish_license_plates_anki \ To fetch the latest data from Wikipedia: ```bash +# Use cached data if available (default) python -m python_pkg.polish_license_plates.fetch_license_plates + +# Force refresh from Wikipedia (ignore cache) +python -m python_pkg.polish_license_plates.fetch_license_plates --force ``` +**Caching**: Downloaded Wikipedia data is cached for 7 days in `.wikipedia_cache/` to avoid unnecessary requests. Use `--force` to bypass the cache. + This will update `license_plate_data.py` with the current codes from Wikipedia. **Requirements**: `pip install requests beautifulsoup4 lxml` diff --git a/python_pkg/polish_license_plates/fetch_license_plates.py b/python_pkg/polish_license_plates/fetch_license_plates.py index 5b56fcd..43f7025 100755 --- a/python_pkg/polish_license_plates/fetch_license_plates.py +++ b/python_pkg/polish_license_plates/fetch_license_plates.py @@ -6,9 +6,17 @@ to extract the official license plate codes and their corresponding locations. The data is extracted from the wikitable on the page and saved to license_plate_data.py. +Caching: + Fetched Wikipedia HTML is cached to avoid unnecessary requests. + Cache location: .wikipedia_cache/license_plates.html + Cache expires after 7 days by default. + Usage: python -m python_pkg.polish_license_plates.fetch_license_plates + # Force refresh (ignore cache) + python -m python_pkg.polish_license_plates.fetch_license_plates --force + Source: https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland @@ -21,9 +29,11 @@ Note: from __future__ import annotations +import argparse from pathlib import Path import re import sys +import time try: from bs4 import BeautifulSoup @@ -38,6 +48,7 @@ except ImportError: # Constants MIN_TABLE_COLUMNS = 2 # Minimum columns needed to extract code and location MAX_CODE_LENGTH = 4 # Maximum length for a valid license plate code +CACHE_EXPIRY_DAYS = 7 # Cache expires after 7 days USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " @@ -45,18 +56,66 @@ USER_AGENT = ( ) -def fetch_wikipedia_license_plates() -> dict[str, str]: - """Fetch Polish license plate codes from Wikipedia. +def get_cache_path() -> Path: + """Get the path to the cache file. Returns: - Dictionary mapping license plate codes to their locations. + Path to the cache file. + """ + script_dir = Path(__file__).parent + cache_dir = script_dir / ".wikipedia_cache" + cache_dir.mkdir(exist_ok=True) + return cache_dir / "license_plates.html" + + +def is_cache_valid(cache_path: Path, max_age_days: int = CACHE_EXPIRY_DAYS) -> bool: + """Check if the cache file exists and is not expired. + + Args: + cache_path: Path to the cache file. + max_age_days: Maximum age in days before cache is considered expired. + + Returns: + True if cache is valid, False otherwise. + """ + if not cache_path.exists(): + return False + + # Check age + file_age_seconds = time.time() - cache_path.stat().st_mtime + max_age_seconds = max_age_days * 24 * 60 * 60 + + return file_age_seconds < max_age_seconds + + +def fetch_wikipedia_html(*, force_refresh: bool = False) -> str: + """Fetch Wikipedia HTML, using cache if available. + + Args: + force_refresh: If True, ignore cache and fetch fresh data. + + Returns: + HTML content of the Wikipedia page. Raises: - RuntimeError: If the page cannot be fetched or parsed. + RuntimeError: If the page cannot be fetched. """ + cache_path = get_cache_path() + + # Check if we can use cache + if not force_refresh and is_cache_valid(cache_path): + sys.stdout.write(f"Using cached data from {cache_path}\n") + cache_age_hours = int((time.time() - cache_path.stat().st_mtime) / 3600) + sys.stdout.write(f"Cache age: {cache_age_hours} hours\n") + return cache_path.read_text(encoding="utf-8") + + # Fetch from Wikipedia url = "https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland" headers = {"User-Agent": USER_AGENT} + if force_refresh: + sys.stdout.write("Force refresh: Ignoring cache\n") + sys.stdout.write(f"Fetching data from {url}...\n") try: @@ -66,7 +125,26 @@ def fetch_wikipedia_license_plates() -> dict[str, str]: msg = f"Failed to fetch Wikipedia page: {e}" raise RuntimeError(msg) from e - soup = BeautifulSoup(response.text, "html.parser") + # Cache the response + cache_path.write_text(response.text, encoding="utf-8") + sys.stdout.write(f"Cached response to {cache_path}\n") + + return response.text + + +def parse_license_plates_from_html(html_content: str) -> dict[str, str]: + """Parse license plate codes from Wikipedia HTML. + + Args: + html_content: HTML content of the Wikipedia page. + + Returns: + Dictionary mapping license plate codes to their locations. + + Raises: + RuntimeError: If no valid tables are found. + """ + soup = BeautifulSoup(html_content, "html.parser") # Find all wikitables tables = soup.find_all("table", {"class": "wikitable"}) @@ -112,6 +190,22 @@ def fetch_wikipedia_license_plates() -> dict[str, str]: return license_plates +def fetch_wikipedia_license_plates(*, force_refresh: bool = False) -> dict[str, str]: + """Fetch Polish license plate codes from Wikipedia. + + Args: + force_refresh: If True, ignore cache and fetch fresh data. + + Returns: + Dictionary mapping license plate codes to their locations. + + Raises: + RuntimeError: If the page cannot be fetched or parsed. + """ + html_content = fetch_wikipedia_html(force_refresh=force_refresh) + return parse_license_plates_from_html(html_content) + + def generate_license_plate_data_file( license_plates: dict[str, str], output_path: Path, @@ -224,9 +318,22 @@ def main() -> int: Returns: Exit code. """ + parser = argparse.ArgumentParser( + description="Fetch Polish license plate codes from Wikipedia.", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--force", + "-f", + action="store_true", + help="Force refresh: ignore cache and fetch fresh data from Wikipedia", + ) + + args = parser.parse_args() + try: # Fetch data from Wikipedia - license_plates = fetch_wikipedia_license_plates() + license_plates = fetch_wikipedia_license_plates(force_refresh=args.force) # Determine output path script_dir = Path(__file__).parent @@ -247,6 +354,8 @@ def main() -> int: "URL: https://en.wikipedia.org/wiki/" "Vehicle_registration_plates_of_Poland\n" ) + sys.stdout.write(f"Cache location: {get_cache_path()}\n") + sys.stdout.write(f"Cache expiry: {CACHE_EXPIRY_DAYS} days\n") sys.stdout.write("\n") sys.stdout.write("Next steps:\n") sys.stdout.write(" 1. Review the generated file\n")