Add caching to Wikipedia scraper to avoid unnecessary requests

Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot] 2026-01-18 13:25:56 +00:00
parent 2ce6a74a65
commit 1c468667d5
3 changed files with 124 additions and 6 deletions

3
.gitignore vendored
View File

@ -273,3 +273,6 @@ python_pkg/download_cats/http_cat_cache/
# Large geojson files that can be downloaded
python_pkg/warsaw_districts/warszawa-dzielnice.geojson
# Wikipedia cache (can be refreshed)
python_pkg/polish_license_plates/.wikipedia_cache/

View File

@ -53,9 +53,15 @@ python -m python_pkg.polish_license_plates.polish_license_plates_anki \
To fetch the latest data from Wikipedia:
```bash
# Use cached data if available (default)
python -m python_pkg.polish_license_plates.fetch_license_plates
# Force refresh from Wikipedia (ignore cache)
python -m python_pkg.polish_license_plates.fetch_license_plates --force
```
**Caching**: Downloaded Wikipedia data is cached for 7 days in `.wikipedia_cache/` to avoid unnecessary requests. Use `--force` to bypass the cache.
This will update `license_plate_data.py` with the current codes from Wikipedia.
**Requirements**: `pip install requests beautifulsoup4 lxml`

View File

@ -6,9 +6,17 @@ to extract the official license plate codes and their corresponding locations.
The data is extracted from the wikitable on the page and saved to license_plate_data.py.
Caching:
Fetched Wikipedia HTML is cached to avoid unnecessary requests.
Cache location: .wikipedia_cache/license_plates.html
Cache expires after 7 days by default.
Usage:
python -m python_pkg.polish_license_plates.fetch_license_plates
# Force refresh (ignore cache)
python -m python_pkg.polish_license_plates.fetch_license_plates --force
Source:
https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland
@ -21,9 +29,11 @@ Note:
from __future__ import annotations
import argparse
from pathlib import Path
import re
import sys
import time
try:
from bs4 import BeautifulSoup
@ -38,6 +48,7 @@ except ImportError:
# Constants
MIN_TABLE_COLUMNS = 2 # Minimum columns needed to extract code and location
MAX_CODE_LENGTH = 4 # Maximum length for a valid license plate code
CACHE_EXPIRY_DAYS = 7 # Cache expires after 7 days
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
@ -45,18 +56,66 @@ USER_AGENT = (
)
def fetch_wikipedia_license_plates() -> dict[str, str]:
"""Fetch Polish license plate codes from Wikipedia.
def get_cache_path() -> Path:
"""Get the path to the cache file.
Returns:
Dictionary mapping license plate codes to their locations.
Path to the cache file.
"""
script_dir = Path(__file__).parent
cache_dir = script_dir / ".wikipedia_cache"
cache_dir.mkdir(exist_ok=True)
return cache_dir / "license_plates.html"
def is_cache_valid(cache_path: Path, max_age_days: int = CACHE_EXPIRY_DAYS) -> bool:
"""Check if the cache file exists and is not expired.
Args:
cache_path: Path to the cache file.
max_age_days: Maximum age in days before cache is considered expired.
Returns:
True if cache is valid, False otherwise.
"""
if not cache_path.exists():
return False
# Check age
file_age_seconds = time.time() - cache_path.stat().st_mtime
max_age_seconds = max_age_days * 24 * 60 * 60
return file_age_seconds < max_age_seconds
def fetch_wikipedia_html(*, force_refresh: bool = False) -> str:
"""Fetch Wikipedia HTML, using cache if available.
Args:
force_refresh: If True, ignore cache and fetch fresh data.
Returns:
HTML content of the Wikipedia page.
Raises:
RuntimeError: If the page cannot be fetched or parsed.
RuntimeError: If the page cannot be fetched.
"""
cache_path = get_cache_path()
# Check if we can use cache
if not force_refresh and is_cache_valid(cache_path):
sys.stdout.write(f"Using cached data from {cache_path}\n")
cache_age_hours = int((time.time() - cache_path.stat().st_mtime) / 3600)
sys.stdout.write(f"Cache age: {cache_age_hours} hours\n")
return cache_path.read_text(encoding="utf-8")
# Fetch from Wikipedia
url = "https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland"
headers = {"User-Agent": USER_AGENT}
if force_refresh:
sys.stdout.write("Force refresh: Ignoring cache\n")
sys.stdout.write(f"Fetching data from {url}...\n")
try:
@ -66,7 +125,26 @@ def fetch_wikipedia_license_plates() -> dict[str, str]:
msg = f"Failed to fetch Wikipedia page: {e}"
raise RuntimeError(msg) from e
soup = BeautifulSoup(response.text, "html.parser")
# Cache the response
cache_path.write_text(response.text, encoding="utf-8")
sys.stdout.write(f"Cached response to {cache_path}\n")
return response.text
def parse_license_plates_from_html(html_content: str) -> dict[str, str]:
"""Parse license plate codes from Wikipedia HTML.
Args:
html_content: HTML content of the Wikipedia page.
Returns:
Dictionary mapping license plate codes to their locations.
Raises:
RuntimeError: If no valid tables are found.
"""
soup = BeautifulSoup(html_content, "html.parser")
# Find all wikitables
tables = soup.find_all("table", {"class": "wikitable"})
@ -112,6 +190,22 @@ def fetch_wikipedia_license_plates() -> dict[str, str]:
return license_plates
def fetch_wikipedia_license_plates(*, force_refresh: bool = False) -> dict[str, str]:
"""Fetch Polish license plate codes from Wikipedia.
Args:
force_refresh: If True, ignore cache and fetch fresh data.
Returns:
Dictionary mapping license plate codes to their locations.
Raises:
RuntimeError: If the page cannot be fetched or parsed.
"""
html_content = fetch_wikipedia_html(force_refresh=force_refresh)
return parse_license_plates_from_html(html_content)
def generate_license_plate_data_file(
license_plates: dict[str, str],
output_path: Path,
@ -224,9 +318,22 @@ def main() -> int:
Returns:
Exit code.
"""
parser = argparse.ArgumentParser(
description="Fetch Polish license plate codes from Wikipedia.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--force",
"-f",
action="store_true",
help="Force refresh: ignore cache and fetch fresh data from Wikipedia",
)
args = parser.parse_args()
try:
# Fetch data from Wikipedia
license_plates = fetch_wikipedia_license_plates()
license_plates = fetch_wikipedia_license_plates(force_refresh=args.force)
# Determine output path
script_dir = Path(__file__).parent
@ -247,6 +354,8 @@ def main() -> int:
"URL: https://en.wikipedia.org/wiki/"
"Vehicle_registration_plates_of_Poland\n"
)
sys.stdout.write(f"Cache location: {get_cache_path()}\n")
sys.stdout.write(f"Cache expiry: {CACHE_EXPIRY_DAYS} days\n")
sys.stdout.write("\n")
sys.stdout.write("Next steps:\n")
sys.stdout.write(" 1. Review the generated file\n")