testsAndMisc/python_pkg/polish_license_plates/fetch_license_plates.py
copilot-swe-agent[bot] 1c468667d5 Add caching to Wikipedia scraper to avoid unnecessary requests
Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
2026-01-18 13:25:56 +00:00

379 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
"""Fetch Polish license plate codes from Wikipedia.
This script scrapes the Wikipedia page "Vehicle registration plates of Poland"
to extract the official license plate codes and their corresponding locations.
The data is extracted from the wikitable on the page and saved to license_plate_data.py.
Caching:
Fetched Wikipedia HTML is cached to avoid unnecessary requests.
Cache location: .wikipedia_cache/license_plates.html
Cache expires after 7 days by default.
Usage:
python -m python_pkg.polish_license_plates.fetch_license_plates
# Force refresh (ignore cache)
python -m python_pkg.polish_license_plates.fetch_license_plates --force
Source:
https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland
Note:
This script requires internet access and the following packages:
- requests
- beautifulsoup4
- lxml
"""
from __future__ import annotations
import argparse
from pathlib import Path
import re
import sys
import time
try:
from bs4 import BeautifulSoup
import requests
except ImportError:
sys.stderr.write(
"Error: Required packages not installed.\n"
"Install with: pip install requests beautifulsoup4 lxml\n"
)
sys.exit(1)
# Constants
MIN_TABLE_COLUMNS = 2 # Minimum columns needed to extract code and location
MAX_CODE_LENGTH = 4 # Maximum length for a valid license plate code
CACHE_EXPIRY_DAYS = 7 # Cache expires after 7 days
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36" # Updated to recent version
)
def get_cache_path() -> Path:
"""Get the path to the cache file.
Returns:
Path to the cache file.
"""
script_dir = Path(__file__).parent
cache_dir = script_dir / ".wikipedia_cache"
cache_dir.mkdir(exist_ok=True)
return cache_dir / "license_plates.html"
def is_cache_valid(cache_path: Path, max_age_days: int = CACHE_EXPIRY_DAYS) -> bool:
"""Check if the cache file exists and is not expired.
Args:
cache_path: Path to the cache file.
max_age_days: Maximum age in days before cache is considered expired.
Returns:
True if cache is valid, False otherwise.
"""
if not cache_path.exists():
return False
# Check age
file_age_seconds = time.time() - cache_path.stat().st_mtime
max_age_seconds = max_age_days * 24 * 60 * 60
return file_age_seconds < max_age_seconds
def fetch_wikipedia_html(*, force_refresh: bool = False) -> str:
"""Fetch Wikipedia HTML, using cache if available.
Args:
force_refresh: If True, ignore cache and fetch fresh data.
Returns:
HTML content of the Wikipedia page.
Raises:
RuntimeError: If the page cannot be fetched.
"""
cache_path = get_cache_path()
# Check if we can use cache
if not force_refresh and is_cache_valid(cache_path):
sys.stdout.write(f"Using cached data from {cache_path}\n")
cache_age_hours = int((time.time() - cache_path.stat().st_mtime) / 3600)
sys.stdout.write(f"Cache age: {cache_age_hours} hours\n")
return cache_path.read_text(encoding="utf-8")
# Fetch from Wikipedia
url = "https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland"
headers = {"User-Agent": USER_AGENT}
if force_refresh:
sys.stdout.write("Force refresh: Ignoring cache\n")
sys.stdout.write(f"Fetching data from {url}...\n")
try:
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
except requests.RequestException as e:
msg = f"Failed to fetch Wikipedia page: {e}"
raise RuntimeError(msg) from e
# Cache the response
cache_path.write_text(response.text, encoding="utf-8")
sys.stdout.write(f"Cached response to {cache_path}\n")
return response.text
def parse_license_plates_from_html(html_content: str) -> dict[str, str]:
"""Parse license plate codes from Wikipedia HTML.
Args:
html_content: HTML content of the Wikipedia page.
Returns:
Dictionary mapping license plate codes to their locations.
Raises:
RuntimeError: If no valid tables are found.
"""
soup = BeautifulSoup(html_content, "html.parser")
# Find all wikitables
tables = soup.find_all("table", {"class": "wikitable"})
if not tables:
msg = "No wikitable found on the page"
raise RuntimeError(msg)
sys.stdout.write(f"Found {len(tables)} tables on the page\n")
license_plates: dict[str, str] = {}
# Process each table
for table_idx, table in enumerate(tables):
rows = table.find_all("tr")
sys.stdout.write(f"Processing table {table_idx + 1} with {len(rows)} rows...\n")
for row in rows[1:]: # Skip header row
cells = row.find_all(["td", "th"])
if len(cells) >= MIN_TABLE_COLUMNS:
# Extract code and location
code_text = cells[0].get_text(strip=True)
location_text = cells[1].get_text(strip=True)
# Clean up the code (remove spaces, keep only letters)
code = re.sub(r"[^A-Z]", "", code_text.upper())
# Skip if code is invalid
if not code or len(code) > MAX_CODE_LENGTH:
continue
# Clean up location text (remove citations, extra spaces)
location = re.sub(r"\[[0-9]+\]", "", location_text)
location = " ".join(location.split())
if location:
license_plates[code] = location
sys.stdout.write(f"Extracted {len(license_plates)} license plate codes\n")
return license_plates
def fetch_wikipedia_license_plates(*, force_refresh: bool = False) -> dict[str, str]:
"""Fetch Polish license plate codes from Wikipedia.
Args:
force_refresh: If True, ignore cache and fetch fresh data.
Returns:
Dictionary mapping license plate codes to their locations.
Raises:
RuntimeError: If the page cannot be fetched or parsed.
"""
html_content = fetch_wikipedia_html(force_refresh=force_refresh)
return parse_license_plates_from_html(html_content)
def generate_license_plate_data_file(
license_plates: dict[str, str],
output_path: Path,
) -> None:
"""Generate license_plate_data.py file with the extracted data.
Args:
license_plates: Dictionary mapping codes to locations.
output_path: Path to the output file.
"""
# Group by first letter (voivodeship)
voivodeships: dict[str, list[tuple[str, str]]] = {}
for code, location in sorted(license_plates.items()):
first_letter = code[0]
if first_letter not in voivodeships:
voivodeships[first_letter] = []
voivodeships[first_letter].append((code, location))
# Voivodeship names
voivodeship_names = {
"B": "Podlaskie",
"C": "Kujawsko-Pomorskie",
"D": "Dolnośląskie",
"E": "Łódzkie",
"F": "Lubuskie",
"G": "Pomorskie",
"K": "Małopolskie",
"L": "Lubelskie",
"N": "Warmińsko-Mazurskie",
"O": "Opolskie",
"P": "Wielkopolskie",
"R": "Podkarpackie",
"S": "Śląskie",
"T": "Świętokrzyskie",
"W": "Mazowieckie",
"Z": "Zachodniopomorskie",
}
# Generate file content
content = '''"""Database of Polish car license plate registration codes.
This module contains a comprehensive mapping of Polish vehicle registration
plate codes to their corresponding locations (cities, powiats, voivodeships).
Polish license plates use a system where:
- First letter indicates the voivodeship (province)
- Following 1-2 letters indicate the specific city or powiat (county)
The database is organized by voivodeships in alphabetical order:
- B: Podlaskie
- C: Kujawsko-Pomorskie
- D: Dolnośląskie
- E: Łódzkie
- F: Lubuskie
- G: Pomorskie
- K: Małopolskie
- L: Lubelskie
- N: Warmińsko-Mazurskie
- O: Opolskie
- P: Wielkopolskie
- R: Podkarpackie
- S: Śląskie
- T: Świętokrzyskie
- W: Mazowieckie
- Z: Zachodniopomorskie
Data source:
Wikipedia - Vehicle registration plates of Poland
https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland
Auto-generated by:
python -m python_pkg.polish_license_plates.fetch_license_plates
Examples:
WA = Warszawa (Warsaw)
KR = Kraków
GD = Gdańsk
"""
from __future__ import annotations
LICENSE_PLATE_CODES: dict[str, str] = {
'''
# Add entries grouped by voivodeship
for letter in sorted(voivodeships.keys()):
voivodeship_name = voivodeship_names.get(letter, f"Voivodeship {letter}")
codes = voivodeships[letter]
content += f" # {letter} - {voivodeship_name} ({len(codes)} codes)\n"
for code, location in codes:
# Escape quotes in location
location_escaped = location.replace('"', '\\"')
content += f' "{code}": "{location_escaped}",\n'
content += "\n"
# Remove last comma and newline, then close the dict
content = content.rstrip(",\n") + "\n}\n"
# Write to file
output_path.write_text(content, encoding="utf-8")
sys.stdout.write(f"Generated {output_path}\n")
def main() -> int:
"""Main entry point.
Returns:
Exit code.
"""
parser = argparse.ArgumentParser(
description="Fetch Polish license plate codes from Wikipedia.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--force",
"-f",
action="store_true",
help="Force refresh: ignore cache and fetch fresh data from Wikipedia",
)
args = parser.parse_args()
try:
# Fetch data from Wikipedia
license_plates = fetch_wikipedia_license_plates(force_refresh=args.force)
# Determine output path
script_dir = Path(__file__).parent
output_path = script_dir / "license_plate_data.py"
# Generate the file
generate_license_plate_data_file(license_plates, output_path)
sys.stdout.write("\n")
sys.stdout.write("=" * 70 + "\n")
sys.stdout.write("LICENSE PLATE DATA UPDATE COMPLETE\n")
sys.stdout.write("=" * 70 + "\n")
sys.stdout.write(f"Total codes: {len(license_plates)}\n")
sys.stdout.write(f"Output file: {output_path}\n")
sys.stdout.write("\n")
sys.stdout.write("Data source: Wikipedia\n")
sys.stdout.write(
"URL: https://en.wikipedia.org/wiki/"
"Vehicle_registration_plates_of_Poland\n"
)
sys.stdout.write(f"Cache location: {get_cache_path()}\n")
sys.stdout.write(f"Cache expiry: {CACHE_EXPIRY_DAYS} days\n")
sys.stdout.write("\n")
sys.stdout.write("Next steps:\n")
sys.stdout.write(" 1. Review the generated file\n")
sys.stdout.write(
" 2. Run tests: " "pytest python_pkg/polish_license_plates/tests/\n"
)
sys.stdout.write(
" 3. Regenerate Anki package: "
"python -m python_pkg.polish_license_plates.polish_license_plates_anki\n"
)
except RuntimeError as e:
sys.stderr.write(f"Error: {e}\n")
return 1
else:
return 0
if __name__ == "__main__":
sys.exit(main())