mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 15:03:01 +02:00
Add caching to Wikipedia scraper to avoid unnecessary requests
Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com>
This commit is contained in:
parent
2ce6a74a65
commit
1c468667d5
3
.gitignore
vendored
3
.gitignore
vendored
@ -273,3 +273,6 @@ python_pkg/download_cats/http_cat_cache/
|
||||
|
||||
# Large geojson files that can be downloaded
|
||||
python_pkg/warsaw_districts/warszawa-dzielnice.geojson
|
||||
|
||||
# Wikipedia cache (can be refreshed)
|
||||
python_pkg/polish_license_plates/.wikipedia_cache/
|
||||
|
||||
@ -53,9 +53,15 @@ python -m python_pkg.polish_license_plates.polish_license_plates_anki \
|
||||
To fetch the latest data from Wikipedia:
|
||||
|
||||
```bash
|
||||
# Use cached data if available (default)
|
||||
python -m python_pkg.polish_license_plates.fetch_license_plates
|
||||
|
||||
# Force refresh from Wikipedia (ignore cache)
|
||||
python -m python_pkg.polish_license_plates.fetch_license_plates --force
|
||||
```
|
||||
|
||||
**Caching**: Downloaded Wikipedia data is cached for 7 days in `.wikipedia_cache/` to avoid unnecessary requests. Use `--force` to bypass the cache.
|
||||
|
||||
This will update `license_plate_data.py` with the current codes from Wikipedia.
|
||||
|
||||
**Requirements**: `pip install requests beautifulsoup4 lxml`
|
||||
|
||||
@ -6,9 +6,17 @@ to extract the official license plate codes and their corresponding locations.
|
||||
|
||||
The data is extracted from the wikitable on the page and saved to license_plate_data.py.
|
||||
|
||||
Caching:
|
||||
Fetched Wikipedia HTML is cached to avoid unnecessary requests.
|
||||
Cache location: .wikipedia_cache/license_plates.html
|
||||
Cache expires after 7 days by default.
|
||||
|
||||
Usage:
|
||||
python -m python_pkg.polish_license_plates.fetch_license_plates
|
||||
|
||||
# Force refresh (ignore cache)
|
||||
python -m python_pkg.polish_license_plates.fetch_license_plates --force
|
||||
|
||||
Source:
|
||||
https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland
|
||||
|
||||
@ -21,9 +29,11 @@ Note:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
@ -38,6 +48,7 @@ except ImportError:
|
||||
# Constants
|
||||
MIN_TABLE_COLUMNS = 2 # Minimum columns needed to extract code and location
|
||||
MAX_CODE_LENGTH = 4 # Maximum length for a valid license plate code
|
||||
CACHE_EXPIRY_DAYS = 7 # Cache expires after 7 days
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
@ -45,18 +56,66 @@ USER_AGENT = (
|
||||
)
|
||||
|
||||
|
||||
def fetch_wikipedia_license_plates() -> dict[str, str]:
|
||||
"""Fetch Polish license plate codes from Wikipedia.
|
||||
def get_cache_path() -> Path:
|
||||
"""Get the path to the cache file.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping license plate codes to their locations.
|
||||
Path to the cache file.
|
||||
"""
|
||||
script_dir = Path(__file__).parent
|
||||
cache_dir = script_dir / ".wikipedia_cache"
|
||||
cache_dir.mkdir(exist_ok=True)
|
||||
return cache_dir / "license_plates.html"
|
||||
|
||||
|
||||
def is_cache_valid(cache_path: Path, max_age_days: int = CACHE_EXPIRY_DAYS) -> bool:
|
||||
"""Check if the cache file exists and is not expired.
|
||||
|
||||
Args:
|
||||
cache_path: Path to the cache file.
|
||||
max_age_days: Maximum age in days before cache is considered expired.
|
||||
|
||||
Returns:
|
||||
True if cache is valid, False otherwise.
|
||||
"""
|
||||
if not cache_path.exists():
|
||||
return False
|
||||
|
||||
# Check age
|
||||
file_age_seconds = time.time() - cache_path.stat().st_mtime
|
||||
max_age_seconds = max_age_days * 24 * 60 * 60
|
||||
|
||||
return file_age_seconds < max_age_seconds
|
||||
|
||||
|
||||
def fetch_wikipedia_html(*, force_refresh: bool = False) -> str:
|
||||
"""Fetch Wikipedia HTML, using cache if available.
|
||||
|
||||
Args:
|
||||
force_refresh: If True, ignore cache and fetch fresh data.
|
||||
|
||||
Returns:
|
||||
HTML content of the Wikipedia page.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the page cannot be fetched or parsed.
|
||||
RuntimeError: If the page cannot be fetched.
|
||||
"""
|
||||
cache_path = get_cache_path()
|
||||
|
||||
# Check if we can use cache
|
||||
if not force_refresh and is_cache_valid(cache_path):
|
||||
sys.stdout.write(f"Using cached data from {cache_path}\n")
|
||||
cache_age_hours = int((time.time() - cache_path.stat().st_mtime) / 3600)
|
||||
sys.stdout.write(f"Cache age: {cache_age_hours} hours\n")
|
||||
return cache_path.read_text(encoding="utf-8")
|
||||
|
||||
# Fetch from Wikipedia
|
||||
url = "https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland"
|
||||
headers = {"User-Agent": USER_AGENT}
|
||||
|
||||
if force_refresh:
|
||||
sys.stdout.write("Force refresh: Ignoring cache\n")
|
||||
|
||||
sys.stdout.write(f"Fetching data from {url}...\n")
|
||||
|
||||
try:
|
||||
@ -66,7 +125,26 @@ def fetch_wikipedia_license_plates() -> dict[str, str]:
|
||||
msg = f"Failed to fetch Wikipedia page: {e}"
|
||||
raise RuntimeError(msg) from e
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
# Cache the response
|
||||
cache_path.write_text(response.text, encoding="utf-8")
|
||||
sys.stdout.write(f"Cached response to {cache_path}\n")
|
||||
|
||||
return response.text
|
||||
|
||||
|
||||
def parse_license_plates_from_html(html_content: str) -> dict[str, str]:
|
||||
"""Parse license plate codes from Wikipedia HTML.
|
||||
|
||||
Args:
|
||||
html_content: HTML content of the Wikipedia page.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping license plate codes to their locations.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If no valid tables are found.
|
||||
"""
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
# Find all wikitables
|
||||
tables = soup.find_all("table", {"class": "wikitable"})
|
||||
@ -112,6 +190,22 @@ def fetch_wikipedia_license_plates() -> dict[str, str]:
|
||||
return license_plates
|
||||
|
||||
|
||||
def fetch_wikipedia_license_plates(*, force_refresh: bool = False) -> dict[str, str]:
|
||||
"""Fetch Polish license plate codes from Wikipedia.
|
||||
|
||||
Args:
|
||||
force_refresh: If True, ignore cache and fetch fresh data.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping license plate codes to their locations.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the page cannot be fetched or parsed.
|
||||
"""
|
||||
html_content = fetch_wikipedia_html(force_refresh=force_refresh)
|
||||
return parse_license_plates_from_html(html_content)
|
||||
|
||||
|
||||
def generate_license_plate_data_file(
|
||||
license_plates: dict[str, str],
|
||||
output_path: Path,
|
||||
@ -224,9 +318,22 @@ def main() -> int:
|
||||
Returns:
|
||||
Exit code.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fetch Polish license plate codes from Wikipedia.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force",
|
||||
"-f",
|
||||
action="store_true",
|
||||
help="Force refresh: ignore cache and fetch fresh data from Wikipedia",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
# Fetch data from Wikipedia
|
||||
license_plates = fetch_wikipedia_license_plates()
|
||||
license_plates = fetch_wikipedia_license_plates(force_refresh=args.force)
|
||||
|
||||
# Determine output path
|
||||
script_dir = Path(__file__).parent
|
||||
@ -247,6 +354,8 @@ def main() -> int:
|
||||
"URL: https://en.wikipedia.org/wiki/"
|
||||
"Vehicle_registration_plates_of_Poland\n"
|
||||
)
|
||||
sys.stdout.write(f"Cache location: {get_cache_path()}\n")
|
||||
sys.stdout.write(f"Cache expiry: {CACHE_EXPIRY_DAYS} days\n")
|
||||
sys.stdout.write("\n")
|
||||
sys.stdout.write("Next steps:\n")
|
||||
sys.stdout.write(" 1. Review the generated file\n")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user