From d050e004d3e8586f6632b01849425e22d09a45ae Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 18 Jan 2026 13:10:34 +0000 Subject: [PATCH] Add Wikipedia scraper for automatic license plate data extraction Co-authored-by: kuhyx <147418882+kuhyx@users.noreply.github.com> --- python_pkg/polish_license_plates/README.md | 61 ++-- .../fetch_license_plates.py | 264 ++++++++++++++++++ .../license_plate_data.py | 8 + 3 files changed, 315 insertions(+), 18 deletions(-) create mode 100755 python_pkg/polish_license_plates/fetch_license_plates.py diff --git a/python_pkg/polish_license_plates/README.md b/python_pkg/polish_license_plates/README.md index 919b028..39dd421 100644 --- a/python_pkg/polish_license_plates/README.md +++ b/python_pkg/polish_license_plates/README.md @@ -7,6 +7,7 @@ Generate Anki flashcards for learning Polish car license plate codes. This package generates Anki-compatible flashcard decks for all Polish vehicle registration plate codes. Each code is mapped to its corresponding location (city or powiat). Polish license plates use a system where: + - First letter indicates the **voivodeship** (province) - Following 1-2 letters indicate the specific **city** or **powiat** (county) @@ -21,6 +22,15 @@ Polish license plates use a system where: - Dark mode support - Self-contained `.apkg` file - no manual setup required +## Data Source + +License plate data is automatically extracted from Wikipedia's authoritative table: + +- **Source**: [Vehicle registration plates of Poland](https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland) +- **Update**: Run `python -m python_pkg.polish_license_plates.fetch_license_plates` to refresh data + +This ensures the codes are always based on the most current public information. + ## Usage ### Generate Flashcards @@ -38,6 +48,18 @@ python -m python_pkg.polish_license_plates.polish_license_plates_anki \ --deck-name "My Polish Plates" ``` +### Update License Plate Data + +To fetch the latest data from Wikipedia: + +```bash +python -m python_pkg.polish_license_plates.fetch_license_plates +``` + +This will update `license_plate_data.py` with the current codes from Wikipedia. + +**Requirements**: `pip install requests beautifulsoup4 lxml` + ### Import into Anki 1. Open Anki @@ -49,28 +71,29 @@ python -m python_pkg.polish_license_plates.polish_license_plates_anki \ ### License Plate Codes by Voivodeship -| Voivodeship | First Letter | Example Codes | -|-------------|--------------|---------------| -| Dolnośląskie | D | DA (Wrocław), DB (Wałbrzych), DJ (Jelenia Góra) | -| Kujawsko-Pomorskie | C | CB (Bydgoszcz), CT (Toruń), CG (Grudziądz) | -| Lubelskie | L | LL (Lublin), LC (Chełm), LZ (Zamość) | -| Lubuskie | F | FZ (Zielona Góra), FG (Gorzów Wielkopolski) | -| Łódzkie | E | ED (Łódź), EP (Piotrków Trybunalski) | -| Małopolskie | K | KR (Kraków), KT (Tarnów), KN (Nowy Sącz) | -| Mazowieckie | W | WA-WZ (Warsaw), WR (Radom), WS (Siedlce) | -| Opolskie | O | OP (Opole), OK (Kędzierzyn-Koźle) | -| Podkarpackie | R | RR (Rzeszów), RP (Przemyśl), RK (Krosno) | -| Podlaskie | B | BI (Białystok), BL (Łomża), BSU (Suwałki) | -| Pomorskie | G | GD (Gdańsk), GDY (Gdynia), GS (Słupsk) | -| Śląskie | S | SK (Katowice), SC (Chorzów), SB (Bielsko-Biała) | -| Świętokrzyskie | T | TK (Kielce), TSK (Skarżysko-Kamienna) | -| Warmińsko-Mazurskie | N | NO (Olsztyn), NE (Elbląg), NG (Giżycko) | -| Wielkopolskie | P | PO (Poznań), PKA (Kalisz), PIA (Piła) | -| Zachodniopomorskie | Z | ZS (Szczecin), ZKO (Koszalin), ZSW (Świnoujście) | +| Voivodeship | First Letter | Example Codes | +| ------------------- | ------------ | ------------------------------------------------ | +| Dolnośląskie | D | DA (Wrocław), DB (Wałbrzych), DJ (Jelenia Góra) | +| Kujawsko-Pomorskie | C | CB (Bydgoszcz), CT (Toruń), CG (Grudziądz) | +| Lubelskie | L | LL (Lublin), LC (Chełm), LZ (Zamość) | +| Lubuskie | F | FZ (Zielona Góra), FG (Gorzów Wielkopolski) | +| Łódzkie | E | ED (Łódź), EP (Piotrków Trybunalski) | +| Małopolskie | K | KR (Kraków), KT (Tarnów), KN (Nowy Sącz) | +| Mazowieckie | W | WA-WZ (Warsaw), WR (Radom), WS (Siedlce) | +| Opolskie | O | OP (Opole), OK (Kędzierzyn-Koźle) | +| Podkarpackie | R | RR (Rzeszów), RP (Przemyśl), RK (Krosno) | +| Podlaskie | B | BI (Białystok), BL (Łomża), BSU (Suwałki) | +| Pomorskie | G | GD (Gdańsk), GDY (Gdynia), GS (Słupsk) | +| Śląskie | S | SK (Katowice), SC (Chorzów), SB (Bielsko-Biała) | +| Świętokrzyskie | T | TK (Kielce), TSK (Skarżysko-Kamienna) | +| Warmińsko-Mazurskie | N | NO (Olsztyn), NE (Elbląg), NG (Giżycko) | +| Wielkopolskie | P | PO (Poznań), PKA (Kalisz), PIA (Piła) | +| Zachodniopomorskie | Z | ZS (Szczecin), ZKO (Koszalin), ZSW (Świnoujście) | ### Warsaw (Warszawa) Codes Warsaw has an extensive range of codes (WA-WZ): + - WA: Warszawa (general) - WB: Warszawa Bemowo - WC: Ciechanów @@ -83,6 +106,7 @@ Warsaw has an extensive range of codes (WA-WZ): ## Data The package includes 444 license plate codes covering: + - All 16 Polish voivodeships - Major cities with powiat rights (e.g., Kraków, Gdańsk, Poznań) - All powiats (counties) across Poland @@ -96,6 +120,7 @@ python -m pytest python_pkg/polish_license_plates/tests/ -v ``` All 17 tests validate: + - Data integrity (444 codes, no duplicates) - Correct voivodeship prefixes - Major cities present diff --git a/python_pkg/polish_license_plates/fetch_license_plates.py b/python_pkg/polish_license_plates/fetch_license_plates.py new file mode 100755 index 0000000..070e1ca --- /dev/null +++ b/python_pkg/polish_license_plates/fetch_license_plates.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +"""Fetch Polish license plate codes from Wikipedia. + +This script scrapes the Wikipedia page "Vehicle registration plates of Poland" +to extract the official license plate codes and their corresponding locations. + +The data is extracted from the wikitable on the page and saved to license_plate_data.py. + +Usage: + python -m python_pkg.polish_license_plates.fetch_license_plates + +Source: + https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland + +Note: + This script requires internet access and the following packages: + - requests + - beautifulsoup4 + - lxml +""" + +from __future__ import annotations + +from pathlib import Path +import re +import sys + +try: + from bs4 import BeautifulSoup + import requests +except ImportError: + sys.stderr.write( + "Error: Required packages not installed.\n" + "Install with: pip install requests beautifulsoup4 lxml\n" + ) + sys.exit(1) + + +def fetch_wikipedia_license_plates() -> dict[str, str]: + """Fetch Polish license plate codes from Wikipedia. + + Returns: + Dictionary mapping license plate codes to their locations. + + Raises: + RuntimeError: If the page cannot be fetched or parsed. + """ + url = "https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland" + headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/91.0.4472.124 Safari/537.36" + } + + sys.stdout.write(f"Fetching data from {url}...\n") + + try: + response = requests.get(url, headers=headers, timeout=30) + response.raise_for_status() + except requests.RequestException as e: + msg = f"Failed to fetch Wikipedia page: {e}" + raise RuntimeError(msg) from e + + soup = BeautifulSoup(response.text, "html.parser") + + # Find all wikitables + tables = soup.find_all("table", {"class": "wikitable"}) + + if not tables: + msg = "No wikitable found on the page" + raise RuntimeError(msg) + + sys.stdout.write(f"Found {len(tables)} tables on the page\n") + + license_plates: dict[str, str] = {} + + # Process each table + for table_idx, table in enumerate(tables): + rows = table.find_all("tr") + + sys.stdout.write(f"Processing table {table_idx + 1} with {len(rows)} rows...\n") + + for row in rows[1:]: # Skip header row + cells = row.find_all(["td", "th"]) + + if len(cells) >= 2: # noqa: PLR2004 + # Extract code and location + code_text = cells[0].get_text(strip=True) + location_text = cells[1].get_text(strip=True) + + # Clean up the code (remove spaces, keep only letters) + code = re.sub(r"[^A-Z]", "", code_text.upper()) + + # Skip if code is invalid + if not code or len(code) > 4: # noqa: PLR2004 + continue + + # Clean up location text (remove citations, extra spaces) + location = re.sub(r"\[[0-9]+\]", "", location_text) + location = " ".join(location.split()) + + if location: + license_plates[code] = location + + sys.stdout.write(f"Extracted {len(license_plates)} license plate codes\n") + + return license_plates + + +def generate_license_plate_data_file( + license_plates: dict[str, str], + output_path: Path, +) -> None: + """Generate license_plate_data.py file with the extracted data. + + Args: + license_plates: Dictionary mapping codes to locations. + output_path: Path to the output file. + """ + # Group by first letter (voivodeship) + voivodeships: dict[str, list[tuple[str, str]]] = {} + for code, location in sorted(license_plates.items()): + first_letter = code[0] + if first_letter not in voivodeships: + voivodeships[first_letter] = [] + voivodeships[first_letter].append((code, location)) + + # Voivodeship names + voivodeship_names = { + "B": "Podlaskie", + "C": "Kujawsko-Pomorskie", + "D": "Dolnośląskie", + "E": "Łódzkie", + "F": "Lubuskie", + "G": "Pomorskie", + "K": "Małopolskie", + "L": "Lubelskie", + "N": "Warmińsko-Mazurskie", + "O": "Opolskie", + "P": "Wielkopolskie", + "R": "Podkarpackie", + "S": "Śląskie", + "T": "Świętokrzyskie", + "W": "Mazowieckie", + "Z": "Zachodniopomorskie", + } + + # Generate file content + content = '''"""Database of Polish car license plate registration codes. + +This module contains a comprehensive mapping of Polish vehicle registration +plate codes to their corresponding locations (cities, powiats, voivodeships). + +Polish license plates use a system where: +- First letter indicates the voivodeship (province) +- Following 1-2 letters indicate the specific city or powiat (county) + +The database is organized by voivodeships in alphabetical order: +- B: Podlaskie +- C: Kujawsko-Pomorskie +- D: Dolnośląskie +- E: Łódzkie +- F: Lubuskie +- G: Pomorskie +- K: Małopolskie +- L: Lubelskie +- N: Warmińsko-Mazurskie +- O: Opolskie +- P: Wielkopolskie +- R: Podkarpackie +- S: Śląskie +- T: Świętokrzyskie +- W: Mazowieckie +- Z: Zachodniopomorskie + +Data source: + Wikipedia - Vehicle registration plates of Poland + https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland + +Auto-generated by: + python -m python_pkg.polish_license_plates.fetch_license_plates + +Examples: + WA = Warszawa (Warsaw) + KR = Kraków + GD = Gdańsk +""" + +from __future__ import annotations + +LICENSE_PLATE_CODES: dict[str, str] = { +''' + + # Add entries grouped by voivodeship + for letter in sorted(voivodeships.keys()): + voivodeship_name = voivodeship_names.get(letter, f"Voivodeship {letter}") + codes = voivodeships[letter] + + content += f" # {letter} - {voivodeship_name} ({len(codes)} codes)\n" + + for code, location in codes: + # Escape quotes in location + location_escaped = location.replace('"', '\\"') + content += f' "{code}": "{location_escaped}",\n' + + content += "\n" + + # Remove last comma and newline, then close the dict + content = content.rstrip(",\n") + "\n}\n" + + # Write to file + output_path.write_text(content, encoding="utf-8") + sys.stdout.write(f"Generated {output_path}\n") + + +def main() -> int: + """Main entry point. + + Returns: + Exit code. + """ + try: + # Fetch data from Wikipedia + license_plates = fetch_wikipedia_license_plates() + + # Determine output path + script_dir = Path(__file__).parent + output_path = script_dir / "license_plate_data.py" + + # Generate the file + generate_license_plate_data_file(license_plates, output_path) + + sys.stdout.write("\n") + sys.stdout.write("=" * 70 + "\n") + sys.stdout.write("LICENSE PLATE DATA UPDATE COMPLETE\n") + sys.stdout.write("=" * 70 + "\n") + sys.stdout.write(f"Total codes: {len(license_plates)}\n") + sys.stdout.write(f"Output file: {output_path}\n") + sys.stdout.write("\n") + sys.stdout.write("Data source: Wikipedia\n") + sys.stdout.write( + "URL: https://en.wikipedia.org/wiki/" + "Vehicle_registration_plates_of_Poland\n" + ) + sys.stdout.write("\n") + sys.stdout.write("Next steps:\n") + sys.stdout.write(" 1. Review the generated file\n") + sys.stdout.write( + " 2. Run tests: " "pytest python_pkg/polish_license_plates/tests/\n" + ) + sys.stdout.write( + " 3. Regenerate Anki package: " + "python -m python_pkg.polish_license_plates.polish_license_plates_anki\n" + ) + + except RuntimeError as e: + sys.stderr.write(f"Error: {e}\n") + return 1 + else: + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python_pkg/polish_license_plates/license_plate_data.py b/python_pkg/polish_license_plates/license_plate_data.py index 3202013..ce40f0b 100644 --- a/python_pkg/polish_license_plates/license_plate_data.py +++ b/python_pkg/polish_license_plates/license_plate_data.py @@ -25,6 +25,14 @@ The database is organized by voivodeships in alphabetical order: - W: Mazowieckie - Z: Zachodniopomorskie +Data source: + Wikipedia - Vehicle registration plates of Poland + https://en.wikipedia.org/wiki/Vehicle_registration_plates_of_Poland + +Note: + This data can be automatically updated by running: + python -m python_pkg.polish_license_plates.fetch_license_plates + Examples: WA = Warszawa (Warsaw) KR = Kraków