testsAndMisc/python_pkg/geo_data.py

"""Shared geographic data module for Warsaw and Poland Anki generators.

This module handles downloading and caching geographic data from various sources:
- OpenStreetMap via Overpass API
- Geofabrik OSM extracts
- GitHub repositories with pre-processed GeoJSON

All data is cached locally to avoid repeated downloads.
"""

from __future__ import annotations

import contextlib
import json
from pathlib import Path
import shutil
import sys
import time
from typing import TYPE_CHECKING
from urllib.request import urlopen

import geopandas as gpd
import requests
from shapely.geometry import (
    GeometryCollection,
    LineString,
    MultiLineString,
    MultiPolygon,
    Polygon,
)

if TYPE_CHECKING:
    from typing import Any

# Shared cache directory for all geo data
CACHE_DIR = Path(__file__).parent / "geo_cache"

# Overpass API endpoints (multiple for redundancy)
# Note: kumi.systems is more reliable, so it's first
OVERPASS_ENDPOINTS = [
    "https://overpass.kumi.systems/api/interpreter",
    "https://overpass-api.de/api/interpreter",
    "https://maps.mail.ru/osm/tools/overpass/api/interpreter",
]

# GitHub URLs for pre-processed data
POLSKA_GEOJSON_BASE = "https://raw.githubusercontent.com/ppatrzyk/polska-geojson/master"

# Wikidata SPARQL endpoint
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"

# Request timeout and retry settings
REQUEST_TIMEOUT = 180
MAX_RETRIES = 3
RETRY_DELAY = 5

# Data thresholds for filtering
MIN_PEAK_ELEVATION = 300  # meters
MIN_LAKE_AREA_KM2 = 0.5  # km²
MIN_RIVER_LENGTH_KM = 10  # km
MIN_LINE_COORDS = 2  # minimum coordinates for a line
MIN_RING_COORDS = 4  # minimum coordinates for a polygon ring


def _ensure_cache_dir() -> None:
    """Create cache directory if it doesn't exist."""
    CACHE_DIR.mkdir(parents=True, exist_ok=True)


def _extract_polygonal_geometry(
    geom: Polygon | MultiPolygon | GeometryCollection,
) -> Polygon | MultiPolygon | None:
    """Extract only polygonal geometry from a geometry that may be mixed.

    Some OSM data comes as GeometryCollections containing polygons mixed with
    lines. This function extracts only the polygon/multipolygon parts.

    Args:
        geom: Input geometry (Polygon, MultiPolygon, or GeometryCollection).

    Returns:
        Polygon or MultiPolygon with only the polygonal parts, or None if empty.
    """
    if isinstance(geom, Polygon | MultiPolygon):
        return geom

    if isinstance(geom, GeometryCollection):
        polygons = [g for g in geom.geoms if isinstance(g, Polygon | MultiPolygon)]
        if not polygons:
            return None
        if len(polygons) == 1:
            return polygons[0]
        # Flatten MultiPolygons and combine all polygons
        all_polys = []
        for p in polygons:
            if isinstance(p, Polygon):
                all_polys.append(p)
            elif isinstance(p, MultiPolygon):
                all_polys.extend(p.geoms)
        return MultiPolygon(all_polys)

    return None


def _query_wikidata(query: str) -> list[dict[str, Any]]:
    """Query Wikidata SPARQL endpoint.

    Args:
        query: SPARQL query string.

    Returns:
        List of result bindings.
    """
    response = requests.get(
        WIKIDATA_SPARQL,
        params={"query": query, "format": "json"},
        timeout=60,
    )
    response.raise_for_status()
    return response.json()["results"]["bindings"]


def _get_powiaty_population() -> dict[str, int]:
    """Get population data for all Polish powiaty from Wikidata.

    Returns:
        Dictionary mapping powiat name to population.
    """
    cache_path = CACHE_DIR / "powiaty_population.json"

    if cache_path.exists():
        return json.loads(cache_path.read_text())

    # Query Wikidata for all powiaty (Q247073) in Poland (Q36) with population
    # Filter to only current Polish powiaty using country=Poland filter
    query = """
    SELECT ?powiat ?powiatLabel ?population WHERE {
      ?powiat wdt:P31 wd:Q247073.
      ?powiat wdt:P17 wd:Q36.
      ?powiat wdt:P1082 ?population.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "pl,en". }
    }
    ORDER BY DESC(?population)
    """

    sys.stdout.write("Fetching powiaty population data from Wikidata...\n")
    results = _query_wikidata(query)

    population_map: dict[str, int] = {}
    for item in results:
        label = item.get("powiatLabel", {}).get("value", "")
        pop = item.get("population", {}).get("value", "0")
        if label and pop:
            # Remove "powiat" prefix if present for matching
            clean_label = label.replace("powiat ", "").strip()
            with contextlib.suppress(ValueError):
                population_map[clean_label] = int(pop)

    _ensure_cache_dir()
    cache_path.write_text(json.dumps(population_map, ensure_ascii=False, indent=2))

    sys.stdout.write(f"Cached population data for {len(population_map)} powiaty.\n")
    return population_map


def _try_single_request(
    endpoint: str, query: str
) -> tuple[dict[str, Any] | None, Exception | None]:
    """Try a single request to an endpoint.

    Args:
        endpoint: Overpass API endpoint URL.
        query: Overpass QL query string.

    Returns:
        Tuple of (result, error). One will be None.
    """
    try:
        sys.stdout.write(f"  Querying {endpoint}...\n")
        response = requests.post(
            endpoint,
            data={"data": query},
            timeout=REQUEST_TIMEOUT,
        )
        response.raise_for_status()
        result = response.json()
    except (requests.RequestException, requests.Timeout, ValueError) as e:
        return None, e
    else:
        # Check for valid response with elements
        if not isinstance(result, dict) or "elements" not in result:
            return None, ValueError("Invalid response format")
        return result, None


def _overpass_query(query: str) -> dict[str, Any]:
    """Execute an Overpass API query with retry logic.

    Args:
        query: Overpass QL query string.

    Returns:
        JSON response from the API.

    Raises:
        RuntimeError: If all endpoints fail.
    """
    last_error: Exception | None = None

    for endpoint in OVERPASS_ENDPOINTS:
        for attempt in range(MAX_RETRIES):
            result, error = _try_single_request(endpoint, query)
            if result is not None:
                return result
            last_error = error
            sys.stdout.write(f"  Attempt {attempt + 1} failed: {error}\n")
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)

    msg = f"All Overpass API endpoints failed. Last error: {last_error}"
    raise RuntimeError(msg)


def _download_github_geojson(url: str, cache_path: Path) -> gpd.GeoDataFrame:
    """Download GeoJSON from GitHub and cache it.

    Args:
        url: URL to download from.
        cache_path: Path to cache the data.

    Returns:
        GeoDataFrame with the data.
    """
    if cache_path.exists():
        return gpd.read_file(cache_path)

    sys.stdout.write(f"Downloading from {url}...\n")
    if not url.startswith(("http://", "https://")):
        msg = f"Unsupported URL scheme: {url}"
        raise ValueError(msg)
    with urlopen(url, timeout=REQUEST_TIMEOUT) as response:
        data = json.loads(response.read().decode())

    _ensure_cache_dir()
    cache_path.write_text(json.dumps(data))

    return gpd.GeoDataFrame.from_features(data["features"], crs="EPSG:4326")


# =============================================================================
# Warsaw Data
# =============================================================================


def get_warsaw_boundary() -> gpd.GeoDataFrame:
    """Get Warsaw city boundary.

    Returns:
        GeoDataFrame with Warsaw boundary polygon.
    """
    cache_path = CACHE_DIR / "warsaw_boundary.geojson"

    if cache_path.exists():
        return gpd.read_file(cache_path)

    # Try to use districts file first
    districts_path = (
        Path(__file__).parent
        / "anki_decks"
        / "warsaw_districts"
        / "warszawa-dzielnice.geojson"
    )
    if districts_path.exists():
        warsaw_gdf = gpd.read_file(districts_path)
        warsaw_boundary = warsaw_gdf[warsaw_gdf["name"] == "Warszawa"]
        if len(warsaw_boundary) == 0:
            warsaw_boundary = gpd.GeoDataFrame(
                geometry=[warsaw_gdf.union_all()], crs=warsaw_gdf.crs
            )
        _ensure_cache_dir()
        warsaw_boundary.to_file(cache_path, driver="GeoJSON")
        return warsaw_boundary

    # Fallback to Overpass query
    sys.stdout.write("Fetching Warsaw boundary from OpenStreetMap...\n")
    query = """
    [out:json][timeout:60];
    relation["name"="Warszawa"]["admin_level"="6"];
    out geom;
    """

    data = _overpass_query(query)

    features = []
    for element in data.get("elements", []):
        if element.get("type") == "relation":
            coords = []
            for member in element.get("members", []):
                if member.get("role") == "outer" and "geometry" in member:
                    coords.extend([(p["lon"], p["lat"]) for p in member["geometry"]])
            if coords:
                features.append(
                    {
                        "type": "Feature",
                        "properties": {"name": "Warszawa"},
                        "geometry": {"type": "Polygon", "coordinates": [coords]},
                    }
                )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson))

    return gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")


def get_warsaw_districts() -> gpd.GeoDataFrame:
    """Get Warsaw districts (dzielnice).

    Returns:
        GeoDataFrame with district boundaries.
    """
    districts_path = (
        Path(__file__).parent
        / "anki_decks"
        / "warsaw_districts"
        / "warszawa-dzielnice.geojson"
    )
    if districts_path.exists():
        gdf = gpd.read_file(districts_path)
        return gdf[gdf["name"] != "Warszawa"].copy()

    msg = "Warsaw districts GeoJSON not found"
    raise FileNotFoundError(msg)


def get_vistula_river() -> gpd.GeoDataFrame:
    """Get Vistula river in Warsaw.

    Returns:
        GeoDataFrame with river geometry.
    """
    cache_path = CACHE_DIR / "warsaw_vistula.geojson"

    if cache_path.exists():
        return gpd.read_file(cache_path)

    sys.stdout.write("Fetching Vistula river data...\n")
    query = """
    [out:json][timeout:60];
    area["name"="Warszawa"]["admin_level"="6"]->.warsaw;
    (
      way["waterway"="river"]["name"="Wisła"](area.warsaw);
    );
    out geom;
    """

    data = _overpass_query(query)

    features = []
    min_coords = 2
    for element in data.get("elements", []):
        if element.get("type") == "way" and "geometry" in element:
            coords = [(p["lon"], p["lat"]) for p in element["geometry"]]
            if len(coords) >= min_coords:
                features.append(
                    {
                        "type": "Feature",
                        "properties": {"name": "Wisła"},
                        "geometry": {"type": "LineString", "coordinates": coords},
                    }
                )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson))

    return gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")


def get_warsaw_bridges() -> gpd.GeoDataFrame:
    """Get Warsaw bridges over the Vistula.

    Returns:
        GeoDataFrame with bridge geometries.
    """
    cache_path = CACHE_DIR / "warsaw_bridges.geojson"

    if cache_path.exists():
        return gpd.read_file(cache_path)

    sys.stdout.write("Fetching Warsaw bridges data...\n")

    # First get the Vistula to filter bridges
    vistula = get_vistula_river()
    vistula_union = vistula.union_all()
    vistula_buffer = vistula_union.buffer(0.002)  # ~200m buffer

    # Query for bridges with "Most" in name - smaller query
    query = """
    [out:json][timeout:90];
    area["name"="Warszawa"]["admin_level"="6"]->.warsaw;
    way["bridge"="yes"]["name"~"^Most"](area.warsaw);
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()
    min_coords = 2

    for element in data.get("elements", []):
        if element.get("type") != "way" or "geometry" not in element:
            continue

        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        coords = [(p["lon"], p["lat"]) for p in element["geometry"]]
        if len(coords) < min_coords:
            continue

        line = LineString(coords)

        # Check if bridge crosses/is near Vistula
        if line.intersects(vistula_buffer):
            seen_names.add(name)
            features.append(
                {
                    "type": "Feature",
                    "properties": {"name": name, "osm_id": element.get("id")},
                    "geometry": {"type": "LineString", "coordinates": coords},
                }
            )

    # Merge segments of the same bridge
    merged_features = _merge_bridge_segments(features)

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": merged_features}
    cache_path.write_text(json.dumps(geojson))

    sys.stdout.write(f"Cached {len(merged_features)} bridges.\n")
    return gpd.GeoDataFrame.from_features(merged_features, crs="EPSG:4326")


def _merge_bridge_segments(features: list[dict]) -> list[dict]:
    """Merge bridge segments with the same name.

    Args:
        features: List of GeoJSON features.

    Returns:
        List of merged features.
    """
    by_name: dict[str, list[list[tuple[float, float]]]] = {}

    for feature in features:
        name = feature["properties"]["name"]
        coords = feature["geometry"]["coordinates"]
        if name not in by_name:
            by_name[name] = []
        by_name[name].append(coords)

    merged = []
    for name, coord_lists in by_name.items():
        if len(coord_lists) == 1:
            geom = {"type": "LineString", "coordinates": coord_lists[0]}
        else:
            geom = {"type": "MultiLineString", "coordinates": coord_lists}

        merged.append(
            {"type": "Feature", "properties": {"name": name}, "geometry": geom}
        )

    return merged


def get_warsaw_metro_stations() -> gpd.GeoDataFrame:
    """Get Warsaw metro stations with line information.

    Returns:
        GeoDataFrame with station points and line info (M1, M2, or M1/M2).
    """
    cache_path = CACHE_DIR / "warsaw_metro.geojson"

    if cache_path.exists():
        return gpd.read_file(cache_path)

    # Known stations for each line (as of 2024)
    m1_stations = {
        "Kabaty",
        "Natolin",
        "Imielin",
        "Stokłosy",
        "Ursynów",
        "Służew",
        "Wilanowska",
        "Wierzbno",
        "Racławicka",
        "Pole Mokotowskie",
        "Politechnika",
        "Centrum",
        "Świętokrzyska",  # Also M2
        "Ratusz-Arsenał",
        "Dworzec Gdański",
        "Plac Wilsona",
        "Marymont",
        "Słodowiec",
        "Stare Bielany",
        "Wawrzyszew",
        "Młociny",
    }
    m2_stations = {
        "Bródno",
        "Kondratowicza",
        "Zacisze",
        "Targówek Mieszkaniowy",
        "Trocka",
        "Szwedzka",
        "Dworzec Wileński",
        "Świętokrzyska",  # Also M1
        "Nowy Świat-Uniwersytet",
        "Centrum Nauki Kopernik",
        "Stadion Narodowy",
        "Rondo ONZ",
        "Rondo Daszyńskiego",
        "Płocka",
        "Młynów",
        "Księcia Janusza",
        "Ulrychów",
        "Bemowo",
    }

    sys.stdout.write("Fetching metro station data...\n")
    query = """
    [out:json][timeout:60];
    area["name"="Warszawa"]["admin_level"="6"]->.warsaw;
    (
      node["railway"="station"]["station"="subway"](area.warsaw);
      node["railway"="station"]["network"~"Metro"](area.warsaw);
    );
    out body;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()

    for element in data.get("elements", []):
        if element.get("type") == "node":
            name = element.get("tags", {}).get("name", "")
            if name and name not in seen_names:
                seen_names.add(name)
                # Determine line from known station lists
                in_m1 = name in m1_stations
                in_m2 = name in m2_stations
                if in_m1 and in_m2:
                    line = "M1/M2"
                elif in_m1:
                    line = "M1"
                elif in_m2:
                    line = "M2"
                else:
                    line = "?"  # Unknown station

                features.append(
                    {
                        "type": "Feature",
                        "properties": {
                            "name": name,
                            "line": line,
                        },
                        "geometry": {
                            "type": "Point",
                            "coordinates": [element["lon"], element["lat"]],
                        },
                    }
                )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson))

    sys.stdout.write(f"Cached {len(features)} metro stations.\n")
    return gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")


def get_warsaw_streets(min_length: int = 500) -> gpd.GeoDataFrame:
    """Get major Warsaw streets.

    Args:
        min_length: Minimum street length in meters.

    Returns:
        GeoDataFrame with street geometries.
    """
    cache_path = CACHE_DIR / "warsaw_streets.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        # Filter by length if needed
        return _filter_streets_by_length(gdf, min_length)

    sys.stdout.write("Fetching street data from OpenStreetMap...\n")
    query = """
    [out:json][timeout:120];
    area["name"="Warszawa"]["admin_level"="6"]->.warsaw;
    (
      way["highway"="primary"]["name"](area.warsaw);
      way["highway"="secondary"]["name"](area.warsaw);
      way["highway"="tertiary"]["name"](area.warsaw);
    );
    out geom;
    """

    data = _overpass_query(query)

    features = []
    min_coords = 2

    for element in data.get("elements", []):
        if element.get("type") == "way" and "geometry" in element:
            coords = [(p["lon"], p["lat"]) for p in element["geometry"]]
            if len(coords) >= min_coords:
                features.append(
                    {
                        "type": "Feature",
                        "properties": {
                            "name": element.get("tags", {}).get("name", "Unknown"),
                            "highway": element.get("tags", {}).get("highway", ""),
                        },
                        "geometry": {"type": "LineString", "coordinates": coords},
                    }
                )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson))

    sys.stdout.write(f"Cached {len(features)} street segments.\n")

    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")
    return _filter_streets_by_length(gdf, min_length)


def _filter_streets_by_length(
    gdf: gpd.GeoDataFrame, min_length: int
) -> gpd.GeoDataFrame:
    """Filter and merge streets by name, keeping only those above min_length.

    Args:
        gdf: GeoDataFrame with street segments.
        min_length: Minimum length in meters.

    Returns:
        GeoDataFrame with merged streets, sorted by length (longest first).
    """
    # Group by street name
    streets: dict[str, list] = {}
    for _, row in gdf.iterrows():
        name = row.get("name", "Unknown")
        if name and name != "Unknown":
            if name not in streets:
                streets[name] = []
            streets[name].append(row.geometry)

    # Merge and filter
    result_rows = []
    for name, geometries in streets.items():
        merged = geometries[0] if len(geometries) == 1 else MultiLineString(geometries)

        # Create temp GeoDataFrame for length calculation
        temp_gdf = gpd.GeoDataFrame(geometry=[merged], crs="EPSG:4326")
        temp_proj = temp_gdf.to_crs("EPSG:2180")  # Polish coordinate system
        length = temp_proj.geometry.length.iloc[0]

        if length >= min_length:
            result_rows.append({"name": name, "geometry": merged, "length_m": length})

    # Sort by length (longest first)
    result_rows.sort(key=lambda x: x["length_m"], reverse=True)

    return gpd.GeoDataFrame(result_rows, crs="EPSG:4326")


def get_warsaw_landmarks() -> gpd.GeoDataFrame:
    """Get Warsaw landmarks (museums, monuments, parks, etc.).

    Returns:
        GeoDataFrame with landmark points.
    """
    cache_path = CACHE_DIR / "warsaw_landmarks.geojson"

    if cache_path.exists():
        return gpd.read_file(cache_path)

    sys.stdout.write("Fetching landmark data...\n")
    # Simplified query - just museums and major attractions
    query = """
    [out:json][timeout:60];
    area["name"="Warszawa"]["admin_level"="6"]->.warsaw;
    (
      node["tourism"="museum"]["name"](area.warsaw);
      node["tourism"="attraction"]["name"](area.warsaw);
      node["historic"="monument"]["name"](area.warsaw);
      way["tourism"="museum"]["name"](area.warsaw);
      way["tourism"="attraction"]["name"](area.warsaw);
    );
    out center;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()

    for element in data.get("elements", []):
        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        # Get coordinates
        if element.get("type") == "node":
            lon, lat = element["lon"], element["lat"]
        elif "center" in element:
            lon, lat = element["center"]["lon"], element["center"]["lat"]
        else:
            continue

        seen_names.add(name)
        landmark_type = (
            element.get("tags", {}).get("tourism")
            or element.get("tags", {}).get("historic")
            or element.get("tags", {}).get("leisure")
            or "landmark"
        )

        features.append(
            {
                "type": "Feature",
                "properties": {"name": name, "type": landmark_type},
                "geometry": {"type": "Point", "coordinates": [lon, lat]},
            }
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson))

    sys.stdout.write(f"Cached {len(features)} landmarks.\n")

    if not features:
        return gpd.GeoDataFrame(
            {"name": [], "type": [], "geometry": []}, crs="EPSG:4326"
        )
    return gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")


def _extract_osiedla_rings(
    element: dict[str, Any], min_coords: int
) -> tuple[list[list[tuple[float, float]]], list[list[tuple[float, float]]]]:
    """Extract outer and inner rings from an OSM relation.

    Args:
        element: OSM relation element.
        min_coords: Minimum number of coordinates for a valid ring.

    Returns:
        Tuple of (outer_rings, inner_rings).
    """
    outer_rings: list[list[tuple[float, float]]] = []
    inner_rings: list[list[tuple[float, float]]] = []

    for member in element.get("members", []):
        if "geometry" not in member:
            continue
        ring = [(p["lon"], p["lat"]) for p in member["geometry"]]
        if len(ring) < min_coords:
            continue
        # Close the ring if not closed
        if ring[0] != ring[-1]:
            ring.append(ring[0])
        if member.get("role") == "outer":
            outer_rings.append(ring)
        elif member.get("role") == "inner":
            inner_rings.append(ring)

    return outer_rings, inner_rings


def _build_osiedla_geometry(
    outer_rings: list[list[tuple[float, float]]],
    inner_rings: list[list[tuple[float, float]]],
) -> dict[str, Any]:
    """Build GeoJSON geometry from outer and inner rings.

    Args:
        outer_rings: List of outer ring coordinates.
        inner_rings: List of inner ring coordinates.

    Returns:
        GeoJSON geometry dict.
    """
    if len(outer_rings) == 1:
        return {
            "type": "Polygon",
            "coordinates": [outer_rings[0], *inner_rings],
        }
    # Multiple outer rings - create MultiPolygon
    # Each polygon in a MultiPolygon is [exterior, hole1, hole2, ...]
    return {
        "type": "MultiPolygon",
        "coordinates": [[ring] for ring in outer_rings],
    }


def _extract_polygon_from_element(
    element: dict[str, Any],
) -> dict[str, Any] | None:
    """Extract polygon geometry from an OSM relation or way element.

    Args:
        element: OSM element (relation or way).

    Returns:
        GeoJSON geometry dict, or None if extraction fails.
    """
    if element.get("type") == "relation":
        outer_rings, inner_rings = _extract_osiedla_rings(element, MIN_RING_COORDS)
        if not outer_rings:
            return None
        return _build_osiedla_geometry(outer_rings, inner_rings)

    if element.get("type") == "way" and "geometry" in element:
        coords = [(p["lon"], p["lat"]) for p in element["geometry"]]
        if len(coords) < MIN_RING_COORDS:
            return None
        if coords[0] != coords[-1]:
            coords.append(coords[0])
        return {"type": "Polygon", "coordinates": [coords]}

    return None


def _extract_line_from_way(element: dict[str, Any]) -> dict[str, Any] | None:
    """Extract line geometry from an OSM way element.

    Args:
        element: OSM way element.

    Returns:
        GeoJSON LineString geometry dict, or None if extraction fails.
    """
    if element.get("type") != "way" or "geometry" not in element:
        return None

    coords = [(p["lon"], p["lat"]) for p in element["geometry"]]
    if len(coords) < MIN_LINE_COORDS:
        return None

    return {"type": "LineString", "coordinates": coords}


def _add_area_column(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """Add area_km2 column to a GeoDataFrame.

    Args:
        gdf: GeoDataFrame with polygon geometries.

    Returns:
        GeoDataFrame with area_km2 column added.
    """
    if len(gdf) == 0:
        return gdf
    gdf_proj = gdf.to_crs("EPSG:2180")  # Polish coordinate system
    gdf["area_km2"] = gdf_proj.geometry.area / 1_000_000
    return gdf


def _add_length_column(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """Add length_km column to a GeoDataFrame.

    Args:
        gdf: GeoDataFrame with line geometries.

    Returns:
        GeoDataFrame with length_km column added.
    """
    if len(gdf) == 0:
        return gdf
    gdf_proj = gdf.to_crs("EPSG:2180")  # Polish coordinate system
    gdf["length_km"] = gdf_proj.geometry.length / 1000
    return gdf


def _extract_coastal_geometry(
    element: dict[str, Any],
    natural_type: str,
    line_types: tuple[str, ...],
) -> dict[str, Any] | None:
    """Extract geometry from a coastal feature element.

    For cliffs and beaches, returns LineString. For others, returns Polygon.

    Args:
        element: OSM element.
        natural_type: The natural= tag value.
        line_types: Tuple of natural types that should be lines.

    Returns:
        GeoJSON geometry dict, or None if extraction fails.
    """
    if element.get("type") == "relation":
        return _extract_polygon_from_element(element)

    if element.get("type") != "way" or "geometry" not in element:
        return None

    coords = [(p["lon"], p["lat"]) for p in element["geometry"]]
    if len(coords) < MIN_LINE_COORDS:
        return None

    # For cliffs and beaches, keep as linestring
    if natural_type in line_types:
        return {"type": "LineString", "coordinates": coords}

    # Otherwise try to make a polygon
    if len(coords) >= MIN_RING_COORDS:
        if coords[0] != coords[-1]:
            coords.append(coords[0])
        return {"type": "Polygon", "coordinates": [coords]}

    return None


def get_warsaw_osiedla() -> gpd.GeoDataFrame:
    """Get Warsaw osiedla (neighborhoods).

    Returns:
        GeoDataFrame with osiedla boundaries.
    """
    cache_path = CACHE_DIR / "warsaw_osiedla.geojson"

    if cache_path.exists():
        return gpd.read_file(cache_path)

    sys.stdout.write("Fetching osiedla data...\n")
    query = """
    [out:json][timeout:180];
    area["name"="Warszawa"]["admin_level"="6"]->.warsaw;
    relation["boundary"="administrative"]["admin_level"="11"]["name"](area.warsaw);
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()
    min_ring_coords = 4

    for element in data.get("elements", []):
        if element.get("type") != "relation":
            continue

        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        outer_rings, inner_rings = _extract_osiedla_rings(element, min_ring_coords)
        if not outer_rings:
            continue

        seen_names.add(name)
        features.append(
            {
                "type": "Feature",
                "properties": {"name": name},
                "geometry": _build_osiedla_geometry(outer_rings, inner_rings),
            }
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson))

    sys.stdout.write(f"Cached {len(features)} osiedla.\n")
    return gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")


# =============================================================================
# Poland Data
# =============================================================================


def get_polish_wojewodztwa() -> gpd.GeoDataFrame:
    """Get Polish województwa (voivodeships).

    Returns:
        GeoDataFrame with województwa boundaries.
    """
    url = f"{POLSKA_GEOJSON_BASE}/wojewodztwa/wojewodztwa-min.geojson"
    cache_path = CACHE_DIR / "polish_wojewodztwa.geojson"
    return _download_github_geojson(url, cache_path)


def get_polish_powiaty() -> gpd.GeoDataFrame:
    """Get Polish powiaty (counties), sorted by population descending.

    Returns:
        GeoDataFrame with powiat boundaries and population.
    """
    url = f"{POLSKA_GEOJSON_BASE}/powiaty/powiaty-min.geojson"
    cache_path = CACHE_DIR / "polish_powiaty.geojson"
    gdf = _download_github_geojson(url, cache_path)

    # Get population data from Wikidata
    population_map = _get_powiaty_population()

    # Add population column
    def get_population(nazwa: str) -> int:
        """Match powiat name to population data."""
        if not nazwa:
            return 0
        # Remove "powiat " prefix for matching
        clean_name = nazwa.replace("powiat ", "").strip()
        # Try direct match
        if clean_name in population_map:
            return population_map[clean_name]
        # Try lowercase
        name_lower = clean_name.lower()
        for pop_name, pop in population_map.items():
            if pop_name.lower() == name_lower:
                return pop
        return 0

    gdf["population"] = gdf["nazwa"].apply(get_population)

    # Sort by population descending
    return gdf.sort_values("population", ascending=False).reset_index(drop=True)


def get_polish_gminy() -> gpd.GeoDataFrame:
    """Get Polish gminy (municipalities) from OSM, sorted by area descending.

    Returns:
        GeoDataFrame with gminy boundaries.
    """
    cache_path = CACHE_DIR / "polish_gminy.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        if "area_km2" in gdf.columns:
            return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write("Fetching gminy data from OSM (this may take a while)...\n")
    # Polish gminy are admin_level=7 in OSM
    query = """
    [out:json][timeout:300];
    area["ISO3166-1"="PL"]->.pl;
    relation["boundary"="administrative"]["admin_level"="7"]["name"](area.pl);
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()
    min_ring_coords = 4

    for element in data.get("elements", []):
        if element.get("type") != "relation":
            continue

        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        outer_rings, inner_rings = _extract_osiedla_rings(element, min_ring_coords)
        if not outer_rings:
            continue

        seen_names.add(name)
        features.append(
            {
                "type": "Feature",
                "properties": {"name": name},
                "geometry": _build_osiedla_geometry(outer_rings, inner_rings),
            }
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson))

    sys.stdout.write(f"Cached {len(features)} gminy.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")

    # Add area column
    gdf = _add_area_column(gdf)

    return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)


def get_poland_boundary() -> gpd.GeoDataFrame:
    """Get Poland country boundary.

    Returns:
        GeoDataFrame with Poland boundary.
    """
    cache_path = CACHE_DIR / "poland_boundary.geojson"

    if cache_path.exists():
        return gpd.read_file(cache_path)

    # Dissolve from województwa
    woj = get_polish_wojewodztwa()
    # Fix invalid geometries with buffer(0)
    woj["geometry"] = woj["geometry"].buffer(0)
    poland = gpd.GeoDataFrame(geometry=[woj.union_all()], crs=woj.crs)

    _ensure_cache_dir()
    poland.to_file(cache_path, driver="GeoJSON")

    return poland


# =============================================================================
# Polish Natural Features
# =============================================================================


def get_polish_mountain_peaks() -> gpd.GeoDataFrame:
    """Get Polish mountain peaks, sorted by elevation descending.

    Returns:
        GeoDataFrame with mountain peak points and elevation.
    """
    cache_path = CACHE_DIR / "polish_mountain_peaks.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        return gdf.sort_values("elevation", ascending=False).reset_index(drop=True)

    sys.stdout.write("Fetching mountain peaks data from OSM...\n")
    query = """
    [out:json][timeout:120];
    area["ISO3166-1"="PL"]->.pl;
    (
      node["natural"="peak"]["name"]["ele"](area.pl);
    );
    out;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()

    for element in data.get("elements", []):
        if element.get("type") != "node":
            continue

        name = element.get("tags", {}).get("name", "")
        ele_str = element.get("tags", {}).get("ele", "")

        if not name or not ele_str or name in seen_names:
            continue

        with contextlib.suppress(ValueError):
            elevation = float(ele_str.replace(",", ".").split()[0])
            if elevation < MIN_PEAK_ELEVATION:
                continue

            seen_names.add(name)
            features.append(
                {
                    "type": "Feature",
                    "properties": {"name": name, "elevation": elevation},
                    "geometry": {
                        "type": "Point",
                        "coordinates": [element["lon"], element["lat"]],
                    },
                }
            )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} mountain peaks.\n")

    if not features:
        msg = "No mountain peaks found in OSM data"
        raise ValueError(msg)

    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")
    return gdf.sort_values("elevation", ascending=False).reset_index(drop=True)


def get_polish_mountain_ranges() -> gpd.GeoDataFrame:
    """Get Polish mountain ranges, sorted by area descending.

    Returns:
        GeoDataFrame with mountain range polygons.
    """
    cache_path = CACHE_DIR / "polish_mountain_ranges.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        # Fix invalid geometries from OSM data and extract only polygons
        gdf["geometry"] = gdf.geometry.make_valid()
        gdf["geometry"] = gdf.geometry.apply(_extract_polygonal_geometry)
        gdf = gdf[gdf.geometry.notna() & ~gdf.geometry.is_empty]
        if "area_km2" in gdf.columns:
            return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write("Fetching mountain ranges data from OSM...\n")
    query = """
    [out:json][timeout:180];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["natural"="mountain_range"]["name"](area.pl);
      way["natural"="mountain_range"]["name"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()
    min_ring_coords = 4

    for element in data.get("elements", []):
        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        if element.get("type") == "relation":
            outer_rings, inner_rings = _extract_osiedla_rings(element, min_ring_coords)
            if not outer_rings:
                continue
            geometry = _build_osiedla_geometry(outer_rings, inner_rings)
        elif element.get("type") == "way" and "geometry" in element:
            coords = [(p["lon"], p["lat"]) for p in element["geometry"]]
            if len(coords) < min_ring_coords:
                continue
            if coords[0] != coords[-1]:
                coords.append(coords[0])
            geometry = {"type": "Polygon", "coordinates": [coords]}
        else:
            continue

        seen_names.add(name)
        features.append(
            {"type": "Feature", "properties": {"name": name}, "geometry": geometry}
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} mountain ranges.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")

    # Fix invalid geometries from OSM data and extract only polygons
    gdf["geometry"] = gdf.geometry.make_valid()
    gdf["geometry"] = gdf.geometry.apply(_extract_polygonal_geometry)
    gdf = gdf[gdf.geometry.notna() & ~gdf.geometry.is_empty]

    # Calculate area in km²
    gdf_proj = gdf.to_crs("EPSG:2180")  # Polish coordinate system
    gdf["area_km2"] = gdf_proj.geometry.area / 1_000_000

    return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)


def get_polish_national_parks() -> gpd.GeoDataFrame:
    """Get all 23 Polish national parks, sorted by area descending.

    Returns:
        GeoDataFrame with national park polygons.
    """
    cache_path = CACHE_DIR / "polish_national_parks.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        if "area_km2" in gdf.columns:
            return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write("Fetching national parks data from OSM...\n")
    query = """
    [out:json][timeout:180];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["boundary"="national_park"]["name"](area.pl);
      relation["leisure"="nature_reserve"]["name"]["protect_class"="2"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()
    min_ring_coords = 4

    for element in data.get("elements", []):
        if element.get("type") != "relation":
            continue

        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        # Filter to only include "Park Narodowy" in name
        if "Narodowy" not in name and "narodowy" not in name.lower():
            continue

        outer_rings, inner_rings = _extract_osiedla_rings(element, min_ring_coords)
        if not outer_rings:
            continue

        seen_names.add(name)
        features.append(
            {
                "type": "Feature",
                "properties": {"name": name},
                "geometry": _build_osiedla_geometry(outer_rings, inner_rings),
            }
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} national parks.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")

    # Calculate area in km²
    gdf_proj = gdf.to_crs("EPSG:2180")
    gdf["area_km2"] = gdf_proj.geometry.area / 1_000_000

    return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)


def get_polish_forests() -> gpd.GeoDataFrame:
    """Get major Polish forests (puszcze), sorted by area descending.

    Returns:
        GeoDataFrame with forest polygons.
    """
    cache_path = CACHE_DIR / "polish_forests.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        if "area_km2" in gdf.columns:
            return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write("Fetching forests data from OSM...\n")
    # Query for named forests, especially "Puszcza" type
    query = """
    [out:json][timeout:300];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["natural"="wood"]["name"](area.pl);
      relation["landuse"="forest"]["name"~"Puszcza|Bory|Las"](area.pl);
      way["natural"="wood"]["name"~"Puszcza|Bory"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)
    forest_keywords = ("Puszcza", "Bory", "Las ", "Lasy ")

    features = []
    seen_names: set[str] = set()

    for element in data.get("elements", []):
        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue
        if not any(keyword in name for keyword in forest_keywords):
            continue

        geometry = _extract_polygon_from_element(element)
        if geometry is None:
            continue

        seen_names.add(name)
        features.append(
            {"type": "Feature", "properties": {"name": name}, "geometry": geometry}
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} forests.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")
    gdf = _add_area_column(gdf)

    if len(gdf) > 0:
        return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
    return gdf


def get_polish_lakes() -> gpd.GeoDataFrame:
    """Get Polish lakes, sorted by area descending.

    Returns:
        GeoDataFrame with lake polygons.
    """
    cache_path = CACHE_DIR / "polish_lakes.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        if "area_km2" in gdf.columns:
            return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write("Fetching lakes data from OSM...\n")
    query = """
    [out:json][timeout:300];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["natural"="water"]["water"="lake"]["name"](area.pl);
      way["natural"="water"]["water"="lake"]["name"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()

    for element in data.get("elements", []):
        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        geometry = _extract_polygon_from_element(element)
        if geometry is None:
            continue

        seen_names.add(name)
        features.append(
            {"type": "Feature", "properties": {"name": name}, "geometry": geometry}
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} lakes.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")
    gdf = _add_area_column(gdf)

    if len(gdf) > 0:
        # Filter to lakes > MIN_LAKE_AREA_KM2 to exclude tiny ponds
        gdf = gdf[gdf["area_km2"] > MIN_LAKE_AREA_KM2]
        return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)

    return gdf


def _extract_river_coords_from_element(
    element: dict[str, Any],
) -> list[list[tuple[float, float]]]:
    """Extract coordinate lists from a river element.

    Args:
        element: OSM element (way or relation).

    Returns:
        List of coordinate lists (line segments).
    """
    coord_lists: list[list[tuple[float, float]]] = []

    if element.get("type") == "way" and "geometry" in element:
        coords = [(p["lon"], p["lat"]) for p in element["geometry"]]
        if len(coords) >= MIN_LINE_COORDS:
            coord_lists.append(coords)
    elif element.get("type") == "relation":
        for member in element.get("members", []):
            if member.get("type") == "way" and "geometry" in member:
                coords = [(p["lon"], p["lat"]) for p in member["geometry"]]
                if len(coords) >= MIN_LINE_COORDS:
                    coord_lists.append(coords)

    return coord_lists


def get_polish_rivers() -> gpd.GeoDataFrame:
    """Get Polish rivers, sorted by length descending.

    Rivers with the same name but in different locations are kept separate
    by using unique IDs from OSM when available.

    Returns:
        GeoDataFrame with river linestrings.
    """
    cache_path = CACHE_DIR / "polish_rivers.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        if "length_km" in gdf.columns:
            return gdf.sort_values("length_km", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write("Fetching rivers data from OSM...\n")
    query = """
    [out:json][timeout:300];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["waterway"="river"]["name"](area.pl);
      way["waterway"="river"]["name"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)

    # Group ways by river name AND wikidata ID (or OSM ID for uniqueness)
    # This prevents merging different rivers with the same name
    rivers_by_key: dict[str, list[list[tuple[float, float]]]] = {}
    river_names: dict[str, str] = {}  # key -> display name

    for element in data.get("elements", []):
        name = element.get("tags", {}).get("name", "")
        if not name:
            continue

        # Use wikidata ID if available, otherwise use element type+id
        wikidata = element.get("tags", {}).get("wikidata", "")
        if wikidata:
            key = f"{name}_{wikidata}"
        else:
            # Fall back to element ID for grouping related ways
            key = f"{name}_{element.get('type')}_{element.get('id')}"

        coord_lists = _extract_river_coords_from_element(element)
        if coord_lists:
            rivers_by_key.setdefault(key, []).extend(coord_lists)
            river_names[key] = name

    features = []
    for key, coord_lists in rivers_by_key.items():
        name = river_names[key]
        geometry: dict[str, Any]
        if len(coord_lists) == 1:
            geometry = {"type": "LineString", "coordinates": coord_lists[0]}
        else:
            geometry = {"type": "MultiLineString", "coordinates": coord_lists}

        features.append(
            {"type": "Feature", "properties": {"name": name}, "geometry": geometry}
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} rivers.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")
    gdf = _add_length_column(gdf)

    if len(gdf) > 0:
        gdf = gdf[gdf["length_km"] > MIN_RIVER_LENGTH_KM]
        return gdf.sort_values("length_km", ascending=False).reset_index(drop=True)

    return gdf


def get_polish_islands() -> gpd.GeoDataFrame:
    """Get Polish islands, sorted by area descending.

    Returns:
        GeoDataFrame with island polygons.
    """
    cache_path = CACHE_DIR / "polish_islands.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        if "area_km2" in gdf.columns:
            return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write("Fetching islands data from OSM...\n")
    query = """
    [out:json][timeout:180];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["place"="island"]["name"](area.pl);
      way["place"="island"]["name"](area.pl);
      relation["place"="islet"]["name"](area.pl);
      way["place"="islet"]["name"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()

    for element in data.get("elements", []):
        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        geometry = _extract_polygon_from_element(element)
        if geometry is None:
            continue

        seen_names.add(name)
        features.append(
            {"type": "Feature", "properties": {"name": name}, "geometry": geometry}
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} islands.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")
    gdf = _add_area_column(gdf)

    if len(gdf) > 0:
        return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
    return gdf


def get_polish_coastal_features() -> gpd.GeoDataFrame:
    """Get Polish coastal features (peninsulas, spits, cliffs), sorted by length.

    Returns:
        GeoDataFrame with coastal feature geometries.
    """
    cache_path = CACHE_DIR / "polish_coastal_features.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        if "length_km" in gdf.columns:
            return gdf.sort_values("length_km", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write("Fetching coastal features data from OSM...\n")
    query = """
    [out:json][timeout:180];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["natural"="peninsula"]["name"](area.pl);
      way["natural"="peninsula"]["name"](area.pl);
      relation["natural"="spit"]["name"](area.pl);
      way["natural"="spit"]["name"](area.pl);
      relation["natural"="cliff"]["name"](area.pl);
      way["natural"="cliff"]["name"](area.pl);
      relation["natural"="coastline"]["name"](area.pl);
      way["natural"="beach"]["name"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)
    line_types = ("cliff", "beach", "coastline")

    features = []
    seen_names: set[str] = set()

    for element in data.get("elements", []):
        name = element.get("tags", {}).get("name", "")
        natural_type = element.get("tags", {}).get("natural", "")
        if not name or name in seen_names:
            continue

        geometry = _extract_coastal_geometry(element, natural_type, line_types)
        if geometry is None:
            continue

        seen_names.add(name)
        features.append(
            {
                "type": "Feature",
                "properties": {"name": name, "type": natural_type},
                "geometry": geometry,
            }
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} coastal features.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")
    gdf = _add_length_column(gdf)

    if len(gdf) > 0:
        return gdf.sort_values("length_km", ascending=False).reset_index(drop=True)
    return gdf


def get_polish_unesco_sites() -> gpd.GeoDataFrame:
    """Get Polish UNESCO World Heritage Sites, sorted by inscription year.

    Returns:
        GeoDataFrame with UNESCO site geometries.
    """
    cache_path = CACHE_DIR / "polish_unesco_sites.geojson"

    if cache_path.exists():
        return gpd.read_file(cache_path)

    sys.stdout.write("Fetching UNESCO sites data from OSM...\n")
    query = """
    [out:json][timeout:180];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["heritage"="world_heritage_site"]["name"](area.pl);
      way["heritage"="world_heritage_site"]["name"](area.pl);
      node["heritage"="world_heritage_site"]["name"](area.pl);
      relation["heritage:operator"="whc"]["name"](area.pl);
      way["heritage:operator"="whc"]["name"](area.pl);
      node["heritage:operator"="whc"]["name"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()
    min_ring_coords = 4

    for element in data.get("elements", []):
        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        if element.get("type") == "node":
            geometry = {
                "type": "Point",
                "coordinates": [element["lon"], element["lat"]],
            }
        elif element.get("type") == "relation":
            outer_rings, inner_rings = _extract_osiedla_rings(element, min_ring_coords)
            if not outer_rings:
                continue
            geometry = _build_osiedla_geometry(outer_rings, inner_rings)
        elif element.get("type") == "way" and "geometry" in element:
            coords = [(p["lon"], p["lat"]) for p in element["geometry"]]
            if len(coords) < min_ring_coords:
                continue
            if coords[0] != coords[-1]:
                coords.append(coords[0])
            geometry = {"type": "Polygon", "coordinates": [coords]}
        else:
            continue

        seen_names.add(name)
        features.append(
            {"type": "Feature", "properties": {"name": name}, "geometry": geometry}
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} UNESCO sites.\n")
    return gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")


def get_polish_nature_reserves() -> gpd.GeoDataFrame:
    """Get Polish nature reserves, sorted by area descending.

    Returns:
        GeoDataFrame with nature reserve polygons.
    """
    cache_path = CACHE_DIR / "polish_nature_reserves.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        if "area_km2" in gdf.columns:
            return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write(
        "Fetching nature reserves data from OSM (this may take a while)...\n"
    )
    query = """
    [out:json][timeout:600];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["leisure"="nature_reserve"]["name"](area.pl);
      way["leisure"="nature_reserve"]["name"](area.pl);
      relation["boundary"="protected_area"]["protect_class"="4"]["name"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()

    for element in data.get("elements", []):
        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        geometry = _extract_polygon_from_element(element)
        if geometry is None:
            continue

        seen_names.add(name)
        features.append(
            {"type": "Feature", "properties": {"name": name}, "geometry": geometry}
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} nature reserves.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")
    gdf = _add_area_column(gdf)

    if len(gdf) > 0:
        return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
    return gdf


def get_polish_landscape_parks() -> gpd.GeoDataFrame:
    """Get Polish landscape parks, sorted by area descending.

    Returns:
        GeoDataFrame with landscape park polygons.
    """
    cache_path = CACHE_DIR / "polish_landscape_parks.geojson"

    if cache_path.exists():
        gdf = gpd.read_file(cache_path)
        # Fix invalid geometries from OSM data and extract only polygons
        gdf["geometry"] = gdf.geometry.make_valid()
        gdf["geometry"] = gdf.geometry.apply(_extract_polygonal_geometry)
        # Remove any rows where geometry extraction failed
        gdf = gdf[gdf.geometry.notna() & ~gdf.geometry.is_empty]
        if "area_km2" in gdf.columns:
            return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)
        return gdf

    sys.stdout.write("Fetching landscape parks data from OSM...\n")
    query = """
    [out:json][timeout:300];
    area["ISO3166-1"="PL"]->.pl;
    (
      relation["boundary"="protected_area"]["protect_class"="5"]["name"](area.pl);
      relation["leisure"="nature_reserve"]["name"~"Park Krajobrazowy"](area.pl);
    );
    out geom;
    """

    data = _overpass_query(query)

    features = []
    seen_names: set[str] = set()
    min_ring_coords = 4

    for element in data.get("elements", []):
        if element.get("type") != "relation":
            continue

        name = element.get("tags", {}).get("name", "")
        if not name or name in seen_names:
            continue

        outer_rings, inner_rings = _extract_osiedla_rings(element, min_ring_coords)
        if not outer_rings:
            continue

        seen_names.add(name)
        features.append(
            {
                "type": "Feature",
                "properties": {"name": name},
                "geometry": _build_osiedla_geometry(outer_rings, inner_rings),
            }
        )

    _ensure_cache_dir()
    geojson = {"type": "FeatureCollection", "features": features}
    cache_path.write_text(json.dumps(geojson, ensure_ascii=False))

    sys.stdout.write(f"Cached {len(features)} landscape parks.\n")
    gdf = gpd.GeoDataFrame.from_features(features, crs="EPSG:4326")

    # Fix invalid geometries from OSM data and extract only polygons
    gdf["geometry"] = gdf.geometry.make_valid()
    gdf["geometry"] = gdf.geometry.apply(_extract_polygonal_geometry)
    # Remove any rows where geometry extraction failed
    gdf = gdf[gdf.geometry.notna() & ~gdf.geometry.is_empty]

    if len(gdf) > 0:
        gdf_proj = gdf.to_crs("EPSG:2180")
        gdf["area_km2"] = gdf_proj.geometry.area / 1_000_000
        return gdf.sort_values("area_km2", ascending=False).reset_index(drop=True)

    return gdf


# =============================================================================
# Utility Functions
# =============================================================================


def download_all_warsaw_data() -> None:
    """Download and cache all Warsaw geographic data.

    Call this once to pre-populate the cache.
    """
    sys.stdout.write("Downloading all Warsaw geographic data...\n")
    sys.stdout.write("=" * 60 + "\n")

    sys.stdout.write("\n1. Warsaw boundary...\n")
    get_warsaw_boundary()

    sys.stdout.write("\n2. Vistula river...\n")
    get_vistula_river()

    sys.stdout.write("\n3. Warsaw bridges...\n")
    get_warsaw_bridges()

    sys.stdout.write("\n4. Metro stations...\n")
    get_warsaw_metro_stations()

    sys.stdout.write("\n5. Major streets...\n")
    get_warsaw_streets()

    sys.stdout.write("\n6. Landmarks...\n")
    get_warsaw_landmarks()

    sys.stdout.write("\n7. Osiedla...\n")
    get_warsaw_osiedla()

    sys.stdout.write("\n" + "=" * 60 + "\n")
    sys.stdout.write("All Warsaw data cached successfully!\n")


def download_all_poland_data() -> None:
    """Download and cache all Poland geographic data.

    Call this once to pre-populate the cache.
    """
    sys.stdout.write("Downloading all Poland geographic data...\n")
    sys.stdout.write("=" * 60 + "\n")

    sys.stdout.write("\n1. Województwa...\n")
    get_polish_wojewodztwa()

    sys.stdout.write("\n2. Powiaty...\n")
    get_polish_powiaty()

    sys.stdout.write("\n3. Gminy (this may take a while)...\n")
    get_polish_gminy()

    sys.stdout.write("\n4. Poland boundary...\n")
    get_poland_boundary()

    sys.stdout.write("\n" + "=" * 60 + "\n")
    sys.stdout.write("All Poland data cached successfully!\n")


def clear_cache() -> None:
    """Clear all cached data."""
    if CACHE_DIR.exists():
        shutil.rmtree(CACHE_DIR)
        sys.stdout.write("Cache cleared.\n")
    else:
        sys.stdout.write("Cache directory does not exist.\n")


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Manage geographic data cache")
    parser.add_argument(
        "--download-warsaw",
        action="store_true",
        help="Download all Warsaw data",
    )
    parser.add_argument(
        "--download-poland",
        action="store_true",
        help="Download all Poland data",
    )
    parser.add_argument(
        "--download-all",
        action="store_true",
        help="Download all data",
    )
    parser.add_argument(
        "--clear-cache",
        action="store_true",
        help="Clear cached data",
    )

    args = parser.parse_args()

    if args.clear_cache:
        clear_cache()
    elif args.download_warsaw:
        download_all_warsaw_data()
    elif args.download_poland:
        download_all_poland_data()
    elif args.download_all:
        download_all_warsaw_data()
        download_all_poland_data()
    else:
        parser.print_help()