testsAndMisc-archive/python_pkg/fm24_searcher/html_parser.py

312 lines
9.0 KiB
Python
Raw Normal View History

"""HTML import parser for FM24 exported views.
FM24 allows exporting search/scout views via Ctrl+P (Printing).
The result is an HTML file containing player data in tables.
This module parses that HTML to extract player attributes.
Supported: the default FM24 HTML export format with
<table> containing player rows and attribute columns.
"""
from __future__ import annotations
import contextlib
from dataclasses import dataclass, field
import html
import re
from typing import TYPE_CHECKING
from python_pkg.fm24_searcher.models import ALL_VISIBLE_ATTRS, GOALKEEPER_ATTRS, Player
if TYPE_CHECKING:
from pathlib import Path
# Common FM attribute header normalizations.
_HEADER_MAP: dict[str, str] = {
"cor": "Corners",
"cro": "Crossing",
"dri": "Dribbling",
"fin": "Finishing",
"fir": "First Touch",
"fre": "Free Kick",
"hea": "Heading",
"lon": "Long Shots",
"l th": "Long Throws",
"mar": "Marking",
"pas": "Passing",
"pen": "Penalty Taking",
"tck": "Tackling",
"tec": "Technique",
"agg": "Aggression",
"ant": "Anticipation",
"bra": "Bravery",
"cmp": "Composure",
"cnt": "Concentration",
"dec": "Decisions",
"det": "Determination",
"fla": "Flair",
"ldr": "Leadership",
"otb": "Off The Ball",
"pos": "Positioning",
"tea": "Teamwork",
"vis": "Vision",
"wor": "Work Rate",
"acc": "Acceleration",
"agi": "Agility",
"bal": "Balance",
"jum": "Jumping Reach",
"nat": "Natural Fitness",
"pac": "Pace",
"sta": "Stamina",
"str": "Strength",
# Goalkeeper
"aer": "Aerial Reach",
"cmd": "Command of Area",
"com": "Communication",
"ecc": "Eccentricity",
"han": "Handling",
"kic": "Kicking",
"1v1": "One on Ones",
"pun": "Punching (Tendency)",
"ref": "Reflexes",
"rus": "Rushing Out (Tendency)",
"thr": "Throwing",
# Alternative spellings
"wk r": "Work Rate",
"work rate": "Work Rate",
"corners": "Corners",
"crossing": "Crossing",
"dribbling": "Dribbling",
"finishing": "Finishing",
"first touch": "First Touch",
"heading": "Heading",
"long shots": "Long Shots",
"long throws": "Long Throws",
"marking": "Marking",
"passing": "Passing",
"tackling": "Tackling",
"technique": "Technique",
}
# Build reverse lookup: normalized attr name → canonical.
_ALL_ATTRS_LOWER = {a.lower(): a for a in ALL_VISIBLE_ATTRS + GOALKEEPER_ATTRS}
_TAG_RE = re.compile(r"<[^>]+>")
_WS_RE = re.compile(r"\s+")
def _strip_html(text: str) -> str:
"""Remove HTML tags and decode entities."""
text = _TAG_RE.sub("", text)
text = html.unescape(text)
return _WS_RE.sub(" ", text).strip()
def _normalize_header(raw: str) -> str | None:
"""Map an HTML column header to a canonical attribute name."""
clean = _strip_html(raw).strip().lower()
# Direct lookup.
if clean in _HEADER_MAP:
return _HEADER_MAP[clean]
if clean in _ALL_ATTRS_LOWER:
return _ALL_ATTRS_LOWER[clean]
# Truncated header: try first 3 chars.
short = clean[:3]
if short in _HEADER_MAP:
return _HEADER_MAP[short]
return None
def _extract_tables(html_content: str) -> list[list[list[str]]]:
"""Parse HTML tables into a list of row lists.
Each row is a list of cell strings. Returns list of tables.
"""
tables: list[list[list[str]]] = []
table_re = re.compile(
r"<table[^>]*>(.*?)</table>",
re.DOTALL | re.IGNORECASE,
)
row_re = re.compile(
r"<tr[^>]*>(.*?)</tr>",
re.DOTALL | re.IGNORECASE,
)
cell_re = re.compile(
r"<t[hd][^>]*>(.*?)</t[hd]>",
re.DOTALL | re.IGNORECASE,
)
for table_match in table_re.finditer(html_content):
rows: list[list[str]] = []
for row_match in row_re.finditer(table_match.group(1)):
cells = [
_strip_html(c.group(1)) for c in cell_re.finditer(row_match.group(1))
]
if cells:
rows.append(cells)
if rows:
tables.append(rows)
return tables
_MIN_TABLE_ROWS = 2
_MIN_ATTR_VAL = 1
_MAX_ATTR_VAL = 20
# Map from lowercase header text to _ColMap field name.
_HDR_FIELD: dict[str, str] = {
"name": "name",
"player": "name",
"club": "club",
"team": "club",
"nat": "nat",
"nationality": "nat",
"position": "pos",
"pos": "pos",
"ca": "ca",
"ability": "ca",
"pa": "pa",
"potential": "pa",
"value": "value",
"val": "value",
"wage": "wage",
}
# Map from _ColMap field name to Player attribute name.
_FIELD_ATTR: list[tuple[str, str]] = [
("club", "club"),
("nat", "nationality"),
("pos", "position"),
("value", "value"),
("wage", "wage"),
]
@dataclass
class _ColMap:
"""Column index mapping from parsed HTML table headers."""
name: int | None = None
club: int | None = None
nat: int | None = None
pos: int | None = None
ca: int | None = None
pa: int | None = None
value: int | None = None
wage: int | None = None
attrs: dict[int, str] = field(default_factory=dict)
def _build_col_map(headers: list[str]) -> _ColMap:
"""Build column index mapping from table header cells."""
cols = _ColMap()
for i, hdr in enumerate(headers):
h = hdr.strip().lower()
if field := _HDR_FIELD.get(h):
setattr(cols, field, i)
elif attr_name := _normalize_header(hdr):
cols.attrs[i] = attr_name
return cols
def _apply_attr(player: Player, attr_name: str, val_str: str) -> None:
"""Parse val_str and set an attribute on player if value is in range."""
if "-" in val_str and val_str[0].isdigit():
val_str = val_str.split("-", maxsplit=1)[0]
with contextlib.suppress(ValueError):
val = int(val_str)
if _MIN_ATTR_VAL <= val <= _MAX_ATTR_VAL:
if attr_name in ALL_VISIBLE_ATTRS:
player.attributes[attr_name] = val
else:
player.gk_attributes[attr_name] = val
def _parse_player_row(row: list[str], cols: _ColMap) -> Player | None:
"""Parse one data row into a Player; returns None if row is invalid."""
if cols.name is None or len(row) <= cols.name:
return None
name = row[cols.name].strip()
if not name:
return None
player = Player(name=name, source="html")
def _get(col: int | None) -> str | None:
return row[col].strip() if col is not None and col < len(row) else None
for col_field, attr in _FIELD_ATTR:
if val := _get(getattr(cols, col_field)):
setattr(player, attr, val)
with contextlib.suppress(ValueError, TypeError):
if raw := _get(cols.ca):
player.current_ability = int(raw)
with contextlib.suppress(ValueError, TypeError):
if raw := _get(cols.pa):
player.potential_ability = int(raw)
for col_idx, attr_name in cols.attrs.items():
if col_idx < len(row):
_apply_attr(player, attr_name, row[col_idx].strip())
return player
def parse_html_export(filepath: Path) -> list[Player]:
"""Parse an FM24 HTML export file into Player objects.
Looks for tables where column headers map to known FM
attributes. The 'Name' column is required.
"""
content = filepath.read_text(encoding="utf-8", errors="replace")
all_players: list[Player] = []
for table in _extract_tables(content):
if len(table) < _MIN_TABLE_ROWS:
continue
cols = _build_col_map(table[0])
if cols.name is None:
continue
for row in table[1:]:
player = _parse_player_row(row, cols)
if player is not None:
all_players.append(player)
return all_players
def merge_players(
binary_players: list[Player],
html_players: list[Player],
) -> list[Player]:
"""Merge binary-parsed data with HTML-imported attributes.
Matches by name (case-insensitive). Binary provides
DOB/CA/personality; HTML provides visible attributes.
"""
html_by_name: dict[str, Player] = {}
for p in html_players:
html_by_name[p.name.lower()] = p
merged: list[Player] = []
matched_names: set[str] = set()
for bp in binary_players:
key = bp.name.lower()
if key in html_by_name:
hp = html_by_name[key]
bp.attributes = hp.attributes
bp.gk_attributes = hp.gk_attributes
bp.club = hp.club or bp.club
bp.nationality = hp.nationality or bp.nationality
bp.position = hp.position or bp.position
bp.value = hp.value or bp.value
bp.wage = hp.wage or bp.wage
if hp.current_ability > 0:
bp.current_ability = hp.current_ability
if hp.potential_ability > 0:
bp.potential_ability = hp.potential_ability
bp.source = "merged"
matched_names.add(key)
merged.append(bp)
# Add HTML-only players not matched.
merged.extend(hp for hp in html_players if hp.name.lower() not in matched_names)
return merged