"""HTML import parser for FM24 exported views. FM24 allows exporting search/scout views via Ctrl+P (Printing). The result is an HTML file containing player data in tables. This module parses that HTML to extract player attributes. Supported: the default FM24 HTML export format with containing player rows and attribute columns. """ from __future__ import annotations import contextlib from dataclasses import dataclass, field import html import re from typing import TYPE_CHECKING from python_pkg.fm24_searcher.models import ALL_VISIBLE_ATTRS, GOALKEEPER_ATTRS, Player if TYPE_CHECKING: from pathlib import Path # Common FM attribute header normalizations. _HEADER_MAP: dict[str, str] = { "cor": "Corners", "cro": "Crossing", "dri": "Dribbling", "fin": "Finishing", "fir": "First Touch", "fre": "Free Kick", "hea": "Heading", "lon": "Long Shots", "l th": "Long Throws", "mar": "Marking", "pas": "Passing", "pen": "Penalty Taking", "tck": "Tackling", "tec": "Technique", "agg": "Aggression", "ant": "Anticipation", "bra": "Bravery", "cmp": "Composure", "cnt": "Concentration", "dec": "Decisions", "det": "Determination", "fla": "Flair", "ldr": "Leadership", "otb": "Off The Ball", "pos": "Positioning", "tea": "Teamwork", "vis": "Vision", "wor": "Work Rate", "acc": "Acceleration", "agi": "Agility", "bal": "Balance", "jum": "Jumping Reach", "nat": "Natural Fitness", "pac": "Pace", "sta": "Stamina", "str": "Strength", # Goalkeeper "aer": "Aerial Reach", "cmd": "Command of Area", "com": "Communication", "ecc": "Eccentricity", "han": "Handling", "kic": "Kicking", "1v1": "One on Ones", "pun": "Punching (Tendency)", "ref": "Reflexes", "rus": "Rushing Out (Tendency)", "thr": "Throwing", # Alternative spellings "wk r": "Work Rate", "work rate": "Work Rate", "corners": "Corners", "crossing": "Crossing", "dribbling": "Dribbling", "finishing": "Finishing", "first touch": "First Touch", "heading": "Heading", "long shots": "Long Shots", "long throws": "Long Throws", "marking": "Marking", "passing": "Passing", "tackling": "Tackling", "technique": "Technique", } # Build reverse lookup: normalized attr name → canonical. _ALL_ATTRS_LOWER = {a.lower(): a for a in ALL_VISIBLE_ATTRS + GOALKEEPER_ATTRS} _TAG_RE = re.compile(r"<[^>]+>") _WS_RE = re.compile(r"\s+") def _strip_html(text: str) -> str: """Remove HTML tags and decode entities.""" text = _TAG_RE.sub("", text) text = html.unescape(text) return _WS_RE.sub(" ", text).strip() def _normalize_header(raw: str) -> str | None: """Map an HTML column header to a canonical attribute name.""" clean = _strip_html(raw).strip().lower() # Direct lookup. if clean in _HEADER_MAP: return _HEADER_MAP[clean] if clean in _ALL_ATTRS_LOWER: return _ALL_ATTRS_LOWER[clean] # Truncated header: try first 3 chars. short = clean[:3] if short in _HEADER_MAP: return _HEADER_MAP[short] return None def _extract_tables(html_content: str) -> list[list[list[str]]]: """Parse HTML tables into a list of row lists. Each row is a list of cell strings. Returns list of tables. """ tables: list[list[list[str]]] = [] table_re = re.compile( r"]*>(.*?)
", re.DOTALL | re.IGNORECASE, ) row_re = re.compile( r"]*>(.*?)", re.DOTALL | re.IGNORECASE, ) cell_re = re.compile( r"]*>(.*?)", re.DOTALL | re.IGNORECASE, ) for table_match in table_re.finditer(html_content): rows: list[list[str]] = [] for row_match in row_re.finditer(table_match.group(1)): cells = [ _strip_html(c.group(1)) for c in cell_re.finditer(row_match.group(1)) ] if cells: rows.append(cells) if rows: tables.append(rows) return tables _MIN_TABLE_ROWS = 2 _MIN_ATTR_VAL = 1 _MAX_ATTR_VAL = 20 # Map from lowercase header text to _ColMap field name. _HDR_FIELD: dict[str, str] = { "name": "name", "player": "name", "club": "club", "team": "club", "nat": "nat", "nationality": "nat", "position": "pos", "pos": "pos", "ca": "ca", "ability": "ca", "pa": "pa", "potential": "pa", "value": "value", "val": "value", "wage": "wage", } # Map from _ColMap field name to Player attribute name. _FIELD_ATTR: list[tuple[str, str]] = [ ("club", "club"), ("nat", "nationality"), ("pos", "position"), ("value", "value"), ("wage", "wage"), ] @dataclass class _ColMap: """Column index mapping from parsed HTML table headers.""" name: int | None = None club: int | None = None nat: int | None = None pos: int | None = None ca: int | None = None pa: int | None = None value: int | None = None wage: int | None = None attrs: dict[int, str] = field(default_factory=dict) def _build_col_map(headers: list[str]) -> _ColMap: """Build column index mapping from table header cells.""" cols = _ColMap() for i, hdr in enumerate(headers): h = hdr.strip().lower() if field := _HDR_FIELD.get(h): setattr(cols, field, i) elif attr_name := _normalize_header(hdr): cols.attrs[i] = attr_name return cols def _apply_attr(player: Player, attr_name: str, val_str: str) -> None: """Parse val_str and set an attribute on player if value is in range.""" if "-" in val_str and val_str[0].isdigit(): val_str = val_str.split("-", maxsplit=1)[0] with contextlib.suppress(ValueError): val = int(val_str) if _MIN_ATTR_VAL <= val <= _MAX_ATTR_VAL: if attr_name in ALL_VISIBLE_ATTRS: player.attributes[attr_name] = val else: player.gk_attributes[attr_name] = val def _parse_player_row(row: list[str], cols: _ColMap) -> Player | None: """Parse one data row into a Player; returns None if row is invalid.""" if cols.name is None or len(row) <= cols.name: return None name = row[cols.name].strip() if not name: return None player = Player(name=name, source="html") def _get(col: int | None) -> str | None: return row[col].strip() if col is not None and col < len(row) else None for col_field, attr in _FIELD_ATTR: if val := _get(getattr(cols, col_field)): setattr(player, attr, val) with contextlib.suppress(ValueError, TypeError): if raw := _get(cols.ca): player.current_ability = int(raw) with contextlib.suppress(ValueError, TypeError): if raw := _get(cols.pa): player.potential_ability = int(raw) for col_idx, attr_name in cols.attrs.items(): if col_idx < len(row): _apply_attr(player, attr_name, row[col_idx].strip()) return player def parse_html_export(filepath: Path) -> list[Player]: """Parse an FM24 HTML export file into Player objects. Looks for tables where column headers map to known FM attributes. The 'Name' column is required. """ content = filepath.read_text(encoding="utf-8", errors="replace") all_players: list[Player] = [] for table in _extract_tables(content): if len(table) < _MIN_TABLE_ROWS: continue cols = _build_col_map(table[0]) if cols.name is None: continue for row in table[1:]: player = _parse_player_row(row, cols) if player is not None: all_players.append(player) return all_players def merge_players( binary_players: list[Player], html_players: list[Player], ) -> list[Player]: """Merge binary-parsed data with HTML-imported attributes. Matches by name (case-insensitive). Binary provides DOB/CA/personality; HTML provides visible attributes. """ html_by_name: dict[str, Player] = {} for p in html_players: html_by_name[p.name.lower()] = p merged: list[Player] = [] matched_names: set[str] = set() for bp in binary_players: key = bp.name.lower() if key in html_by_name: hp = html_by_name[key] bp.attributes = hp.attributes bp.gk_attributes = hp.gk_attributes bp.club = hp.club or bp.club bp.nationality = hp.nationality or bp.nationality bp.position = hp.position or bp.position bp.value = hp.value or bp.value bp.wage = hp.wage or bp.wage if hp.current_ability > 0: bp.current_ability = hp.current_ability if hp.potential_ability > 0: bp.potential_ability = hp.potential_ability bp.source = "merged" matched_names.add(key) merged.append(bp) # Add HTML-only players not matched. merged.extend(hp for hp in html_players if hp.name.lower() not in matched_names) return merged