testsAndMisc-archive/python_pkg/extract_links/main.py
Krzysztof kuhy Rudnicki 78c1d77144 fix: resolve all pre-commit hook failures after file splits
- Remove all # type: ignore and # noqa comments (banned by no-noqa hook)
- Add mypy --disable-error-code flags to pre-commit config for error
  codes previously suppressed by inline comments
- Fix broken imports after ruff auto-removed re-exports:
  steam_backlog_enforcer, stockfish_analysis, word_frequency, lichess_bot
- Re-add re-exports with __all__ in translator.py, screen_lock.py
- Split _process_epc_fc.py (524 lines) into _process_epc_fc.py + _process_fc.py
- Fix test failures: keyboard_coop, stockfish_analysis, tag_divider
- Add per-file-ignores for PLC0415 (deferred imports) in 7 files
- Mark shebang scripts as executable
- Add __init__.py for generate_images and repo_explorer packages
- Fix codespell, eslint, ruff-format, prettier issues
- Update copilot-instructions.md with --no-verify ban
2026-03-18 22:20:05 +01:00

98 lines
2.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""Extract hosts from href attributes in an HTML file and write them as *host* per line.
Usage:
python main.py INPUT_HTML [OUTPUT_TXT]
If OUTPUT_TXT is not provided, the script writes to <INPUT_BASENAME>_links.txt
alongside the input file.
"""
from __future__ import annotations
import argparse
from html.parser import HTMLParser
import logging
from pathlib import Path
from urllib.parse import urlparse
_logger = logging.getLogger(__name__)
class _HrefParser(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.hrefs: list[str] = []
def handle_starttag(self, _tag: str, attrs: list[tuple[str, str | None]]) -> None:
"""Collect href attributes from start tags."""
for k, v in attrs:
if k.lower() == "href" and v is not None:
self.hrefs.append(v)
def extract_hosts_from_html(html_text: str) -> list[str]:
"""Parse HTML text, extract href values, and return a list of hostnames.
Rules:
- Only http/https URLs are considered.
- Output is the network location (host[:port]) without scheme or path.
- Duplicates are removed, preserving first-seen order.
"""
parser = _HrefParser()
parser.feed(html_text)
seen: set[str] = set()
hosts: list[str] = []
for href in parser.hrefs:
parsed = urlparse(href)
if parsed.scheme in {"http", "https"} and parsed.netloc:
host = parsed.netloc
if host not in seen:
seen.add(host)
hosts.append(host)
return hosts
def main() -> int:
"""Parse command-line arguments and extract hosts from an HTML file."""
ap = argparse.ArgumentParser(
description="Extract hosts from hrefs in an HTML file."
)
ap.add_argument("input_html", help="Path to input HTML file")
ap.add_argument(
"output_txt",
nargs="?",
help=(
"Path to output text file "
"(defaults to <input_basename>_links.txt in the same directory)"
),
)
args = ap.parse_args()
input_path = Path(args.input_html)
if not input_path.is_file():
msg = f"Input file not found: {input_path}"
raise SystemExit(msg)
out_path = args.output_txt
if not out_path:
out_path = input_path.parent / f"{input_path.stem}_links.txt"
else:
out_path = Path(out_path)
with input_path.open(encoding="utf-8", errors="ignore") as f:
html_text = f.read()
hosts = extract_hosts_from_html(html_text)
with out_path.open("w", encoding="utf-8") as f:
f.writelines(f"*{host}*\n" for host in hosts)
_logger.info("Wrote %s host(s) to %s", len(hosts), out_path)
return 0
if __name__ == "__main__":
raise SystemExit(main())