testsAndMisc/PYTHON/extractLinks/main.py
Krzysztof kuhy Rudnicki e3f9e6dc0b fix: correct shebang and executable permissions
- Add +x to Python scripts with shebangs (3 files)
- Remove -x from non-script files like .cpp, .txt, makefile (23 files)
- Move shebang to first line in C/imageViewer/lint.sh
2025-11-30 13:42:16 +01:00

90 lines
2.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""Extract hosts from href attributes in an HTML file and write them as *host* per line.
Usage:
python main.py INPUT_HTML [OUTPUT_TXT]
If OUTPUT_TXT is not provided, the script writes to <INPUT_BASENAME>_links.txt
alongside the input file.
"""
from __future__ import annotations
import argparse
from html.parser import HTMLParser
import os
from urllib.parse import urlparse
class _HrefParser(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.hrefs: list[str] = []
def handle_starttag(self, tag: str, attrs): # type: ignore[override]
# Collect any href attribute on any tag
for k, v in attrs:
if k.lower() == "href" and v is not None:
self.hrefs.append(v)
def extract_hosts_from_html(html_text: str) -> list[str]:
"""Parse HTML text, extract href values, and return a list of hostnames.
Rules:
- Only http/https URLs are considered.
- Output is the network location (host[:port]) without scheme or path.
- Duplicates are removed, preserving first-seen order.
"""
parser = _HrefParser()
parser.feed(html_text)
seen: set[str] = set()
hosts: list[str] = []
for href in parser.hrefs:
parsed = urlparse(href)
if parsed.scheme in {"http", "https"} and parsed.netloc:
host = parsed.netloc
if host not in seen:
seen.add(host)
hosts.append(host)
return hosts
def main() -> int:
ap = argparse.ArgumentParser(
description="Extract hosts from hrefs in an HTML file."
)
ap.add_argument("input_html", help="Path to input HTML file")
ap.add_argument(
"output_txt",
nargs="?",
help="Path to output text file (defaults to <input_basename>_links.txt in the same directory)",
)
args = ap.parse_args()
input_path = args.input_html
if not os.path.isfile(input_path):
raise SystemExit(f"Input file not found: {input_path}")
out_path = args.output_txt
if not out_path:
base = os.path.splitext(os.path.basename(input_path))[0]
out_path = os.path.join(os.path.dirname(input_path), f"{base}_links.txt")
with open(input_path, encoding="utf-8", errors="ignore") as f:
html_text = f.read()
hosts = extract_hosts_from_html(html_text)
with open(out_path, "w", encoding="utf-8") as f:
for host in hosts:
f.write(f"*{host}*\n")
print(f"Wrote {len(hosts)} host(s) to {out_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main())