testsAndMisc-archive/python_pkg/scrape_website/scrape_comics.py
Krzysztof kuhy Rudnicki 077a31cb54 fix: address all pylint warnings
- R0914 (too many locals): Extract helper functions in generate_jpeg.py,
  engine.py, lichess_api.py, main.py
- R0902 (too many instance attributes): Use dataclasses in keyboard_coop
- W0621 (redefined outer name): Rename parameters/variables to avoid shadowing
- W0201 (attribute outside init): Initialize all attrs in __init__
- R1705 (no-else-return): Remove unnecessary else after return
- C1805 (implicit booleaness): Use implicit boolean checks
- R1732 (consider-using-with): Use context manager for subprocess.Popen
- E0401 (import-error): Add pylint disable for optional deps (selenium, mitmproxy)
- Clean up pyproject.toml: update comments, remove redundant settings

Pylint score: 10.00/10
2025-12-01 16:11:15 +01:00

90 lines
2.8 KiB
Python

"""Download comic images from a website using Selenium."""
import argparse
import logging
from pathlib import Path
from urllib.parse import urlparse
import requests
# pylint: disable=import-error
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
# pylint: enable=import-error
_logger = logging.getLogger(__name__)
REQUEST_TIMEOUT = 30 # seconds
# Initialize argument parser to accept the website URL as an argument
parser = argparse.ArgumentParser(description="Download images from a comic website.")
parser.add_argument(
"url", type=str, help="The URL of the website to start downloading images from"
)
args = parser.parse_args()
# Initialize WebDriver (Use the appropriate driver for your browser)
driver = webdriver.Chrome()
# Open the website from the passed argument
url = args.url
_logger.info("Opening the website: %s", url)
driver.get(url)
# A function to download images by URL
def download_image(image_url: str) -> bool:
"""Download an image from a URL and save it locally."""
# Extract image name from URL
image_name = Path(urlparse(image_url).path).name
image_path = Path(image_name)
# Check if the image already exists
if image_path.exists():
_logger.info("Image %s already exists, skipping download.", image_name)
return False
_logger.info("Downloading image from URL: %s", image_url)
img_data = requests.get(image_url, timeout=REQUEST_TIMEOUT).content
with image_path.open("wb") as handler:
handler.write(img_data)
_logger.info("Image %s downloaded successfully", image_name)
return True
# No need to define a specific number of images now
count = 1
while True:
_logger.info("Processing image %s...", count)
# Find the image element by its ID
image_element = driver.find_element(By.ID, "cc-comic")
# Get the image URL from the 'src' attribute
current_image_url = image_element.get_attribute("src")
_logger.info("Found image URL: %s", current_image_url)
# Download the image if it doesn't already exist
if download_image(current_image_url):
count += 1 # Increment count only if the image was downloaded
# Try to find the 'Next' button by its class
try:
_logger.info("Clicking the 'Next' button to load the next image...")
next_button = driver.find_element(By.CSS_SELECTOR, "a.cc-next")
# Navigate to the URL in the 'href' of the next button
next_button_url = next_button.get_attribute("href")
driver.get(next_button_url)
except NoSuchElementException:
# If the 'Next' button is not found, it means we've reached the last image
_logger.info("No 'Next' button found. Reached the end of images.")
break
# Close the browser
_logger.info("All images processed, closing the browser.")
driver.quit()