2025-11-30 14:45:55 +01:00
|
|
|
"""Download comic images from a website using Selenium."""
|
|
|
|
|
|
2025-11-30 13:42:16 +01:00
|
|
|
import argparse
|
2025-11-30 14:36:13 +01:00
|
|
|
import logging
|
2025-11-30 23:03:03 +01:00
|
|
|
from pathlib import Path
|
2025-11-30 13:42:16 +01:00
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
2024-09-16 16:21:36 +02:00
|
|
|
import requests
|
|
|
|
|
from selenium import webdriver
|
2025-11-30 21:37:47 +01:00
|
|
|
from selenium.common.exceptions import NoSuchElementException
|
2024-09-16 16:21:36 +02:00
|
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger = logging.getLogger(__name__)
|
2025-11-30 14:36:13 +01:00
|
|
|
|
2025-11-30 15:17:52 +01:00
|
|
|
REQUEST_TIMEOUT = 30 # seconds
|
|
|
|
|
|
2024-09-16 16:21:36 +02:00
|
|
|
# Initialize argument parser to accept the website URL as an argument
|
2025-11-30 13:42:16 +01:00
|
|
|
parser = argparse.ArgumentParser(description="Download images from a comic website.")
|
2025-11-30 14:25:35 +01:00
|
|
|
parser.add_argument(
|
|
|
|
|
"url", type=str, help="The URL of the website to start downloading images from"
|
|
|
|
|
)
|
2024-09-16 16:21:36 +02:00
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
# Initialize WebDriver (Use the appropriate driver for your browser)
|
|
|
|
|
driver = webdriver.Chrome()
|
|
|
|
|
|
|
|
|
|
# Open the website from the passed argument
|
|
|
|
|
url = args.url
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger.info("Opening the website: %s", url)
|
2024-09-16 16:21:36 +02:00
|
|
|
driver.get(url)
|
|
|
|
|
|
2025-11-30 13:42:16 +01:00
|
|
|
|
2024-09-16 16:21:36 +02:00
|
|
|
# A function to download images by URL
|
2025-11-30 15:49:40 +01:00
|
|
|
def download_image(url: str) -> bool:
|
2025-11-30 14:45:55 +01:00
|
|
|
"""Download an image from a URL and save it locally."""
|
2024-09-16 16:21:36 +02:00
|
|
|
# Extract image name from URL
|
2025-11-30 23:03:03 +01:00
|
|
|
image_name = Path(urlparse(url).path).name
|
|
|
|
|
image_path = Path(image_name)
|
2025-11-30 13:42:16 +01:00
|
|
|
|
2024-09-16 16:21:36 +02:00
|
|
|
# Check if the image already exists
|
2025-11-30 23:03:03 +01:00
|
|
|
if image_path.exists():
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger.info("Image %s already exists, skipping download.", image_name)
|
2024-09-16 16:21:36 +02:00
|
|
|
return False
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger.info("Downloading image from URL: %s", url)
|
2025-11-30 15:17:52 +01:00
|
|
|
img_data = requests.get(url, timeout=REQUEST_TIMEOUT).content
|
2025-11-30 23:03:03 +01:00
|
|
|
with open(image_path, "wb") as handler:
|
2025-11-30 13:42:16 +01:00
|
|
|
handler.write(img_data)
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger.info("Image %s downloaded successfully", image_name)
|
2025-11-30 13:42:16 +01:00
|
|
|
return True
|
|
|
|
|
|
2024-09-16 16:21:36 +02:00
|
|
|
|
|
|
|
|
# No need to define a specific number of images now
|
|
|
|
|
count = 1
|
|
|
|
|
|
|
|
|
|
while True:
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger.info("Processing image %s...", count)
|
2024-09-16 16:21:36 +02:00
|
|
|
|
|
|
|
|
# Find the image element by its ID
|
2025-11-30 13:42:16 +01:00
|
|
|
image_element = driver.find_element(By.ID, "cc-comic")
|
|
|
|
|
|
2024-09-16 16:21:36 +02:00
|
|
|
# Get the image URL from the 'src' attribute
|
2025-11-30 13:42:16 +01:00
|
|
|
image_url = image_element.get_attribute("src")
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger.info("Found image URL: %s", image_url)
|
2024-09-16 16:21:36 +02:00
|
|
|
|
|
|
|
|
# Download the image if it doesn't already exist
|
|
|
|
|
if download_image(image_url):
|
|
|
|
|
count += 1 # Increment count only if the image was downloaded
|
|
|
|
|
|
|
|
|
|
# Try to find the 'Next' button by its class
|
|
|
|
|
try:
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger.info("Clicking the 'Next' button to load the next image...")
|
2025-11-30 13:42:16 +01:00
|
|
|
next_button = driver.find_element(By.CSS_SELECTOR, "a.cc-next")
|
2024-09-16 16:21:36 +02:00
|
|
|
|
|
|
|
|
# Navigate to the URL in the 'href' of the next button
|
2025-11-30 13:42:16 +01:00
|
|
|
next_button_url = next_button.get_attribute("href")
|
2024-09-16 16:21:36 +02:00
|
|
|
driver.get(next_button_url)
|
|
|
|
|
|
2025-11-30 21:37:47 +01:00
|
|
|
except NoSuchElementException:
|
2024-09-16 16:21:36 +02:00
|
|
|
# If the 'Next' button is not found, it means we've reached the last image
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger.info("No 'Next' button found. Reached the end of images.")
|
2024-09-16 16:21:36 +02:00
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# Close the browser
|
2025-11-30 21:59:24 +01:00
|
|
|
_logger.info("All images processed, closing the browser.")
|
2024-09-16 16:21:36 +02:00
|
|
|
driver.quit()
|