From b5fc4fda2e67569eb3489c95f7c4e46e0b16b61b Mon Sep 17 00:00:00 2001 From: Krzysztof Rudnicki Date: Mon, 16 Sep 2024 16:21:36 +0200 Subject: [PATCH] feat: python script for scraping webcomics --- PYTHON/downloadSMBC/.gitignore | 63 ++++++++++++++++++++++++ PYTHON/downloadSMBC/requirements.txt | 2 + PYTHON/downloadSMBC/scrape_comics.py | 71 ++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 PYTHON/downloadSMBC/.gitignore create mode 100644 PYTHON/downloadSMBC/requirements.txt create mode 100644 PYTHON/downloadSMBC/scrape_comics.py diff --git a/PYTHON/downloadSMBC/.gitignore b/PYTHON/downloadSMBC/.gitignore new file mode 100644 index 0000000..ad534f6 --- /dev/null +++ b/PYTHON/downloadSMBC/.gitignore @@ -0,0 +1,63 @@ +# JPEG +*.jpg +*.jpeg +*.jpe +*.jif +*.jfif +*.jfi + +# JPEG 2000 +*.jp2 +*.j2k +*.jpf +*.jpx +*.jpm +*.mj2 + +# JPEG XR +*.jxr +*.hdp +*.wdp + +# Graphics Interchange Format +*.gif + +# RAW +*.raw + +# Web P +*.webp + +# Portable Network Graphics +*.png + +# Animated Portable Network Graphics +*.apng + +# Multiple-image Network Graphics +*.mng + +# Tagged Image File Format +*.tiff +*.tif + +# Scalable Vector Graphics +*.svg +*.svgz + +# Portable Document Format +*.pdf + +# X BitMap +*.xbm + +# BMP +*.bmp +*.dib + +# ICO +*.ico + +# 3D Images +*.3dm +*.max \ No newline at end of file diff --git a/PYTHON/downloadSMBC/requirements.txt b/PYTHON/downloadSMBC/requirements.txt new file mode 100644 index 0000000..ae62ce7 --- /dev/null +++ b/PYTHON/downloadSMBC/requirements.txt @@ -0,0 +1,2 @@ +selenium +requests diff --git a/PYTHON/downloadSMBC/scrape_comics.py b/PYTHON/downloadSMBC/scrape_comics.py new file mode 100644 index 0000000..a18dd17 --- /dev/null +++ b/PYTHON/downloadSMBC/scrape_comics.py @@ -0,0 +1,71 @@ +import os +import requests +import argparse +from selenium import webdriver +from selenium.webdriver.common.by import By +from urllib.parse import urlparse + +# Initialize argument parser to accept the website URL as an argument +parser = argparse.ArgumentParser(description='Download images from a comic website.') +parser.add_argument('url', type=str, help='The URL of the website to start downloading images from') +args = parser.parse_args() + +# Initialize WebDriver (Use the appropriate driver for your browser) +driver = webdriver.Chrome() + +# Open the website from the passed argument +url = args.url +print(f"Opening the website: {url}") +driver.get(url) + +# A function to download images by URL +def download_image(url): + # Extract image name from URL + image_name = os.path.basename(urlparse(url).path) + + # Check if the image already exists + if os.path.exists(image_name): + print(f"Image {image_name} already exists, skipping download.") + return False + else: + print(f"Downloading image from URL: {url}") + img_data = requests.get(url).content + with open(image_name, 'wb') as handler: + handler.write(img_data) + print(f'Image {image_name} downloaded successfully') + return True + +# No need to define a specific number of images now +count = 1 + +while True: + print(f"Processing image {count}...") + + # Find the image element by its ID + image_element = driver.find_element(By.ID, 'cc-comic') + + # Get the image URL from the 'src' attribute + image_url = image_element.get_attribute('src') + print(f"Found image URL: {image_url}") + + # Download the image if it doesn't already exist + if download_image(image_url): + count += 1 # Increment count only if the image was downloaded + + # Try to find the 'Next' button by its class + try: + print(f"Clicking the 'Next' button to load the next image...") + next_button = driver.find_element(By.CSS_SELECTOR, 'a.cc-next') + + # Navigate to the URL in the 'href' of the next button + next_button_url = next_button.get_attribute('href') + driver.get(next_button_url) + + except: + # If the 'Next' button is not found, it means we've reached the last image + print("No 'Next' button found. Reached the end of images.") + break + +# Close the browser +print("All images processed, closing the browser.") +driver.quit()