feat: python script for scraping webcomics

2026-07-04 14:43:01 +02:00 · 2024-09-16 16:21:36 +02:00 · 2024-09-16 16:21:36 +02:00 · b5fc4fda2e
commit b5fc4fda2e
parent 479d0eed9d
3 changed files with 136 additions and 0 deletions
--- a/PYTHON/downloadSMBC/.gitignore
+++ b/PYTHON/downloadSMBC/.gitignore
@ -0,0 +1,63 @@
+# JPEG
+*.jpg
+*.jpeg
+*.jpe
+*.jif
+*.jfif
+*.jfi
+
+# JPEG 2000
+*.jp2
+*.j2k
+*.jpf
+*.jpx
+*.jpm
+*.mj2
+
+# JPEG XR
+*.jxr
+*.hdp
+*.wdp
+
+# Graphics Interchange Format
+*.gif
+
+# RAW
+*.raw
+
+# Web P
+*.webp
+
+# Portable Network Graphics
+*.png
+
+# Animated Portable Network Graphics
+*.apng
+
+# Multiple-image Network Graphics
+*.mng
+
+# Tagged Image File Format
+*.tiff
+*.tif
+
+# Scalable Vector Graphics
+*.svg
+*.svgz
+
+# Portable Document Format
+*.pdf
+
+# X BitMap
+*.xbm
+
+# BMP
+*.bmp
+*.dib
+
+# ICO
+*.ico
+
+# 3D Images
+*.3dm
+*.max
--- a/PYTHON/downloadSMBC/requirements.txt
+++ b/PYTHON/downloadSMBC/requirements.txt
@ -0,0 +1,2 @@
+selenium
+requests
--- a/PYTHON/downloadSMBC/scrape_comics.py
+++ b/PYTHON/downloadSMBC/scrape_comics.py
@ -0,0 +1,71 @@
+import os
+import requests
+import argparse
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from urllib.parse import urlparse
+
+# Initialize argument parser to accept the website URL as an argument
+parser = argparse.ArgumentParser(description='Download images from a comic website.')
+parser.add_argument('url', type=str, help='The URL of the website to start downloading images from')
+args = parser.parse_args()
+
+# Initialize WebDriver (Use the appropriate driver for your browser)
+driver = webdriver.Chrome()
+
+# Open the website from the passed argument
+url = args.url
+print(f"Opening the website: {url}")
+driver.get(url)
+
+# A function to download images by URL
+def download_image(url):
+    # Extract image name from URL
+    image_name = os.path.basename(urlparse(url).path)
+    
+    # Check if the image already exists
+    if os.path.exists(image_name):
+        print(f"Image {image_name} already exists, skipping download.")
+        return False
+    else:
+        print(f"Downloading image from URL: {url}")
+        img_data = requests.get(url).content
+        with open(image_name, 'wb') as handler:
+            handler.write(img_data)
+        print(f'Image {image_name} downloaded successfully')
+        return True
+
+# No need to define a specific number of images now
+count = 1
+
+while True:
+    print(f"Processing image {count}...")
+
+    # Find the image element by its ID
+    image_element = driver.find_element(By.ID, 'cc-comic')
+    
+    # Get the image URL from the 'src' attribute
+    image_url = image_element.get_attribute('src')
+    print(f"Found image URL: {image_url}")
+
+    # Download the image if it doesn't already exist
+    if download_image(image_url):
+        count += 1  # Increment count only if the image was downloaded
+
+    # Try to find the 'Next' button by its class
+    try:
+        print(f"Clicking the 'Next' button to load the next image...")
+        next_button = driver.find_element(By.CSS_SELECTOR, 'a.cc-next')
+
+        # Navigate to the URL in the 'href' of the next button
+        next_button_url = next_button.get_attribute('href')
+        driver.get(next_button_url)
+
+    except:
+        # If the 'Next' button is not found, it means we've reached the last image
+        print("No 'Next' button found. Reached the end of images.")
+        break
+
+# Close the browser
+print("All images processed, closing the browser.")
+driver.quit()