feat: python script for scraping webcomics

This commit is contained in:
Krzysztof Rudnicki 2024-09-16 16:21:36 +02:00
parent 479d0eed9d
commit b5fc4fda2e
3 changed files with 136 additions and 0 deletions

63
PYTHON/downloadSMBC/.gitignore vendored Normal file
View File

@ -0,0 +1,63 @@
# JPEG
*.jpg
*.jpeg
*.jpe
*.jif
*.jfif
*.jfi
# JPEG 2000
*.jp2
*.j2k
*.jpf
*.jpx
*.jpm
*.mj2
# JPEG XR
*.jxr
*.hdp
*.wdp
# Graphics Interchange Format
*.gif
# RAW
*.raw
# Web P
*.webp
# Portable Network Graphics
*.png
# Animated Portable Network Graphics
*.apng
# Multiple-image Network Graphics
*.mng
# Tagged Image File Format
*.tiff
*.tif
# Scalable Vector Graphics
*.svg
*.svgz
# Portable Document Format
*.pdf
# X BitMap
*.xbm
# BMP
*.bmp
*.dib
# ICO
*.ico
# 3D Images
*.3dm
*.max

View File

@ -0,0 +1,2 @@
selenium
requests

View File

@ -0,0 +1,71 @@
import os
import requests
import argparse
from selenium import webdriver
from selenium.webdriver.common.by import By
from urllib.parse import urlparse
# Initialize argument parser to accept the website URL as an argument
parser = argparse.ArgumentParser(description='Download images from a comic website.')
parser.add_argument('url', type=str, help='The URL of the website to start downloading images from')
args = parser.parse_args()
# Initialize WebDriver (Use the appropriate driver for your browser)
driver = webdriver.Chrome()
# Open the website from the passed argument
url = args.url
print(f"Opening the website: {url}")
driver.get(url)
# A function to download images by URL
def download_image(url):
# Extract image name from URL
image_name = os.path.basename(urlparse(url).path)
# Check if the image already exists
if os.path.exists(image_name):
print(f"Image {image_name} already exists, skipping download.")
return False
else:
print(f"Downloading image from URL: {url}")
img_data = requests.get(url).content
with open(image_name, 'wb') as handler:
handler.write(img_data)
print(f'Image {image_name} downloaded successfully')
return True
# No need to define a specific number of images now
count = 1
while True:
print(f"Processing image {count}...")
# Find the image element by its ID
image_element = driver.find_element(By.ID, 'cc-comic')
# Get the image URL from the 'src' attribute
image_url = image_element.get_attribute('src')
print(f"Found image URL: {image_url}")
# Download the image if it doesn't already exist
if download_image(image_url):
count += 1 # Increment count only if the image was downloaded
# Try to find the 'Next' button by its class
try:
print(f"Clicking the 'Next' button to load the next image...")
next_button = driver.find_element(By.CSS_SELECTOR, 'a.cc-next')
# Navigate to the URL in the 'href' of the next button
next_button_url = next_button.get_attribute('href')
driver.get(next_button_url)
except:
# If the 'Next' button is not found, it means we've reached the last image
print("No 'Next' button found. Reached the end of images.")
break
# Close the browser
print("All images processed, closing the browser.")
driver.quit()