feat: translated scrape website comics to C

This commit is contained in:
Krzysztof Rudnicki 2024-09-16 16:52:48 +02:00
parent 2008ecf405
commit 0c33244cda
3 changed files with 274 additions and 0 deletions

65
C/scrapeWebsite/.gitignore vendored Normal file
View File

@ -0,0 +1,65 @@
# JPEG
*.jpg
*.jpeg
*.jpe
*.jif
*.jfif
*.jfi
# JPEG 2000
*.jp2
*.j2k
*.jpf
*.jpx
*.jpm
*.mj2
# JPEG XR
*.jxr
*.hdp
*.wdp
# Graphics Interchange Format
*.gif
# RAW
*.raw
# Web P
*.webp
# Portable Network Graphics
*.png
# Animated Portable Network Graphics
*.apng
# Multiple-image Network Graphics
*.mng
# Tagged Image File Format
*.tiff
*.tif
# Scalable Vector Graphics
*.svg
*.svgz
# Portable Document Format
*.pdf
# X BitMap
*.xbm
# BMP
*.bmp
*.dib
# ICO
*.ico
# 3D Images
*.3dm
*.max
scrape

31
C/scrapeWebsite/Makefile Normal file
View File

@ -0,0 +1,31 @@
# Compiler
CC = gcc
# Compiler flags
CFLAGS = -Wall -O3 -march=native -I/usr/include/libxml2
# Libraries
LIBS = -lcurl -lxml2
# Source files
SRCS = scrape.c
# Output executable
TARGET = scrape
# Default target
all: $(TARGET)
# Link and compile the program
$(TARGET): $(SRCS)
$(CC) $(CFLAGS) -o $(TARGET) $(SRCS) $(LIBS)
# Clean up build artifacts
clean:
rm -f $(TARGET)
# Install the program (optional)
install: $(TARGET)
install -m 755 $(TARGET) /usr/local/bin/
.PHONY: all clean install

178
C/scrapeWebsite/scrape.c Normal file
View File

@ -0,0 +1,178 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/uri.h>
#include <unistd.h>
// Structure to store downloaded data
struct MemoryStruct {
char *memory;
size_t size;
};
// Write callback function for curl
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
char *ptr = realloc(mem->memory, mem->size + realsize + 1);
if(ptr == NULL) {
printf("Not enough memory!\n");
return 0;
}
mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
// Initialize the curl request for the URL
CURL* init_curl_request(const char *url, struct MemoryStruct *chunk) {
CURL *curl = curl_easy_init();
if(curl) {
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
}
return curl;
}
// Download the image file
int download_image(const char *url, const char *image_name) {
if(access(image_name, F_OK) != -1) {
printf("Image %s already exists, skipping download.\n", image_name);
return 0;
}
CURL *curl = curl_easy_init();
if(curl) {
FILE *fp = fopen(image_name, "wb");
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
CURLcode res = curl_easy_perform(curl);
fclose(fp);
curl_easy_cleanup(curl);
return res == CURLE_OK ? 1 : 0;
}
return 0;
}
// Parse HTML and find the XPath expression
xmlChar* get_xpath_value(htmlDocPtr doc, const char *xpathExpr) {
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((xmlChar *)xpathExpr, xpathCtx);
xmlChar *result = NULL;
if (xpathObj && !xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) {
result = xmlNodeListGetString(doc, xpathObj->nodesetval->nodeTab[0]->xmlChildrenNode, 1);
}
xmlXPathFreeObject(xpathObj);
xmlXPathFreeContext(xpathCtx);
return result;
}
// Extract the image URL and download it
void extract_and_download_image(htmlDocPtr doc, const char *url) {
xmlChar *image_url = get_xpath_value(doc, "//*[@id='cc-comic']/@src");
if(image_url) {
printf("Found image URL: %s\n", image_url);
char *image_name = strrchr((char *)image_url, '/');
if(image_name) {
image_name++; // Skip the '/'
download_image((char *)image_url, image_name);
}
xmlFree(image_url);
}
}
// Find and return the next button URL
char* find_next_button_url(htmlDocPtr doc) {
xmlChar *next_url = get_xpath_value(doc, "//a[contains(@class,'cc-next')]/@href");
if(next_url) {
char *url_copy = strdup((char *)next_url);
xmlFree(next_url);
return url_copy;
}
return NULL;
}
// Reset chunk memory size before performing curl request
void reset_chunk_size(struct MemoryStruct *chunk) {
chunk->size = 0;
}
// Perform curl request and return result
CURLcode perform_curl_request(CURL *curl) {
CURLcode res = curl_easy_perform(curl);
if(res != CURLE_OK) {
printf("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
}
return res;
}
// Parse the HTML document from the chunk memory
htmlDocPtr parse_html_from_chunk(struct MemoryStruct *chunk, const char *url) {
return htmlReadMemory(chunk->memory, chunk->size, url, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
}
// Handle processing of the current HTML document
int process_html_document(htmlDocPtr doc, const char **url) {
extract_and_download_image(doc, *url);
char *next_url = find_next_button_url(doc);
if (next_url) {
printf("Next URL: %s\n", next_url);
*url = next_url;
return 1;
} else {
printf("Reached the end of images.\n");
return 0;
}
}
// Clean up resources used during processing
void clean_up(CURL *curl, struct MemoryStruct *chunk) {
curl_easy_cleanup(curl);
free(chunk->memory);
}
// Process the images and follow the next button
void process_images(const char *url) {
struct MemoryStruct chunk = {malloc(1), 0};
CURL *curl = init_curl_request(url, &chunk);
CURLcode res;
if (curl) {
do {
reset_chunk_size(&chunk);
res = perform_curl_request(curl);
if (res != CURLE_OK) break;
htmlDocPtr doc = parse_html_from_chunk(&chunk, url);
if (!doc) break;
if (!process_html_document(doc, &url)) break;
xmlFreeDoc(doc);
} while (res == CURLE_OK);
clean_up(curl, &chunk);
}
}
int main() {
const char *url = "..."; // Replace with your actual URL
process_images(url);
printf("All images processed.\n");
return 0;
}