From dc46a88993091fd6a0449d02215164ec65b6b23c Mon Sep 17 00:00:00 2001 From: Krzysztof Rudnicki Date: Mon, 16 Sep 2024 16:52:48 +0200 Subject: [PATCH] feat: translated scrape website comics to C --- C/scrapeWebsite/.gitignore | 65 ++++++++++++++ C/scrapeWebsite/Makefile | 31 +++++++ C/scrapeWebsite/scrape.c | 178 +++++++++++++++++++++++++++++++++++++ 3 files changed, 274 insertions(+) create mode 100644 C/scrapeWebsite/.gitignore create mode 100644 C/scrapeWebsite/Makefile create mode 100644 C/scrapeWebsite/scrape.c diff --git a/C/scrapeWebsite/.gitignore b/C/scrapeWebsite/.gitignore new file mode 100644 index 0000000..1774c0e --- /dev/null +++ b/C/scrapeWebsite/.gitignore @@ -0,0 +1,65 @@ +# JPEG +*.jpg +*.jpeg +*.jpe +*.jif +*.jfif +*.jfi + +# JPEG 2000 +*.jp2 +*.j2k +*.jpf +*.jpx +*.jpm +*.mj2 + +# JPEG XR +*.jxr +*.hdp +*.wdp + +# Graphics Interchange Format +*.gif + +# RAW +*.raw + +# Web P +*.webp + +# Portable Network Graphics +*.png + +# Animated Portable Network Graphics +*.apng + +# Multiple-image Network Graphics +*.mng + +# Tagged Image File Format +*.tiff +*.tif + +# Scalable Vector Graphics +*.svg +*.svgz + +# Portable Document Format +*.pdf + +# X BitMap +*.xbm + +# BMP +*.bmp +*.dib + +# ICO +*.ico + +# 3D Images +*.3dm +*.max + +scrape \ No newline at end of file diff --git a/C/scrapeWebsite/Makefile b/C/scrapeWebsite/Makefile new file mode 100644 index 0000000..33264d6 --- /dev/null +++ b/C/scrapeWebsite/Makefile @@ -0,0 +1,31 @@ +# Compiler +CC = gcc + +# Compiler flags +CFLAGS = -Wall -O3 -march=native -I/usr/include/libxml2 + +# Libraries +LIBS = -lcurl -lxml2 + +# Source files +SRCS = scrape.c + +# Output executable +TARGET = scrape + +# Default target +all: $(TARGET) + +# Link and compile the program +$(TARGET): $(SRCS) + $(CC) $(CFLAGS) -o $(TARGET) $(SRCS) $(LIBS) + +# Clean up build artifacts +clean: + rm -f $(TARGET) + +# Install the program (optional) +install: $(TARGET) + install -m 755 $(TARGET) /usr/local/bin/ + +.PHONY: all clean install diff --git a/C/scrapeWebsite/scrape.c b/C/scrapeWebsite/scrape.c new file mode 100644 index 0000000..16a24a7 --- /dev/null +++ b/C/scrapeWebsite/scrape.c @@ -0,0 +1,178 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +// Structure to store downloaded data +struct MemoryStruct { + char *memory; + size_t size; +}; + +// Write callback function for curl +static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) { + size_t realsize = size * nmemb; + struct MemoryStruct *mem = (struct MemoryStruct *)userp; + + char *ptr = realloc(mem->memory, mem->size + realsize + 1); + if(ptr == NULL) { + printf("Not enough memory!\n"); + return 0; + } + + mem->memory = ptr; + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + + return realsize; +} + +// Initialize the curl request for the URL +CURL* init_curl_request(const char *url, struct MemoryStruct *chunk) { + CURL *curl = curl_easy_init(); + if(curl) { + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + } + return curl; +} + +// Download the image file +int download_image(const char *url, const char *image_name) { + if(access(image_name, F_OK) != -1) { + printf("Image %s already exists, skipping download.\n", image_name); + return 0; + } + + CURL *curl = curl_easy_init(); + if(curl) { + FILE *fp = fopen(image_name, "wb"); + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp); + CURLcode res = curl_easy_perform(curl); + fclose(fp); + curl_easy_cleanup(curl); + return res == CURLE_OK ? 1 : 0; + } + return 0; +} + +// Parse HTML and find the XPath expression +xmlChar* get_xpath_value(htmlDocPtr doc, const char *xpathExpr) { + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((xmlChar *)xpathExpr, xpathCtx); + xmlChar *result = NULL; + + if (xpathObj && !xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) { + result = xmlNodeListGetString(doc, xpathObj->nodesetval->nodeTab[0]->xmlChildrenNode, 1); + } + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return result; +} + +// Extract the image URL and download it +void extract_and_download_image(htmlDocPtr doc, const char *url) { + xmlChar *image_url = get_xpath_value(doc, "//*[@id='cc-comic']/@src"); + if(image_url) { + printf("Found image URL: %s\n", image_url); + char *image_name = strrchr((char *)image_url, '/'); + if(image_name) { + image_name++; // Skip the '/' + download_image((char *)image_url, image_name); + } + xmlFree(image_url); + } +} + +// Find and return the next button URL +char* find_next_button_url(htmlDocPtr doc) { + xmlChar *next_url = get_xpath_value(doc, "//a[contains(@class,'cc-next')]/@href"); + if(next_url) { + char *url_copy = strdup((char *)next_url); + xmlFree(next_url); + return url_copy; + } + return NULL; +} + +// Reset chunk memory size before performing curl request +void reset_chunk_size(struct MemoryStruct *chunk) { + chunk->size = 0; +} + +// Perform curl request and return result +CURLcode perform_curl_request(CURL *curl) { + CURLcode res = curl_easy_perform(curl); + if(res != CURLE_OK) { + printf("curl_easy_perform() failed: %s\n", curl_easy_strerror(res)); + } + return res; +} + +// Parse the HTML document from the chunk memory +htmlDocPtr parse_html_from_chunk(struct MemoryStruct *chunk, const char *url) { + return htmlReadMemory(chunk->memory, chunk->size, url, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); +} + +// Handle processing of the current HTML document +int process_html_document(htmlDocPtr doc, const char **url) { + extract_and_download_image(doc, *url); + char *next_url = find_next_button_url(doc); + + if (next_url) { + printf("Next URL: %s\n", next_url); + *url = next_url; + return 1; + } else { + printf("Reached the end of images.\n"); + return 0; + } +} + +// Clean up resources used during processing +void clean_up(CURL *curl, struct MemoryStruct *chunk) { + curl_easy_cleanup(curl); + free(chunk->memory); +} + +// Process the images and follow the next button +void process_images(const char *url) { + struct MemoryStruct chunk = {malloc(1), 0}; + CURL *curl = init_curl_request(url, &chunk); + CURLcode res; + + if (curl) { + do { + reset_chunk_size(&chunk); + res = perform_curl_request(curl); + + if (res != CURLE_OK) break; + + htmlDocPtr doc = parse_html_from_chunk(&chunk, url); + if (!doc) break; + + if (!process_html_document(doc, &url)) break; + + xmlFreeDoc(doc); + } while (res == CURLE_OK); + + clean_up(curl, &chunk); + } +} + + +int main() { + const char *url = "..."; // Replace with your actual URL + process_images(url); + printf("All images processed.\n"); + return 0; +}