mirror of
https://github.com/kuhyx/testsAndMisc.git
synced 2026-07-04 14:43:01 +02:00
feat: translated scrape website comics to C
This commit is contained in:
parent
2008ecf405
commit
0c33244cda
65
C/scrapeWebsite/.gitignore
vendored
Normal file
65
C/scrapeWebsite/.gitignore
vendored
Normal file
@ -0,0 +1,65 @@
|
||||
# JPEG
|
||||
*.jpg
|
||||
*.jpeg
|
||||
*.jpe
|
||||
*.jif
|
||||
*.jfif
|
||||
*.jfi
|
||||
|
||||
# JPEG 2000
|
||||
*.jp2
|
||||
*.j2k
|
||||
*.jpf
|
||||
*.jpx
|
||||
*.jpm
|
||||
*.mj2
|
||||
|
||||
# JPEG XR
|
||||
*.jxr
|
||||
*.hdp
|
||||
*.wdp
|
||||
|
||||
# Graphics Interchange Format
|
||||
*.gif
|
||||
|
||||
# RAW
|
||||
*.raw
|
||||
|
||||
# Web P
|
||||
*.webp
|
||||
|
||||
# Portable Network Graphics
|
||||
*.png
|
||||
|
||||
# Animated Portable Network Graphics
|
||||
*.apng
|
||||
|
||||
# Multiple-image Network Graphics
|
||||
*.mng
|
||||
|
||||
# Tagged Image File Format
|
||||
*.tiff
|
||||
*.tif
|
||||
|
||||
# Scalable Vector Graphics
|
||||
*.svg
|
||||
*.svgz
|
||||
|
||||
# Portable Document Format
|
||||
*.pdf
|
||||
|
||||
# X BitMap
|
||||
*.xbm
|
||||
|
||||
# BMP
|
||||
*.bmp
|
||||
*.dib
|
||||
|
||||
# ICO
|
||||
*.ico
|
||||
|
||||
# 3D Images
|
||||
*.3dm
|
||||
*.max
|
||||
|
||||
scrape
|
||||
31
C/scrapeWebsite/Makefile
Normal file
31
C/scrapeWebsite/Makefile
Normal file
@ -0,0 +1,31 @@
|
||||
# Compiler
|
||||
CC = gcc
|
||||
|
||||
# Compiler flags
|
||||
CFLAGS = -Wall -O3 -march=native -I/usr/include/libxml2
|
||||
|
||||
# Libraries
|
||||
LIBS = -lcurl -lxml2
|
||||
|
||||
# Source files
|
||||
SRCS = scrape.c
|
||||
|
||||
# Output executable
|
||||
TARGET = scrape
|
||||
|
||||
# Default target
|
||||
all: $(TARGET)
|
||||
|
||||
# Link and compile the program
|
||||
$(TARGET): $(SRCS)
|
||||
$(CC) $(CFLAGS) -o $(TARGET) $(SRCS) $(LIBS)
|
||||
|
||||
# Clean up build artifacts
|
||||
clean:
|
||||
rm -f $(TARGET)
|
||||
|
||||
# Install the program (optional)
|
||||
install: $(TARGET)
|
||||
install -m 755 $(TARGET) /usr/local/bin/
|
||||
|
||||
.PHONY: all clean install
|
||||
178
C/scrapeWebsite/scrape.c
Normal file
178
C/scrapeWebsite/scrape.c
Normal file
@ -0,0 +1,178 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <curl/curl.h>
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <libxml/xpath.h>
|
||||
#include <libxml/uri.h>
|
||||
#include <unistd.h>
|
||||
|
||||
// Structure to store downloaded data
|
||||
struct MemoryStruct {
|
||||
char *memory;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
// Write callback function for curl
|
||||
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
|
||||
|
||||
char *ptr = realloc(mem->memory, mem->size + realsize + 1);
|
||||
if(ptr == NULL) {
|
||||
printf("Not enough memory!\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
mem->memory = ptr;
|
||||
memcpy(&(mem->memory[mem->size]), contents, realsize);
|
||||
mem->size += realsize;
|
||||
mem->memory[mem->size] = 0;
|
||||
|
||||
return realsize;
|
||||
}
|
||||
|
||||
// Initialize the curl request for the URL
|
||||
CURL* init_curl_request(const char *url, struct MemoryStruct *chunk) {
|
||||
CURL *curl = curl_easy_init();
|
||||
if(curl) {
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
}
|
||||
return curl;
|
||||
}
|
||||
|
||||
// Download the image file
|
||||
int download_image(const char *url, const char *image_name) {
|
||||
if(access(image_name, F_OK) != -1) {
|
||||
printf("Image %s already exists, skipping download.\n", image_name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
CURL *curl = curl_easy_init();
|
||||
if(curl) {
|
||||
FILE *fp = fopen(image_name, "wb");
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
fclose(fp);
|
||||
curl_easy_cleanup(curl);
|
||||
return res == CURLE_OK ? 1 : 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Parse HTML and find the XPath expression
|
||||
xmlChar* get_xpath_value(htmlDocPtr doc, const char *xpathExpr) {
|
||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
||||
xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((xmlChar *)xpathExpr, xpathCtx);
|
||||
xmlChar *result = NULL;
|
||||
|
||||
if (xpathObj && !xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) {
|
||||
result = xmlNodeListGetString(doc, xpathObj->nodesetval->nodeTab[0]->xmlChildrenNode, 1);
|
||||
}
|
||||
xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Extract the image URL and download it
|
||||
void extract_and_download_image(htmlDocPtr doc, const char *url) {
|
||||
xmlChar *image_url = get_xpath_value(doc, "//*[@id='cc-comic']/@src");
|
||||
if(image_url) {
|
||||
printf("Found image URL: %s\n", image_url);
|
||||
char *image_name = strrchr((char *)image_url, '/');
|
||||
if(image_name) {
|
||||
image_name++; // Skip the '/'
|
||||
download_image((char *)image_url, image_name);
|
||||
}
|
||||
xmlFree(image_url);
|
||||
}
|
||||
}
|
||||
|
||||
// Find and return the next button URL
|
||||
char* find_next_button_url(htmlDocPtr doc) {
|
||||
xmlChar *next_url = get_xpath_value(doc, "//a[contains(@class,'cc-next')]/@href");
|
||||
if(next_url) {
|
||||
char *url_copy = strdup((char *)next_url);
|
||||
xmlFree(next_url);
|
||||
return url_copy;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Reset chunk memory size before performing curl request
|
||||
void reset_chunk_size(struct MemoryStruct *chunk) {
|
||||
chunk->size = 0;
|
||||
}
|
||||
|
||||
// Perform curl request and return result
|
||||
CURLcode perform_curl_request(CURL *curl) {
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if(res != CURLE_OK) {
|
||||
printf("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
// Parse the HTML document from the chunk memory
|
||||
htmlDocPtr parse_html_from_chunk(struct MemoryStruct *chunk, const char *url) {
|
||||
return htmlReadMemory(chunk->memory, chunk->size, url, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
}
|
||||
|
||||
// Handle processing of the current HTML document
|
||||
int process_html_document(htmlDocPtr doc, const char **url) {
|
||||
extract_and_download_image(doc, *url);
|
||||
char *next_url = find_next_button_url(doc);
|
||||
|
||||
if (next_url) {
|
||||
printf("Next URL: %s\n", next_url);
|
||||
*url = next_url;
|
||||
return 1;
|
||||
} else {
|
||||
printf("Reached the end of images.\n");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up resources used during processing
|
||||
void clean_up(CURL *curl, struct MemoryStruct *chunk) {
|
||||
curl_easy_cleanup(curl);
|
||||
free(chunk->memory);
|
||||
}
|
||||
|
||||
// Process the images and follow the next button
|
||||
void process_images(const char *url) {
|
||||
struct MemoryStruct chunk = {malloc(1), 0};
|
||||
CURL *curl = init_curl_request(url, &chunk);
|
||||
CURLcode res;
|
||||
|
||||
if (curl) {
|
||||
do {
|
||||
reset_chunk_size(&chunk);
|
||||
res = perform_curl_request(curl);
|
||||
|
||||
if (res != CURLE_OK) break;
|
||||
|
||||
htmlDocPtr doc = parse_html_from_chunk(&chunk, url);
|
||||
if (!doc) break;
|
||||
|
||||
if (!process_html_document(doc, &url)) break;
|
||||
|
||||
xmlFreeDoc(doc);
|
||||
} while (res == CURLE_OK);
|
||||
|
||||
clean_up(curl, &chunk);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
const char *url = "..."; // Replace with your actual URL
|
||||
process_images(url);
|
||||
printf("All images processed.\n");
|
||||
return 0;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user