testsAndMisc-archive/C/scrapeWebsite/scrape.c

207 lines
5.3 KiB
C

#include <curl/curl.h>
#include <libxml/HTMLparser.h>
#include <libxml/uri.h>
#include <libxml/xpath.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
// Structure to store downloaded data
struct MemoryStruct
{
char *memory;
size_t size;
};
// Write callback function for curl
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size_t realsize = size * nmemb;
struct MemoryStruct *mem = (struct MemoryStruct *)userp;
char *ptr = realloc(mem->memory, mem->size + realsize + 1);
if (ptr == NULL)
{
printf("Not enough memory!\n");
return 0;
}
mem->memory = ptr;
memcpy(&(mem->memory[mem->size]), contents, realsize);
mem->size += realsize;
mem->memory[mem->size] = 0;
return realsize;
}
// Initialize the curl request for the URL
CURL *init_curl_request(const char *url, struct MemoryStruct *chunk)
{
CURL *curl = curl_easy_init();
if (curl)
{
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
}
return curl;
}
// Download the image file
int download_image(const char *url, const char *image_name)
{
if (access(image_name, F_OK) != -1)
{
printf("Image %s already exists, skipping download.\n", image_name);
return 0;
}
CURL *curl = curl_easy_init();
if (curl)
{
FILE *fp = fopen(image_name, "wb");
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, NULL);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
CURLcode res = curl_easy_perform(curl);
fclose(fp);
curl_easy_cleanup(curl);
return res == CURLE_OK ? 1 : 0;
}
return 0;
}
// Parse HTML and find the XPath expression
xmlChar *get_xpath_value(htmlDocPtr doc, const char *xpathExpr)
{
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((xmlChar *)xpathExpr, xpathCtx);
xmlChar *result = NULL;
if (xpathObj && !xmlXPathNodeSetIsEmpty(xpathObj->nodesetval))
{
result = xmlNodeListGetString(doc, xpathObj->nodesetval->nodeTab[0]->xmlChildrenNode, 1);
}
xmlXPathFreeObject(xpathObj);
xmlXPathFreeContext(xpathCtx);
return result;
}
// Extract the image URL and download it
void extract_and_download_image(htmlDocPtr doc, const char *url)
{
xmlChar *image_url = get_xpath_value(doc, "//*[@id='cc-comic']/@src");
if (image_url)
{
printf("Found image URL: %s\n", image_url);
char *image_name = strrchr((char *)image_url, '/');
if (image_name)
{
image_name++; // Skip the '/'
download_image((char *)image_url, image_name);
}
xmlFree(image_url);
}
}
// Find and return the next button URL
char *find_next_button_url(htmlDocPtr doc)
{
xmlChar *next_url = get_xpath_value(doc, "//a[contains(@class,'cc-next')]/@href");
if (next_url)
{
char *url_copy = strdup((char *)next_url);
xmlFree(next_url);
return url_copy;
}
return NULL;
}
// Reset chunk memory size before performing curl request
void reset_chunk_size(struct MemoryStruct *chunk) { chunk->size = 0; }
// Perform curl request and return result
CURLcode perform_curl_request(CURL *curl)
{
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK)
{
printf("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
}
return res;
}
// Parse the HTML document from the chunk memory
htmlDocPtr parse_html_from_chunk(struct MemoryStruct *chunk, const char *url)
{
return htmlReadMemory(chunk->memory, chunk->size, url, NULL,
HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
}
// Handle processing of the current HTML document
int process_html_document(htmlDocPtr doc, const char **url)
{
extract_and_download_image(doc, *url);
char *next_url = find_next_button_url(doc);
if (next_url)
{
printf("Next URL: %s\n", next_url);
*url = next_url;
return 1;
}
else
{
printf("Reached the end of images.\n");
return 0;
}
}
// Clean up resources used during processing
void clean_up(CURL *curl, struct MemoryStruct *chunk)
{
curl_easy_cleanup(curl);
free(chunk->memory);
}
// Process the images and follow the next button
void process_images(const char *url)
{
struct MemoryStruct chunk = {malloc(1), 0};
CURL *curl = init_curl_request(url, &chunk);
CURLcode res;
if (curl)
{
do
{
reset_chunk_size(&chunk);
res = perform_curl_request(curl);
if (res != CURLE_OK)
break;
htmlDocPtr doc = parse_html_from_chunk(&chunk, url);
if (!doc)
break;
if (!process_html_document(doc, &url))
break;
xmlFreeDoc(doc);
} while (res == CURLE_OK);
clean_up(curl, &chunk);
}
}
int main()
{
const char *url = "..."; // Replace with your actual URL
process_images(url);
printf("All images processed.\n");
return 0;
}