WIP: parse both atom and rss feeds

Signed-off-by: William Brawner <me@wbrawner.com>
This commit is contained in:
William Brawner 2020-08-06 17:19:31 -07:00
parent abe77be5a7
commit 273c060674
3 changed files with 92 additions and 73 deletions

1
.gitignore vendored
View file

@ -6,3 +6,4 @@ out/
tmp/ tmp/
*.sln *.sln
*.swp *.swp
tags

View file

@ -10,7 +10,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
int main(int argc, char **argv) { int main(int argc, char** argv) {
if (argc < 2) { if (argc < 2) {
printf("Please provide the URL of a feed to parse.\n"); printf("Please provide the URL of a feed to parse.\n");
return 1; return 1;
@ -18,18 +18,15 @@ int main(int argc, char **argv) {
printf("Attempting to retrieve XML for URL: %s\n", argv[1]); printf("Attempting to retrieve XML for URL: %s\n", argv[1]);
} }
sqlite3 *db; sqlite3* db;
setup_database(db); setup_database(db);
fead_xml(argv[1]); fead_xml(argv[1]);
close_database(db); close_database(db);
return 0; return 0;
} }
void fead_xml(char* url) { void fead_xml(char* url) {
xml *x = malloc(sizeof(xml)); xml* x = malloc(sizeof(xml));
get_xml_ptr(x, url); get_xml_ptr(x, url);
int res = get_xml(x); int res = get_xml(x);
@ -37,13 +34,9 @@ void fead_xml(char* url) {
printf("XML data retrieved from server\n"); printf("XML data retrieved from server\n");
} else { } else {
printf("Unable to retrieve XML for URL: %s\n", x->url); printf("Unable to retrieve XML for URL: %s\n", x->url);
printf("%s\n", x->errBuf);
} }
parse_xml_items(x); parse_xml_items(x);
print_xml_elements(x); print_xml_elements(x);
cleanup_xml(x); cleanup_xml(x);
} }
@ -54,21 +47,18 @@ void get_xml_ptr(xml* x, char* url) {
x->url = url; x->url = url;
} }
size_t my_write_callback(char *ptr, size_t size, size_t nmemb, xml *x) { size_t my_write_callback(char* ptr, size_t size, size_t nmemb, xml* x) {
int byte_size = (size * nmemb); int byte_size = (size * nmemb);
printf("byte_size: %d\n", byte_size);
int new_size = x->size + byte_size; int new_size = x->size + byte_size;
printf("new_size size: %d\n", new_size);
x->data = realloc(x->data, new_size + 1); x->data = realloc(x->data, new_size + 1);
memcpy(x->data + x->size, ptr, byte_size); memcpy(x->data + x->size, ptr, byte_size);
x->data[new_size] = '\0'; x->data[new_size] = '\0';
x->size = new_size; x->size = new_size;
printf("New xml size: %d\n", x->size);
return byte_size; return byte_size;
} }
size_t get_xml(xml* x) { size_t get_xml(xml* x) {
CURL *curl = curl_easy_init(); CURL* curl = curl_easy_init();
if (!curl) { if (!curl) {
printf("Unable to instantiate curl object. Aborting"); printf("Unable to instantiate curl object. Aborting");
@ -76,74 +66,101 @@ size_t get_xml(xml* x) {
} }
curl_easy_setopt(curl, CURLOPT_URL, x->url); curl_easy_setopt(curl, CURLOPT_URL, x->url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, *my_write_callback); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, my_write_callback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, x); curl_easy_setopt(curl, CURLOPT_WRITEDATA, x);
curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, x->errBuf);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
// curl_easy_setopt(curl, CURLOPT_VERBOSE, 1);
CURLcode res = curl_easy_perform(curl); CURLcode res = curl_easy_perform(curl);
curl_easy_cleanup(curl); curl_easy_cleanup(curl);
curl_global_cleanup(); curl_global_cleanup();
return res; return res;
} }
void parse_xml_items(xml *x) { void parse_xml_items(xml* x) {
xmlChar* xChar = xmlCharStrndup(x->data, x->size); xmlChar* xChar = xmlCharStrndup(x->data, x->size);
x->xdp = xmlReadDoc( x->xdp = xmlReadDoc(xChar, x->url, NULL, XML_PARSE_RECOVER);
xChar,
x->url,
NULL,
XML_PARSE_RECOVER
);
xmlFree(xChar); xmlFree(xChar);
} }
void parse_entry(xmlNode *root) {
}
article * parse_item(xmlNode *item) {
printf("Article info:\n");
do {
if (strncmp("text", item->name, strlen(item->name)) != 0) {
if (strlen(item->children->content) > 200) {
printf("%s: %ld chars\n", item->name,
strlen(item->children->content));
} else {
printf("%s: %s\n", item->name, item->children->content);
}
}
item = item->next;
} while (item != NULL);
printf("\n");
return NULL;
}
feed * parse_feed(xmlNode *feed) {
while (feed && strcmp(feed->name, "entry") != 0) {
feed = feed->next;
}
}
feed * parse_rss(xmlNode *rss) {
xmlNode* channel = rss->children->next;
xmlNode* channelChild;
while (channel && strcmp(channel->name, "channel") != 0) {
channel = channel->next;
}
while (channelChild != NULL) {
if (strncmp("title", channelChild->name, strlen(channelChild->name) - 1) == 0) {
if (channelChild->children != NULL) {
printf("%s: %s\n", channelChild->name, channelChild->children->content);
}
} else if (strncmp("description", channelChild->name,
strlen(channelChild->name)) == 0) {
if (channelChild->children != NULL) {
printf("%s: %s\n", channelChild->name, channelChild->children->content);
}
} else if (strncmp("item", channelChild->name,
strlen(channelChild->name)) == 0) {
article *article = parse_item(channelChild->children);
// TODO: Store articles in array to return
}
channelChild = channelChild->next;
}
}
void print_xml_elements(xml* x) { void print_xml_elements(xml* x) {
if (x->xdp == NULL || x->xdp->children == NULL) { if (x->xdp == NULL || x->xdp->children == NULL) {
printf("Unable to parse XML\n"); printf("Unable to parse XML\n");
} }
xmlNode* rss = x->xdp->children; xmlNode* root = x->xdp->children;
xmlNode* channel = rss->children->next; if (strcmp(root->name, "rss") == 0) {
xmlNode* channelChild = channel->children; parse_rss(root);
int articleCount = 0; } else if (strcmp(root->name, "feed") == 0){
while (channelChild != NULL) { parse_feed(root);
if (strncmp("title", channelChild->name, strlen(channelChild->name)) == 0) {
if (channelChild->children != NULL) {
printf("%s: %s\n", channelChild->name, channelChild->children->content);
}
} else if (strncmp("description", channelChild->name, strlen(channelChild->name)) == 0) {
if (channelChild->children != NULL) {
printf("%s: %s\n", channelChild->name, channelChild->children->content);
}
} else if (strncmp("item", channelChild->name, strlen(channelChild->name)) == 0) {
articleCount++;
xmlNode* itemData = channelChild->children;
printf("Article info:\n");
do {
if (strncmp("text", itemData->name, strlen(itemData->name)) != 0) {
if (strlen(itemData->children->content) > 200) {
printf("%s: %ld chars\n", itemData->name, strlen(itemData->children->content));
} else { } else {
printf("%s: %s\n", itemData->name, itemData->children->content); printf("Unable to parse XML\n");
} // TODO: Return error
} return;
itemData = itemData->next;
} while (itemData != NULL);
printf("\n");
}
channelChild = channelChild->next;
} }
rss = NULL; int articleCount = 0;
channel = NULL;
root = NULL;
printf("Found %d articles in the feed\n", articleCount); printf("Found %d articles in the feed\n", articleCount);
} }
void cleanup_xml(xml* x) { void cleanup_xml(xml* x) {
free(x->data); if (!x) return;
free(x->errBuf); if (x->data) free(x->data);
xmlFreeDoc(x->xdp); if (x->xdp) xmlFreeDoc(x->xdp);
free(x); free(x);
} }
@ -160,6 +177,4 @@ void setup_database(sqlite3* handle) {
} }
} }
void close_database(sqlite3* handle) { void close_database(sqlite3* handle) { sqlite3_close(handle); }
sqlite3_close(handle);
}

View file

@ -8,7 +8,6 @@ typedef struct {
int size; int size;
long totalSize; long totalSize;
char *url; char *url;
char *errBuf;
xmlDocPtr xdp; xmlDocPtr xdp;
} xml; } xml;
@ -19,11 +18,11 @@ typedef struct {
long feed_id; long feed_id;
char *author; char *author;
bool is_favorite; bool is_favorite;
char *featured_image text; char *featured_image;
char *content text; char *content;
char *excerpt text; char *excerpt;
bool is_read; bool is_read;
char *published_date date; char *published_date;
} article; } article;
typedef struct { typedef struct {
@ -33,11 +32,15 @@ typedef struct {
char *feed_url; char *feed_url;
char *icon; char *icon;
bool is_favorite; bool is_favorite;
char *last_polled date; char *last_polled;
char *filter_accept; char *filter_accept;
char *filter_reject; char *filter_reject;
} feed; } feed;
typedef struct {
} feed_with_articles;
size_t my_write_callback(char *ptr, size_t size, size_t nmemb, xml *x); size_t my_write_callback(char *ptr, size_t size, size_t nmemb, xml *x);
size_t get_xml(xml *x); size_t get_xml(xml *x);