X-Git-Url: https://projects.mako.cc/source/wikiq/blobdiff_plain/a8bbe4703654e6d6131368c0c42995f8933b3dce..26c05eec96c89590ddf4d516ae0cf9fc6bdbec82:/wikiq.c diff --git a/wikiq.c b/wikiq.c deleted file mode 100644 index a8cf97d..0000000 --- a/wikiq.c +++ /dev/null @@ -1,492 +0,0 @@ -/* - * An XML parser for Wikipedia Data dumps. - * Converts XML files to tab-separated values files readable by spreadsheets - * and statistical packages. - */ - -#include -#include -#include -#include -#include "expat.h" -#include -#include "disorder.h" -#include "md5.h" - -// timestamp of the form 2003-11-07T00:43:23Z -#define DATE_LENGTH 10 -#define TIME_LENGTH 8 -#define TIMESTAMP_LENGTH 20 - -#define MEGABYTE 1048576 -#define FIELD_BUFFER_SIZE 1024 -// 2048 KB in bytes + 1 -//#define TEXT_BUFFER_SIZE 2097153 -//#define TEXT_BUFFER_SIZE 10485760 - -enum elements { - TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR, - EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT -}; - -enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP }; - -enum outtype { FULL, SIMPLE }; - -typedef struct { - - // pointers to once-allocated buffers - char *title; - char *articleid; - char *revid; - char *date; - char *time; - char *timestamp; - char *anon; - char *editor; - char *editorid; - char *comment; - char *text; - - // track string size of the elements, to prevent O(N^2) processing in charhndl - // when we have to take strlen for every character which we append to the buffer - size_t title_size; - size_t articleid_size; - size_t revid_size; - size_t date_size; - size_t time_size; - size_t timestamp_size; - size_t anon_size; - size_t editor_size; - size_t editorid_size; - size_t comment_size; - size_t text_size; - - bool minor; - - enum elements element; - enum block position; - enum outtype output_type; - -} revisionData; - - -/* free_data and clean_data - * Takes a pointer to the data struct and an integer {0,1} indicating if the - * title data needs to be cleared as well. - * Also, frees memory dynamically allocated to store data. - */ -static void -clean_data(revisionData *data, int title) -{ - // reset title (if we are switching articles) - if (title) { - data->title[0] = '\0'; - data->articleid[0] = '\0'; - data->title_size = 0; - data->articleid_size = 0; - } - - // reset text fields - data->revid[0] = '\0'; - data->date[0] = '\0'; - data->time[0] = '\0'; - data->timestamp[0] = '\0'; - data->anon[0] = '\0'; - data->editor[0] = '\0'; - data->editorid[0] = '\0'; - data->comment[0] = '\0'; - data->text[0] = '\0'; - - // reset length tracking - data->revid_size = 0; - data->date_size = 0; - data->time_size = 0; - data->timestamp_size = 0; - data->anon_size = 0; - data->editor_size = 0; - data->editorid_size = 0; - data->comment_size = 0; - data->text_size = 0; - - // reset flags and element type info - data->minor = false; - data->element = UNUSED; - -} - -// presently unused -static void -free_data(revisionData *data, int title) -{ - if (title) { - //printf("freeing article\n"); - free(data->title); - free(data->articleid); - } - free(data->revid); - free(data->date); - free(data->time); - free(data->timestamp); - free(data->anon); - free(data->editor); - free(data->editorid); - free(data->comment); - free(data->text); -} - -void cleanup_revision(revisionData *data) { - clean_data(data, 0); -} - -void cleanup_article(revisionData *data) { - clean_data(data, 1); -} - - -static void -init_data(revisionData *data, outtype output_type) -{ - data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'? - data->comment = (char*) malloc(FIELD_BUFFER_SIZE); - data->title = (char*) malloc(FIELD_BUFFER_SIZE); - data->articleid = (char*) malloc(FIELD_BUFFER_SIZE); - data->revid = (char*) malloc(FIELD_BUFFER_SIZE); - data->date = (char*) malloc(FIELD_BUFFER_SIZE); - data->time = (char*) malloc(FIELD_BUFFER_SIZE); - data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE); - data->anon = (char*) malloc(FIELD_BUFFER_SIZE); - data->editor = (char*) malloc(FIELD_BUFFER_SIZE); - data->editorid = (char*) malloc(FIELD_BUFFER_SIZE); - data->minor = false; - - // resets the data fields, null terminates strings, sets lengths - clean_data(data, 1); - - data->output_type = output_type; -} - -/* for debugging only, prints out the state of the data struct - */ -static void -print_state(revisionData *data) -{ - printf("element = %i\n", data->element); - printf("output_type = %i\n", data->output_type); - printf("title = %s\n", data->title); - printf("articleid = %s\n", data->articleid); - printf("revid = %s\n", data->revid); - printf("date = %s\n", data->date); - printf("time = %s\n", data->time); - printf("anon = %s\n", data->anon); - printf("editor = %s\n", data->editor); - printf("editorid = %s\n", data->editorid); - printf("minor = %s\n", (data->minor ? "1" : "0")); - printf("comment = %s\n", data->comment); - printf("text = %s\n", data->text); - printf("\n"); - -} - -/* Write a header for the comma-separated output - */ -static void -write_header() -{ - // printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n"); -// printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n"); - -} - - -/* - * write a line of comma-separated value formatted data to standard out - * follows the form: - * title,articleid,revid,date,time,anon,editor,editorid,minor,comment - * (str) (int) (int) (str)(str)(bin)(str) (int) (bin) (str) - * - * it is called right before cleanup_revision() and cleanup_article() - */ -static void -write_row(revisionData *data) -{ - - // get md5sum - md5_state_t state; - md5_byte_t digest[16]; - char md5_hex_output[2 * 16 + 1]; - md5_init(&state); - md5_append(&state, (const md5_byte_t *)data->text, data->text_size); - md5_finish(&state, digest); - int di; - for (di = 0; di < 16; ++di) { - sprintf(md5_hex_output + di * 2, "%02x", digest[di]); - } - - // print line of tsv output - printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\n", - data->title, - data->articleid, - data->revid, - data->date, - data->time, - (data->editor[0] != '\0') ? "0" : "1", // anon? - data->editor, - data->editorid, - (data->minor) ? "1" : "0", - (unsigned int) data->text_size, - shannon_H(data->text, data->text_size), - md5_hex_output - ); - - // - if (data->output_type == FULL) { - printf("comment:%s\ntext:\n%s\n", data->comment, data->text); - } - -} - -void -split_timestamp(revisionData *data) -{ - char *t = data->timestamp; - strncpy(data->date, data->timestamp, DATE_LENGTH); - char *timeinstamp = &data->timestamp[DATE_LENGTH+1]; - strncpy(data->time, timeinstamp, TIME_LENGTH); -} - -// like strncat but with previously known length -char* -strlcatn(char *dest, const char *src, size_t dest_len, size_t n) -{ - //size_t dest_len = strlen(dest); - size_t i; - - for (i = 0 ; i < n && src[i] != '\0' ; i++) - dest[dest_len + i] = src[i]; - dest[dest_len + i] = '\0'; - - return dest; -} - -static void -charhndl(void* vdata, const XML_Char* s, int len) -{ - revisionData* data = (revisionData*) vdata; - if (data->element != UNUSED && data->position != SKIP) { - //char t[len]; - //strncpy(t,s,len); - //t[len] = '\0'; // makes t a well-formed string - switch (data->element) { - case TEXT: - // printf("buffer length = %i, text: %s\n", len, t); - strlcatn(data->text, s, data->text_size, len); - data->text_size += len; - break; - case COMMENT: - strlcatn(data->comment, s, data->comment_size, len); - data->comment_size += len; - break; - case TITLE: - strlcatn(data->title, s, data->title_size, len); - data->title_size += len; - break; - case ARTICLEID: - // printf("articleid = %s\n", t); - strlcatn(data->articleid, s, data->articleid_size, len); - data->articleid_size += len; - break; - case REVID: - // printf("revid = %s\n", t); - strlcatn(data->revid, s, data->revid_size, len); - data->revid_size += len; - break; - case TIMESTAMP: - strlcatn(data->timestamp, s, data->timestamp_size, len); - data->timestamp_size += len; - if (strlen(data->timestamp) == TIMESTAMP_LENGTH) - split_timestamp(data); - break; - case EDITOR: - strlcatn(data->editor, s, data->editor_size, len); - data->editor_size += len; - break; - case EDITORID: - //printf("editorid = %s\n", t); - strlcatn(data->editorid, s, data->editorid_size, len); - data->editorid_size += len; - break; - /* the following are implied or skipped: - case MINOR: - printf("found minor element\n"); doesn't work - break; minor tag is just a tag - case UNUSED: - */ - default: break; - } - } -} - -static void -start(void* vdata, const XML_Char* name, const XML_Char** attr) -{ - revisionData* data = (revisionData*) vdata; - - if (strcmp(name,"title") == 0) { - cleanup_article(data); // cleans up data from last article - data->element = TITLE; - data->position = TITLE_BLOCK; - } else if (data->position != SKIP) { - if (strcmp(name,"revision") == 0) { - data->element = REVISION; - data->position = REVISION_BLOCK; - } else if (strcmp(name, "contributor") == 0) { - data->element = CONTRIBUTOR; - data->position = CONTRIBUTOR_BLOCK; - } else if (strcmp(name,"id") == 0) - switch (data->position) { - case TITLE_BLOCK: - data->element = ARTICLEID; - break; - case REVISION_BLOCK: - data->element = REVID; - break; - case CONTRIBUTOR_BLOCK: - data->element = EDITORID; - break; - } - - // minor tag has no character data, so we parse here - else if (strcmp(name,"minor") == 0) { - data->element = MINOR; - data->minor = true; - } - else if (strcmp(name,"timestamp") == 0) - data->element = TIMESTAMP; - - else if (strcmp(name, "username") == 0) - data->element = EDITOR; - - else if (strcmp(name,"ip") == 0) - data->element = EDITORID; - - else if (strcmp(name,"comment") == 0) - data->element = COMMENT; - - else if (strcmp(name,"text") == 0) - data->element = TEXT; - - else if (strcmp(name,"page") == 0 - || strcmp(name,"mediawiki") == 0 - || strcmp(name,"restrictions") == 0 - || strcmp(name,"siteinfo") == 0) - data->element = UNUSED; - } - -} - - -static void -end(void* vdata, const XML_Char* name) -{ - revisionData* data = (revisionData*) vdata; - if (strcmp(name, "revision") == 0 && data->position != SKIP) { - write_row(data); // crucial... :) - cleanup_revision(data); // also crucial - } else { - data->element = UNUSED; // sets our state to "not-in-useful" - } // thus avoiding unpleasant character data - // b/w tags (newlines etc.) -} - -void print_usage(char* argv[]) { - fprintf(stderr, "usage: | %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -t print text and comments after each line of tab separated data\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n"); - fprintf(stderr, "a tab-separated stream of revisions on standard out:\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "author: Erik Garrison \n"); -} - - -int -main(int argc, char *argv[]) -{ - - enum outtype output_type; - int dry_run = 0; - // in "simple" output, we don't print text and comments - output_type = SIMPLE; - char c; - - while ((c = getopt(argc, argv, "ht")) != -1) - switch (c) - { - case 'd': - dry_run = 1; - break; - case 't': - output_type = FULL; - break; - case 'h': - print_usage(argv); - exit(0); - break; - } - - if (dry_run) { // lets us print initialization options - printf("simple_output = %i\n", output_type); - exit(1); - } - - // create a new instance of the expat parser - XML_Parser parser = XML_ParserCreate("UTF-8"); - - // initialize the user data struct which is passed to callback functions - revisionData data; - // initialize the elements of the struct to default values - init_data(&data, output_type); - - - // makes the parser pass "data" as the first argument to every callback - XML_SetUserData(parser, &data); - void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start; - void (*endFnPtr)(void*, const XML_Char*) = end; - void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl; - - // sets start and end to be the element start and end handlers - XML_SetElementHandler(parser, startFnPtr, endFnPtr); - // sets charhndl to be the callback for character data - XML_SetCharacterDataHandler(parser, charHandlerFnPtr); - - bool done; - char buf[BUFSIZ]; - - // shovel data into the parser - do { - - // read into buf a bufferfull of data from standard input - size_t len = fread(buf, 1, BUFSIZ, stdin); - done = len < BUFSIZ; // checks if we've got the last bufferfull - - // passes the buffer of data to the parser and checks for error - // (this is where the callbacks are invoked) - if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) { - fprintf(stderr, - "%s at line %d\n", - XML_ErrorString(XML_GetErrorCode(parser)), - (int) XML_GetCurrentLineNumber(parser)); - return 1; - } - } while (!done); - - - XML_ParserFree(parser); - - return 0; -}