X-Git-Url: https://projects.mako.cc/source/wikiq/blobdiff_plain/b685a77fb13d526f5b4b216844a216b0dbab1ed3..86aeece4b3a7ffa495b4c09441cffeec8e579f34:/wikiq.c diff --git a/wikiq.c b/wikiq.c index 554e145..4d254f2 100644 --- a/wikiq.c +++ b/wikiq.c @@ -11,12 +11,17 @@ #include "expat.h" #include -#define BUFFER_SIZE 80 // timestamp of the form 2003-11-07T00:43:23Z #define DATE_LENGTH 10 #define TIME_LENGTH 8 #define TIMESTAMP_LENGTH 20 +#define MEGABYTE 1048576 +#define FIELD_BUFFER_SIZE 1024 +// 2048 KB in bytes + 1 +//#define TEXT_BUFFER_SIZE 2097153 +//#define TEXT_BUFFER_SIZE 10485760 + enum elements { TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR, EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT @@ -24,31 +29,44 @@ enum elements { enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP }; -enum outtype { NORMAL, SIMPLE }; +enum outtype { FULL, SIMPLE }; typedef struct { - struct { - char *title; - char *articleid; - char *revid; - char *date; - char *time; - char *timestamp; - char *anon; - char *editor; - char *editorid; - char *minor; - char *comment; - char *text; - } rev; + // pointers to once-allocated buffers + char *title; + char *articleid; + char *revid; + char *date; + char *time; + char *timestamp; + char *anon; + char *editor; + char *editorid; + char *comment; + char *text; + + // track string size of the elements, to prevent O(N^2) processing in charhndl + // when we have to take strlen for every character which we append to the buffer + size_t title_size; + size_t articleid_size; + size_t revid_size; + size_t date_size; + size_t time_size; + size_t timestamp_size; + size_t anon_size; + size_t editor_size; + size_t editorid_size; + size_t comment_size; + size_t text_size; + + bool minor; - char *dropstr; enum elements element; enum block position; enum outtype output_type; -} parseData; +} revisionData; /* free_data and clean_data @@ -57,83 +75,113 @@ typedef struct { * Also, frees memory dynamically allocated to store data. */ static void -clean_data(parseData *data, int title) +clean_data(revisionData *data, int title) { + // reset title (if we are switching articles) if (title) { - data->rev.title = NULL; - data->rev.articleid = NULL; + data->title[0] = '\0'; + data->articleid[0] = '\0'; + data->title_size = 0; + data->articleid_size = 0; } - data->rev.revid = NULL; - data->rev.date = NULL; - data->rev.time = NULL; - data->rev.timestamp = NULL; - data->rev.anon = NULL; - data->rev.editor = NULL; - data->rev.editorid = NULL; - data->rev.minor = NULL; - data->rev.comment = NULL; - data->rev.text = NULL; + + // reset text fields + data->revid[0] = '\0'; + data->date[0] = '\0'; + data->time[0] = '\0'; + data->timestamp[0] = '\0'; + data->anon[0] = '\0'; + data->editor[0] = '\0'; + data->editorid[0] = '\0'; + data->comment[0] = '\0'; + data->text[0] = '\0'; + + // reset length tracking + data->revid_size = 0; + data->date_size = 0; + data->time_size = 0; + data->timestamp_size = 0; + data->anon_size = 0; + data->editor_size = 0; + data->editorid_size = 0; + data->comment_size = 0; + data->text_size = 0; + + // reset flags and element type info + data->minor = false; data->element = UNUSED; - //data->position = + } +// presently unused static void -free_data(parseData *data, int title) +free_data(revisionData *data, int title) { if (title) { //printf("freeing article\n"); - free(data->rev.title); - free(data->rev.articleid); + free(data->title); + free(data->articleid); } - free(data->rev.revid); - free(data->rev.date); - free(data->rev.time); - free(data->rev.timestamp); - free(data->rev.anon); - free(data->rev.editor); - free(data->rev.editorid); - free(data->rev.minor); - free(data->rev.comment); - free(data->rev.text); + free(data->revid); + free(data->date); + free(data->time); + free(data->timestamp); + free(data->anon); + free(data->editor); + free(data->editorid); + free(data->comment); + free(data->text); } -cleanup_revision(parseData *data) { - free_data(data, 0); +void cleanup_revision(revisionData *data) { clean_data(data, 0); } -cleanup_article(parseData *data) { - free_data(data, 1); +void cleanup_article(revisionData *data) { clean_data(data, 1); } static void -init_data(parseData *data, char *dropstr, int output_type) +init_data(revisionData *data, outtype output_type) { - clean_data(data, 1); // sets every element to null... - data->dropstr = dropstr; + data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'? + data->comment = (char*) malloc(FIELD_BUFFER_SIZE); + data->title = (char*) malloc(FIELD_BUFFER_SIZE); + data->articleid = (char*) malloc(FIELD_BUFFER_SIZE); + data->revid = (char*) malloc(FIELD_BUFFER_SIZE); + data->date = (char*) malloc(FIELD_BUFFER_SIZE); + data->time = (char*) malloc(FIELD_BUFFER_SIZE); + data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE); + data->anon = (char*) malloc(FIELD_BUFFER_SIZE); + data->editor = (char*) malloc(FIELD_BUFFER_SIZE); + data->editorid = (char*) malloc(FIELD_BUFFER_SIZE); + data->minor = false; + + // resets the data fields, null terminates strings, sets lengths + clean_data(data, 1); + data->output_type = output_type; } /* for debugging only, prints out the state of the data struct */ static void -print_state(parseData *data) +print_state(revisionData *data) { printf("element = %i\n", data->element); printf("output_type = %i\n", data->output_type); - printf("title = %s\n", data->rev.title); - printf("articleid = %s\n", data->rev.articleid); - printf("revid = %s\n", data->rev.revid); - printf("date = %s\n", data->rev.date); - printf("time = %s\n", data->rev.time); - printf("anon = %s\n", data->rev.anon); - printf("editor = %s\n", data->rev.editor); - printf("editorid = %s\n", data->rev.editorid); - printf("minor = %s\n", data->rev.minor); - printf("comment = %s\n", data->rev.comment); - printf("text = %s\n", data->rev.text); + printf("title = %s\n", data->title); + printf("articleid = %s\n", data->articleid); + printf("revid = %s\n", data->revid); + printf("date = %s\n", data->date); + printf("time = %s\n", data->time); + printf("anon = %s\n", data->anon); + printf("editor = %s\n", data->editor); + printf("editorid = %s\n", data->editorid); + printf("minor = %s\n", (data->minor ? "1" : "0")); + printf("comment = %s\n", data->comment); + printf("text = %s\n", data->text); printf("\n"); } @@ -158,138 +206,76 @@ write_header() * it is called right before cleanup_revision() and cleanup_article() */ static void -write_row(parseData *data) +write_row(revisionData *data) { - // define temporary variables to hold output values: - char *title, *articleid; - char *revid, *date, *time, *anon, *editor, *editorid; - char *minor, *comment; - char *text; - // perform some simple logic to obtain correct output values - - if (data->rev.minor == NULL) - minor = "0"; - else minor = data->rev.minor; - - if (data->rev.editor == NULL) - anon = "1"; - else anon = "0"; - - if (data->rev.title == NULL) - title = ""; - else title = data->rev.title; - - if (data->rev.articleid == NULL) - articleid = ""; - else articleid = data->rev.articleid; - - if (data->rev.revid == NULL) - revid = ""; - else revid = data->rev.revid; - - if (data->rev.date == NULL) - date = ""; - else date = data->rev.date; - - if (data->rev.time == NULL) - time = ""; - else time = data->rev.time; - - if (data->rev.editor == NULL) - editor = ""; - else editor = data->rev.editor; - - if (data->rev.editorid == NULL) - editorid = ""; - else editorid = data->rev.editorid; - - if (data->rev.text == NULL) - text = ""; - else text = data->rev.text; - - - if (data->rev.comment == NULL) - comment = ""; - else comment = data->rev.comment; - // TODO: make it so you can specify fields to output // note that date and time are separated by a space, to match postgres's // timestamp format + printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s", + data->title, + data->articleid, + data->revid, + data->date, + data->time, + (data->editor[0] != '\0') ? "0" : "1", // anon? + data->editor, + data->editorid, + (data->minor) ? "1" : "0"); switch (data->output_type) { - case NORMAL: - printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%s\t%s\n", - title,articleid,revid,date,time,anon,editor,editorid,minor,comment,text); - break; case SIMPLE: - printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\n", - title,articleid,revid,date,time,anon,editor,editorid,minor); + printf("\t%i\n", (unsigned int) strlen(data->text)); + //printf("\n"); + break; + case FULL: + printf("\t%s\t%s\n", data->comment, data->text); break; } } -static char -*timestr(char *timestamp, char time_buffer[TIME_LENGTH+1]) -{ - char *timeinstamp = ×tamp[DATE_LENGTH+1]; - strncpy(time_buffer, timeinstamp, TIME_LENGTH); - time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string -} - - -static char -*datestr(char *timestamp, char date_buffer[DATE_LENGTH+1]) -{ - strncpy(date_buffer, timestamp, DATE_LENGTH); - date_buffer[DATE_LENGTH] = '\0'; -} - char -*append(char *entry, char *new) +*append(char *entry, char *newstr) { char *newbuff; int len; - len = (strlen(entry)+strlen(new))*sizeof(char) + 1; - newbuff = realloc(entry, len); - strcat(newbuff, new); + len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1; + newbuff = (char*) realloc(entry, len); + strcat(newbuff, newstr); return newbuff; } char -*cache(char *entry, char *new) +*cache(char *entry, char *newstr) { char *newbuff; int len; - len = strlen(new)*sizeof(char) + 1; // include space for the '\0' ! - newbuff = malloc(len); - strcpy(newbuff,new); + len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' ! + newbuff = (char*) malloc(len); + strcpy(newbuff,newstr); return newbuff; } char -*store(char *entry, char *new) +*store(char *entry, char *newstr) { char *newbuff; if (entry == NULL) - newbuff = cache(entry, new); + newbuff = cache(entry, newstr); else - newbuff = append(entry, new); + newbuff = append(entry, newstr); return newbuff; } void -split_timestamp(parseData *data) +split_timestamp(revisionData *data) { - char *t = data->rev.timestamp; - char date_buffer[DATE_LENGTH+1]; - char time_buffer[TIME_LENGTH+1]; - datestr(t, date_buffer); - timestr(t, time_buffer); - data->rev.date = store(data->rev.date, date_buffer); - data->rev.time = store(data->rev.time, time_buffer); + char *t = data->timestamp; + strncpy(data->date, data->timestamp, DATE_LENGTH); + char *timeinstamp = &data->timestamp[DATE_LENGTH+1]; + strncpy(data->time, timeinstamp, TIME_LENGTH); } /* currently unused */ @@ -305,65 +291,66 @@ is_whitespace(char *string) { return 0; } -static void -squeeze(char *s, int c) { - int i, j; - for (i = j = 0; s[i] != '\0'; i++) - if (s[i] != c) - s[j++] = s[i]; - s[j] = '\0'; -} - -int -contains(char *s, char *t) +// like strncat but with previously known length +char* +strlcatn(char *dest, const char *src, size_t dest_len, size_t n) { - char c = t[0]; //just get the first character of t - int i = 0; - while (s[i] != '\0') { - if (s[i] == c) - return 1; - i++; - } + //size_t dest_len = strlen(dest); + size_t i; + + for (i = 0 ; i < n && src[i] != '\0' ; i++) + dest[dest_len + i] = src[i]; + dest[dest_len + i] = '\0'; + + return dest; } static void -charhndl(parseData *data, char *s, int len) +charhndl(void* vdata, const XML_Char* s, int len) { + revisionData* data = (revisionData*) vdata; if (data->element != UNUSED && data->position != SKIP) { - char t[len]; - strncpy(t,s,len); - t[len] = '\0'; // makes t a well-formed string + //char t[len]; + //strncpy(t,s,len); + //t[len] = '\0'; // makes t a well-formed string switch (data->element) { + case TEXT: + // printf("buffer length = %i, text: %s\n", len, t); + strlcatn(data->text, s, data->text_size, len); + data->text_size += len; + break; + case COMMENT: + strlcatn(data->comment, s, data->comment_size, len); + data->comment_size += len; + break; case TITLE: - { - data->rev.title = store(data->rev.title, t); - // skip any articles with bad characters in their titles - if (contains(t, data->dropstr)) { - data->position = SKIP; - //printf("found a baddie\n"); - } + strlcatn(data->title, s, data->title_size, len); + data->title_size += len; break; - } case ARTICLEID: // printf("articleid = %s\n", t); - data->rev.articleid = store(data->rev.articleid, t); + strlcatn(data->articleid, s, data->articleid_size, len); + data->articleid_size += len; break; case REVID: // printf("revid = %s\n", t); - data->rev.revid = store(data->rev.revid, t); + strlcatn(data->revid, s, data->revid_size, len); + data->revid_size += len; break; case TIMESTAMP: - data->rev.timestamp = store(data->rev.timestamp, t); - if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH) + strlcatn(data->timestamp, s, data->timestamp_size, len); + data->timestamp_size += len; + if (strlen(data->timestamp) == TIMESTAMP_LENGTH) split_timestamp(data); break; - case EDITOR: { - data->rev.editor = store(data->rev.editor, t); + case EDITOR: + strlcatn(data->editor, s, data->editor_size, len); + data->editor_size += len; break; - } case EDITORID: //printf("editorid = %s\n", t); - data->rev.editorid = store(data->rev.editorid, t); + strlcatn(data->editorid, s, data->editorid_size, len); + data->editorid_size += len; break; /* the following are implied or skipped: case MINOR: @@ -371,21 +358,15 @@ charhndl(parseData *data, char *s, int len) break; minor tag is just a tag case UNUSED: */ - case COMMENT: - // printf("row: comment is %s\n", t); - data->rev.comment = store(data->rev.comment, t); - break; - case TEXT: - data->rev.text = store(data->rev.text, t); - break; default: break; } } } static void -start(parseData *data, const char *name, const char **attr) +start(void* vdata, const XML_Char* name, const XML_Char** attr) { + revisionData* data = (revisionData*) vdata; if (strcmp(name,"title") == 0) { cleanup_article(data); // cleans up data from last article @@ -414,7 +395,7 @@ start(parseData *data, const char *name, const char **attr) // minor tag has no character data, so we parse here else if (strcmp(name,"minor") == 0) { data->element = MINOR; - data->rev.minor = store(data->rev.minor, "1"); + data->minor = true; } else if (strcmp(name,"timestamp") == 0) data->element = TIMESTAMP; @@ -442,8 +423,9 @@ start(parseData *data, const char *name, const char **attr) static void -end(parseData *data, const char *name) +end(void* vdata, const XML_Char* name) { + revisionData* data = (revisionData*) vdata; if (strcmp(name, "revision") == 0 && data->position != SKIP) { write_row(data); // crucial... :) cleanup_revision(data); // also crucial @@ -462,7 +444,7 @@ void print_usage(char* argv[]) { fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n"); fprintf(stderr, "a tab-separated stream of revisions on standard out:\n"); fprintf(stderr, "\n"); - fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n"); + fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor, revlength\n"); fprintf(stderr, "\n"); fprintf(stderr, "author: Erik Garrison \n"); } @@ -472,24 +454,20 @@ int main(int argc, char *argv[]) { - char *dropstr = ""; enum outtype output_type; int dry_run = 0; // in "simple" output, we don't print text and comments output_type = SIMPLE; char c; - while ((c = getopt(argc, argv, "hr:sd")) != -1) + while ((c = getopt(argc, argv, "ht")) != -1) switch (c) { - case 'r': - dropstr = optarg; - break; case 'd': dry_run = 1; break; case 't': - output_type = NORMAL; + output_type = FULL; break; case 'h': print_usage(argv); @@ -499,37 +477,38 @@ main(int argc, char *argv[]) if (dry_run) { // lets us print initialization options printf("simple_output = %i\n", output_type); - printf("dropstr = %s\n", dropstr); exit(1); } // create a new instance of the expat parser - XML_Parser parser = XML_ParserCreate(NULL); + XML_Parser parser = XML_ParserCreate("UTF-8"); // initialize the user data struct which is passed to callback functions - parseData data; + revisionData data; // initialize the elements of the struct to default values - init_data(&data, dropstr, output_type); + init_data(&data, output_type); // makes the parser pass "data" as the first argument to every callback XML_SetUserData(parser, &data); + void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start; + void (*endFnPtr)(void*, const XML_Char*) = end; + void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl; + // sets start and end to be the element start and end handlers - XML_SetElementHandler(parser, (void *) start, (void *) end); - // sets charhndl to be the callback for raw character data - XML_SetCharacterDataHandler(parser, (void *) charhndl); + XML_SetElementHandler(parser, startFnPtr, endFnPtr); + // sets charhndl to be the callback for character data + XML_SetCharacterDataHandler(parser, charHandlerFnPtr); - int done; + bool done; char buf[BUFSIZ]; - write_header(); - // shovel data into the parser do { // read into buf a bufferfull of data from standard input - size_t len = fread(buf, 1, sizeof(buf), stdin); - done = len < sizeof(buf); // checks if we've got the last bufferfull + size_t len = fread(buf, 1, BUFSIZ, stdin); + done = len < BUFSIZ; // checks if we've got the last bufferfull // passes the buffer of data to the parser and checks for error // (this is where the callbacks are invoked)