From b685a77fb13d526f5b4b216844a216b0dbab1ed3 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Thu, 7 Oct 2010 00:44:59 -0400 Subject: [PATCH] first produced in the summer of 2006, finally published! --- Makefile | 6 + README | 13 ++ wikiq.c | 549 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 568 insertions(+) create mode 100644 Makefile create mode 100644 README create mode 100644 wikiq.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..eec9e75 --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +all: wikiq + +wikiq: wikiq.c + gcc wikiq.c -o wikiq -lexpat + +.PHONY: all diff --git a/README b/README new file mode 100644 index 0000000..ee39e87 --- /dev/null +++ b/README @@ -0,0 +1,13 @@ +wikiq: a WikiMedia XML data dump to .tsv parser + +author: Erik Garrison + +wikiq is written in C using expat. It is designed to enable researchers to +rapidly extract revision histories (minus text and comments) from impossibly +large XML datasets. + +To use, first make sure you have libexpat installed, then: + + % make + % ./wikiq -h # prints usage + % 7za e -so hugewikidatadump.xml | ./wikiq >hugewikidatadump.tsv diff --git a/wikiq.c b/wikiq.c new file mode 100644 index 0000000..554e145 --- /dev/null +++ b/wikiq.c @@ -0,0 +1,549 @@ +/* + * An XML parser for Wikipedia Data dumps. + * Converts XML files to tab-separated values files readable by spreadsheets + * and statistical packages. + */ + +#include +#include +#include +#include +#include "expat.h" +#include + +#define BUFFER_SIZE 80 +// timestamp of the form 2003-11-07T00:43:23Z +#define DATE_LENGTH 10 +#define TIME_LENGTH 8 +#define TIMESTAMP_LENGTH 20 + +enum elements { + TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR, + EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT +}; + +enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP }; + +enum outtype { NORMAL, SIMPLE }; + +typedef struct { + + struct { + char *title; + char *articleid; + char *revid; + char *date; + char *time; + char *timestamp; + char *anon; + char *editor; + char *editorid; + char *minor; + char *comment; + char *text; + } rev; + + char *dropstr; + enum elements element; + enum block position; + enum outtype output_type; + +} parseData; + + +/* free_data and clean_data + * Takes a pointer to the data struct and an integer {0,1} indicating if the + * title data needs to be cleared as well. + * Also, frees memory dynamically allocated to store data. + */ +static void +clean_data(parseData *data, int title) +{ + if (title) { + data->rev.title = NULL; + data->rev.articleid = NULL; + } + data->rev.revid = NULL; + data->rev.date = NULL; + data->rev.time = NULL; + data->rev.timestamp = NULL; + data->rev.anon = NULL; + data->rev.editor = NULL; + data->rev.editorid = NULL; + data->rev.minor = NULL; + data->rev.comment = NULL; + data->rev.text = NULL; + data->element = UNUSED; + //data->position = +} + +static void +free_data(parseData *data, int title) +{ + if (title) { + //printf("freeing article\n"); + free(data->rev.title); + free(data->rev.articleid); + } + free(data->rev.revid); + free(data->rev.date); + free(data->rev.time); + free(data->rev.timestamp); + free(data->rev.anon); + free(data->rev.editor); + free(data->rev.editorid); + free(data->rev.minor); + free(data->rev.comment); + free(data->rev.text); +} + +cleanup_revision(parseData *data) { + free_data(data, 0); + clean_data(data, 0); +} + +cleanup_article(parseData *data) { + free_data(data, 1); + clean_data(data, 1); +} + + +static void +init_data(parseData *data, char *dropstr, int output_type) +{ + clean_data(data, 1); // sets every element to null... + data->dropstr = dropstr; + data->output_type = output_type; +} + +/* for debugging only, prints out the state of the data struct + */ +static void +print_state(parseData *data) +{ + printf("element = %i\n", data->element); + printf("output_type = %i\n", data->output_type); + printf("title = %s\n", data->rev.title); + printf("articleid = %s\n", data->rev.articleid); + printf("revid = %s\n", data->rev.revid); + printf("date = %s\n", data->rev.date); + printf("time = %s\n", data->rev.time); + printf("anon = %s\n", data->rev.anon); + printf("editor = %s\n", data->rev.editor); + printf("editorid = %s\n", data->rev.editorid); + printf("minor = %s\n", data->rev.minor); + printf("comment = %s\n", data->rev.comment); + printf("text = %s\n", data->rev.text); + printf("\n"); + +} + +/* Write a header for the comma-separated output + */ +static void +write_header() +{ + // printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n"); +// printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n"); + +} + + +/* + * write a line of comma-separated value formatted data to standard out + * follows the form: + * title,articleid,revid,date,time,anon,editor,editorid,minor,comment + * (str) (int) (int) (str)(str)(bin)(str) (int) (bin) (str) + * + * it is called right before cleanup_revision() and cleanup_article() + */ +static void +write_row(parseData *data) +{ + // define temporary variables to hold output values: + char *title, *articleid; + char *revid, *date, *time, *anon, *editor, *editorid; + char *minor, *comment; + char *text; + // perform some simple logic to obtain correct output values + + if (data->rev.minor == NULL) + minor = "0"; + else minor = data->rev.minor; + + if (data->rev.editor == NULL) + anon = "1"; + else anon = "0"; + + if (data->rev.title == NULL) + title = ""; + else title = data->rev.title; + + if (data->rev.articleid == NULL) + articleid = ""; + else articleid = data->rev.articleid; + + if (data->rev.revid == NULL) + revid = ""; + else revid = data->rev.revid; + + if (data->rev.date == NULL) + date = ""; + else date = data->rev.date; + + if (data->rev.time == NULL) + time = ""; + else time = data->rev.time; + + if (data->rev.editor == NULL) + editor = ""; + else editor = data->rev.editor; + + if (data->rev.editorid == NULL) + editorid = ""; + else editorid = data->rev.editorid; + + if (data->rev.text == NULL) + text = ""; + else text = data->rev.text; + + + if (data->rev.comment == NULL) + comment = ""; + else comment = data->rev.comment; + + + // TODO: make it so you can specify fields to output + // note that date and time are separated by a space, to match postgres's + // timestamp format + switch (data->output_type) + { + case NORMAL: + printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%s\t%s\n", + title,articleid,revid,date,time,anon,editor,editorid,minor,comment,text); + break; + case SIMPLE: + printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\n", + title,articleid,revid,date,time,anon,editor,editorid,minor); + break; + } + +} + +static char +*timestr(char *timestamp, char time_buffer[TIME_LENGTH+1]) +{ + char *timeinstamp = ×tamp[DATE_LENGTH+1]; + strncpy(time_buffer, timeinstamp, TIME_LENGTH); + time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string +} + + +static char +*datestr(char *timestamp, char date_buffer[DATE_LENGTH+1]) +{ + strncpy(date_buffer, timestamp, DATE_LENGTH); + date_buffer[DATE_LENGTH] = '\0'; +} + +char +*append(char *entry, char *new) +{ + char *newbuff; + int len; + len = (strlen(entry)+strlen(new))*sizeof(char) + 1; + newbuff = realloc(entry, len); + strcat(newbuff, new); + return newbuff; +} + +char +*cache(char *entry, char *new) +{ + char *newbuff; + int len; + len = strlen(new)*sizeof(char) + 1; // include space for the '\0' ! + newbuff = malloc(len); + strcpy(newbuff,new); + return newbuff; + +} + +char +*store(char *entry, char *new) +{ + char *newbuff; + if (entry == NULL) + newbuff = cache(entry, new); + else + newbuff = append(entry, new); + return newbuff; +} + +void +split_timestamp(parseData *data) +{ + char *t = data->rev.timestamp; + char date_buffer[DATE_LENGTH+1]; + char time_buffer[TIME_LENGTH+1]; + datestr(t, date_buffer); + timestr(t, time_buffer); + data->rev.date = store(data->rev.date, date_buffer); + data->rev.time = store(data->rev.time, time_buffer); +} + +/* currently unused */ +static int +is_whitespace(char *string) { + int len = strlen(string); + while (isspace(string[0]) && strlen(string) > 0) { + string++; + } + if (strcmp(string, "") == 0) + return 1; + else + return 0; +} + +static void +squeeze(char *s, int c) { + int i, j; + for (i = j = 0; s[i] != '\0'; i++) + if (s[i] != c) + s[j++] = s[i]; + s[j] = '\0'; +} + +int +contains(char *s, char *t) +{ + char c = t[0]; //just get the first character of t + int i = 0; + while (s[i] != '\0') { + if (s[i] == c) + return 1; + i++; + } +} + +static void +charhndl(parseData *data, char *s, int len) +{ + if (data->element != UNUSED && data->position != SKIP) { + char t[len]; + strncpy(t,s,len); + t[len] = '\0'; // makes t a well-formed string + switch (data->element) { + case TITLE: + { + data->rev.title = store(data->rev.title, t); + // skip any articles with bad characters in their titles + if (contains(t, data->dropstr)) { + data->position = SKIP; + //printf("found a baddie\n"); + } + break; + } + case ARTICLEID: + // printf("articleid = %s\n", t); + data->rev.articleid = store(data->rev.articleid, t); + break; + case REVID: + // printf("revid = %s\n", t); + data->rev.revid = store(data->rev.revid, t); + break; + case TIMESTAMP: + data->rev.timestamp = store(data->rev.timestamp, t); + if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH) + split_timestamp(data); + break; + case EDITOR: { + data->rev.editor = store(data->rev.editor, t); + break; + } + case EDITORID: + //printf("editorid = %s\n", t); + data->rev.editorid = store(data->rev.editorid, t); + break; + /* the following are implied or skipped: + case MINOR: + printf("found minor element\n"); doesn't work + break; minor tag is just a tag + case UNUSED: + */ + case COMMENT: + // printf("row: comment is %s\n", t); + data->rev.comment = store(data->rev.comment, t); + break; + case TEXT: + data->rev.text = store(data->rev.text, t); + break; + default: break; + } + } +} + +static void +start(parseData *data, const char *name, const char **attr) +{ + + if (strcmp(name,"title") == 0) { + cleanup_article(data); // cleans up data from last article + data->element = TITLE; + data->position = TITLE_BLOCK; + } else if (data->position != SKIP) { + if (strcmp(name,"revision") == 0) { + data->element = REVISION; + data->position = REVISION_BLOCK; + } else if (strcmp(name, "contributor") == 0) { + data->element = CONTRIBUTOR; + data->position = CONTRIBUTOR_BLOCK; + } else if (strcmp(name,"id") == 0) + switch (data->position) { + case TITLE_BLOCK: + data->element = ARTICLEID; + break; + case REVISION_BLOCK: + data->element = REVID; + break; + case CONTRIBUTOR_BLOCK: + data->element = EDITORID; + break; + } + + // minor tag has no character data, so we parse here + else if (strcmp(name,"minor") == 0) { + data->element = MINOR; + data->rev.minor = store(data->rev.minor, "1"); + } + else if (strcmp(name,"timestamp") == 0) + data->element = TIMESTAMP; + + else if (strcmp(name, "username") == 0) + data->element = EDITOR; + + else if (strcmp(name,"ip") == 0) + data->element = EDITORID; + + else if (strcmp(name,"comment") == 0) + data->element = COMMENT; + + else if (strcmp(name,"text") == 0) + data->element = TEXT; + + else if (strcmp(name,"page") == 0 + || strcmp(name,"mediawiki") == 0 + || strcmp(name,"restrictions") == 0 + || strcmp(name,"siteinfo") == 0) + data->element = UNUSED; + } + +} + + +static void +end(parseData *data, const char *name) +{ + if (strcmp(name, "revision") == 0 && data->position != SKIP) { + write_row(data); // crucial... :) + cleanup_revision(data); // also crucial + } else { + data->element = UNUSED; // sets our state to "not-in-useful" + } // thus avoiding unpleasant character data + // b/w tags (newlines etc.) +} + +void print_usage(char* argv[]) { + fprintf(stderr, "usage: | %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -t print text and comments after each line of tab separated data\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n"); + fprintf(stderr, "a tab-separated stream of revisions on standard out:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "author: Erik Garrison \n"); +} + + +int +main(int argc, char *argv[]) +{ + + char *dropstr = ""; + enum outtype output_type; + int dry_run = 0; + // in "simple" output, we don't print text and comments + output_type = SIMPLE; + char c; + + while ((c = getopt(argc, argv, "hr:sd")) != -1) + switch (c) + { + case 'r': + dropstr = optarg; + break; + case 'd': + dry_run = 1; + break; + case 't': + output_type = NORMAL; + break; + case 'h': + print_usage(argv); + exit(0); + break; + } + + if (dry_run) { // lets us print initialization options + printf("simple_output = %i\n", output_type); + printf("dropstr = %s\n", dropstr); + exit(1); + } + + // create a new instance of the expat parser + XML_Parser parser = XML_ParserCreate(NULL); + + // initialize the user data struct which is passed to callback functions + parseData data; + // initialize the elements of the struct to default values + init_data(&data, dropstr, output_type); + + + // makes the parser pass "data" as the first argument to every callback + XML_SetUserData(parser, &data); + // sets start and end to be the element start and end handlers + XML_SetElementHandler(parser, (void *) start, (void *) end); + // sets charhndl to be the callback for raw character data + XML_SetCharacterDataHandler(parser, (void *) charhndl); + + int done; + char buf[BUFSIZ]; + + write_header(); + + // shovel data into the parser + do { + + // read into buf a bufferfull of data from standard input + size_t len = fread(buf, 1, sizeof(buf), stdin); + done = len < sizeof(buf); // checks if we've got the last bufferfull + + // passes the buffer of data to the parser and checks for error + // (this is where the callbacks are invoked) + if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) { + fprintf(stderr, + "%s at line %d\n", + XML_ErrorString(XML_GetErrorCode(parser)), + (int) XML_GetCurrentLineNumber(parser)); + return 1; + } + } while (!done); + + + XML_ParserFree(parser); + + return 0; +} -- 2.39.5