first produced in the summer of 2006, finally published!
authorErik Garrison <erik@hypervolu.me>
Thu, 7 Oct 2010 04:44:59 +0000 (00:44 -0400)
committerErik Garrison <erik@hypervolu.me>
Thu, 7 Oct 2010 04:44:59 +0000 (00:44 -0400)
Makefile [new file with mode: 0644]
README [new file with mode: 0644]
wikiq.c [new file with mode: 0644]

diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..eec9e75
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,6 @@
+all: wikiq
+
+wikiq: wikiq.c
+       gcc wikiq.c -o wikiq -lexpat
+
+.PHONY: all
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..ee39e87
--- /dev/null
+++ b/README
@@ -0,0 +1,13 @@
+wikiq: a WikiMedia XML data dump to .tsv parser
+
+author: Erik Garrison <erik@hypervolu.me>
+
+wikiq is written in C using expat.  It is designed to enable researchers to
+rapidly extract revision histories (minus text and comments) from impossibly
+large XML datasets.
+
+To use, first make sure you have libexpat installed, then:
+
+    % make
+    % ./wikiq -h  # prints usage
+    % 7za e -so hugewikidatadump.xml | ./wikiq >hugewikidatadump.tsv
diff --git a/wikiq.c b/wikiq.c
new file mode 100644 (file)
index 0000000..554e145
--- /dev/null
+++ b/wikiq.c
@@ -0,0 +1,549 @@
+/* 
+ * An XML parser for Wikipedia Data dumps.
+ * Converts XML files to tab-separated values files readable by spreadsheets
+ * and statistical packages.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include "expat.h"
+#include <getopt.h>
+
+#define BUFFER_SIZE 80
+// timestamp of the form 2003-11-07T00:43:23Z
+#define DATE_LENGTH 10
+#define TIME_LENGTH 8
+#define TIMESTAMP_LENGTH 20
+
+enum elements { 
+    TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR, 
+    EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
+}; 
+
+enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
+
+enum outtype { NORMAL, SIMPLE };
+
+typedef struct {
+
+    struct {
+        char *title;
+        char *articleid;
+        char *revid;
+        char *date;
+        char *time;
+        char *timestamp;
+        char *anon;
+        char *editor;
+        char *editorid;
+        char *minor;
+        char *comment;
+        char *text;
+    } rev;
+    
+    char *dropstr;
+    enum elements element;
+    enum block position;
+    enum outtype output_type;
+    
+} parseData;
+
+
+/* free_data and clean_data
+ * Takes a pointer to the data struct and an integer {0,1} indicating if the 
+ * title data needs to be cleared as well.
+ * Also, frees memory dynamically allocated to store data.
+ */ 
+static void
+clean_data(parseData *data, int title)
+{
+    if (title) {
+        data->rev.title = NULL;
+        data->rev.articleid = NULL;
+    }
+    data->rev.revid = NULL;
+    data->rev.date = NULL;
+    data->rev.time = NULL;
+    data->rev.timestamp = NULL;
+    data->rev.anon = NULL;
+    data->rev.editor = NULL;
+    data->rev.editorid = NULL;
+    data->rev.minor = NULL;
+    data->rev.comment = NULL; 
+    data->rev.text = NULL;
+    data->element = UNUSED;
+    //data->position = 
+}
+
+static void
+free_data(parseData *data, int title)
+{
+    if (title) {
+        //printf("freeing article\n");
+        free(data->rev.title);
+        free(data->rev.articleid);
+    }
+    free(data->rev.revid);
+    free(data->rev.date);
+    free(data->rev.time);
+    free(data->rev.timestamp);
+    free(data->rev.anon);
+    free(data->rev.editor);
+    free(data->rev.editorid);
+    free(data->rev.minor);
+    free(data->rev.comment);
+    free(data->rev.text);
+}
+
+cleanup_revision(parseData *data) {
+    free_data(data, 0);
+    clean_data(data, 0);
+}
+
+cleanup_article(parseData *data) {
+    free_data(data, 1);
+    clean_data(data, 1);
+}
+
+
+static void 
+init_data(parseData *data, char *dropstr, int output_type)
+{
+    clean_data(data, 1); // sets every element to null...
+    data->dropstr = dropstr;
+    data->output_type = output_type;
+}
+
+/* for debugging only, prints out the state of the data struct
+ */
+static void
+print_state(parseData *data) 
+{
+    printf("element = %i\n", data->element);
+    printf("output_type = %i\n", data->output_type);
+    printf("title = %s\n", data->rev.title);
+    printf("articleid = %s\n", data->rev.articleid);
+    printf("revid = %s\n", data->rev.revid);
+    printf("date = %s\n", data->rev.date);
+    printf("time = %s\n", data->rev.time);
+    printf("anon = %s\n", data->rev.anon);
+    printf("editor = %s\n", data->rev.editor);
+    printf("editorid = %s\n", data->rev.editorid);
+    printf("minor = %s\n", data->rev.minor);
+    printf("comment = %s\n", data->rev.comment); 
+    printf("text = %s\n", data->rev.text);
+    printf("\n");
+
+}
+
+/* Write a header for the comma-separated output
+ */
+static void
+write_header()
+{
+ //   printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
+//    printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
+
+}
+
+
+/* 
+ * write a line of comma-separated value formatted data to standard out
+ * follows the form:
+ * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
+ * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
+ *
+ * it is called right before cleanup_revision() and cleanup_article()
+ */
+static void
+write_row(parseData *data)
+{
+    // define temporary variables to hold output values:
+    char *title, *articleid; 
+    char *revid, *date, *time, *anon, *editor, *editorid;
+    char *minor, *comment;
+    char *text;
+    // perform some simple logic to obtain correct output values
+
+    if (data->rev.minor == NULL)
+        minor = "0";
+    else minor = data->rev.minor;
+
+    if (data->rev.editor == NULL)
+        anon = "1";
+    else anon = "0";
+
+    if (data->rev.title ==  NULL)
+        title = "";
+    else title = data->rev.title;
+
+    if (data->rev.articleid == NULL)
+        articleid = "";
+    else articleid = data->rev.articleid;
+
+    if (data->rev.revid == NULL)
+        revid = "";
+    else revid = data->rev.revid;
+
+    if (data->rev.date == NULL)
+        date = "";
+    else date = data->rev.date;
+
+    if (data->rev.time == NULL)
+        time = "";
+    else time = data->rev.time;
+
+    if (data->rev.editor == NULL)
+        editor = "";
+    else editor = data->rev.editor;
+    
+    if (data->rev.editorid == NULL)
+        editorid = "";
+    else editorid = data->rev.editorid;
+
+    if (data->rev.text == NULL)
+        text = "";
+    else text = data->rev.text;
+
+    
+    if (data->rev.comment == NULL)
+        comment = "";
+    else comment = data->rev.comment;
+    
+
+    // TODO: make it so you can specify fields to output
+    // note that date and time are separated by a space, to match postgres's 
+    // timestamp format
+    switch (data->output_type)
+    {
+        case NORMAL:
+            printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%s\t%s\n",
+                title,articleid,revid,date,time,anon,editor,editorid,minor,comment,text);
+            break;
+        case SIMPLE:
+            printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\n",
+                title,articleid,revid,date,time,anon,editor,editorid,minor);
+            break;
+    }
+
+}
+
+static char
+*timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
+{
+    char *timeinstamp = &timestamp[DATE_LENGTH+1];
+    strncpy(time_buffer, timeinstamp, TIME_LENGTH);
+    time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
+}
+
+
+static char
+*datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
+{
+    strncpy(date_buffer, timestamp, DATE_LENGTH);
+    date_buffer[DATE_LENGTH] = '\0';
+}
+
+char
+*append(char *entry, char *new)
+{
+    char *newbuff;
+    int len;
+    len = (strlen(entry)+strlen(new))*sizeof(char) + 1;
+    newbuff = realloc(entry, len);
+    strcat(newbuff, new);
+    return newbuff;
+}
+
+char
+*cache(char *entry, char *new)
+{
+    char *newbuff;
+    int len;
+    len = strlen(new)*sizeof(char) + 1; // include space for the '\0' !
+    newbuff = malloc(len);
+    strcpy(newbuff,new);
+    return newbuff;
+
+}
+
+char
+*store(char *entry, char *new)
+{
+    char *newbuff;
+    if (entry == NULL)
+        newbuff = cache(entry, new);
+    else 
+        newbuff = append(entry, new);
+    return newbuff;
+}
+
+void
+split_timestamp(parseData *data) 
+{
+    char *t = data->rev.timestamp;
+    char date_buffer[DATE_LENGTH+1];
+    char time_buffer[TIME_LENGTH+1];
+    datestr(t, date_buffer);
+    timestr(t, time_buffer);
+    data->rev.date = store(data->rev.date, date_buffer);
+    data->rev.time = store(data->rev.time, time_buffer);
+}
+
+/* currently unused */
+static int
+is_whitespace(char *string) {
+    int len = strlen(string);
+    while (isspace(string[0]) && strlen(string) > 0) {
+        string++;
+    }
+    if (strcmp(string, "") == 0)
+        return 1;
+    else
+        return 0;
+}
+
+static void
+squeeze(char *s, int c) {
+    int i, j;
+    for (i = j = 0; s[i] != '\0'; i++)
+        if (s[i] != c)
+            s[j++] = s[i];
+    s[j] = '\0';
+}
+
+int
+contains(char *s, char *t)
+{
+    char c = t[0]; //just get the first character of t
+    int i = 0;
+    while (s[i] != '\0') {
+        if (s[i] == c) 
+            return 1;
+        i++;
+    }
+}
+
+static void
+charhndl(parseData *data, char *s, int len)
+{ 
+    if (data->element != UNUSED && data->position != SKIP) {
+        char t[len];
+        strncpy(t,s,len);
+        t[len] = '\0'; // makes t a well-formed string
+        switch (data->element) {
+            case TITLE:
+                {
+                    data->rev.title = store(data->rev.title, t);
+                    // skip any articles with bad characters in their titles
+                    if (contains(t, data->dropstr)) {
+                        data->position = SKIP;
+                        //printf("found a baddie\n");
+                    }
+                    break;
+                }
+            case ARTICLEID:
+                   // printf("articleid = %s\n", t);
+                    data->rev.articleid = store(data->rev.articleid, t);
+                    break;
+            case REVID:
+                   // printf("revid = %s\n", t);
+                    data->rev.revid = store(data->rev.revid, t);
+                    break;
+            case TIMESTAMP: 
+                    data->rev.timestamp = store(data->rev.timestamp, t); 
+                    if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH)
+                        split_timestamp(data);
+                    break;
+            case EDITOR: {
+                    data->rev.editor = store(data->rev.editor, t);
+                    break;
+                    }
+            case EDITORID: 
+                    //printf("editorid = %s\n", t);
+                    data->rev.editorid = store(data->rev.editorid, t);
+                    break;
+            /* the following are implied or skipped:
+            case MINOR: 
+                    printf("found minor element\n");  doesn't work
+                    break;                   minor tag is just a tag
+            case UNUSED: 
+            */
+            case COMMENT: 
+                   // printf("row: comment is %s\n", t);
+                    data->rev.comment = store(data->rev.comment, t);
+                    break;
+            case TEXT:
+                   data->rev.text = store(data->rev.text, t);
+                   break; 
+            default: break;
+        }
+    }
+}
+
+static void
+start(parseData *data, const char *name, const char **attr)
+{
+    
+    if (strcmp(name,"title") == 0) {
+        cleanup_article(data); // cleans up data from last article
+        data->element = TITLE;
+        data->position = TITLE_BLOCK;
+    } else if (data->position != SKIP) {
+        if (strcmp(name,"revision") == 0) {
+            data->element = REVISION;
+            data->position = REVISION_BLOCK;
+        } else if (strcmp(name, "contributor") == 0) {
+            data->element = CONTRIBUTOR;
+            data->position = CONTRIBUTOR_BLOCK;
+        } else if (strcmp(name,"id") == 0)
+            switch (data->position) {
+                case TITLE_BLOCK:
+                    data->element = ARTICLEID;
+                    break;
+                case REVISION_BLOCK: 
+                    data->element = REVID;
+                    break;
+                case CONTRIBUTOR_BLOCK:
+                    data->element = EDITORID;
+                    break;
+            }
+    
+        // minor tag has no character data, so we parse here
+        else if (strcmp(name,"minor") == 0) {
+            data->element = MINOR;
+            data->rev.minor = store(data->rev.minor, "1"); 
+        }
+        else if (strcmp(name,"timestamp") == 0)
+            data->element = TIMESTAMP;
+
+        else if (strcmp(name, "username") == 0)
+            data->element = EDITOR;
+
+        else if (strcmp(name,"ip") == 0) 
+            data->element = EDITORID;
+
+        else if (strcmp(name,"comment") == 0)
+            data->element = COMMENT;
+
+        else if (strcmp(name,"text") == 0)
+            data->element = TEXT;
+
+        else if (strcmp(name,"page") == 0 
+                || strcmp(name,"mediawiki") == 0
+                || strcmp(name,"restrictions") == 0
+                || strcmp(name,"siteinfo") == 0)
+            data->element = UNUSED;
+    }
+
+}
+
+
+static void
+end(parseData *data, const char *name)
+{
+    if (strcmp(name, "revision") == 0 && data->position != SKIP) {
+        write_row(data); // crucial... :)
+        cleanup_revision(data);  // also crucial
+    } else {
+        data->element = UNUSED; // sets our state to "not-in-useful"
+    }                           // thus avoiding unpleasant character data 
+                                // b/w tags (newlines etc.)
+}
+
+void print_usage(char* argv[]) {
+    fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -t   print text and comments after each line of tab separated data\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
+    fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
+    fprintf(stderr, "\n");
+    fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
+}
+
+
+int
+main(int argc, char *argv[])
+{
+    
+    char *dropstr = "";
+    enum outtype output_type;
+    int dry_run = 0;
+    // in "simple" output, we don't print text and comments
+    output_type = SIMPLE;
+    char c;
+
+    while ((c = getopt(argc, argv, "hr:sd")) != -1)
+        switch (c)
+        {
+            case 'r':
+                dropstr = optarg;
+                break;
+            case 'd':
+                dry_run = 1;
+                break;
+            case 't':
+                output_type = NORMAL;
+                break;
+            case 'h':
+                print_usage(argv);
+                exit(0);
+                break;
+        }
+
+    if (dry_run) { // lets us print initialization options
+        printf("simple_output = %i\n", output_type);
+        printf("dropstr = %s\n", dropstr);
+        exit(1);
+    }
+
+    // create a new instance of the expat parser
+    XML_Parser parser = XML_ParserCreate(NULL);
+
+    // initialize the user data struct which is passed to callback functions
+    parseData data;  
+    // initialize the elements of the struct to default values
+    init_data(&data, dropstr, output_type);
+
+
+    // makes the parser pass "data" as the first argument to every callback 
+    XML_SetUserData(parser, &data);
+    // sets start and end to be the element start and end handlers
+    XML_SetElementHandler(parser, (void *) start, (void *) end);
+    // sets charhndl to be the callback for raw character data
+    XML_SetCharacterDataHandler(parser, (void *) charhndl);
+
+    int done;
+    char buf[BUFSIZ];
+    
+    write_header();
+
+    // shovel data into the parser
+    do {
+        
+        // read into buf a bufferfull of data from standard input
+        size_t len = fread(buf, 1, sizeof(buf), stdin);
+        done = len < sizeof(buf); // checks if we've got the last bufferfull
+        
+        // passes the buffer of data to the parser and checks for error
+        //   (this is where the callbacks are invoked)
+        if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
+            fprintf(stderr,
+                "%s at line %d\n",
+                XML_ErrorString(XML_GetErrorCode(parser)),
+                (int) XML_GetCurrentLineNumber(parser));
+            return 1;
+        }
+    } while (!done);
+   
+
+    XML_ParserFree(parser);
+
+    return 0;
+}

Benjamin Mako Hill || Want to submit a patch?