2 * An XML parser for Wikipedia Data dumps.
3 * Converts XML files to tab-separated values files readable by spreadsheets
4 * and statistical packages.
14 #define BUFFER_SIZE 80
15 // timestamp of the form 2003-11-07T00:43:23Z
16 #define DATE_LENGTH 10
18 #define TIMESTAMP_LENGTH 20
20 // 2048 KB in bytes + 1
21 #define TEXT_BUFFER_SIZE 2097153
22 #define FIELD_BUFFER_SIZE 1024
25 TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
26 EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
29 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
31 enum outtype { FULL, SIMPLE };
46 char text[TEXT_BUFFER_SIZE];
48 enum elements element;
50 enum outtype output_type;
55 /* free_data and clean_data
56 * Takes a pointer to the data struct and an integer {0,1} indicating if the
57 * title data needs to be cleared as well.
58 * Also, frees memory dynamically allocated to store data.
61 clean_data(revisionData *data, int title)
65 data->articleid = NULL;
70 data->timestamp = NULL;
73 data->editorid = NULL;
77 data->element = UNUSED;
82 free_data(revisionData *data, int title)
85 //printf("freeing article\n");
87 free(data->articleid);
92 free(data->timestamp);
101 void cleanup_revision(revisionData *data) {
106 void cleanup_article(revisionData *data) {
113 init_data(revisionData *data, outtype output_type)
115 clean_data(data, 1); // sets every element to null...
116 data->output_type = output_type;
119 /* for debugging only, prints out the state of the data struct
122 print_state(revisionData *data)
124 printf("element = %i\n", data->element);
125 printf("output_type = %i\n", data->output_type);
126 printf("title = %s\n", data->title);
127 printf("articleid = %s\n", data->articleid);
128 printf("revid = %s\n", data->revid);
129 printf("date = %s\n", data->date);
130 printf("time = %s\n", data->time);
131 printf("anon = %s\n", data->anon);
132 printf("editor = %s\n", data->editor);
133 printf("editorid = %s\n", data->editorid);
134 printf("minor = %s\n", (data->minor ? "1" : "0"));
135 printf("comment = %s\n", data->comment);
136 printf("text = %s\n", data->text);
141 /* Write a header for the comma-separated output
146 // printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
147 // printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
153 * write a line of comma-separated value formatted data to standard out
155 * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
156 * (str) (int) (int) (str)(str)(bin)(str) (int) (bin) (str)
158 * it is called right before cleanup_revision() and cleanup_article()
161 write_row(revisionData *data)
164 // TODO: make it so you can specify fields to output
165 // note that date and time are separated by a space, to match postgres's
167 printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
168 (data->title != NULL) ? data->title : "",
169 (data->articleid != NULL) ? data->articleid : "",
170 (data->revid != NULL) ? data->revid : "",
171 (data->date != NULL) ? data->date : "",
172 (data->time != NULL) ? data->time : "",
173 (data->editor != NULL) ? "0" : "1",
174 (data->editor != NULL) ? data->editor : "",
175 (data->editorid != NULL) ? data->editorid : "",
176 (data->minor) ? "1" : "0");
177 switch (data->output_type)
180 printf("\t%i\n", (unsigned int) strlen(data->text));
183 printf("\t%s\t%s\n", data->comment, data->text);
190 *timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
192 char *timeinstamp = ×tamp[DATE_LENGTH+1];
193 strncpy(time_buffer, timeinstamp, TIME_LENGTH);
194 time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
199 *datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
201 strncpy(date_buffer, timestamp, DATE_LENGTH);
202 date_buffer[DATE_LENGTH] = '\0';
206 *append(char *entry, char *newstr)
210 len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1;
211 newbuff = (char*) realloc(entry, len);
212 strcat(newbuff, newstr);
217 *cache(char *entry, char *newstr)
221 len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' !
222 newbuff = (char*) malloc(len);
223 strcpy(newbuff,newstr);
229 *store(char *entry, char *newstr)
233 newbuff = cache(entry, newstr);
235 newbuff = append(entry, newstr);
240 split_timestamp(revisionData *data)
242 char *t = data->timestamp;
243 char date_buffer[DATE_LENGTH+1];
244 char time_buffer[TIME_LENGTH+1];
245 datestr(t, date_buffer);
246 timestr(t, time_buffer);
247 data->date = store(data->date, date_buffer);
248 data->time = store(data->time, time_buffer);
251 /* currently unused */
253 is_whitespace(char *string) {
254 int len = strlen(string);
255 while (isspace(string[0]) && strlen(string) > 0) {
258 if (strcmp(string, "") == 0)
265 charhndl(void* vdata, const XML_Char* s, int len)
267 revisionData* data = (revisionData*) vdata;
268 if (data->element != UNUSED && data->position != SKIP) {
271 t[len] = '\0'; // makes t a well-formed string
272 switch (data->element) {
275 data->title = store(data->title, t);
276 // skip any articles with bad characters in their titles
280 // printf("articleid = %s\n", t);
281 data->articleid = store(data->articleid, t);
284 // printf("revid = %s\n", t);
285 data->revid = store(data->revid, t);
288 data->timestamp = store(data->timestamp, t);
289 if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
290 split_timestamp(data);
293 data->editor = store(data->editor, t);
297 //printf("editorid = %s\n", t);
298 data->editorid = store(data->editorid, t);
300 /* the following are implied or skipped:
302 printf("found minor element\n"); doesn't work
303 break; minor tag is just a tag
307 // printf("row: comment is %s\n", t);
308 //if (data->output_type == FULL) {
309 data->comment = store(data->comment, t);
313 //if (data->output_type == FULL) {
314 //data->text = store(data->text, t);
316 strcat(data->text, t);
325 start(void* vdata, const XML_Char* name, const XML_Char** attr)
327 revisionData* data = (revisionData*) vdata;
329 if (strcmp(name,"title") == 0) {
330 cleanup_article(data); // cleans up data from last article
331 data->element = TITLE;
332 data->position = TITLE_BLOCK;
333 } else if (data->position != SKIP) {
334 if (strcmp(name,"revision") == 0) {
335 data->element = REVISION;
336 data->position = REVISION_BLOCK;
337 } else if (strcmp(name, "contributor") == 0) {
338 data->element = CONTRIBUTOR;
339 data->position = CONTRIBUTOR_BLOCK;
340 } else if (strcmp(name,"id") == 0)
341 switch (data->position) {
343 data->element = ARTICLEID;
346 data->element = REVID;
348 case CONTRIBUTOR_BLOCK:
349 data->element = EDITORID;
353 // minor tag has no character data, so we parse here
354 else if (strcmp(name,"minor") == 0) {
355 data->element = MINOR;
358 else if (strcmp(name,"timestamp") == 0)
359 data->element = TIMESTAMP;
361 else if (strcmp(name, "username") == 0)
362 data->element = EDITOR;
364 else if (strcmp(name,"ip") == 0)
365 data->element = EDITORID;
367 else if (strcmp(name,"comment") == 0)
368 data->element = COMMENT;
370 else if (strcmp(name,"text") == 0)
371 data->element = TEXT;
373 else if (strcmp(name,"page") == 0
374 || strcmp(name,"mediawiki") == 0
375 || strcmp(name,"restrictions") == 0
376 || strcmp(name,"siteinfo") == 0)
377 data->element = UNUSED;
384 end(void* vdata, const XML_Char* name)
386 revisionData* data = (revisionData*) vdata;
387 if (strcmp(name, "revision") == 0 && data->position != SKIP) {
388 write_row(data); // crucial... :)
389 cleanup_revision(data); // also crucial
391 data->element = UNUSED; // sets our state to "not-in-useful"
392 } // thus avoiding unpleasant character data
393 // b/w tags (newlines etc.)
396 void print_usage(char* argv[]) {
397 fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
398 fprintf(stderr, "\n");
399 fprintf(stderr, "options:\n");
400 fprintf(stderr, " -t print text and comments after each line of tab separated data\n");
401 fprintf(stderr, "\n");
402 fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
403 fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
404 fprintf(stderr, "\n");
405 fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
406 fprintf(stderr, "\n");
407 fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
412 main(int argc, char *argv[])
415 enum outtype output_type;
417 // in "simple" output, we don't print text and comments
418 output_type = SIMPLE;
421 while ((c = getopt(argc, argv, "ht")) != -1)
436 if (dry_run) { // lets us print initialization options
437 printf("simple_output = %i\n", output_type);
441 // create a new instance of the expat parser
442 XML_Parser parser = XML_ParserCreate(NULL);
444 // initialize the user data struct which is passed to callback functions
446 // initialize the elements of the struct to default values
447 init_data(&data, output_type);
450 // makes the parser pass "data" as the first argument to every callback
451 XML_SetUserData(parser, &data);
452 void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
453 void (*endFnPtr)(void*, const XML_Char*) = end;
454 void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
456 // sets start and end to be the element start and end handlers
457 XML_SetElementHandler(parser, startFnPtr, endFnPtr);
458 // sets charhndl to be the callback for raw character data
459 XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
466 // shovel data into the parser
469 // read into buf a bufferfull of data from standard input
470 size_t len = fread(buf, 1, sizeof(buf), stdin);
471 done = len < sizeof(buf); // checks if we've got the last bufferfull
473 // passes the buffer of data to the parser and checks for error
474 // (this is where the callbacks are invoked)
475 if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
478 XML_ErrorString(XML_GetErrorCode(parser)),
479 (int) XML_GetCurrentLineNumber(parser));
485 XML_ParserFree(parser);