2 * An XML parser for Wikipedia Data dumps.
3 * Converts XML files to tab-separated values files readable by spreadsheets
4 * and statistical packages.
14 #define BUFFER_SIZE 80
15 // timestamp of the form 2003-11-07T00:43:23Z
16 #define DATE_LENGTH 10
18 #define TIMESTAMP_LENGTH 20
21 TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
22 EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
25 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
27 enum outtype { NORMAL, SIMPLE };
47 enum elements element;
49 enum outtype output_type;
54 /* free_data and clean_data
55 * Takes a pointer to the data struct and an integer {0,1} indicating if the
56 * title data needs to be cleared as well.
57 * Also, frees memory dynamically allocated to store data.
60 clean_data(parseData *data, int title)
63 data->rev.title = NULL;
64 data->rev.articleid = NULL;
66 data->rev.revid = NULL;
67 data->rev.date = NULL;
68 data->rev.time = NULL;
69 data->rev.timestamp = NULL;
70 data->rev.anon = NULL;
71 data->rev.editor = NULL;
72 data->rev.editorid = NULL;
73 data->rev.minor = NULL;
74 data->rev.comment = NULL;
75 data->rev.text = NULL;
76 data->element = UNUSED;
81 free_data(parseData *data, int title)
84 //printf("freeing article\n");
85 free(data->rev.title);
86 free(data->rev.articleid);
88 free(data->rev.revid);
91 free(data->rev.timestamp);
93 free(data->rev.editor);
94 free(data->rev.editorid);
95 free(data->rev.minor);
96 free(data->rev.comment);
100 cleanup_revision(parseData *data) {
105 cleanup_article(parseData *data) {
112 init_data(parseData *data, char *dropstr, int output_type)
114 clean_data(data, 1); // sets every element to null...
115 data->dropstr = dropstr;
116 data->output_type = output_type;
119 /* for debugging only, prints out the state of the data struct
122 print_state(parseData *data)
124 printf("element = %i\n", data->element);
125 printf("output_type = %i\n", data->output_type);
126 printf("title = %s\n", data->rev.title);
127 printf("articleid = %s\n", data->rev.articleid);
128 printf("revid = %s\n", data->rev.revid);
129 printf("date = %s\n", data->rev.date);
130 printf("time = %s\n", data->rev.time);
131 printf("anon = %s\n", data->rev.anon);
132 printf("editor = %s\n", data->rev.editor);
133 printf("editorid = %s\n", data->rev.editorid);
134 printf("minor = %s\n", data->rev.minor);
135 printf("comment = %s\n", data->rev.comment);
136 printf("text = %s\n", data->rev.text);
141 /* Write a header for the comma-separated output
146 // printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
147 // printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
153 * write a line of comma-separated value formatted data to standard out
155 * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
156 * (str) (int) (int) (str)(str)(bin)(str) (int) (bin) (str)
158 * it is called right before cleanup_revision() and cleanup_article()
161 write_row(parseData *data)
163 // define temporary variables to hold output values:
164 char *title, *articleid;
165 char *revid, *date, *time, *anon, *editor, *editorid;
166 char *minor, *comment;
168 // perform some simple logic to obtain correct output values
170 if (data->rev.minor == NULL)
172 else minor = data->rev.minor;
174 if (data->rev.editor == NULL)
178 if (data->rev.title == NULL)
180 else title = data->rev.title;
182 if (data->rev.articleid == NULL)
184 else articleid = data->rev.articleid;
186 if (data->rev.revid == NULL)
188 else revid = data->rev.revid;
190 if (data->rev.date == NULL)
192 else date = data->rev.date;
194 if (data->rev.time == NULL)
196 else time = data->rev.time;
198 if (data->rev.editor == NULL)
200 else editor = data->rev.editor;
202 if (data->rev.editorid == NULL)
204 else editorid = data->rev.editorid;
206 if (data->rev.text == NULL)
208 else text = data->rev.text;
211 if (data->rev.comment == NULL)
213 else comment = data->rev.comment;
216 // TODO: make it so you can specify fields to output
217 // note that date and time are separated by a space, to match postgres's
219 switch (data->output_type)
222 printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%s\t%s\n",
223 title,articleid,revid,date,time,anon,editor,editorid,minor,comment,text);
226 printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\n",
227 title,articleid,revid,date,time,anon,editor,editorid,minor);
234 *timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
236 char *timeinstamp = ×tamp[DATE_LENGTH+1];
237 strncpy(time_buffer, timeinstamp, TIME_LENGTH);
238 time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
243 *datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
245 strncpy(date_buffer, timestamp, DATE_LENGTH);
246 date_buffer[DATE_LENGTH] = '\0';
250 *append(char *entry, char *new)
254 len = (strlen(entry)+strlen(new))*sizeof(char) + 1;
255 newbuff = realloc(entry, len);
256 strcat(newbuff, new);
261 *cache(char *entry, char *new)
265 len = strlen(new)*sizeof(char) + 1; // include space for the '\0' !
266 newbuff = malloc(len);
273 *store(char *entry, char *new)
277 newbuff = cache(entry, new);
279 newbuff = append(entry, new);
284 split_timestamp(parseData *data)
286 char *t = data->rev.timestamp;
287 char date_buffer[DATE_LENGTH+1];
288 char time_buffer[TIME_LENGTH+1];
289 datestr(t, date_buffer);
290 timestr(t, time_buffer);
291 data->rev.date = store(data->rev.date, date_buffer);
292 data->rev.time = store(data->rev.time, time_buffer);
295 /* currently unused */
297 is_whitespace(char *string) {
298 int len = strlen(string);
299 while (isspace(string[0]) && strlen(string) > 0) {
302 if (strcmp(string, "") == 0)
309 squeeze(char *s, int c) {
311 for (i = j = 0; s[i] != '\0'; i++)
318 contains(char *s, char *t)
320 char c = t[0]; //just get the first character of t
322 while (s[i] != '\0') {
330 charhndl(parseData *data, char *s, int len)
332 if (data->element != UNUSED && data->position != SKIP) {
335 t[len] = '\0'; // makes t a well-formed string
336 switch (data->element) {
339 data->rev.title = store(data->rev.title, t);
340 // skip any articles with bad characters in their titles
341 if (contains(t, data->dropstr)) {
342 data->position = SKIP;
343 //printf("found a baddie\n");
348 // printf("articleid = %s\n", t);
349 data->rev.articleid = store(data->rev.articleid, t);
352 // printf("revid = %s\n", t);
353 data->rev.revid = store(data->rev.revid, t);
356 data->rev.timestamp = store(data->rev.timestamp, t);
357 if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH)
358 split_timestamp(data);
361 data->rev.editor = store(data->rev.editor, t);
365 //printf("editorid = %s\n", t);
366 data->rev.editorid = store(data->rev.editorid, t);
368 /* the following are implied or skipped:
370 printf("found minor element\n"); doesn't work
371 break; minor tag is just a tag
375 // printf("row: comment is %s\n", t);
376 data->rev.comment = store(data->rev.comment, t);
379 data->rev.text = store(data->rev.text, t);
387 start(parseData *data, const char *name, const char **attr)
390 if (strcmp(name,"title") == 0) {
391 cleanup_article(data); // cleans up data from last article
392 data->element = TITLE;
393 data->position = TITLE_BLOCK;
394 } else if (data->position != SKIP) {
395 if (strcmp(name,"revision") == 0) {
396 data->element = REVISION;
397 data->position = REVISION_BLOCK;
398 } else if (strcmp(name, "contributor") == 0) {
399 data->element = CONTRIBUTOR;
400 data->position = CONTRIBUTOR_BLOCK;
401 } else if (strcmp(name,"id") == 0)
402 switch (data->position) {
404 data->element = ARTICLEID;
407 data->element = REVID;
409 case CONTRIBUTOR_BLOCK:
410 data->element = EDITORID;
414 // minor tag has no character data, so we parse here
415 else if (strcmp(name,"minor") == 0) {
416 data->element = MINOR;
417 data->rev.minor = store(data->rev.minor, "1");
419 else if (strcmp(name,"timestamp") == 0)
420 data->element = TIMESTAMP;
422 else if (strcmp(name, "username") == 0)
423 data->element = EDITOR;
425 else if (strcmp(name,"ip") == 0)
426 data->element = EDITORID;
428 else if (strcmp(name,"comment") == 0)
429 data->element = COMMENT;
431 else if (strcmp(name,"text") == 0)
432 data->element = TEXT;
434 else if (strcmp(name,"page") == 0
435 || strcmp(name,"mediawiki") == 0
436 || strcmp(name,"restrictions") == 0
437 || strcmp(name,"siteinfo") == 0)
438 data->element = UNUSED;
445 end(parseData *data, const char *name)
447 if (strcmp(name, "revision") == 0 && data->position != SKIP) {
448 write_row(data); // crucial... :)
449 cleanup_revision(data); // also crucial
451 data->element = UNUSED; // sets our state to "not-in-useful"
452 } // thus avoiding unpleasant character data
453 // b/w tags (newlines etc.)
456 void print_usage(char* argv[]) {
457 fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
458 fprintf(stderr, "\n");
459 fprintf(stderr, "options:\n");
460 fprintf(stderr, " -t print text and comments after each line of tab separated data\n");
461 fprintf(stderr, "\n");
462 fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
463 fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
464 fprintf(stderr, "\n");
465 fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
466 fprintf(stderr, "\n");
467 fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
472 main(int argc, char *argv[])
476 enum outtype output_type;
478 // in "simple" output, we don't print text and comments
479 output_type = SIMPLE;
482 while ((c = getopt(argc, argv, "hr:sd")) != -1)
492 output_type = NORMAL;
500 if (dry_run) { // lets us print initialization options
501 printf("simple_output = %i\n", output_type);
502 printf("dropstr = %s\n", dropstr);
506 // create a new instance of the expat parser
507 XML_Parser parser = XML_ParserCreate(NULL);
509 // initialize the user data struct which is passed to callback functions
511 // initialize the elements of the struct to default values
512 init_data(&data, dropstr, output_type);
515 // makes the parser pass "data" as the first argument to every callback
516 XML_SetUserData(parser, &data);
517 // sets start and end to be the element start and end handlers
518 XML_SetElementHandler(parser, (void *) start, (void *) end);
519 // sets charhndl to be the callback for raw character data
520 XML_SetCharacterDataHandler(parser, (void *) charhndl);
527 // shovel data into the parser
530 // read into buf a bufferfull of data from standard input
531 size_t len = fread(buf, 1, sizeof(buf), stdin);
532 done = len < sizeof(buf); // checks if we've got the last bufferfull
534 // passes the buffer of data to the parser and checks for error
535 // (this is where the callbacks are invoked)
536 if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
539 XML_ErrorString(XML_GetErrorCode(parser)),
540 (int) XML_GetCurrentLineNumber(parser));
546 XML_ParserFree(parser);