2 * An XML parser for Wikipedia Data dumps.
3 * Converts XML files to tab-separated values files readable by spreadsheets
4 * and statistical packages.
14 #define BUFFER_SIZE 80
15 // timestamp of the form 2003-11-07T00:43:23Z
16 #define DATE_LENGTH 10
18 #define TIMESTAMP_LENGTH 20
21 TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
22 EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
25 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
27 enum outtype { FULL, SIMPLE };
46 enum elements element;
48 enum outtype output_type;
53 /* free_data and clean_data
54 * Takes a pointer to the data struct and an integer {0,1} indicating if the
55 * title data needs to be cleared as well.
56 * Also, frees memory dynamically allocated to store data.
59 clean_data(revisionData *data, int title)
62 data->rev.title = NULL;
63 data->rev.articleid = NULL;
65 data->rev.revid = NULL;
66 data->rev.date = NULL;
67 data->rev.time = NULL;
68 data->rev.timestamp = NULL;
69 data->rev.anon = NULL;
70 data->rev.editor = NULL;
71 data->rev.editorid = NULL;
72 data->rev.minor = false;
73 data->rev.comment = NULL;
74 data->rev.text = NULL;
75 data->element = UNUSED;
80 free_data(revisionData *data, int title)
83 //printf("freeing article\n");
84 free(data->rev.title);
85 free(data->rev.articleid);
87 free(data->rev.revid);
90 free(data->rev.timestamp);
92 free(data->rev.editor);
93 free(data->rev.editorid);
94 free(data->rev.comment);
98 void cleanup_revision(revisionData *data) {
103 void cleanup_article(revisionData *data) {
110 init_data(revisionData *data, outtype output_type)
112 clean_data(data, 1); // sets every element to null...
113 data->output_type = output_type;
116 /* for debugging only, prints out the state of the data struct
119 print_state(revisionData *data)
121 printf("element = %i\n", data->element);
122 printf("output_type = %i\n", data->output_type);
123 printf("title = %s\n", data->rev.title);
124 printf("articleid = %s\n", data->rev.articleid);
125 printf("revid = %s\n", data->rev.revid);
126 printf("date = %s\n", data->rev.date);
127 printf("time = %s\n", data->rev.time);
128 printf("anon = %s\n", data->rev.anon);
129 printf("editor = %s\n", data->rev.editor);
130 printf("editorid = %s\n", data->rev.editorid);
131 printf("minor = %s\n", (data->rev.minor ? "1" : "0"));
132 printf("comment = %s\n", data->rev.comment);
133 printf("text = %s\n", data->rev.text);
138 /* Write a header for the comma-separated output
143 // printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
144 // printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
150 * write a line of comma-separated value formatted data to standard out
152 * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
153 * (str) (int) (int) (str)(str)(bin)(str) (int) (bin) (str)
155 * it is called right before cleanup_revision() and cleanup_article()
158 write_row(revisionData *data)
161 // TODO: make it so you can specify fields to output
162 // note that date and time are separated by a space, to match postgres's
164 printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
165 (data->rev.title != NULL) ? data->rev.title : "",
166 (data->rev.articleid != NULL) ? data->rev.articleid : "",
167 (data->rev.revid != NULL) ? data->rev.revid : "",
168 (data->rev.date != NULL) ? data->rev.date : "",
169 (data->rev.time != NULL) ? data->rev.time : "",
170 (data->rev.editor != NULL) ? "0" : "1",
171 (data->rev.editor != NULL) ? data->rev.editor : "",
172 (data->rev.editorid != NULL) ? data->rev.editorid : "",
173 (data->rev.minor) ? "1" : "0");
174 switch (data->output_type)
181 (data->rev.comment != NULL) ? data->rev.comment : "",
182 (data->rev.text != NULL) ? data->rev.text : "");
189 *timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
191 char *timeinstamp = ×tamp[DATE_LENGTH+1];
192 strncpy(time_buffer, timeinstamp, TIME_LENGTH);
193 time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
198 *datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
200 strncpy(date_buffer, timestamp, DATE_LENGTH);
201 date_buffer[DATE_LENGTH] = '\0';
205 *append(char *entry, char *newstr)
209 len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1;
210 newbuff = (char*) realloc(entry, len);
211 strcat(newbuff, newstr);
216 *cache(char *entry, char *newstr)
220 len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' !
221 newbuff = (char*) malloc(len);
222 strcpy(newbuff,newstr);
228 *store(char *entry, char *newstr)
232 newbuff = cache(entry, newstr);
234 newbuff = append(entry, newstr);
239 split_timestamp(revisionData *data)
241 char *t = data->rev.timestamp;
242 char date_buffer[DATE_LENGTH+1];
243 char time_buffer[TIME_LENGTH+1];
244 datestr(t, date_buffer);
245 timestr(t, time_buffer);
246 data->rev.date = store(data->rev.date, date_buffer);
247 data->rev.time = store(data->rev.time, time_buffer);
250 /* currently unused */
252 is_whitespace(char *string) {
253 int len = strlen(string);
254 while (isspace(string[0]) && strlen(string) > 0) {
257 if (strcmp(string, "") == 0)
264 squeeze(char *s, int c) {
266 for (i = j = 0; s[i] != '\0'; i++)
273 contains(char *s, char *t)
275 char c = t[0]; //just get the first character of t
277 while (s[i] != '\0') {
285 charhndl(void* vdata, const XML_Char* s, int len)
287 revisionData* data = (revisionData*) vdata;
288 if (data->element != UNUSED && data->position != SKIP) {
291 t[len] = '\0'; // makes t a well-formed string
292 switch (data->element) {
295 data->rev.title = store(data->rev.title, t);
296 // skip any articles with bad characters in their titles
300 // printf("articleid = %s\n", t);
301 data->rev.articleid = store(data->rev.articleid, t);
304 // printf("revid = %s\n", t);
305 data->rev.revid = store(data->rev.revid, t);
308 data->rev.timestamp = store(data->rev.timestamp, t);
309 if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH)
310 split_timestamp(data);
313 data->rev.editor = store(data->rev.editor, t);
317 //printf("editorid = %s\n", t);
318 data->rev.editorid = store(data->rev.editorid, t);
320 /* the following are implied or skipped:
322 printf("found minor element\n"); doesn't work
323 break; minor tag is just a tag
327 // printf("row: comment is %s\n", t);
328 if (data->output_type == FULL) {
329 data->rev.comment = store(data->rev.comment, t);
333 if (data->output_type == FULL) {
334 data->rev.text = store(data->rev.text, t);
343 start(void* vdata, const XML_Char* name, const XML_Char** attr)
345 revisionData* data = (revisionData*) vdata;
347 if (strcmp(name,"title") == 0) {
348 cleanup_article(data); // cleans up data from last article
349 data->element = TITLE;
350 data->position = TITLE_BLOCK;
351 } else if (data->position != SKIP) {
352 if (strcmp(name,"revision") == 0) {
353 data->element = REVISION;
354 data->position = REVISION_BLOCK;
355 } else if (strcmp(name, "contributor") == 0) {
356 data->element = CONTRIBUTOR;
357 data->position = CONTRIBUTOR_BLOCK;
358 } else if (strcmp(name,"id") == 0)
359 switch (data->position) {
361 data->element = ARTICLEID;
364 data->element = REVID;
366 case CONTRIBUTOR_BLOCK:
367 data->element = EDITORID;
371 // minor tag has no character data, so we parse here
372 else if (strcmp(name,"minor") == 0) {
373 data->element = MINOR;
374 data->rev.minor = true;
376 else if (strcmp(name,"timestamp") == 0)
377 data->element = TIMESTAMP;
379 else if (strcmp(name, "username") == 0)
380 data->element = EDITOR;
382 else if (strcmp(name,"ip") == 0)
383 data->element = EDITORID;
385 else if (strcmp(name,"comment") == 0)
386 data->element = COMMENT;
388 else if (strcmp(name,"text") == 0)
389 data->element = TEXT;
391 else if (strcmp(name,"page") == 0
392 || strcmp(name,"mediawiki") == 0
393 || strcmp(name,"restrictions") == 0
394 || strcmp(name,"siteinfo") == 0)
395 data->element = UNUSED;
402 end(void* vdata, const XML_Char* name)
404 revisionData* data = (revisionData*) vdata;
405 if (strcmp(name, "revision") == 0 && data->position != SKIP) {
406 write_row(data); // crucial... :)
407 cleanup_revision(data); // also crucial
409 data->element = UNUSED; // sets our state to "not-in-useful"
410 } // thus avoiding unpleasant character data
411 // b/w tags (newlines etc.)
414 void print_usage(char* argv[]) {
415 fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
416 fprintf(stderr, "\n");
417 fprintf(stderr, "options:\n");
418 fprintf(stderr, " -t print text and comments after each line of tab separated data\n");
419 fprintf(stderr, "\n");
420 fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
421 fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
422 fprintf(stderr, "\n");
423 fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
424 fprintf(stderr, "\n");
425 fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
430 main(int argc, char *argv[])
433 enum outtype output_type;
435 // in "simple" output, we don't print text and comments
436 output_type = SIMPLE;
439 while ((c = getopt(argc, argv, "ht")) != -1)
454 if (dry_run) { // lets us print initialization options
455 printf("simple_output = %i\n", output_type);
459 // create a new instance of the expat parser
460 XML_Parser parser = XML_ParserCreate(NULL);
462 // initialize the user data struct which is passed to callback functions
464 // initialize the elements of the struct to default values
465 init_data(&data, output_type);
468 // makes the parser pass "data" as the first argument to every callback
469 XML_SetUserData(parser, &data);
470 void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
471 void (*endFnPtr)(void*, const XML_Char*) = end;
472 void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
474 // sets start and end to be the element start and end handlers
475 XML_SetElementHandler(parser, startFnPtr, endFnPtr);
476 // sets charhndl to be the callback for raw character data
477 XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
484 // shovel data into the parser
487 // read into buf a bufferfull of data from standard input
488 size_t len = fread(buf, 1, sizeof(buf), stdin);
489 done = len < sizeof(buf); // checks if we've got the last bufferfull
491 // passes the buffer of data to the parser and checks for error
492 // (this is where the callbacks are invoked)
493 if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
496 XML_ErrorString(XML_GetErrorCode(parser)),
497 (int) XML_GetCurrentLineNumber(parser));
503 XML_ParserFree(parser);