2 * An XML parser for Wikipedia Data dumps.
3 * Converts XML files to tab-separated values files readable by spreadsheets
4 * and statistical packages.
14 // timestamp of the form 2003-11-07T00:43:23Z
15 #define DATE_LENGTH 10
17 #define TIMESTAMP_LENGTH 20
19 #define MEGABYTE 1048576
20 #define FIELD_BUFFER_SIZE 1024
21 // 2048 KB in bytes + 1
22 //#define TEXT_BUFFER_SIZE 2097153
23 //#define TEXT_BUFFER_SIZE 10485760
26 TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
27 EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
30 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
32 enum outtype { FULL, SIMPLE };
36 // pointers to once-allocated buffers
49 // track string size of the elements, to prevent O(N^2) processing in charhndl
50 // when we have to take strlen for every character which we append to the buffer
52 size_t articleid_size;
56 size_t timestamp_size;
65 enum elements element;
67 enum outtype output_type;
72 /* free_data and clean_data
73 * Takes a pointer to the data struct and an integer {0,1} indicating if the
74 * title data needs to be cleared as well.
75 * Also, frees memory dynamically allocated to store data.
78 clean_data(revisionData *data, int title)
80 // reset title (if we are switching articles)
82 data->title[0] = '\0';
83 data->articleid[0] = '\0';
85 data->articleid_size = 0;
89 data->revid[0] = '\0';
92 data->timestamp[0] = '\0';
94 data->editor[0] = '\0';
95 data->editorid[0] = '\0';
96 data->comment[0] = '\0';
99 // reset length tracking
100 data->revid_size = 0;
103 data->timestamp_size = 0;
105 data->editor_size = 0;
106 data->editorid_size = 0;
107 data->comment_size = 0;
110 // reset flags and element type info
112 data->element = UNUSED;
118 free_data(revisionData *data, int title)
121 //printf("freeing article\n");
123 free(data->articleid);
128 free(data->timestamp);
131 free(data->editorid);
136 void cleanup_revision(revisionData *data) {
140 void cleanup_article(revisionData *data) {
146 init_data(revisionData *data, outtype output_type)
148 data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'?
149 data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
150 data->title = (char*) malloc(FIELD_BUFFER_SIZE);
151 data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
152 data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
153 data->date = (char*) malloc(FIELD_BUFFER_SIZE);
154 data->time = (char*) malloc(FIELD_BUFFER_SIZE);
155 data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
156 data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
157 data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
158 data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
161 // resets the data fields, null terminates strings, sets lengths
164 data->output_type = output_type;
167 /* for debugging only, prints out the state of the data struct
170 print_state(revisionData *data)
172 printf("element = %i\n", data->element);
173 printf("output_type = %i\n", data->output_type);
174 printf("title = %s\n", data->title);
175 printf("articleid = %s\n", data->articleid);
176 printf("revid = %s\n", data->revid);
177 printf("date = %s\n", data->date);
178 printf("time = %s\n", data->time);
179 printf("anon = %s\n", data->anon);
180 printf("editor = %s\n", data->editor);
181 printf("editorid = %s\n", data->editorid);
182 printf("minor = %s\n", (data->minor ? "1" : "0"));
183 printf("comment = %s\n", data->comment);
184 printf("text = %s\n", data->text);
189 /* Write a header for the comma-separated output
194 // printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
195 // printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
201 * write a line of comma-separated value formatted data to standard out
203 * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
204 * (str) (int) (int) (str)(str)(bin)(str) (int) (bin) (str)
206 * it is called right before cleanup_revision() and cleanup_article()
209 write_row(revisionData *data)
212 // TODO: make it so you can specify fields to output
213 // note that date and time are separated by a space, to match postgres's
215 printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
221 (data->editor[0] != '\0') ? "0" : "1", // anon?
224 (data->minor) ? "1" : "0");
225 switch (data->output_type)
228 printf("\t%i\n", (unsigned int) strlen(data->text));
232 printf("\t%s\t%s\n", data->comment, data->text);
239 *append(char *entry, char *newstr)
243 len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1;
244 newbuff = (char*) realloc(entry, len);
245 strcat(newbuff, newstr);
250 *cache(char *entry, char *newstr)
254 len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' !
255 newbuff = (char*) malloc(len);
256 strcpy(newbuff,newstr);
262 *store(char *entry, char *newstr)
266 newbuff = cache(entry, newstr);
268 newbuff = append(entry, newstr);
273 split_timestamp(revisionData *data)
275 char *t = data->timestamp;
276 strncpy(data->date, data->timestamp, DATE_LENGTH);
277 char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
278 strncpy(data->time, timeinstamp, TIME_LENGTH);
281 /* currently unused */
283 is_whitespace(char *string) {
284 int len = strlen(string);
285 while (isspace(string[0]) && strlen(string) > 0) {
288 if (strcmp(string, "") == 0)
294 // like strncat but with previously known length
296 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
298 //size_t dest_len = strlen(dest);
301 for (i = 0 ; i < n && src[i] != '\0' ; i++)
302 dest[dest_len + i] = src[i];
303 dest[dest_len + i] = '\0';
309 charhndl(void* vdata, const XML_Char* s, int len)
311 revisionData* data = (revisionData*) vdata;
312 if (data->element != UNUSED && data->position != SKIP) {
315 //t[len] = '\0'; // makes t a well-formed string
316 switch (data->element) {
318 // printf("buffer length = %i, text: %s\n", len, t);
319 strlcatn(data->text, s, data->text_size, len);
320 data->text_size += len;
323 strlcatn(data->comment, s, data->comment_size, len);
324 data->comment_size += len;
327 strlcatn(data->title, s, data->title_size, len);
328 data->title_size += len;
331 // printf("articleid = %s\n", t);
332 strlcatn(data->articleid, s, data->articleid_size, len);
333 data->articleid_size += len;
336 // printf("revid = %s\n", t);
337 strlcatn(data->revid, s, data->revid_size, len);
338 data->revid_size += len;
341 strlcatn(data->timestamp, s, data->timestamp_size, len);
342 data->timestamp_size += len;
343 if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
344 split_timestamp(data);
347 strlcatn(data->editor, s, data->editor_size, len);
348 data->editor_size += len;
351 //printf("editorid = %s\n", t);
352 strlcatn(data->editorid, s, data->editorid_size, len);
353 data->editorid_size += len;
355 /* the following are implied or skipped:
357 printf("found minor element\n"); doesn't work
358 break; minor tag is just a tag
367 start(void* vdata, const XML_Char* name, const XML_Char** attr)
369 revisionData* data = (revisionData*) vdata;
371 if (strcmp(name,"title") == 0) {
372 cleanup_article(data); // cleans up data from last article
373 data->element = TITLE;
374 data->position = TITLE_BLOCK;
375 } else if (data->position != SKIP) {
376 if (strcmp(name,"revision") == 0) {
377 data->element = REVISION;
378 data->position = REVISION_BLOCK;
379 } else if (strcmp(name, "contributor") == 0) {
380 data->element = CONTRIBUTOR;
381 data->position = CONTRIBUTOR_BLOCK;
382 } else if (strcmp(name,"id") == 0)
383 switch (data->position) {
385 data->element = ARTICLEID;
388 data->element = REVID;
390 case CONTRIBUTOR_BLOCK:
391 data->element = EDITORID;
395 // minor tag has no character data, so we parse here
396 else if (strcmp(name,"minor") == 0) {
397 data->element = MINOR;
400 else if (strcmp(name,"timestamp") == 0)
401 data->element = TIMESTAMP;
403 else if (strcmp(name, "username") == 0)
404 data->element = EDITOR;
406 else if (strcmp(name,"ip") == 0)
407 data->element = EDITORID;
409 else if (strcmp(name,"comment") == 0)
410 data->element = COMMENT;
412 else if (strcmp(name,"text") == 0)
413 data->element = TEXT;
415 else if (strcmp(name,"page") == 0
416 || strcmp(name,"mediawiki") == 0
417 || strcmp(name,"restrictions") == 0
418 || strcmp(name,"siteinfo") == 0)
419 data->element = UNUSED;
426 end(void* vdata, const XML_Char* name)
428 revisionData* data = (revisionData*) vdata;
429 if (strcmp(name, "revision") == 0 && data->position != SKIP) {
430 write_row(data); // crucial... :)
431 cleanup_revision(data); // also crucial
433 data->element = UNUSED; // sets our state to "not-in-useful"
434 } // thus avoiding unpleasant character data
435 // b/w tags (newlines etc.)
438 void print_usage(char* argv[]) {
439 fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
440 fprintf(stderr, "\n");
441 fprintf(stderr, "options:\n");
442 fprintf(stderr, " -t print text and comments after each line of tab separated data\n");
443 fprintf(stderr, "\n");
444 fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
445 fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
446 fprintf(stderr, "\n");
447 fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor, revlength\n");
448 fprintf(stderr, "\n");
449 fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
454 main(int argc, char *argv[])
457 enum outtype output_type;
459 // in "simple" output, we don't print text and comments
460 output_type = SIMPLE;
463 while ((c = getopt(argc, argv, "ht")) != -1)
478 if (dry_run) { // lets us print initialization options
479 printf("simple_output = %i\n", output_type);
483 // create a new instance of the expat parser
484 XML_Parser parser = XML_ParserCreate("UTF-8");
486 // initialize the user data struct which is passed to callback functions
488 // initialize the elements of the struct to default values
489 init_data(&data, output_type);
492 // makes the parser pass "data" as the first argument to every callback
493 XML_SetUserData(parser, &data);
494 void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
495 void (*endFnPtr)(void*, const XML_Char*) = end;
496 void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
498 // sets start and end to be the element start and end handlers
499 XML_SetElementHandler(parser, startFnPtr, endFnPtr);
500 // sets charhndl to be the callback for character data
501 XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
506 // shovel data into the parser
509 // read into buf a bufferfull of data from standard input
510 size_t len = fread(buf, 1, BUFSIZ, stdin);
511 done = len < BUFSIZ; // checks if we've got the last bufferfull
513 // passes the buffer of data to the parser and checks for error
514 // (this is where the callbacks are invoked)
515 if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
518 XML_ErrorString(XML_GetErrorCode(parser)),
519 (int) XML_GetCurrentLineNumber(parser));
525 XML_ParserFree(parser);