2 * An XML parser for Wikipedia Data dumps.
3 * Converts XML files to tab-separated values files readable by spreadsheets
4 * and statistical packages.
16 #include "dtl/dtl.hpp"
21 // timestamp of the form 2003-11-07T00:43:23Z
22 #define DATE_LENGTH 10
24 #define TIMESTAMP_LENGTH 20
26 #define MEGABYTE 1048576
27 #define FIELD_BUFFER_SIZE 1024
28 // 2048 KB in bytes + 1
29 //#define TEXT_BUFFER_SIZE 2097153
30 //#define TEXT_BUFFER_SIZE 10485760
33 TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
34 EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
37 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
39 enum outtype { FULL, SIMPLE };
43 // pointers to once-allocated buffers
55 vector<string> last_tokens;
57 // track string size of the elements, to prevent O(N^2) processing in charhndl
58 // when we have to take strlen for every character which we append to the buffer
60 size_t articleid_size;
64 size_t timestamp_size;
73 enum elements element;
75 enum outtype output_type;
80 /* free_data and clean_data
81 * Takes a pointer to the data struct and an integer {0,1} indicating if the
82 * title data needs to be cleared as well.
83 * Also, frees memory dynamically allocated to store data.
86 clean_data(revisionData *data, int title)
88 // reset title (if we are switching articles)
90 data->title[0] = '\0';
91 data->articleid[0] = '\0';
93 data->articleid_size = 0;
97 data->revid[0] = '\0';
100 data->timestamp[0] = '\0';
101 data->anon[0] = '\0';
102 data->editor[0] = '\0';
103 data->editorid[0] = '\0';
104 data->comment[0] = '\0';
105 data->text[0] = '\0';
107 // reset length tracking
108 data->revid_size = 0;
111 data->timestamp_size = 0;
113 data->editor_size = 0;
114 data->editorid_size = 0;
115 data->comment_size = 0;
118 // reset flags and element type info
120 data->element = UNUSED;
126 free_data(revisionData *data, int title)
129 //printf("freeing article\n");
131 free(data->articleid);
136 free(data->timestamp);
139 free(data->editorid);
142 data->last_tokens.clear();
145 void cleanup_revision(revisionData *data) {
149 void cleanup_article(revisionData *data) {
151 data->last_tokens.clear();
156 init_data(revisionData *data, outtype output_type)
158 data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'?
159 data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
160 data->title = (char*) malloc(FIELD_BUFFER_SIZE);
161 data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
162 data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
163 data->date = (char*) malloc(FIELD_BUFFER_SIZE);
164 data->time = (char*) malloc(FIELD_BUFFER_SIZE);
165 data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
166 data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
167 data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
168 data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
171 // resets the data fields, null terminates strings, sets lengths
174 data->output_type = output_type;
177 /* for debugging only, prints out the state of the data struct
180 print_state(revisionData *data)
182 printf("element = %i\n", data->element);
183 printf("output_type = %i\n", data->output_type);
184 printf("title = %s\n", data->title);
185 printf("articleid = %s\n", data->articleid);
186 printf("revid = %s\n", data->revid);
187 printf("date = %s\n", data->date);
188 printf("time = %s\n", data->time);
189 printf("anon = %s\n", data->anon);
190 printf("editor = %s\n", data->editor);
191 printf("editorid = %s\n", data->editorid);
192 printf("minor = %s\n", (data->minor ? "1" : "0"));
193 printf("comment = %s\n", data->comment);
194 printf("text = %s\n", data->text);
199 /* Write a header for the comma-separated output
204 // printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
205 // printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
211 * write a line of comma-separated value formatted data to standard out
213 * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
214 * (str) (int) (int) (str)(str)(bin)(str) (int) (bin) (str)
216 * it is called right before cleanup_revision() and cleanup_article()
219 write_row(revisionData *data)
224 md5_byte_t digest[16];
225 char md5_hex_output[2 * 16 + 1];
227 md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
228 md5_finish(&state, digest);
230 for (di = 0; di < 16; ++di) {
231 sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
234 string text = string(data->text, data->text_size);
235 vector<string> text_tokens;
236 size_t period_pos = 0;
237 size_t paragraph_pos = 0;
239 while ((period_pos = text.find(".", period_pos + 1)) != string::npos &&
240 (paragraph_pos = text.find("\n\n", paragraph_pos + 1)) != string::npos) {
241 if (paragraph_pos < period_pos) {
242 text_tokens.push_back(text.substr(start, paragraph_pos - start));
243 start = paragraph_pos;
245 text_tokens.push_back(text.substr(start, period_pos - start));
250 vector<string> additions;
251 vector<string> deletions;
253 if (data->last_tokens.empty()) {
254 data->last_tokens = text_tokens;
258 dtl::Diff< string, vector<string> > d(data->last_tokens, text_tokens);
259 //d.onOnlyEditDistance();
262 vector<pair<string, dtl::elemInfo> > ses_v = d.getSes().getSequence();
263 for (vector<pair<string, dtl::elemInfo> >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) {
264 switch (sit->second.type) {
266 cout << "ADD: \"" << sit->first << "\"" << endl;
267 additions.push_back(sit->first);
269 case dtl::SES_DELETE:
270 cout << "DEL: \"" << sit->first << "\"" << endl;
271 deletions.push_back(sit->first);
276 // apply regex to the diff
279 data->last_tokens = text_tokens;
283 // print line of tsv output
284 printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\t%i\t%i\n",
290 (data->editor[0] != '\0') ? "0" : "1", // anon?
293 (data->minor) ? "1" : "0",
294 (unsigned int) data->text_size,
295 shannon_H(data->text, data->text_size),
297 (int) additions.size(),
298 (int) deletions.size()
302 if (data->output_type == FULL) {
303 printf("comment:%s\ntext:\n%s\n", data->comment, data->text);
309 split_timestamp(revisionData *data)
311 char *t = data->timestamp;
312 strncpy(data->date, data->timestamp, DATE_LENGTH);
313 char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
314 strncpy(data->time, timeinstamp, TIME_LENGTH);
317 // like strncat but with previously known length
319 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
321 //size_t dest_len = strlen(dest);
324 for (i = 0 ; i < n && src[i] != '\0' ; i++)
325 dest[dest_len + i] = src[i];
326 dest[dest_len + i] = '\0';
332 charhndl(void* vdata, const XML_Char* s, int len)
334 revisionData* data = (revisionData*) vdata;
335 if (data->element != UNUSED && data->position != SKIP) {
338 //t[len] = '\0'; // makes t a well-formed string
339 switch (data->element) {
341 // printf("buffer length = %i, text: %s\n", len, t);
342 strlcatn(data->text, s, data->text_size, len);
343 data->text_size += len;
346 strlcatn(data->comment, s, data->comment_size, len);
347 data->comment_size += len;
350 strlcatn(data->title, s, data->title_size, len);
351 data->title_size += len;
354 // printf("articleid = %s\n", t);
355 strlcatn(data->articleid, s, data->articleid_size, len);
356 data->articleid_size += len;
359 // printf("revid = %s\n", t);
360 strlcatn(data->revid, s, data->revid_size, len);
361 data->revid_size += len;
364 strlcatn(data->timestamp, s, data->timestamp_size, len);
365 data->timestamp_size += len;
366 if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
367 split_timestamp(data);
370 strlcatn(data->editor, s, data->editor_size, len);
371 data->editor_size += len;
374 //printf("editorid = %s\n", t);
375 strlcatn(data->editorid, s, data->editorid_size, len);
376 data->editorid_size += len;
378 /* the following are implied or skipped:
380 printf("found minor element\n"); doesn't work
381 break; minor tag is just a tag
390 start(void* vdata, const XML_Char* name, const XML_Char** attr)
392 revisionData* data = (revisionData*) vdata;
394 if (strcmp(name,"title") == 0) {
395 cleanup_article(data); // cleans up data from last article
396 data->element = TITLE;
397 data->position = TITLE_BLOCK;
398 } else if (data->position != SKIP) {
399 if (strcmp(name,"revision") == 0) {
400 data->element = REVISION;
401 data->position = REVISION_BLOCK;
402 } else if (strcmp(name, "contributor") == 0) {
403 data->element = CONTRIBUTOR;
404 data->position = CONTRIBUTOR_BLOCK;
405 } else if (strcmp(name,"id") == 0)
406 switch (data->position) {
408 data->element = ARTICLEID;
411 data->element = REVID;
413 case CONTRIBUTOR_BLOCK:
414 data->element = EDITORID;
418 // minor tag has no character data, so we parse here
419 else if (strcmp(name,"minor") == 0) {
420 data->element = MINOR;
423 else if (strcmp(name,"timestamp") == 0)
424 data->element = TIMESTAMP;
426 else if (strcmp(name, "username") == 0)
427 data->element = EDITOR;
429 else if (strcmp(name,"ip") == 0)
430 data->element = EDITORID;
432 else if (strcmp(name,"comment") == 0)
433 data->element = COMMENT;
435 else if (strcmp(name,"text") == 0)
436 data->element = TEXT;
438 else if (strcmp(name,"page") == 0
439 || strcmp(name,"mediawiki") == 0
440 || strcmp(name,"restrictions") == 0
441 || strcmp(name,"siteinfo") == 0)
442 data->element = UNUSED;
449 end(void* vdata, const XML_Char* name)
451 revisionData* data = (revisionData*) vdata;
452 if (strcmp(name, "revision") == 0 && data->position != SKIP) {
453 write_row(data); // crucial... :)
454 cleanup_revision(data); // also crucial
456 data->element = UNUSED; // sets our state to "not-in-useful"
457 } // thus avoiding unpleasant character data
458 // b/w tags (newlines etc.)
461 void print_usage(char* argv[]) {
462 fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
463 fprintf(stderr, "\n");
464 fprintf(stderr, "options:\n");
465 fprintf(stderr, " -t print text and comments after each line of tab separated data\n");
466 fprintf(stderr, "\n");
467 fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
468 fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
469 fprintf(stderr, "\n");
470 fprintf(stderr, "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5\n");
471 fprintf(stderr, "\n");
472 fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
477 main(int argc, char *argv[])
480 enum outtype output_type;
482 // in "simple" output, we don't print text and comments
483 output_type = SIMPLE;
486 while ((c = getopt(argc, argv, "ht")) != -1)
501 if (dry_run) { // lets us print initialization options
502 printf("simple_output = %i\n", output_type);
506 // create a new instance of the expat parser
507 XML_Parser parser = XML_ParserCreate("UTF-8");
509 // initialize the user data struct which is passed to callback functions
511 // initialize the elements of the struct to default values
512 init_data(&data, output_type);
515 // makes the parser pass "data" as the first argument to every callback
516 XML_SetUserData(parser, &data);
517 void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
518 void (*endFnPtr)(void*, const XML_Char*) = end;
519 void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
521 // sets start and end to be the element start and end handlers
522 XML_SetElementHandler(parser, startFnPtr, endFnPtr);
523 // sets charhndl to be the callback for character data
524 XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
529 // shovel data into the parser
532 // read into buf a bufferfull of data from standard input
533 size_t len = fread(buf, 1, BUFSIZ, stdin);
534 done = len < BUFSIZ; // checks if we've got the last bufferfull
536 // passes the buffer of data to the parser and checks for error
537 // (this is where the callbacks are invoked)
538 if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
541 XML_ErrorString(XML_GetErrorCode(parser)),
542 (int) XML_GetCurrentLineNumber(parser));
548 XML_ParserFree(parser);