2 * An XML parser for Wikipedia Data dumps.
3 * Converts XML files to tab-separated values files readable by spreadsheets
4 * and statistical packages.
15 #include "dtl/dtl.hpp"
23 // timestamp of the form 2003-11-07T00:43:23Z
24 #define DATE_LENGTH 10
26 #define TIMESTAMP_LENGTH 20
28 #define MEGABYTE 1048576
29 #define FIELD_BUFFER_SIZE 1024
31 // this can be changed at runtime if we encounter an article larger than 10mb
32 size_t text_buffer_size = 10 * MEGABYTE;
35 TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
36 EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
39 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
41 enum outtype { FULL, SIMPLE };
45 // pointers to once-allocated buffers
57 vector<string> last_text_tokens;
60 vector<pcrecpp::RE> title_regexes;
62 // regexes for checking with revisions
63 vector<string> content_regex_names;
64 vector<pcrecpp::RE> content_regexes;
66 // regexes for looking within diffs
67 vector<string> diff_regex_names;
68 vector<pcrecpp::RE> diff_regexes;
70 map<string, string> revision_md5; // used for detecting reversions
72 // track string size of the elements, to prevent O(N^2) processing in charhndl
73 // when we have to take strlen for every character which we append to the buffer
75 size_t articleid_size;
79 size_t timestamp_size;
88 enum elements element;
90 enum outtype output_type;
95 /* free_data and clean_data
96 * Takes a pointer to the data struct and an integer {0,1} indicating if the
97 * title data needs to be cleared as well.
98 * Also, frees memory dynamically allocated to store data.
101 clean_data(revisionData *data, int title)
103 // reset title (if we are switching articles)
105 data->title[0] = '\0';
106 data->articleid[0] = '\0';
107 data->title_size = 0;
108 data->articleid_size = 0;
112 data->revid[0] = '\0';
113 data->date[0] = '\0';
114 data->time[0] = '\0';
115 data->timestamp[0] = '\0';
116 data->anon[0] = '\0';
117 data->editor[0] = '\0';
118 data->editorid[0] = '\0';
119 data->comment[0] = '\0';
120 data->text[0] = '\0';
122 // reset length tracking
123 data->revid_size = 0;
126 data->timestamp_size = 0;
128 data->editor_size = 0;
129 data->editorid_size = 0;
130 data->comment_size = 0;
133 // reset flags and element type info
135 data->element = UNUSED;
141 free_data(revisionData *data, int title)
144 //printf("freeing article\n");
146 free(data->articleid);
151 free(data->timestamp);
154 free(data->editorid);
157 data->last_text_tokens.clear();
160 void cleanup_revision(revisionData *data) {
164 void cleanup_article(revisionData *data) {
166 data->last_text_tokens.clear();
167 data->revision_md5.clear();
172 init_data(revisionData *data, outtype output_type)
174 data->text = (char*) malloc(text_buffer_size);
175 data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
176 data->title = (char*) malloc(FIELD_BUFFER_SIZE);
177 data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
178 data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
179 data->date = (char*) malloc(FIELD_BUFFER_SIZE);
180 data->time = (char*) malloc(FIELD_BUFFER_SIZE);
181 data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
182 data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
183 data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
184 data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
187 // resets the data fields, null terminates strings, sets lengths
190 data->output_type = output_type;
193 /* for debugging only, prints out the state of the data struct
196 print_state(revisionData *data)
198 printf("element = %i\n", data->element);
199 printf("output_type = %i\n", data->output_type);
200 printf("title = %s\n", data->title);
201 printf("articleid = %s\n", data->articleid);
202 printf("revid = %s\n", data->revid);
203 printf("date = %s\n", data->date);
204 printf("time = %s\n", data->time);
205 printf("anon = %s\n", data->anon);
206 printf("editor = %s\n", data->editor);
207 printf("editorid = %s\n", data->editorid);
208 printf("minor = %s\n", (data->minor ? "1" : "0"));
209 printf("comment = %s\n", data->comment);
210 printf("text = %s\n", data->text);
217 * write a line of comma-separated value formatted data to standard out
219 * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
220 * (str) (int) (int) (str)(str)(bin)(str) (int) (bin) (str)
222 * it is called right before cleanup_revision() and cleanup_article()
225 write_row(revisionData *data)
230 md5_byte_t digest[16];
231 char md5_hex_output[2 * 16 + 1];
233 md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
234 md5_finish(&state, digest);
236 for (di = 0; di < 16; ++di) {
237 sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
241 map<string, string>::iterator prev_revision = data->revision_md5.find(md5_hex_output);
242 if (prev_revision != data->revision_md5.end()) {
243 reverted_to = prev_revision->second; // id of previous revision
245 data->revision_md5[md5_hex_output] = data->revid;
247 string text = string(data->text, data->text_size);
248 vector<string> text_tokens;
251 while ((pos = text.find_first_of(" \n\t\r", pos)) != string::npos) {
252 //cout << "\"\"\"" << text.substr(start, pos - start) << "\"\"\"" << endl;
253 text_tokens.push_back(text.substr(start, pos - start));
258 // look to see if (a) we've passed in a list of /any/ title_regexes
259 // and (b) if all of the title_regex_matches match
260 // if (a) is true and (b) is not, we return
261 bool any_title_regex_match = false;
262 if (!data->title_regexes.empty()) {
263 for (vector<pcrecpp::RE>::iterator r = data->title_regexes.begin(); r != data->title_regexes.end(); ++r) {
264 pcrecpp::RE& title_regex = *r;
265 if (title_regex.PartialMatch(data->title)) {
266 any_title_regex_match = true;
270 if (!any_title_regex_match) {
275 // search the content of the revision for a any of the regexes
276 vector<bool> content_regex_matches;
277 if (!data->content_regexes.empty()) {
278 for (vector<pcrecpp::RE>::iterator r = data->content_regexes.begin(); r != data->content_regexes.end(); ++r) {
279 pcrecpp::RE& content_regex = *r;
280 content_regex_matches.push_back(content_regex.PartialMatch(data->text));
284 //vector<string> additions;
285 //vector<string> deletions;
289 vector<bool> diff_regex_matches_adds;
290 vector<bool> diff_regex_matches_dels;
292 if (data->last_text_tokens.empty()) {
293 additions = data->text;
297 dtl::Diff< string, vector<string> > d(data->last_text_tokens, text_tokens);
298 //d.onOnlyEditDistance();
301 vector<pair<string, dtl::elemInfo> > ses_v = d.getSes().getSequence();
302 for (vector<pair<string, dtl::elemInfo> >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) {
303 switch (sit->second.type) {
305 //cout << "ADD: \"" << sit->first << "\"" << endl;
306 additions += sit->first;
308 case dtl::SES_DELETE:
309 //cout << "DEL: \"" << sit->first << "\"" << endl;
310 deletions += sit->first;
316 if (!additions.empty()) {
317 //cout << "ADD: " << additions << endl;
318 for (vector<pcrecpp::RE>::iterator r = data->diff_regexes.begin(); r != data->diff_regexes.end(); ++r) {
319 pcrecpp::RE& diff_regex = *r;
320 diff_regex_matches_adds.push_back(diff_regex.PartialMatch(additions));
324 if (!deletions.empty()) {
325 //cout << "DEL: " << deletions << endl;
326 for (vector<pcrecpp::RE>::iterator r = data->diff_regexes.begin(); r != data->diff_regexes.end(); ++r) {
327 pcrecpp::RE& diff_regex = *r;
328 diff_regex_matches_dels.push_back(diff_regex.PartialMatch(deletions));
332 data->last_text_tokens = text_tokens;
335 // print line of tsv output
337 << data->title << "\t"
338 << data->articleid << "\t"
339 << data->revid << "\t"
341 << data->time << "\t"
342 << ((data->editor[0] != '\0') ? "FALSE" : "TRUE") << "\t"
343 << data->editor << "\t"
344 << data->editorid << "\t"
345 << ((data->minor) ? "TRUE" : "FALSE") << "\t"
346 << (unsigned int) data->text_size << "\t"
347 << md5_hex_output << "\t"
348 << reverted_to << "\t"
349 << (int) additions.size() << "\t"
350 << (int) deletions.size();
352 for (int n = 0; n < data->content_regex_names.size(); ++n) {
353 cout << "\t" << ((!content_regex_matches.empty()
354 && content_regex_matches.at(n)) ? "TRUE" : "FALSE");
357 for (int n = 0; n < data->diff_regex_names.size(); ++n) {
358 cout << "\t" << ((!diff_regex_matches_adds.empty() && diff_regex_matches_adds.at(n)) ? "TRUE" : "FALSE")
359 << "\t" << ((!diff_regex_matches_dels.empty() && diff_regex_matches_dels.at(n)) ? "TRUE" : "FALSE");
364 if (data->output_type == FULL) {
365 cout << "comment:" << data->comment << endl
366 << "text:" << endl << data->text << endl;
372 split_timestamp(revisionData *data)
374 char *t = data->timestamp;
375 strncpy(data->date, data->timestamp, DATE_LENGTH);
376 char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
377 strncpy(data->time, timeinstamp, TIME_LENGTH);
380 // like strncat but with previously known length
382 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
386 for (i = 0 ; i < n && src[i] != '\0' ; i++)
387 dest[dest_len + i] = src[i];
388 dest[dest_len + i] = '\0';
394 charhndl(void* vdata, const XML_Char* s, int len)
396 revisionData* data = (revisionData*) vdata;
398 if (data->element != UNUSED && data->position != SKIP) {
399 switch (data->element) {
401 // check if we'd overflow our buffer
402 bufsz = data->text_size + len;
403 if (bufsz + 1 > text_buffer_size) {
404 data->text = (char*) realloc(data->text, bufsz + 1);
405 text_buffer_size = bufsz + 1;
407 strlcatn(data->text, s, data->text_size, len);
408 data->text_size = bufsz;
411 strlcatn(data->comment, s, data->comment_size, len);
412 data->comment_size += len;
415 strlcatn(data->title, s, data->title_size, len);
416 data->title_size += len;
419 // printf("articleid = %s\n", t);
420 strlcatn(data->articleid, s, data->articleid_size, len);
421 data->articleid_size += len;
424 // printf("revid = %s\n", t);
425 strlcatn(data->revid, s, data->revid_size, len);
426 data->revid_size += len;
429 strlcatn(data->timestamp, s, data->timestamp_size, len);
430 data->timestamp_size += len;
431 if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
432 split_timestamp(data);
435 strlcatn(data->editor, s, data->editor_size, len);
436 data->editor_size += len;
439 //printf("editorid = %s\n", t);
440 strlcatn(data->editorid, s, data->editorid_size, len);
441 data->editorid_size += len;
443 /* the following are implied or skipped:
445 printf("found minor element\n"); doesn't work
446 break; minor tag is just a tag
455 start(void* vdata, const XML_Char* name, const XML_Char** attr)
457 revisionData* data = (revisionData*) vdata;
459 if (strcmp(name,"title") == 0) {
460 cleanup_article(data); // cleans up data from last article
461 data->element = TITLE;
462 data->position = TITLE_BLOCK;
463 } else if (data->position != SKIP) {
464 if (strcmp(name,"revision") == 0) {
465 data->element = REVISION;
466 data->position = REVISION_BLOCK;
467 } else if (strcmp(name, "contributor") == 0) {
468 data->element = CONTRIBUTOR;
469 data->position = CONTRIBUTOR_BLOCK;
470 } else if (strcmp(name,"id") == 0)
471 switch (data->position) {
473 data->element = ARTICLEID;
476 data->element = REVID;
478 case CONTRIBUTOR_BLOCK:
479 data->element = EDITORID;
483 // minor tag has no character data, so we parse here
484 else if (strcmp(name,"minor") == 0) {
485 data->element = MINOR;
488 else if (strcmp(name,"timestamp") == 0)
489 data->element = TIMESTAMP;
491 else if (strcmp(name, "username") == 0)
492 data->element = EDITOR;
494 else if (strcmp(name,"ip") == 0)
495 data->element = EDITORID;
497 else if (strcmp(name,"comment") == 0)
498 data->element = COMMENT;
500 else if (strcmp(name,"text") == 0)
501 data->element = TEXT;
503 else if (strcmp(name,"page") == 0
504 || strcmp(name,"mediawiki") == 0
505 || strcmp(name,"restrictions") == 0
506 || strcmp(name,"siteinfo") == 0)
507 data->element = UNUSED;
514 end(void* vdata, const XML_Char* name)
516 revisionData* data = (revisionData*) vdata;
517 if (strcmp(name, "revision") == 0 && data->position != SKIP) {
518 write_row(data); // crucial... :)
519 cleanup_revision(data); // also crucial
521 data->element = UNUSED; // sets our state to "not-in-useful"
522 } // thus avoiding unpleasant character data
523 // b/w tags (newlines etc.)
526 void print_usage(char* argv[]) {
527 cerr << "usage: <wikimedia dump xml> | " << argv[0] << "[options]" << endl
529 << "options:" << endl
530 << " -v verbose mode prints text and comments after each line of tab separated data" << endl
531 << " -n name of the following regex for content (e.g. -n name -r \"...\")" << endl
532 << " -r regex to check against content of the revision" << endl
533 << " -N name of the following regex for diffs (e.g. -N name -R \"...\")" << endl
534 << " -R regex to check against diffs (i.e., additions and deletions)" << endl
535 << " -t parse revisions only from pages whose titles match regex(es)" << endl
537 << "Takes a wikimedia data dump XML stream on standard in, and produces" << endl
538 << "a tab-separated stream of revisions on standard out:" << endl
540 << "title, articleid, revid, timestamp, anon, editor, editorid, minor," << endl
541 << "text_length, text_md5, reversion, additions_size, deletions_size" << endl
542 << ".... and additional fields for each regex executed against add/delete diffs" << endl
544 << "Boolean fields are TRUE/FALSE except in the case of reversion, which is blank" << endl
545 << "unless the article is a revert to a previous revision, in which case, it" << endl
546 << "contains the revision ID of the revision which was reverted to." << endl
548 << "authors: Erik Garrison <erik@hypervolu.me>" << endl
549 << " Benjamin Mako Hill <mako@atdot.cc>" << endl;
554 main(int argc, char *argv[])
557 enum outtype output_type;
559 // in "simple" output, we don't print text and comments
560 output_type = SIMPLE;
562 string diff_regex_name;
563 string content_regex_name;
565 // the user data struct which is passed to callback functions
568 while ((c = getopt(argc, argv, "hvn:r:t:")) != -1)
578 content_regex_name = optarg;
581 data.content_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
582 data.content_regex_names.push_back(content_regex_name);
583 if (!content_regex_name.empty()) {
584 content_regex_name.clear();
588 diff_regex_name = optarg;
591 data.diff_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
592 data.diff_regex_names.push_back(diff_regex_name);
593 if (!diff_regex_name.empty()) {
594 diff_regex_name.clear();
602 data.title_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
606 if (dry_run) { // lets us print initialization options
607 printf("simple_output = %i\n", output_type);
611 // create a new instance of the expat parser
612 XML_Parser parser = XML_ParserCreate("UTF-8");
614 // initialize the elements of the struct to default values
615 init_data(&data, output_type);
618 // makes the parser pass "data" as the first argument to every callback
619 XML_SetUserData(parser, &data);
620 void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
621 void (*endFnPtr)(void*, const XML_Char*) = end;
622 void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
624 // sets start and end to be the element start and end handlers
625 XML_SetElementHandler(parser, startFnPtr, endFnPtr);
626 // sets charhndl to be the callback for character data
627 XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
634 cout << "title" << "\t"
635 << "articleid" << "\t"
641 << "editor_id" << "\t"
643 << "text_size" << "\t"
644 << "text_md5" << "\t"
645 << "reversion" << "\t"
646 << "additions_size" << "\t"
650 if (!data.content_regexes.empty()) {
651 for (vector<pcrecpp::RE>::iterator r = data.content_regexes.begin();
652 r != data.content_regexes.end(); ++r, ++n) {
653 if (data.content_regex_names.at(n).empty()) {
654 cout << "\t" << "regex" << n;
656 cout << "\t" << data.content_regex_names.at(n);
661 if (!data.diff_regexes.empty()) {
662 for (vector<pcrecpp::RE>::iterator r = data.diff_regexes.begin(); r != data.diff_regexes.end(); ++r, ++n) {
663 if (data.diff_regex_names.at(n).empty()) {
664 cout << "\t" << "regex_" << n << "_add"
665 << "\t" << "regex_" << n << "_del";
667 cout << "\t" << data.diff_regex_names.at(n) << "_add"
668 << "\t" << data.diff_regex_names.at(n) << "_del";
675 // shovel data into the parser
678 // read into buf a bufferfull of data from standard input
679 size_t len = fread(buf, 1, BUFSIZ, stdin);
680 done = len < BUFSIZ; // checks if we've got the last bufferfull
682 // passes the buffer of data to the parser and checks for error
683 // (this is where the callbacks are invoked)
684 if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
685 cerr << "XML ERROR: " << XML_ErrorString(XML_GetErrorCode(parser)) << " at line "
686 << (int) XML_GetCurrentLineNumber(parser) << endl;
692 XML_ParserFree(parser);