2 * An XML parser for Wikipedia Data dumps.
3 * Converts XML files to tab-separated values files readable by spreadsheets
4 * and statistical packages.
16 #include "dtl/dtl.hpp"
24 // timestamp of the form 2003-11-07T00:43:23Z
25 #define DATE_LENGTH 10
27 #define TIMESTAMP_LENGTH 20
29 #define MEGABYTE 1048576
30 #define FIELD_BUFFER_SIZE 1024
31 // 2048 KB in bytes + 1
32 //#define TEXT_BUFFER_SIZE 2097153
33 //#define TEXT_BUFFER_SIZE 10485760
36 TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
37 EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
40 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
42 enum outtype { FULL, SIMPLE };
46 // pointers to once-allocated buffers
58 vector<string> last_text_tokens;
59 vector<pcrecpp::RE> regexes;
60 vector<string> regex_names;
61 map<string, string> revision_md5; // used for detecting reversions
63 // track string size of the elements, to prevent O(N^2) processing in charhndl
64 // when we have to take strlen for every character which we append to the buffer
66 size_t articleid_size;
70 size_t timestamp_size;
79 enum elements element;
81 enum outtype output_type;
86 /* free_data and clean_data
87 * Takes a pointer to the data struct and an integer {0,1} indicating if the
88 * title data needs to be cleared as well.
89 * Also, frees memory dynamically allocated to store data.
92 clean_data(revisionData *data, int title)
94 // reset title (if we are switching articles)
96 data->title[0] = '\0';
97 data->articleid[0] = '\0';
99 data->articleid_size = 0;
103 data->revid[0] = '\0';
104 data->date[0] = '\0';
105 data->time[0] = '\0';
106 data->timestamp[0] = '\0';
107 data->anon[0] = '\0';
108 data->editor[0] = '\0';
109 data->editorid[0] = '\0';
110 data->comment[0] = '\0';
111 data->text[0] = '\0';
113 // reset length tracking
114 data->revid_size = 0;
117 data->timestamp_size = 0;
119 data->editor_size = 0;
120 data->editorid_size = 0;
121 data->comment_size = 0;
124 // reset flags and element type info
126 data->element = UNUSED;
132 free_data(revisionData *data, int title)
135 //printf("freeing article\n");
137 free(data->articleid);
142 free(data->timestamp);
145 free(data->editorid);
148 data->last_text_tokens.clear();
151 void cleanup_revision(revisionData *data) {
155 void cleanup_article(revisionData *data) {
157 data->last_text_tokens.clear();
158 data->revision_md5.clear();
163 init_data(revisionData *data, outtype output_type)
165 data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'?
166 data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
167 data->title = (char*) malloc(FIELD_BUFFER_SIZE);
168 data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
169 data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
170 data->date = (char*) malloc(FIELD_BUFFER_SIZE);
171 data->time = (char*) malloc(FIELD_BUFFER_SIZE);
172 data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
173 data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
174 data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
175 data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
178 // resets the data fields, null terminates strings, sets lengths
181 data->output_type = output_type;
184 /* for debugging only, prints out the state of the data struct
187 print_state(revisionData *data)
189 printf("element = %i\n", data->element);
190 printf("output_type = %i\n", data->output_type);
191 printf("title = %s\n", data->title);
192 printf("articleid = %s\n", data->articleid);
193 printf("revid = %s\n", data->revid);
194 printf("date = %s\n", data->date);
195 printf("time = %s\n", data->time);
196 printf("anon = %s\n", data->anon);
197 printf("editor = %s\n", data->editor);
198 printf("editorid = %s\n", data->editorid);
199 printf("minor = %s\n", (data->minor ? "1" : "0"));
200 printf("comment = %s\n", data->comment);
201 printf("text = %s\n", data->text);
208 * write a line of comma-separated value formatted data to standard out
210 * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
211 * (str) (int) (int) (str)(str)(bin)(str) (int) (bin) (str)
213 * it is called right before cleanup_revision() and cleanup_article()
216 write_row(revisionData *data)
221 md5_byte_t digest[16];
222 char md5_hex_output[2 * 16 + 1];
224 md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
225 md5_finish(&state, digest);
227 for (di = 0; di < 16; ++di) {
228 sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
232 map<string, string>::iterator prev_revision = data->revision_md5.find(md5_hex_output);
233 if (prev_revision != data->revision_md5.end()) {
234 reverted_to = prev_revision->second; // id of previous revision
236 data->revision_md5[md5_hex_output] = data->revid;
238 string text = string(data->text, data->text_size);
239 vector<string> text_tokens;
242 while ((pos = text.find_first_of(" \n\t\r", pos)) != string::npos) {
243 //cout << "\"\"\"" << text.substr(start, pos - start) << "\"\"\"" << endl;
244 text_tokens.push_back(text.substr(start, pos - start));
249 //vector<string> additions;
250 //vector<string> deletions;
254 vector<bool> regex_matches_adds;
255 vector<bool> regex_matches_dels;
257 if (!data->last_text_tokens.empty()) {
260 dtl::Diff< string, vector<string> > d(data->last_text_tokens, text_tokens);
261 //d.onOnlyEditDistance();
264 vector<pair<string, dtl::elemInfo> > ses_v = d.getSes().getSequence();
265 for (vector<pair<string, dtl::elemInfo> >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) {
266 switch (sit->second.type) {
268 //cout << "ADD: \"" << sit->first << "\"" << endl;
269 additions += sit->first;
271 case dtl::SES_DELETE:
272 //cout << "DEL: \"" << sit->first << "\"" << endl;
273 deletions += sit->first;
278 if (!additions.empty()) {
279 //cout << "ADD: " << additions << endl;
280 for (vector<pcrecpp::RE>::iterator r = data->regexes.begin(); r != data->regexes.end(); ++r) {
281 pcrecpp::RE& regex = *r;
282 regex_matches_adds.push_back(regex.PartialMatch(additions));
286 if (!deletions.empty()) {
287 //cout << "DEL: " << deletions << endl;
288 for (vector<pcrecpp::RE>::iterator r = data->regexes.begin(); r != data->regexes.end(); ++r) {
289 pcrecpp::RE& regex = *r;
290 regex_matches_dels.push_back(regex.PartialMatch(deletions));
294 // apply regex to the diff
298 data->last_text_tokens = text_tokens;
301 // print line of tsv output
303 << data->title << "\t"
304 << data->articleid << "\t"
305 << data->revid << "\t"
307 << data->time << "\t"
308 << ((data->editor[0] != '\0') ? "FALSE" : "TRUE") << "\t"
309 << data->editor << "\t"
310 << data->editorid << "\t"
311 << ((data->minor) ? "TRUE" : "FALSE") << "\t"
312 << (unsigned int) data->text_size << "\t"
313 << shannon_H(data->text, data->text_size) << "\t"
314 << md5_hex_output << "\t"
315 << reverted_to << "\t"
316 << (int) additions.size() << "\t"
317 << (int) deletions.size();
319 for (int n = 0; n < data->regex_names.size(); ++n) {
320 cout << "\t" << ((!regex_matches_adds.empty() && regex_matches_adds.at(n)) ? "TRUE" : "FALSE")
321 << "\t" << ((!regex_matches_dels.empty() && regex_matches_dels.at(n)) ? "TRUE" : "FALSE");
326 if (data->output_type == FULL) {
327 cout << "comment:" << data->comment << endl
328 << "text:" << endl << data->text << endl;
334 split_timestamp(revisionData *data)
336 char *t = data->timestamp;
337 strncpy(data->date, data->timestamp, DATE_LENGTH);
338 char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
339 strncpy(data->time, timeinstamp, TIME_LENGTH);
342 // like strncat but with previously known length
344 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
346 //size_t dest_len = strlen(dest);
349 for (i = 0 ; i < n && src[i] != '\0' ; i++)
350 dest[dest_len + i] = src[i];
351 dest[dest_len + i] = '\0';
357 charhndl(void* vdata, const XML_Char* s, int len)
359 revisionData* data = (revisionData*) vdata;
360 if (data->element != UNUSED && data->position != SKIP) {
363 //t[len] = '\0'; // makes t a well-formed string
364 switch (data->element) {
366 // printf("buffer length = %i, text: %s\n", len, t);
367 strlcatn(data->text, s, data->text_size, len);
368 data->text_size += len;
371 strlcatn(data->comment, s, data->comment_size, len);
372 data->comment_size += len;
375 strlcatn(data->title, s, data->title_size, len);
376 data->title_size += len;
379 // printf("articleid = %s\n", t);
380 strlcatn(data->articleid, s, data->articleid_size, len);
381 data->articleid_size += len;
384 // printf("revid = %s\n", t);
385 strlcatn(data->revid, s, data->revid_size, len);
386 data->revid_size += len;
389 strlcatn(data->timestamp, s, data->timestamp_size, len);
390 data->timestamp_size += len;
391 if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
392 split_timestamp(data);
395 strlcatn(data->editor, s, data->editor_size, len);
396 data->editor_size += len;
399 //printf("editorid = %s\n", t);
400 strlcatn(data->editorid, s, data->editorid_size, len);
401 data->editorid_size += len;
403 /* the following are implied or skipped:
405 printf("found minor element\n"); doesn't work
406 break; minor tag is just a tag
415 start(void* vdata, const XML_Char* name, const XML_Char** attr)
417 revisionData* data = (revisionData*) vdata;
419 if (strcmp(name,"title") == 0) {
420 cleanup_article(data); // cleans up data from last article
421 data->element = TITLE;
422 data->position = TITLE_BLOCK;
423 } else if (data->position != SKIP) {
424 if (strcmp(name,"revision") == 0) {
425 data->element = REVISION;
426 data->position = REVISION_BLOCK;
427 } else if (strcmp(name, "contributor") == 0) {
428 data->element = CONTRIBUTOR;
429 data->position = CONTRIBUTOR_BLOCK;
430 } else if (strcmp(name,"id") == 0)
431 switch (data->position) {
433 data->element = ARTICLEID;
436 data->element = REVID;
438 case CONTRIBUTOR_BLOCK:
439 data->element = EDITORID;
443 // minor tag has no character data, so we parse here
444 else if (strcmp(name,"minor") == 0) {
445 data->element = MINOR;
448 else if (strcmp(name,"timestamp") == 0)
449 data->element = TIMESTAMP;
451 else if (strcmp(name, "username") == 0)
452 data->element = EDITOR;
454 else if (strcmp(name,"ip") == 0)
455 data->element = EDITORID;
457 else if (strcmp(name,"comment") == 0)
458 data->element = COMMENT;
460 else if (strcmp(name,"text") == 0)
461 data->element = TEXT;
463 else if (strcmp(name,"page") == 0
464 || strcmp(name,"mediawiki") == 0
465 || strcmp(name,"restrictions") == 0
466 || strcmp(name,"siteinfo") == 0)
467 data->element = UNUSED;
474 end(void* vdata, const XML_Char* name)
476 revisionData* data = (revisionData*) vdata;
477 if (strcmp(name, "revision") == 0 && data->position != SKIP) {
478 write_row(data); // crucial... :)
479 cleanup_revision(data); // also crucial
481 data->element = UNUSED; // sets our state to "not-in-useful"
482 } // thus avoiding unpleasant character data
483 // b/w tags (newlines etc.)
486 void print_usage(char* argv[]) {
487 cerr << "usage: <wikimedia dump xml> | " << argv[0] << "[options]" << endl
489 << "options:" << endl
490 << " -t print text and comments after each line of tab separated data" << endl
491 << " -n name of the following regex (e.g. -N name -r \"...\")" << endl
492 << " -r regex to check against additions and deletions" << endl
494 << "Takes a wikimedia data dump XML stream on standard in, and produces" << endl
495 << "a tab-separated stream of revisions on standard out:" << endl
497 << "title, articleid, revid, timestamp, anon, editor, editorid, minor," << endl
498 << "text_length, text_entropy, text_md5, reversion, additions_size, deletions_size" << endl
499 << ".... and additional fields for each regex executed against add/delete diffs" << endl
502 << "Boolean fields are TRUE/FALSE except in the case of reversion, which is blank" << endl
503 << "unless the article is a revert to a previous revision, in which case, it" << endl
504 << "contains the revision ID of the revision which was reverted to." << endl
506 << "author: Erik Garrison <erik@hypervolu.me>" << endl;
511 main(int argc, char *argv[])
514 enum outtype output_type;
516 // in "simple" output, we don't print text and comments
517 output_type = SIMPLE;
521 // the user data struct which is passed to callback functions
524 while ((c = getopt(argc, argv, "htn:r:")) != -1)
537 data.regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
538 data.regex_names.push_back(regex_name);
539 if (!regex_name.empty()) {
549 if (dry_run) { // lets us print initialization options
550 printf("simple_output = %i\n", output_type);
554 // create a new instance of the expat parser
555 XML_Parser parser = XML_ParserCreate("UTF-8");
557 // initialize the elements of the struct to default values
558 init_data(&data, output_type);
561 // makes the parser pass "data" as the first argument to every callback
562 XML_SetUserData(parser, &data);
563 void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
564 void (*endFnPtr)(void*, const XML_Char*) = end;
565 void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
567 // sets start and end to be the element start and end handlers
568 XML_SetElementHandler(parser, startFnPtr, endFnPtr);
569 // sets charhndl to be the callback for character data
570 XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
577 cout << "title" << "\t"
578 << "articleid" << "\t"
584 << "editor_id" << "\t"
586 << "text_size" << "\t"
587 << "text_entropy" << "\t"
588 << "text_md5" << "\t"
589 << "reversion" << "\t"
590 << "additions_size" << "\t"
594 if (!data.regexes.empty()) {
595 for (vector<pcrecpp::RE>::iterator r = data.regexes.begin(); r != data.regexes.end(); ++r, ++n) {
596 if (data.regex_names.at(n).empty()) {
597 cout << "\t" << "regex_" << n << "_add"
598 << "\t" << "regex_" << n << "_del";
600 cout << "\t" << data.regex_names.at(n) << "_add"
601 << "\t" << data.regex_names.at(n) << "_del";
607 // shovel data into the parser
610 // read into buf a bufferfull of data from standard input
611 size_t len = fread(buf, 1, BUFSIZ, stdin);
612 done = len < BUFSIZ; // checks if we've got the last bufferfull
614 // passes the buffer of data to the parser and checks for error
615 // (this is where the callbacks are invoked)
616 if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
617 cerr << XML_ErrorString(XML_GetErrorCode(parser)) << " at line "
618 << (int) XML_GetCurrentLineNumber(parser) << endl;
624 XML_ParserFree(parser);