projects.mako.cc - wikiq/blob - wikiq.cpp

   1 /*
   2  * An XML parser for Wikipedia Data dumps.
   3  * Converts XML files to tab-separated values files readable by spreadsheets
   4  * and statistical packages.
   5  */
   6
   7 #include <stdio.h>
   8 #include <iostream>
   9 #include <string.h>
  10 #include <ctype.h>
  11 #include <stdlib.h>
  12 #include "expat.h"
  13 #include <getopt.h>
  14 #include "disorder.h"
  15 #include "md5.h"
  16 #include "dtl/dtl.hpp"
  17 #include <vector>
  18 #include <map>
  19 #include <pcrecpp.h>
  20
  21
  22 using namespace std;
  23
  24 // timestamp of the form 2003-11-07T00:43:23Z
  25 #define DATE_LENGTH 10
  26 #define TIME_LENGTH 8
  27 #define TIMESTAMP_LENGTH 20
  28
  29 #define MEGABYTE 1048576
  30 #define FIELD_BUFFER_SIZE 1024
  31 // 2048 KB in bytes + 1
  32 //#define TEXT_BUFFER_SIZE 2097153
  33 //#define TEXT_BUFFER_SIZE 10485760
  34
  35 enum elements {
  36     TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
  37     EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
  38 };
  39
  40 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
  41
  42 enum outtype { FULL, SIMPLE };
  43
  44 typedef struct {
  45
  46     // pointers to once-allocated buffers
  47     char *title;
  48     char *articleid;
  49     char *revid;
  50     char *date;
  51     char *time;
  52     char *timestamp;
  53     char *anon;
  54     char *editor;
  55     char *editorid;
  56     char *comment;
  57     char *text;
  58     vector<string> last_text_tokens;
  59     vector<pcrecpp::RE> regexes;
  60     vector<string> regex_names;
  61     map<string, string> revision_md5; // used for detecting reversions
  62
  63     // track string size of the elements, to prevent O(N^2) processing in charhndl
  64     // when we have to take strlen for every character which we append to the buffer
  65     size_t title_size;
  66     size_t articleid_size;
  67     size_t revid_size;
  68     size_t date_size;
  69     size_t time_size;
  70     size_t timestamp_size;
  71     size_t anon_size;
  72     size_t editor_size;
  73     size_t editorid_size;
  74     size_t comment_size;
  75     size_t text_size;
  76
  77     bool minor;
  78
  79     enum elements element;
  80     enum block position;
  81     enum outtype output_type;
  82
  83 } revisionData;
  84
  85
  86 /* free_data and clean_data
  87  * Takes a pointer to the data struct and an integer {0,1} indicating if the
  88  * title data needs to be cleared as well.
  89  * Also, frees memory dynamically allocated to store data.
  90  */
  91 static void
  92 clean_data(revisionData *data, int title)
  93 {
  94     // reset title (if we are switching articles)
  95     if (title) {
  96         data->title[0] = '\0';
  97         data->articleid[0] = '\0';
  98         data->title_size = 0;
  99         data->articleid_size = 0;
 100     }
 101
 102     // reset text fields
 103     data->revid[0] = '\0';
 104     data->date[0] = '\0';
 105     data->time[0] = '\0';
 106     data->timestamp[0] = '\0';
 107     data->anon[0] = '\0';
 108     data->editor[0] = '\0';
 109     data->editorid[0] = '\0';
 110     data->comment[0] = '\0';
 111     data->text[0] = '\0';
 112
 113     // reset length tracking
 114     data->revid_size = 0;
 115     data->date_size = 0;
 116     data->time_size = 0;
 117     data->timestamp_size = 0;
 118     data->anon_size = 0;
 119     data->editor_size = 0;
 120     data->editorid_size = 0;
 121     data->comment_size = 0;
 122     data->text_size = 0;
 123
 124     // reset flags and element type info
 125     data->minor = false;
 126     data->element = UNUSED;
 127
 128 }
 129
 130 // presently unused
 131 static void
 132 free_data(revisionData *data, int title)
 133 {
 134     if (title) {
 135         //printf("freeing article\n");
 136         free(data->title);
 137         free(data->articleid);
 138     }
 139     free(data->revid);
 140     free(data->date);
 141     free(data->time);
 142     free(data->timestamp);
 143     free(data->anon);
 144     free(data->editor);
 145     free(data->editorid);
 146     free(data->comment);
 147     free(data->text);
 148     data->last_text_tokens.clear();
 149 }
 150
 151 void cleanup_revision(revisionData *data) {
 152     clean_data(data, 0);
 153 }
 154
 155 void cleanup_article(revisionData *data) {
 156     clean_data(data, 1);
 157     data->last_text_tokens.clear();
 158     data->revision_md5.clear();
 159 }
 160
 161
 162 static void
 163 init_data(revisionData *data, outtype output_type)
 164 {
 165     data->text = (char*) malloc(4 * MEGABYTE);  // 2MB is the article length limit, 4MB is 'safe'?
 166     data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
 167     data->title = (char*) malloc(FIELD_BUFFER_SIZE);
 168     data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
 169     data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
 170     data->date = (char*) malloc(FIELD_BUFFER_SIZE);
 171     data->time = (char*) malloc(FIELD_BUFFER_SIZE);
 172     data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
 173     data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
 174     data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
 175     data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
 176     data->minor = false;
 177
 178     // resets the data fields, null terminates strings, sets lengths
 179     clean_data(data, 1);
 180
 181     data->output_type = output_type;
 182 }
 183
 184 /* for debugging only, prints out the state of the data struct
 185  */
 186 static void
 187 print_state(revisionData *data)
 188 {
 189     printf("element = %i\n", data->element);
 190     printf("output_type = %i\n", data->output_type);
 191     printf("title = %s\n", data->title);
 192     printf("articleid = %s\n", data->articleid);
 193     printf("revid = %s\n", data->revid);
 194     printf("date = %s\n", data->date);
 195     printf("time = %s\n", data->time);
 196     printf("anon = %s\n", data->anon);
 197     printf("editor = %s\n", data->editor);
 198     printf("editorid = %s\n", data->editorid);
 199     printf("minor = %s\n", (data->minor ? "1" : "0"));
 200     printf("comment = %s\n", data->comment);
 201     printf("text = %s\n", data->text);
 202     printf("\n");
 203
 204 }
 205
 206
 207 /*
 208  * write a line of comma-separated value formatted data to standard out
 209  * follows the form:
 210  * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
 211  * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
 212  *
 213  * it is called right before cleanup_revision() and cleanup_article()
 214  */
 215 static void
 216 write_row(revisionData *data)
 217 {
 218
 219     // get md5sum
 220     md5_state_t state;
 221     md5_byte_t digest[16];
 222     char md5_hex_output[2 * 16 + 1];
 223     md5_init(&state);
 224     md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
 225     md5_finish(&state, digest);
 226     int di;
 227     for (di = 0; di < 16; ++di) {
 228         sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
 229     }
 230
 231     string reverted_to;
 232     map<string, string>::iterator prev_revision = data->revision_md5.find(md5_hex_output);
 233     if (prev_revision != data->revision_md5.end()) {
 234         reverted_to = prev_revision->second; // id of previous revision
 235     }
 236     data->revision_md5[md5_hex_output] = data->revid;
 237
 238     string text = string(data->text, data->text_size);
 239     vector<string> text_tokens;
 240     size_t pos = 0;
 241     size_t start = 0;
 242     while ((pos = text.find_first_of(" \n\t\r", pos)) != string::npos) {
 243         //cout << "\"\"\"" << text.substr(start, pos - start) << "\"\"\"" << endl;
 244         text_tokens.push_back(text.substr(start, pos - start));
 245         start = pos;
 246         ++pos;
 247     }
 248
 249     //vector<string> additions;
 250     //vector<string> deletions;
 251     string additions;
 252     string deletions;
 253
 254     vector<bool> regex_matches_adds;
 255     vector<bool> regex_matches_dels;
 256
 257     if (!data->last_text_tokens.empty()) {
 258         // do the diff
 259
 260         dtl::Diff< string, vector<string> > d(data->last_text_tokens, text_tokens);
 261         //d.onOnlyEditDistance();
 262         d.compose();
 263
 264         vector<pair<string, dtl::elemInfo> > ses_v = d.getSes().getSequence();
 265         for (vector<pair<string, dtl::elemInfo> >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) {
 266             switch (sit->second.type) {
 267             case dtl::SES_ADD:
 268                 //cout << "ADD: \"" << sit->first << "\"" << endl;
 269                 additions += sit->first;
 270                 break;
 271             case dtl::SES_DELETE:
 272                 //cout << "DEL: \"" << sit->first << "\"" << endl;
 273                 deletions += sit->first;
 274                 break;
 275             }
 276         }
 277
 278         if (!additions.empty()) {
 279             //cout << "ADD: " << additions << endl;
 280             for (vector<pcrecpp::RE>::iterator r = data->regexes.begin(); r != data->regexes.end(); ++r) {
 281                 pcrecpp::RE& regex = *r;
 282                 regex_matches_adds.push_back(regex.PartialMatch(additions));
 283             }
 284         }
 285
 286         if (!deletions.empty()) {
 287             //cout << "DEL: " << deletions << endl;
 288             for (vector<pcrecpp::RE>::iterator r = data->regexes.begin(); r != data->regexes.end(); ++r) {
 289                 pcrecpp::RE& regex = *r;
 290                 regex_matches_dels.push_back(regex.PartialMatch(deletions));
 291             }
 292         }
 293
 294         // apply regex to the diff
 295
 296     }
 297
 298     data->last_text_tokens = text_tokens;
 299
 300
 301     // print line of tsv output
 302     cout
 303         << data->title << "\t"
 304         << data->articleid << "\t"
 305         << data->revid << "\t"
 306         << data->date << " "
 307         << data->time << "\t"
 308         << ((data->editor[0] != '\0') ? "FALSE" : "TRUE") << "\t"
 309         << data->editor << "\t"
 310         << data->editorid << "\t"
 311         << ((data->minor) ? "TRUE" : "FALSE") << "\t"
 312         << (unsigned int) data->text_size << "\t"
 313         << shannon_H(data->text, data->text_size) << "\t"
 314         << md5_hex_output << "\t"
 315         << reverted_to << "\t"
 316         << (int) additions.size() << "\t"
 317         << (int) deletions.size();
 318
 319     for (int n = 0; n < data->regex_names.size(); ++n) {
 320         cout << "\t" << ((!regex_matches_adds.empty() && regex_matches_adds.at(n)) ? "TRUE" : "FALSE")
 321              << "\t" << ((!regex_matches_dels.empty() && regex_matches_dels.at(n)) ? "TRUE" : "FALSE");
 322     }
 323     cout << endl;
 324
 325     //
 326     if (data->output_type == FULL) {
 327         cout << "comment:" << data->comment << endl
 328              << "text:" << endl << data->text << endl;
 329     }
 330
 331 }
 332
 333 void
 334 split_timestamp(revisionData *data)
 335 {
 336     char *t = data->timestamp;
 337     strncpy(data->date, data->timestamp, DATE_LENGTH);
 338     char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
 339     strncpy(data->time, timeinstamp, TIME_LENGTH);
 340 }
 341
 342 // like strncat but with previously known length
 343 char*
 344 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
 345 {
 346    //size_t dest_len = strlen(dest);
 347    size_t i;
 348
 349    for (i = 0 ; i < n && src[i] != '\0' ; i++)
 350        dest[dest_len + i] = src[i];
 351    dest[dest_len + i] = '\0';
 352
 353    return dest;
 354 }
 355
 356 static void
 357 charhndl(void* vdata, const XML_Char* s, int len)
 358 {
 359     revisionData* data = (revisionData*) vdata;
 360     if (data->element != UNUSED && data->position != SKIP) {
 361         //char t[len];
 362         //strncpy(t,s,len);
 363         //t[len] = '\0'; // makes t a well-formed string
 364         switch (data->element) {
 365             case TEXT:
 366                    // printf("buffer length = %i, text: %s\n", len, t);
 367                     strlcatn(data->text, s, data->text_size, len);
 368                     data->text_size += len;
 369                     break;
 370             case COMMENT:
 371                     strlcatn(data->comment, s, data->comment_size, len);
 372                     data->comment_size += len;
 373                     break;
 374             case TITLE:
 375                     strlcatn(data->title, s, data->title_size, len);
 376                     data->title_size += len;
 377                     break;
 378             case ARTICLEID:
 379                    // printf("articleid = %s\n", t);
 380                     strlcatn(data->articleid, s, data->articleid_size, len);
 381                     data->articleid_size += len;
 382                     break;
 383             case REVID:
 384                    // printf("revid = %s\n", t);
 385                     strlcatn(data->revid, s, data->revid_size, len);
 386                     data->revid_size += len;
 387                     break;
 388             case TIMESTAMP:
 389                     strlcatn(data->timestamp, s, data->timestamp_size, len);
 390                     data->timestamp_size += len;
 391                     if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
 392                         split_timestamp(data);
 393                     break;
 394             case EDITOR:
 395                     strlcatn(data->editor, s, data->editor_size, len);
 396                     data->editor_size += len;
 397                     break;
 398             case EDITORID:
 399                     //printf("editorid = %s\n", t);
 400                     strlcatn(data->editorid, s, data->editorid_size, len);
 401                     data->editorid_size += len;
 402                     break;
 403             /* the following are implied or skipped:
 404             case MINOR:
 405                     printf("found minor element\n");  doesn't work
 406                     break;                   minor tag is just a tag
 407             case UNUSED:
 408             */
 409             default: break;
 410         }
 411     }
 412 }
 413
 414 static void
 415 start(void* vdata, const XML_Char* name, const XML_Char** attr)
 416 {
 417     revisionData* data = (revisionData*) vdata;
 418
 419     if (strcmp(name,"title") == 0) {
 420         cleanup_article(data); // cleans up data from last article
 421         data->element = TITLE;
 422         data->position = TITLE_BLOCK;
 423     } else if (data->position != SKIP) {
 424         if (strcmp(name,"revision") == 0) {
 425             data->element = REVISION;
 426             data->position = REVISION_BLOCK;
 427         } else if (strcmp(name, "contributor") == 0) {
 428             data->element = CONTRIBUTOR;
 429             data->position = CONTRIBUTOR_BLOCK;
 430         } else if (strcmp(name,"id") == 0)
 431             switch (data->position) {
 432                 case TITLE_BLOCK:
 433                     data->element = ARTICLEID;
 434                     break;
 435                 case REVISION_BLOCK:
 436                     data->element = REVID;
 437                     break;
 438                 case CONTRIBUTOR_BLOCK:
 439                     data->element = EDITORID;
 440                     break;
 441             }
 442
 443         // minor tag has no character data, so we parse here
 444         else if (strcmp(name,"minor") == 0) {
 445             data->element = MINOR;
 446             data->minor = true;
 447         }
 448         else if (strcmp(name,"timestamp") == 0)
 449             data->element = TIMESTAMP;
 450
 451         else if (strcmp(name, "username") == 0)
 452             data->element = EDITOR;
 453
 454         else if (strcmp(name,"ip") == 0)
 455             data->element = EDITORID;
 456
 457         else if (strcmp(name,"comment") == 0)
 458             data->element = COMMENT;
 459
 460         else if (strcmp(name,"text") == 0)
 461             data->element = TEXT;
 462
 463         else if (strcmp(name,"page") == 0
 464                 || strcmp(name,"mediawiki") == 0
 465                 || strcmp(name,"restrictions") == 0
 466                 || strcmp(name,"siteinfo") == 0)
 467             data->element = UNUSED;
 468     }
 469
 470 }
 471
 472
 473 static void
 474 end(void* vdata, const XML_Char* name)
 475 {
 476     revisionData* data = (revisionData*) vdata;
 477     if (strcmp(name, "revision") == 0 && data->position != SKIP) {
 478         write_row(data); // crucial... :)
 479         cleanup_revision(data);  // also crucial
 480     } else {
 481         data->element = UNUSED; // sets our state to "not-in-useful"
 482     }                           // thus avoiding unpleasant character data
 483                                 // b/w tags (newlines etc.)
 484 }
 485
 486 void print_usage(char* argv[]) {
 487     cerr << "usage: <wikimedia dump xml> | " << argv[0] << "[options]" << endl
 488          << endl
 489          << "options:" << endl
 490          << "  -t   print text and comments after each line of tab separated data" << endl
 491          << "  -n   name of the following regex (e.g. -N name -r \"...\")" << endl
 492          << "  -r   regex to check against additions and deletions" << endl
 493          << endl
 494          << "Takes a wikimedia data dump XML stream on standard in, and produces" << endl
 495          << "a tab-separated stream of revisions on standard out:" << endl
 496          << endl
 497          << "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5" << endl
 498          << ".... and additional fields for each regex executed against add/delete diffs" << endl
 499          << endl
 500          << "author: Erik Garrison <erik@hypervolu.me>" << endl;
 501 }
 502
 503
 504 int
 505 main(int argc, char *argv[])
 506 {
 507
 508     enum outtype output_type;
 509     int dry_run = 0;
 510     // in "simple" output, we don't print text and comments
 511     output_type = SIMPLE;
 512     char c;
 513     string regex_name;
 514
 515     // the user data struct which is passed to callback functions
 516     revisionData data;
 517
 518     while ((c = getopt(argc, argv, "htn:r:")) != -1)
 519         switch (c)
 520         {
 521             case 'd':
 522                 dry_run = 1;
 523                 break;
 524             case 't':
 525                 output_type = FULL;
 526                 break;
 527             case 'n':
 528                 regex_name = optarg;
 529                 break;
 530             case 'r':
 531                 data.regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
 532                 data.regex_names.push_back(regex_name);
 533                 if (!regex_name.empty()) {
 534                     regex_name.clear();
 535                 }
 536                 break;
 537             case 'h':
 538                 print_usage(argv);
 539                 exit(0);
 540                 break;
 541         }
 542
 543     if (dry_run) { // lets us print initialization options
 544         printf("simple_output = %i\n", output_type);
 545         exit(1);
 546     }
 547
 548     // create a new instance of the expat parser
 549     XML_Parser parser = XML_ParserCreate("UTF-8");
 550
 551     // initialize the elements of the struct to default values
 552     init_data(&data, output_type);
 553
 554
 555     // makes the parser pass "data" as the first argument to every callback
 556     XML_SetUserData(parser, &data);
 557     void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
 558     void (*endFnPtr)(void*, const XML_Char*) = end;
 559     void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
 560
 561     // sets start and end to be the element start and end handlers
 562     XML_SetElementHandler(parser, startFnPtr, endFnPtr);
 563     // sets charhndl to be the callback for character data
 564     XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
 565
 566     bool done;
 567     char buf[BUFSIZ];
 568
 569     // write header
 570
 571     cout << "title" << "\t"
 572         << "articleid" << "\t"
 573         << "revid" << "\t"
 574         << "date" << " "
 575         << "time" << "\t"
 576         << "anon" << "\t"
 577         << "editor" << "\t"
 578         << "editor_id" << "\t"
 579         << "minor" << "\t"
 580         << "text_size" << "\t"
 581         << "text_entropy" << "\t"
 582         << "text_md5" << "\t"
 583         << "reversion" << "\t"
 584         << "additions_size" << "\t"
 585         << "deletions_size";
 586
 587     int n = 0;
 588     if (!data.regexes.empty()) {
 589         for (vector<pcrecpp::RE>::iterator r = data.regexes.begin(); r != data.regexes.end(); ++r, ++n) {
 590             if (data.regex_names.at(n).empty()) {
 591                 cout << "\t" << "regex_" << n << "_add"
 592                      << "\t" << "regex_" << n << "_del";
 593             } else {
 594                 cout << "\t" << data.regex_names.at(n) << "_add"
 595                      << "\t" << data.regex_names.at(n) << "_del";
 596             }
 597         }
 598     }
 599     cout << endl;
 600
 601     // shovel data into the parser
 602     do {
 603
 604         // read into buf a bufferfull of data from standard input
 605         size_t len = fread(buf, 1, BUFSIZ, stdin);
 606         done = len < BUFSIZ; // checks if we've got the last bufferfull
 607
 608         // passes the buffer of data to the parser and checks for error
 609         //   (this is where the callbacks are invoked)
 610         if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
 611             cerr << XML_ErrorString(XML_GetErrorCode(parser)) << " at line "
 612                  << (int) XML_GetCurrentLineNumber(parser) << endl;
 613             return 1;
 614         }
 615     } while (!done);
 616
 617
 618     XML_ParserFree(parser);
 619
 620     return 0;
 621 }