projects.mako.cc - wikiq/blob - wikiq.cpp

   1 /*
   2  * An XML parser for Wikipedia Data dumps.
   3  * Converts XML files to tab-separated values files readable by spreadsheets
   4  * and statistical packages.
   5  */
   6
   7 #include <stdio.h>
   8 #include <iostream>
   9 #include <string.h>
  10 #include <ctype.h>
  11 #include <stdlib.h>
  12 #include "expat.h"
  13 #include <getopt.h>
  14 #include "disorder.h"
  15 #include "md5.h"
  16 #include "dtl/dtl.hpp"
  17 #include <vector>
  18 #include <map>
  19 #include <pcrecpp.h>
  20
  21
  22 using namespace std;
  23
  24 // timestamp of the form 2003-11-07T00:43:23Z
  25 #define DATE_LENGTH 10
  26 #define TIME_LENGTH 8
  27 #define TIMESTAMP_LENGTH 20
  28
  29 #define MEGABYTE 1048576
  30 #define FIELD_BUFFER_SIZE 1024
  31
  32 // this can be changed at runtime if we encounter an article larger than 10mb
  33 size_t text_buffer_size = 10 * MEGABYTE;
  34
  35 enum elements {
  36     TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
  37     EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
  38 };
  39
  40 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
  41
  42 enum outtype { FULL, SIMPLE };
  43
  44 typedef struct {
  45
  46     // pointers to once-allocated buffers
  47     char *title;
  48     char *articleid;
  49     char *revid;
  50     char *date;
  51     char *time;
  52     char *timestamp;
  53     char *anon;
  54     char *editor;
  55     char *editorid;
  56     char *comment;
  57     char *text;
  58     vector<string> last_text_tokens;
  59
  60     // title regexes
  61     vector<pcrecpp::RE> title_regexes;
  62
  63     // regexes for looking within diffs
  64     vector<string> diff_regex_names;
  65     vector<pcrecpp::RE> diff_regexes;
  66
  67     map<string, string> revision_md5; // used for detecting reversions
  68
  69     // track string size of the elements, to prevent O(N^2) processing in charhndl
  70     // when we have to take strlen for every character which we append to the buffer
  71     size_t title_size;
  72     size_t articleid_size;
  73     size_t revid_size;
  74     size_t date_size;
  75     size_t time_size;
  76     size_t timestamp_size;
  77     size_t anon_size;
  78     size_t editor_size;
  79     size_t editorid_size;
  80     size_t comment_size;
  81     size_t text_size;
  82
  83     bool minor;
  84
  85     enum elements element;
  86     enum block position;
  87     enum outtype output_type;
  88
  89 } revisionData;
  90
  91
  92 /* free_data and clean_data
  93  * Takes a pointer to the data struct and an integer {0,1} indicating if the
  94  * title data needs to be cleared as well.
  95  * Also, frees memory dynamically allocated to store data.
  96  */
  97 static void
  98 clean_data(revisionData *data, int title)
  99 {
 100     // reset title (if we are switching articles)
 101     if (title) {
 102         data->title[0] = '\0';
 103         data->articleid[0] = '\0';
 104         data->title_size = 0;
 105         data->articleid_size = 0;
 106     }
 107
 108     // reset text fields
 109     data->revid[0] = '\0';
 110     data->date[0] = '\0';
 111     data->time[0] = '\0';
 112     data->timestamp[0] = '\0';
 113     data->anon[0] = '\0';
 114     data->editor[0] = '\0';
 115     data->editorid[0] = '\0';
 116     data->comment[0] = '\0';
 117     data->text[0] = '\0';
 118
 119     // reset length tracking
 120     data->revid_size = 0;
 121     data->date_size = 0;
 122     data->time_size = 0;
 123     data->timestamp_size = 0;
 124     data->anon_size = 0;
 125     data->editor_size = 0;
 126     data->editorid_size = 0;
 127     data->comment_size = 0;
 128     data->text_size = 0;
 129
 130     // reset flags and element type info
 131     data->minor = false;
 132     data->element = UNUSED;
 133
 134 }
 135
 136 // presently unused
 137 static void
 138 free_data(revisionData *data, int title)
 139 {
 140     if (title) {
 141         //printf("freeing article\n");
 142         free(data->title);
 143         free(data->articleid);
 144     }
 145     free(data->revid);
 146     free(data->date);
 147     free(data->time);
 148     free(data->timestamp);
 149     free(data->anon);
 150     free(data->editor);
 151     free(data->editorid);
 152     free(data->comment);
 153     free(data->text);
 154     data->last_text_tokens.clear();
 155 }
 156
 157 void cleanup_revision(revisionData *data) {
 158     clean_data(data, 0);
 159 }
 160
 161 void cleanup_article(revisionData *data) {
 162     clean_data(data, 1);
 163     data->last_text_tokens.clear();
 164     data->revision_md5.clear();
 165 }
 166
 167
 168 static void
 169 init_data(revisionData *data, outtype output_type)
 170 {
 171     data->text = (char*) malloc(text_buffer_size);
 172     data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
 173     data->title = (char*) malloc(FIELD_BUFFER_SIZE);
 174     data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
 175     data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
 176     data->date = (char*) malloc(FIELD_BUFFER_SIZE);
 177     data->time = (char*) malloc(FIELD_BUFFER_SIZE);
 178     data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
 179     data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
 180     data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
 181     data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
 182     data->minor = false;
 183
 184     // resets the data fields, null terminates strings, sets lengths
 185     clean_data(data, 1);
 186
 187     data->output_type = output_type;
 188 }
 189
 190 /* for debugging only, prints out the state of the data struct
 191  */
 192 static void
 193 print_state(revisionData *data)
 194 {
 195     printf("element = %i\n", data->element);
 196     printf("output_type = %i\n", data->output_type);
 197     printf("title = %s\n", data->title);
 198     printf("articleid = %s\n", data->articleid);
 199     printf("revid = %s\n", data->revid);
 200     printf("date = %s\n", data->date);
 201     printf("time = %s\n", data->time);
 202     printf("anon = %s\n", data->anon);
 203     printf("editor = %s\n", data->editor);
 204     printf("editorid = %s\n", data->editorid);
 205     printf("minor = %s\n", (data->minor ? "1" : "0"));
 206     printf("comment = %s\n", data->comment);
 207     printf("text = %s\n", data->text);
 208     printf("\n");
 209
 210 }
 211
 212
 213 /*
 214  * write a line of comma-separated value formatted data to standard out
 215  * follows the form:
 216  * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
 217  * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
 218  *
 219  * it is called right before cleanup_revision() and cleanup_article()
 220  */
 221 static void
 222 write_row(revisionData *data)
 223 {
 224
 225     // get md5sum
 226     md5_state_t state;
 227     md5_byte_t digest[16];
 228     char md5_hex_output[2 * 16 + 1];
 229     md5_init(&state);
 230     md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
 231     md5_finish(&state, digest);
 232     int di;
 233     for (di = 0; di < 16; ++di) {
 234         sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
 235     }
 236
 237     string reverted_to;
 238     map<string, string>::iterator prev_revision = data->revision_md5.find(md5_hex_output);
 239     if (prev_revision != data->revision_md5.end()) {
 240         reverted_to = prev_revision->second; // id of previous revision
 241     }
 242     data->revision_md5[md5_hex_output] = data->revid;
 243
 244     string text = string(data->text, data->text_size);
 245     vector<string> text_tokens;
 246     size_t pos = 0;
 247     size_t start = 0;
 248     while ((pos = text.find_first_of(" \n\t\r", pos)) != string::npos) {
 249         //cout << "\"\"\"" << text.substr(start, pos - start) << "\"\"\"" << endl;
 250         text_tokens.push_back(text.substr(start, pos - start));
 251         start = pos;
 252         ++pos;
 253     }
 254
 255     // look to see if (a) we've passed in a list of /any/ title_regexes
 256     // and (b) if all of the title_regex_matches match
 257     // if (a) is true and (b) is not, we return
 258     bool any_title_regex_match = false;
 259     if (!data->title_regexes.empty()) {
 260         for (vector<pcrecpp::RE>::iterator r = data->title_regexes.begin(); r != data->title_regexes.end(); ++r) {
 261             pcrecpp::RE& title_regex = *r;
 262             if (title_regex.PartialMatch(data->title)) {
 263                 any_title_regex_match = true;
 264                 break;
 265             }
 266         }
 267         if (!any_title_regex_match) {
 268             return;
 269         }
 270     }
 271
 272     //vector<string> additions;
 273     //vector<string> deletions;
 274     string additions;
 275     string deletions;
 276
 277     vector<bool> diff_regex_matches_adds;
 278     vector<bool> diff_regex_matches_dels;
 279
 280     if (data->last_text_tokens.empty()) {
 281         additions = data->text;
 282     } else {
 283         // do the diff
 284
 285         dtl::Diff< string, vector<string> > d(data->last_text_tokens, text_tokens);
 286         //d.onOnlyEditDistance();
 287         d.compose();
 288
 289         vector<pair<string, dtl::elemInfo> > ses_v = d.getSes().getSequence();
 290         for (vector<pair<string, dtl::elemInfo> >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) {
 291             switch (sit->second.type) {
 292             case dtl::SES_ADD:
 293                 //cout << "ADD: \"" << sit->first << "\"" << endl;
 294                 additions += sit->first;
 295                 break;
 296             case dtl::SES_DELETE:
 297                 //cout << "DEL: \"" << sit->first << "\"" << endl;
 298                 deletions += sit->first;
 299                 break;
 300             }
 301         }
 302     }
 303
 304     if (!additions.empty()) {
 305         //cout << "ADD: " << additions << endl;
 306         for (vector<pcrecpp::RE>::iterator r = data->diff_regexes.begin(); r != data->diff_regexes.end(); ++r) {
 307             pcrecpp::RE& diff_regex = *r;
 308             diff_regex_matches_adds.push_back(diff_regex.PartialMatch(additions));
 309         }
 310     }
 311
 312     if (!deletions.empty()) {
 313         //cout << "DEL: " << deletions << endl;
 314         for (vector<pcrecpp::RE>::iterator r = data->diff_regexes.begin(); r != data->diff_regexes.end(); ++r) {
 315             pcrecpp::RE& diff_regex = *r;
 316             diff_regex_matches_dels.push_back(diff_regex.PartialMatch(deletions));
 317         }
 318     }
 319
 320     data->last_text_tokens = text_tokens;
 321
 322
 323     // print line of tsv output
 324     cout
 325         << data->title << "\t"
 326         << data->articleid << "\t"
 327         << data->revid << "\t"
 328         << data->date << " "
 329         << data->time << "\t"
 330         << ((data->editor[0] != '\0') ? "FALSE" : "TRUE") << "\t"
 331         << data->editor << "\t"
 332         << data->editorid << "\t"
 333         << ((data->minor) ? "TRUE" : "FALSE") << "\t"
 334         << (unsigned int) data->text_size << "\t"
 335         << shannon_H(data->text, data->text_size) << "\t"
 336         << md5_hex_output << "\t"
 337         << reverted_to << "\t"
 338         << (int) additions.size() << "\t"
 339         << (int) deletions.size();
 340
 341     for (int n = 0; n < data->diff_regex_names.size(); ++n) {
 342         cout << "\t" << ((!diff_regex_matches_adds.empty() && diff_regex_matches_adds.at(n)) ? "TRUE" : "FALSE")
 343              << "\t" << ((!diff_regex_matches_dels.empty() && diff_regex_matches_dels.at(n)) ? "TRUE" : "FALSE");
 344     }
 345     cout << endl;
 346
 347     //
 348     if (data->output_type == FULL) {
 349         cout << "comment:" << data->comment << endl
 350              << "text:" << endl << data->text << endl;
 351     }
 352
 353 }
 354
 355 void
 356 split_timestamp(revisionData *data)
 357 {
 358     char *t = data->timestamp;
 359     strncpy(data->date, data->timestamp, DATE_LENGTH);
 360     char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
 361     strncpy(data->time, timeinstamp, TIME_LENGTH);
 362 }
 363
 364 // like strncat but with previously known length
 365 char*
 366 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
 367 {
 368    size_t i;
 369
 370    for (i = 0 ; i < n && src[i] != '\0' ; i++)
 371        dest[dest_len + i] = src[i];
 372    dest[dest_len + i] = '\0';
 373
 374    return dest;
 375 }
 376
 377 static void
 378 charhndl(void* vdata, const XML_Char* s, int len)
 379 {
 380     revisionData* data = (revisionData*) vdata;
 381     size_t bufsz;
 382     if (data->element != UNUSED && data->position != SKIP) {
 383         switch (data->element) {
 384             case TEXT:
 385                     // check if we'd overflow our buffer
 386                     bufsz = data->text_size + len;
 387                     if (bufsz + 1 > text_buffer_size) {
 388                         data->text = (char*) realloc(data->text, bufsz + 1);
 389                         text_buffer_size = bufsz + 1;
 390                     }
 391                     strlcatn(data->text, s, data->text_size, len);
 392                     data->text_size = bufsz;
 393                     break;
 394             case COMMENT:
 395                     strlcatn(data->comment, s, data->comment_size, len);
 396                     data->comment_size += len;
 397                     break;
 398             case TITLE:
 399                     strlcatn(data->title, s, data->title_size, len);
 400                     data->title_size += len;
 401                     break;
 402             case ARTICLEID:
 403                    // printf("articleid = %s\n", t);
 404                     strlcatn(data->articleid, s, data->articleid_size, len);
 405                     data->articleid_size += len;
 406                     break;
 407             case REVID:
 408                    // printf("revid = %s\n", t);
 409                     strlcatn(data->revid, s, data->revid_size, len);
 410                     data->revid_size += len;
 411                     break;
 412             case TIMESTAMP:
 413                     strlcatn(data->timestamp, s, data->timestamp_size, len);
 414                     data->timestamp_size += len;
 415                     if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
 416                         split_timestamp(data);
 417                     break;
 418             case EDITOR:
 419                     strlcatn(data->editor, s, data->editor_size, len);
 420                     data->editor_size += len;
 421                     break;
 422             case EDITORID:
 423                     //printf("editorid = %s\n", t);
 424                     strlcatn(data->editorid, s, data->editorid_size, len);
 425                     data->editorid_size += len;
 426                     break;
 427             /* the following are implied or skipped:
 428             case MINOR:
 429                     printf("found minor element\n");  doesn't work
 430                     break;                   minor tag is just a tag
 431             case UNUSED:
 432             */
 433             default: break;
 434         }
 435     }
 436 }
 437
 438 static void
 439 start(void* vdata, const XML_Char* name, const XML_Char** attr)
 440 {
 441     revisionData* data = (revisionData*) vdata;
 442
 443     if (strcmp(name,"title") == 0) {
 444         cleanup_article(data); // cleans up data from last article
 445         data->element = TITLE;
 446         data->position = TITLE_BLOCK;
 447     } else if (data->position != SKIP) {
 448         if (strcmp(name,"revision") == 0) {
 449             data->element = REVISION;
 450             data->position = REVISION_BLOCK;
 451         } else if (strcmp(name, "contributor") == 0) {
 452             data->element = CONTRIBUTOR;
 453             data->position = CONTRIBUTOR_BLOCK;
 454         } else if (strcmp(name,"id") == 0)
 455             switch (data->position) {
 456                 case TITLE_BLOCK:
 457                     data->element = ARTICLEID;
 458                     break;
 459                 case REVISION_BLOCK:
 460                     data->element = REVID;
 461                     break;
 462                 case CONTRIBUTOR_BLOCK:
 463                     data->element = EDITORID;
 464                     break;
 465             }
 466
 467         // minor tag has no character data, so we parse here
 468         else if (strcmp(name,"minor") == 0) {
 469             data->element = MINOR;
 470             data->minor = true;
 471         }
 472         else if (strcmp(name,"timestamp") == 0)
 473             data->element = TIMESTAMP;
 474
 475         else if (strcmp(name, "username") == 0)
 476             data->element = EDITOR;
 477
 478         else if (strcmp(name,"ip") == 0)
 479             data->element = EDITORID;
 480
 481         else if (strcmp(name,"comment") == 0)
 482             data->element = COMMENT;
 483
 484         else if (strcmp(name,"text") == 0)
 485             data->element = TEXT;
 486
 487         else if (strcmp(name,"page") == 0
 488                 || strcmp(name,"mediawiki") == 0
 489                 || strcmp(name,"restrictions") == 0
 490                 || strcmp(name,"siteinfo") == 0)
 491             data->element = UNUSED;
 492     }
 493
 494 }
 495
 496
 497 static void
 498 end(void* vdata, const XML_Char* name)
 499 {
 500     revisionData* data = (revisionData*) vdata;
 501     if (strcmp(name, "revision") == 0 && data->position != SKIP) {
 502         write_row(data); // crucial... :)
 503         cleanup_revision(data);  // also crucial
 504     } else {
 505         data->element = UNUSED; // sets our state to "not-in-useful"
 506     }                           // thus avoiding unpleasant character data
 507                                 // b/w tags (newlines etc.)
 508 }
 509
 510 void print_usage(char* argv[]) {
 511     cerr << "usage: <wikimedia dump xml> | " << argv[0] << "[options]" << endl
 512          << endl
 513          << "options:" << endl
 514          << "  -v   verbose mode prints text and comments after each line of tab separated data" << endl
 515          << "  -N   name of the following regex for diffs (e.g. -N name -R \"...\")" << endl
 516          << "  -R   regex to check against diffs (i.e., additions and deletions)" << endl
 517          << "  -t   parse revisions only from pages whose titles match regex(es)" << endl
 518          << endl
 519          << "Takes a wikimedia data dump XML stream on standard in, and produces" << endl
 520          << "a tab-separated stream of revisions on standard out:" << endl
 521          << endl
 522          << "title, articleid, revid, timestamp, anon, editor, editorid, minor," << endl
 523          << "text_length, text_entropy, text_md5, reversion, additions_size, deletions_size" << endl
 524          << ".... and additional fields for each regex executed against add/delete diffs" << endl
 525          << endl
 526          << "Boolean fields are TRUE/FALSE except in the case of reversion, which is blank" << endl
 527          << "unless the article is a revert to a previous revision, in which case, it" << endl
 528          << "contains the revision ID of the revision which was reverted to." << endl
 529          << endl
 530          << "author: Erik Garrison <erik@hypervolu.me>" << endl;
 531 }
 532
 533
 534 int
 535 main(int argc, char *argv[])
 536 {
 537
 538     enum outtype output_type;
 539     int dry_run = 0;
 540     // in "simple" output, we don't print text and comments
 541     output_type = SIMPLE;
 542     char c;
 543     string diff_regex_name;
 544
 545     // the user data struct which is passed to callback functions
 546     revisionData data;
 547
 548     while ((c = getopt(argc, argv, "hvn:r:t:")) != -1)
 549         switch (c)
 550         {
 551             case 'd':
 552                 dry_run = 1;
 553                 break;
 554             case 'v':
 555                 output_type = FULL;
 556                 break;
 557             case 'N':
 558                 diff_regex_name = optarg;
 559                 break;
 560             case 'R':
 561                 data.diff_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
 562                 data.diff_regex_names.push_back(diff_regex_name);
 563                 if (!diff_regex_name.empty()) {
 564                     diff_regex_name.clear();
 565                 }
 566                 break;
 567             case 'h':
 568                 print_usage(argv);
 569                 exit(0);
 570                 break;
 571             case 't':
 572                 data.title_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
 573                 break;
 574         }
 575
 576     if (dry_run) { // lets us print initialization options
 577         printf("simple_output = %i\n", output_type);
 578         exit(1);
 579     }
 580
 581     // create a new instance of the expat parser
 582     XML_Parser parser = XML_ParserCreate("UTF-8");
 583
 584     // initialize the elements of the struct to default values
 585     init_data(&data, output_type);
 586
 587
 588     // makes the parser pass "data" as the first argument to every callback
 589     XML_SetUserData(parser, &data);
 590     void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
 591     void (*endFnPtr)(void*, const XML_Char*) = end;
 592     void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
 593
 594     // sets start and end to be the element start and end handlers
 595     XML_SetElementHandler(parser, startFnPtr, endFnPtr);
 596     // sets charhndl to be the callback for character data
 597     XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
 598
 599     bool done;
 600     char buf[BUFSIZ];
 601
 602     // write header
 603
 604     cout << "title" << "\t"
 605         << "articleid" << "\t"
 606         << "revid" << "\t"
 607         << "date" << " "
 608         << "time" << "\t"
 609         << "anon" << "\t"
 610         << "editor" << "\t"
 611         << "editor_id" << "\t"
 612         << "minor" << "\t"
 613         << "text_size" << "\t"
 614         << "text_entropy" << "\t"
 615         << "text_md5" << "\t"
 616         << "reversion" << "\t"
 617         << "additions_size" << "\t"
 618         << "deletions_size";
 619
 620     int n = 0;
 621     if (!data.diff_regexes.empty()) {
 622         for (vector<pcrecpp::RE>::iterator r = data.diff_regexes.begin(); r != data.diff_regexes.end(); ++r, ++n) {
 623             if (data.diff_regex_names.at(n).empty()) {
 624                 cout << "\t" << "regex_" << n << "_add"
 625                      << "\t" << "regex_" << n << "_del";
 626             } else {
 627                 cout << "\t" << data.diff_regex_names.at(n) << "_add"
 628                      << "\t" << data.diff_regex_names.at(n) << "_del";
 629             }
 630         }
 631     }
 632     cout << endl;
 633
 634     // shovel data into the parser
 635     do {
 636
 637         // read into buf a bufferfull of data from standard input
 638         size_t len = fread(buf, 1, BUFSIZ, stdin);
 639         done = len < BUFSIZ; // checks if we've got the last bufferfull
 640
 641         // passes the buffer of data to the parser and checks for error
 642         //   (this is where the callbacks are invoked)
 643         if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
 644             cerr << "XML ERROR: " << XML_ErrorString(XML_GetErrorCode(parser)) << " at line "
 645                  << (int) XML_GetCurrentLineNumber(parser) << endl;
 646             return 1;
 647         }
 648     } while (!done);
 649
 650
 651     XML_ParserFree(parser);
 652
 653     return 0;
 654 }