projects.mako.cc - wikiq/blob - wikiq.cpp

   1 /*
   2  * An XML parser for Wikipedia Data dumps.
   3  * Converts XML files to tab-separated values files readable by spreadsheets
   4  * and statistical packages.
   5  */
   6
   7 #include <stdio.h>
   8 #include <iostream>
   9 #include <string.h>
  10 #include <ctype.h>
  11 #include <stdlib.h>
  12 #include "expat.h"
  13 #include <getopt.h>
  14 #include "disorder.h"
  15 #include "md5.h"
  16 #include "dtl/dtl.hpp"
  17 #include <vector>
  18
  19 using namespace std;
  20
  21 // timestamp of the form 2003-11-07T00:43:23Z
  22 #define DATE_LENGTH 10
  23 #define TIME_LENGTH 8
  24 #define TIMESTAMP_LENGTH 20
  25
  26 #define MEGABYTE 1048576
  27 #define FIELD_BUFFER_SIZE 1024
  28 // 2048 KB in bytes + 1
  29 //#define TEXT_BUFFER_SIZE 2097153
  30 //#define TEXT_BUFFER_SIZE 10485760
  31
  32 enum elements {
  33     TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
  34     EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
  35 };
  36
  37 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
  38
  39 enum outtype { FULL, SIMPLE };
  40
  41 typedef struct {
  42
  43     // pointers to once-allocated buffers
  44     char *title;
  45     char *articleid;
  46     char *revid;
  47     char *date;
  48     char *time;
  49     char *timestamp;
  50     char *anon;
  51     char *editor;
  52     char *editorid;
  53     char *comment;
  54     char *text;
  55     vector<string> last_tokens;
  56
  57     // track string size of the elements, to prevent O(N^2) processing in charhndl
  58     // when we have to take strlen for every character which we append to the buffer
  59     size_t title_size;
  60     size_t articleid_size;
  61     size_t revid_size;
  62     size_t date_size;
  63     size_t time_size;
  64     size_t timestamp_size;
  65     size_t anon_size;
  66     size_t editor_size;
  67     size_t editorid_size;
  68     size_t comment_size;
  69     size_t text_size;
  70
  71     bool minor;
  72
  73     enum elements element;
  74     enum block position;
  75     enum outtype output_type;
  76
  77 } revisionData;
  78
  79
  80 /* free_data and clean_data
  81  * Takes a pointer to the data struct and an integer {0,1} indicating if the
  82  * title data needs to be cleared as well.
  83  * Also, frees memory dynamically allocated to store data.
  84  */
  85 static void
  86 clean_data(revisionData *data, int title)
  87 {
  88     // reset title (if we are switching articles)
  89     if (title) {
  90         data->title[0] = '\0';
  91         data->articleid[0] = '\0';
  92         data->title_size = 0;
  93         data->articleid_size = 0;
  94     }
  95
  96     // reset text fields
  97     data->revid[0] = '\0';
  98     data->date[0] = '\0';
  99     data->time[0] = '\0';
 100     data->timestamp[0] = '\0';
 101     data->anon[0] = '\0';
 102     data->editor[0] = '\0';
 103     data->editorid[0] = '\0';
 104     data->comment[0] = '\0';
 105     data->text[0] = '\0';
 106
 107     // reset length tracking
 108     data->revid_size = 0;
 109     data->date_size = 0;
 110     data->time_size = 0;
 111     data->timestamp_size = 0;
 112     data->anon_size = 0;
 113     data->editor_size = 0;
 114     data->editorid_size = 0;
 115     data->comment_size = 0;
 116     data->text_size = 0;
 117
 118     // reset flags and element type info
 119     data->minor = false;
 120     data->element = UNUSED;
 121
 122 }
 123
 124 // presently unused
 125 static void
 126 free_data(revisionData *data, int title)
 127 {
 128     if (title) {
 129         //printf("freeing article\n");
 130         free(data->title);
 131         free(data->articleid);
 132     }
 133     free(data->revid);
 134     free(data->date);
 135     free(data->time);
 136     free(data->timestamp);
 137     free(data->anon);
 138     free(data->editor);
 139     free(data->editorid);
 140     free(data->comment);
 141     free(data->text);
 142     data->last_tokens.clear();
 143 }
 144
 145 void cleanup_revision(revisionData *data) {
 146     clean_data(data, 0);
 147 }
 148
 149 void cleanup_article(revisionData *data) {
 150     clean_data(data, 1);
 151     data->last_tokens.clear();
 152 }
 153
 154
 155 static void
 156 init_data(revisionData *data, outtype output_type)
 157 {
 158     data->text = (char*) malloc(4 * MEGABYTE);  // 2MB is the article length limit, 4MB is 'safe'?
 159     data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
 160     data->title = (char*) malloc(FIELD_BUFFER_SIZE);
 161     data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
 162     data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
 163     data->date = (char*) malloc(FIELD_BUFFER_SIZE);
 164     data->time = (char*) malloc(FIELD_BUFFER_SIZE);
 165     data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
 166     data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
 167     data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
 168     data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
 169     data->minor = false;
 170
 171     // resets the data fields, null terminates strings, sets lengths
 172     clean_data(data, 1);
 173
 174     data->output_type = output_type;
 175 }
 176
 177 /* for debugging only, prints out the state of the data struct
 178  */
 179 static void
 180 print_state(revisionData *data)
 181 {
 182     printf("element = %i\n", data->element);
 183     printf("output_type = %i\n", data->output_type);
 184     printf("title = %s\n", data->title);
 185     printf("articleid = %s\n", data->articleid);
 186     printf("revid = %s\n", data->revid);
 187     printf("date = %s\n", data->date);
 188     printf("time = %s\n", data->time);
 189     printf("anon = %s\n", data->anon);
 190     printf("editor = %s\n", data->editor);
 191     printf("editorid = %s\n", data->editorid);
 192     printf("minor = %s\n", (data->minor ? "1" : "0"));
 193     printf("comment = %s\n", data->comment);
 194     printf("text = %s\n", data->text);
 195     printf("\n");
 196
 197 }
 198
 199 /* Write a header for the comma-separated output
 200  */
 201 static void
 202 write_header()
 203 {
 204  //   printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
 205 //    printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
 206
 207 }
 208
 209
 210 /*
 211  * write a line of comma-separated value formatted data to standard out
 212  * follows the form:
 213  * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
 214  * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
 215  *
 216  * it is called right before cleanup_revision() and cleanup_article()
 217  */
 218 static void
 219 write_row(revisionData *data)
 220 {
 221
 222     // get md5sum
 223     md5_state_t state;
 224     md5_byte_t digest[16];
 225     char md5_hex_output[2 * 16 + 1];
 226     md5_init(&state);
 227     md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
 228     md5_finish(&state, digest);
 229     int di;
 230     for (di = 0; di < 16; ++di) {
 231         sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
 232     }
 233
 234     string text = string(data->text, data->text_size);
 235     vector<string> text_tokens;
 236     size_t period_pos = 0;
 237     size_t paragraph_pos = 0;
 238     size_t start = 0;
 239     while ((period_pos = text.find(".", period_pos + 1)) != string::npos &&
 240             (paragraph_pos = text.find("\n\n", paragraph_pos + 1)) != string::npos) {
 241         if (paragraph_pos < period_pos) {
 242             text_tokens.push_back(text.substr(start, paragraph_pos - start));
 243             start = paragraph_pos;
 244         } else {
 245             text_tokens.push_back(text.substr(start, period_pos - start));
 246             start = period_pos;
 247         }
 248     }
 249
 250     vector<string> additions;
 251     vector<string> deletions;
 252
 253     if (data->last_tokens.empty()) {
 254         data->last_tokens = text_tokens;
 255     } else {
 256         // do the diff
 257
 258         dtl::Diff< string, vector<string> > d(data->last_tokens, text_tokens);
 259         //d.onOnlyEditDistance();
 260         d.compose();
 261
 262         vector<pair<string, dtl::elemInfo> > ses_v = d.getSes().getSequence();
 263         for (vector<pair<string, dtl::elemInfo> >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) {
 264             switch (sit->second.type) {
 265             case dtl::SES_ADD:
 266                 cout << "ADD: \"" << sit->first << "\"" << endl;
 267                 additions.push_back(sit->first);
 268                 break;
 269             case dtl::SES_DELETE:
 270                 cout << "DEL: \"" << sit->first << "\"" << endl;
 271                 deletions.push_back(sit->first);
 272                 break;
 273             }
 274         }
 275
 276         // apply regex to the diff
 277
 278
 279         data->last_tokens = text_tokens;
 280     }
 281
 282
 283     // print line of tsv output
 284     printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\t%i\t%i\n",
 285         data->title,
 286         data->articleid,
 287         data->revid,
 288         data->date,
 289         data->time,
 290         (data->editor[0] != '\0') ? "0" : "1",  // anon?
 291         data->editor,
 292         data->editorid,
 293         (data->minor) ? "1" : "0",
 294         (unsigned int) data->text_size,
 295         shannon_H(data->text, data->text_size),
 296         md5_hex_output,
 297         (int) additions.size(),
 298         (int) deletions.size()
 299         );
 300
 301     //
 302     if (data->output_type == FULL) {
 303         printf("comment:%s\ntext:\n%s\n", data->comment, data->text);
 304     }
 305
 306 }
 307
 308 void
 309 split_timestamp(revisionData *data)
 310 {
 311     char *t = data->timestamp;
 312     strncpy(data->date, data->timestamp, DATE_LENGTH);
 313     char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
 314     strncpy(data->time, timeinstamp, TIME_LENGTH);
 315 }
 316
 317 // like strncat but with previously known length
 318 char*
 319 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
 320 {
 321    //size_t dest_len = strlen(dest);
 322    size_t i;
 323
 324    for (i = 0 ; i < n && src[i] != '\0' ; i++)
 325        dest[dest_len + i] = src[i];
 326    dest[dest_len + i] = '\0';
 327
 328    return dest;
 329 }
 330
 331 static void
 332 charhndl(void* vdata, const XML_Char* s, int len)
 333 {
 334     revisionData* data = (revisionData*) vdata;
 335     if (data->element != UNUSED && data->position != SKIP) {
 336         //char t[len];
 337         //strncpy(t,s,len);
 338         //t[len] = '\0'; // makes t a well-formed string
 339         switch (data->element) {
 340             case TEXT:
 341                    // printf("buffer length = %i, text: %s\n", len, t);
 342                     strlcatn(data->text, s, data->text_size, len);
 343                     data->text_size += len;
 344                     break;
 345             case COMMENT:
 346                     strlcatn(data->comment, s, data->comment_size, len);
 347                     data->comment_size += len;
 348                     break;
 349             case TITLE:
 350                     strlcatn(data->title, s, data->title_size, len);
 351                     data->title_size += len;
 352                     break;
 353             case ARTICLEID:
 354                    // printf("articleid = %s\n", t);
 355                     strlcatn(data->articleid, s, data->articleid_size, len);
 356                     data->articleid_size += len;
 357                     break;
 358             case REVID:
 359                    // printf("revid = %s\n", t);
 360                     strlcatn(data->revid, s, data->revid_size, len);
 361                     data->revid_size += len;
 362                     break;
 363             case TIMESTAMP:
 364                     strlcatn(data->timestamp, s, data->timestamp_size, len);
 365                     data->timestamp_size += len;
 366                     if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
 367                         split_timestamp(data);
 368                     break;
 369             case EDITOR:
 370                     strlcatn(data->editor, s, data->editor_size, len);
 371                     data->editor_size += len;
 372                     break;
 373             case EDITORID:
 374                     //printf("editorid = %s\n", t);
 375                     strlcatn(data->editorid, s, data->editorid_size, len);
 376                     data->editorid_size += len;
 377                     break;
 378             /* the following are implied or skipped:
 379             case MINOR:
 380                     printf("found minor element\n");  doesn't work
 381                     break;                   minor tag is just a tag
 382             case UNUSED:
 383             */
 384             default: break;
 385         }
 386     }
 387 }
 388
 389 static void
 390 start(void* vdata, const XML_Char* name, const XML_Char** attr)
 391 {
 392     revisionData* data = (revisionData*) vdata;
 393
 394     if (strcmp(name,"title") == 0) {
 395         cleanup_article(data); // cleans up data from last article
 396         data->element = TITLE;
 397         data->position = TITLE_BLOCK;
 398     } else if (data->position != SKIP) {
 399         if (strcmp(name,"revision") == 0) {
 400             data->element = REVISION;
 401             data->position = REVISION_BLOCK;
 402         } else if (strcmp(name, "contributor") == 0) {
 403             data->element = CONTRIBUTOR;
 404             data->position = CONTRIBUTOR_BLOCK;
 405         } else if (strcmp(name,"id") == 0)
 406             switch (data->position) {
 407                 case TITLE_BLOCK:
 408                     data->element = ARTICLEID;
 409                     break;
 410                 case REVISION_BLOCK:
 411                     data->element = REVID;
 412                     break;
 413                 case CONTRIBUTOR_BLOCK:
 414                     data->element = EDITORID;
 415                     break;
 416             }
 417
 418         // minor tag has no character data, so we parse here
 419         else if (strcmp(name,"minor") == 0) {
 420             data->element = MINOR;
 421             data->minor = true;
 422         }
 423         else if (strcmp(name,"timestamp") == 0)
 424             data->element = TIMESTAMP;
 425
 426         else if (strcmp(name, "username") == 0)
 427             data->element = EDITOR;
 428
 429         else if (strcmp(name,"ip") == 0)
 430             data->element = EDITORID;
 431
 432         else if (strcmp(name,"comment") == 0)
 433             data->element = COMMENT;
 434
 435         else if (strcmp(name,"text") == 0)
 436             data->element = TEXT;
 437
 438         else if (strcmp(name,"page") == 0
 439                 || strcmp(name,"mediawiki") == 0
 440                 || strcmp(name,"restrictions") == 0
 441                 || strcmp(name,"siteinfo") == 0)
 442             data->element = UNUSED;
 443     }
 444
 445 }
 446
 447
 448 static void
 449 end(void* vdata, const XML_Char* name)
 450 {
 451     revisionData* data = (revisionData*) vdata;
 452     if (strcmp(name, "revision") == 0 && data->position != SKIP) {
 453         write_row(data); // crucial... :)
 454         cleanup_revision(data);  // also crucial
 455     } else {
 456         data->element = UNUSED; // sets our state to "not-in-useful"
 457     }                           // thus avoiding unpleasant character data
 458                                 // b/w tags (newlines etc.)
 459 }
 460
 461 void print_usage(char* argv[]) {
 462     fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
 463     fprintf(stderr, "\n");
 464     fprintf(stderr, "options:\n");
 465     fprintf(stderr, "  -t   print text and comments after each line of tab separated data\n");
 466     fprintf(stderr, "\n");
 467     fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
 468     fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
 469     fprintf(stderr, "\n");
 470     fprintf(stderr, "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5\n");
 471     fprintf(stderr, "\n");
 472     fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
 473 }
 474
 475
 476 int
 477 main(int argc, char *argv[])
 478 {
 479
 480     enum outtype output_type;
 481     int dry_run = 0;
 482     // in "simple" output, we don't print text and comments
 483     output_type = SIMPLE;
 484     char c;
 485
 486     while ((c = getopt(argc, argv, "ht")) != -1)
 487         switch (c)
 488         {
 489             case 'd':
 490                 dry_run = 1;
 491                 break;
 492             case 't':
 493                 output_type = FULL;
 494                 break;
 495             case 'h':
 496                 print_usage(argv);
 497                 exit(0);
 498                 break;
 499         }
 500
 501     if (dry_run) { // lets us print initialization options
 502         printf("simple_output = %i\n", output_type);
 503         exit(1);
 504     }
 505
 506     // create a new instance of the expat parser
 507     XML_Parser parser = XML_ParserCreate("UTF-8");
 508
 509     // initialize the user data struct which is passed to callback functions
 510     revisionData data;
 511     // initialize the elements of the struct to default values
 512     init_data(&data, output_type);
 513
 514
 515     // makes the parser pass "data" as the first argument to every callback
 516     XML_SetUserData(parser, &data);
 517     void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
 518     void (*endFnPtr)(void*, const XML_Char*) = end;
 519     void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
 520
 521     // sets start and end to be the element start and end handlers
 522     XML_SetElementHandler(parser, startFnPtr, endFnPtr);
 523     // sets charhndl to be the callback for character data
 524     XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
 525
 526     bool done;
 527     char buf[BUFSIZ];
 528
 529     // shovel data into the parser
 530     do {
 531
 532         // read into buf a bufferfull of data from standard input
 533         size_t len = fread(buf, 1, BUFSIZ, stdin);
 534         done = len < BUFSIZ; // checks if we've got the last bufferfull
 535
 536         // passes the buffer of data to the parser and checks for error
 537         //   (this is where the callbacks are invoked)
 538         if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
 539             fprintf(stderr,
 540                 "%s at line %d\n",
 541                 XML_ErrorString(XML_GetErrorCode(parser)),
 542                 (int) XML_GetCurrentLineNumber(parser));
 543             return 1;
 544         }
 545     } while (!done);
 546
 547
 548     XML_ParserFree(parser);
 549
 550     return 0;
 551 }