projects.mako.cc - wikiq/blob - wikiq.c

   1 /*
   2  * An XML parser for Wikipedia Data dumps.
   3  * Converts XML files to tab-separated values files readable by spreadsheets
   4  * and statistical packages.
   5  */
   6
   7 #include <stdio.h>
   8 #include <string.h>
   9 #include <ctype.h>
  10 #include <stdlib.h>
  11 #include "expat.h"
  12 #include <getopt.h>
  13 #include "disorder.h"
  14 #include "md5.h"
  15
  16 // timestamp of the form 2003-11-07T00:43:23Z
  17 #define DATE_LENGTH 10
  18 #define TIME_LENGTH 8
  19 #define TIMESTAMP_LENGTH 20
  20
  21 #define MEGABYTE 1048576
  22 #define FIELD_BUFFER_SIZE 1024
  23 // 2048 KB in bytes + 1
  24 //#define TEXT_BUFFER_SIZE 2097153
  25 //#define TEXT_BUFFER_SIZE 10485760
  26
  27 enum elements {
  28     TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
  29     EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
  30 };
  31
  32 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
  33
  34 enum outtype { FULL, SIMPLE };
  35
  36 typedef struct {
  37
  38     // pointers to once-allocated buffers
  39     char *title;
  40     char *articleid;
  41     char *revid;
  42     char *date;
  43     char *time;
  44     char *timestamp;
  45     char *anon;
  46     char *editor;
  47     char *editorid;
  48     char *comment;
  49     char *text;
  50
  51     // track string size of the elements, to prevent O(N^2) processing in charhndl
  52     // when we have to take strlen for every character which we append to the buffer
  53     size_t title_size;
  54     size_t articleid_size;
  55     size_t revid_size;
  56     size_t date_size;
  57     size_t time_size;
  58     size_t timestamp_size;
  59     size_t anon_size;
  60     size_t editor_size;
  61     size_t editorid_size;
  62     size_t comment_size;
  63     size_t text_size;
  64
  65     bool minor;
  66
  67     enum elements element;
  68     enum block position;
  69     enum outtype output_type;
  70
  71 } revisionData;
  72
  73
  74 /* free_data and clean_data
  75  * Takes a pointer to the data struct and an integer {0,1} indicating if the
  76  * title data needs to be cleared as well.
  77  * Also, frees memory dynamically allocated to store data.
  78  */
  79 static void
  80 clean_data(revisionData *data, int title)
  81 {
  82     // reset title (if we are switching articles)
  83     if (title) {
  84         data->title[0] = '\0';
  85         data->articleid[0] = '\0';
  86         data->title_size = 0;
  87         data->articleid_size = 0;
  88     }
  89
  90     // reset text fields
  91     data->revid[0] = '\0';
  92     data->date[0] = '\0';
  93     data->time[0] = '\0';
  94     data->timestamp[0] = '\0';
  95     data->anon[0] = '\0';
  96     data->editor[0] = '\0';
  97     data->editorid[0] = '\0';
  98     data->comment[0] = '\0';
  99     data->text[0] = '\0';
 100
 101     // reset length tracking
 102     data->revid_size = 0;
 103     data->date_size = 0;
 104     data->time_size = 0;
 105     data->timestamp_size = 0;
 106     data->anon_size = 0;
 107     data->editor_size = 0;
 108     data->editorid_size = 0;
 109     data->comment_size = 0;
 110     data->text_size = 0;
 111
 112     // reset flags and element type info
 113     data->minor = false;
 114     data->element = UNUSED;
 115
 116 }
 117
 118 // presently unused
 119 static void
 120 free_data(revisionData *data, int title)
 121 {
 122     if (title) {
 123         //printf("freeing article\n");
 124         free(data->title);
 125         free(data->articleid);
 126     }
 127     free(data->revid);
 128     free(data->date);
 129     free(data->time);
 130     free(data->timestamp);
 131     free(data->anon);
 132     free(data->editor);
 133     free(data->editorid);
 134     free(data->comment);
 135     free(data->text);
 136 }
 137
 138 void cleanup_revision(revisionData *data) {
 139     clean_data(data, 0);
 140 }
 141
 142 void cleanup_article(revisionData *data) {
 143     clean_data(data, 1);
 144 }
 145
 146
 147 static void
 148 init_data(revisionData *data, outtype output_type)
 149 {
 150     data->text = (char*) malloc(4 * MEGABYTE);  // 2MB is the article length limit, 4MB is 'safe'?
 151     data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
 152     data->title = (char*) malloc(FIELD_BUFFER_SIZE);
 153     data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
 154     data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
 155     data->date = (char*) malloc(FIELD_BUFFER_SIZE);
 156     data->time = (char*) malloc(FIELD_BUFFER_SIZE);
 157     data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
 158     data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
 159     data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
 160     data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
 161     data->minor = false;
 162
 163     // resets the data fields, null terminates strings, sets lengths
 164     clean_data(data, 1);
 165
 166     data->output_type = output_type;
 167 }
 168
 169 /* for debugging only, prints out the state of the data struct
 170  */
 171 static void
 172 print_state(revisionData *data)
 173 {
 174     printf("element = %i\n", data->element);
 175     printf("output_type = %i\n", data->output_type);
 176     printf("title = %s\n", data->title);
 177     printf("articleid = %s\n", data->articleid);
 178     printf("revid = %s\n", data->revid);
 179     printf("date = %s\n", data->date);
 180     printf("time = %s\n", data->time);
 181     printf("anon = %s\n", data->anon);
 182     printf("editor = %s\n", data->editor);
 183     printf("editorid = %s\n", data->editorid);
 184     printf("minor = %s\n", (data->minor ? "1" : "0"));
 185     printf("comment = %s\n", data->comment);
 186     printf("text = %s\n", data->text);
 187     printf("\n");
 188
 189 }
 190
 191 /* Write a header for the comma-separated output
 192  */
 193 static void
 194 write_header()
 195 {
 196  //   printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
 197 //    printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
 198
 199 }
 200
 201
 202 /*
 203  * write a line of comma-separated value formatted data to standard out
 204  * follows the form:
 205  * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
 206  * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
 207  *
 208  * it is called right before cleanup_revision() and cleanup_article()
 209  */
 210 static void
 211 write_row(revisionData *data)
 212 {
 213
 214     // get md5sum
 215     md5_state_t state;
 216     md5_byte_t digest[16];
 217     char md5_hex_output[2 * 16 + 1];
 218     md5_init(&state);
 219     md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
 220     md5_finish(&state, digest);
 221     int di;
 222     for (di = 0; di < 16; ++di) {
 223         sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
 224     }
 225
 226     // print line of tsv output
 227     printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\n",
 228         data->title,
 229         data->articleid,
 230         data->revid,
 231         data->date,
 232         data->time,
 233         (data->editor[0] != '\0') ? "0" : "1",  // anon?
 234         data->editor,
 235         data->editorid,
 236         (data->minor) ? "1" : "0",
 237         (unsigned int) data->text_size,
 238         shannon_H(data->text, data->text_size),
 239         md5_hex_output
 240         );
 241
 242     //
 243     if (data->output_type == FULL) {
 244         printf("comment:%s\ntext:\n%s\n", data->comment, data->text);
 245     }
 246
 247 }
 248
 249 void
 250 split_timestamp(revisionData *data)
 251 {
 252     char *t = data->timestamp;
 253     strncpy(data->date, data->timestamp, DATE_LENGTH);
 254     char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
 255     strncpy(data->time, timeinstamp, TIME_LENGTH);
 256 }
 257
 258 // like strncat but with previously known length
 259 char*
 260 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
 261 {
 262    //size_t dest_len = strlen(dest);
 263    size_t i;
 264
 265    for (i = 0 ; i < n && src[i] != '\0' ; i++)
 266        dest[dest_len + i] = src[i];
 267    dest[dest_len + i] = '\0';
 268
 269    return dest;
 270 }
 271
 272 static void
 273 charhndl(void* vdata, const XML_Char* s, int len)
 274 {
 275     revisionData* data = (revisionData*) vdata;
 276     if (data->element != UNUSED && data->position != SKIP) {
 277         //char t[len];
 278         //strncpy(t,s,len);
 279         //t[len] = '\0'; // makes t a well-formed string
 280         switch (data->element) {
 281             case TEXT:
 282                    // printf("buffer length = %i, text: %s\n", len, t);
 283                     strlcatn(data->text, s, data->text_size, len);
 284                     data->text_size += len;
 285                     break;
 286             case COMMENT:
 287                     strlcatn(data->comment, s, data->comment_size, len);
 288                     data->comment_size += len;
 289                     break;
 290             case TITLE:
 291                     strlcatn(data->title, s, data->title_size, len);
 292                     data->title_size += len;
 293                     break;
 294             case ARTICLEID:
 295                    // printf("articleid = %s\n", t);
 296                     strlcatn(data->articleid, s, data->articleid_size, len);
 297                     data->articleid_size += len;
 298                     break;
 299             case REVID:
 300                    // printf("revid = %s\n", t);
 301                     strlcatn(data->revid, s, data->revid_size, len);
 302                     data->revid_size += len;
 303                     break;
 304             case TIMESTAMP:
 305                     strlcatn(data->timestamp, s, data->timestamp_size, len);
 306                     data->timestamp_size += len;
 307                     if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
 308                         split_timestamp(data);
 309                     break;
 310             case EDITOR:
 311                     strlcatn(data->editor, s, data->editor_size, len);
 312                     data->editor_size += len;
 313                     break;
 314             case EDITORID:
 315                     //printf("editorid = %s\n", t);
 316                     strlcatn(data->editorid, s, data->editorid_size, len);
 317                     data->editorid_size += len;
 318                     break;
 319             /* the following are implied or skipped:
 320             case MINOR:
 321                     printf("found minor element\n");  doesn't work
 322                     break;                   minor tag is just a tag
 323             case UNUSED:
 324             */
 325             default: break;
 326         }
 327     }
 328 }
 329
 330 static void
 331 start(void* vdata, const XML_Char* name, const XML_Char** attr)
 332 {
 333     revisionData* data = (revisionData*) vdata;
 334
 335     if (strcmp(name,"title") == 0) {
 336         cleanup_article(data); // cleans up data from last article
 337         data->element = TITLE;
 338         data->position = TITLE_BLOCK;
 339     } else if (data->position != SKIP) {
 340         if (strcmp(name,"revision") == 0) {
 341             data->element = REVISION;
 342             data->position = REVISION_BLOCK;
 343         } else if (strcmp(name, "contributor") == 0) {
 344             data->element = CONTRIBUTOR;
 345             data->position = CONTRIBUTOR_BLOCK;
 346         } else if (strcmp(name,"id") == 0)
 347             switch (data->position) {
 348                 case TITLE_BLOCK:
 349                     data->element = ARTICLEID;
 350                     break;
 351                 case REVISION_BLOCK:
 352                     data->element = REVID;
 353                     break;
 354                 case CONTRIBUTOR_BLOCK:
 355                     data->element = EDITORID;
 356                     break;
 357             }
 358
 359         // minor tag has no character data, so we parse here
 360         else if (strcmp(name,"minor") == 0) {
 361             data->element = MINOR;
 362             data->minor = true;
 363         }
 364         else if (strcmp(name,"timestamp") == 0)
 365             data->element = TIMESTAMP;
 366
 367         else if (strcmp(name, "username") == 0)
 368             data->element = EDITOR;
 369
 370         else if (strcmp(name,"ip") == 0)
 371             data->element = EDITORID;
 372
 373         else if (strcmp(name,"comment") == 0)
 374             data->element = COMMENT;
 375
 376         else if (strcmp(name,"text") == 0)
 377             data->element = TEXT;
 378
 379         else if (strcmp(name,"page") == 0
 380                 || strcmp(name,"mediawiki") == 0
 381                 || strcmp(name,"restrictions") == 0
 382                 || strcmp(name,"siteinfo") == 0)
 383             data->element = UNUSED;
 384     }
 385
 386 }
 387
 388
 389 static void
 390 end(void* vdata, const XML_Char* name)
 391 {
 392     revisionData* data = (revisionData*) vdata;
 393     if (strcmp(name, "revision") == 0 && data->position != SKIP) {
 394         write_row(data); // crucial... :)
 395         cleanup_revision(data);  // also crucial
 396     } else {
 397         data->element = UNUSED; // sets our state to "not-in-useful"
 398     }                           // thus avoiding unpleasant character data
 399                                 // b/w tags (newlines etc.)
 400 }
 401
 402 void print_usage(char* argv[]) {
 403     fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
 404     fprintf(stderr, "\n");
 405     fprintf(stderr, "options:\n");
 406     fprintf(stderr, "  -t   print text and comments after each line of tab separated data\n");
 407     fprintf(stderr, "\n");
 408     fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
 409     fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
 410     fprintf(stderr, "\n");
 411     fprintf(stderr, "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5\n");
 412     fprintf(stderr, "\n");
 413     fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
 414 }
 415
 416
 417 int
 418 main(int argc, char *argv[])
 419 {
 420
 421     enum outtype output_type;
 422     int dry_run = 0;
 423     // in "simple" output, we don't print text and comments
 424     output_type = SIMPLE;
 425     char c;
 426
 427     while ((c = getopt(argc, argv, "ht")) != -1)
 428         switch (c)
 429         {
 430             case 'd':
 431                 dry_run = 1;
 432                 break;
 433             case 't':
 434                 output_type = FULL;
 435                 break;
 436             case 'h':
 437                 print_usage(argv);
 438                 exit(0);
 439                 break;
 440         }
 441
 442     if (dry_run) { // lets us print initialization options
 443         printf("simple_output = %i\n", output_type);
 444         exit(1);
 445     }
 446
 447     // create a new instance of the expat parser
 448     XML_Parser parser = XML_ParserCreate("UTF-8");
 449
 450     // initialize the user data struct which is passed to callback functions
 451     revisionData data;
 452     // initialize the elements of the struct to default values
 453     init_data(&data, output_type);
 454
 455
 456     // makes the parser pass "data" as the first argument to every callback
 457     XML_SetUserData(parser, &data);
 458     void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
 459     void (*endFnPtr)(void*, const XML_Char*) = end;
 460     void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
 461
 462     // sets start and end to be the element start and end handlers
 463     XML_SetElementHandler(parser, startFnPtr, endFnPtr);
 464     // sets charhndl to be the callback for character data
 465     XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
 466
 467     bool done;
 468     char buf[BUFSIZ];
 469
 470     // shovel data into the parser
 471     do {
 472
 473         // read into buf a bufferfull of data from standard input
 474         size_t len = fread(buf, 1, BUFSIZ, stdin);
 475         done = len < BUFSIZ; // checks if we've got the last bufferfull
 476
 477         // passes the buffer of data to the parser and checks for error
 478         //   (this is where the callbacks are invoked)
 479         if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
 480             fprintf(stderr,
 481                 "%s at line %d\n",
 482                 XML_ErrorString(XML_GetErrorCode(parser)),
 483                 (int) XML_GetCurrentLineNumber(parser));
 484             return 1;
 485         }
 486     } while (!done);
 487
 488
 489     XML_ParserFree(parser);
 490
 491     return 0;
 492 }