projects.mako.cc - wikiq/blob - wikiq.c

   1 /*
   2  * An XML parser for Wikipedia Data dumps.
   3  * Converts XML files to tab-separated values files readable by spreadsheets
   4  * and statistical packages.
   5  */
   6
   7 #include <stdio.h>
   8 #include <string.h>
   9 #include <ctype.h>
  10 #include <stdlib.h>
  11 #include "expat.h"
  12 #include <getopt.h>
  13
  14 #define BUFFER_SIZE 80
  15 // timestamp of the form 2003-11-07T00:43:23Z
  16 #define DATE_LENGTH 10
  17 #define TIME_LENGTH 8
  18 #define TIMESTAMP_LENGTH 20
  19
  20 enum elements {
  21     TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
  22     EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
  23 };
  24
  25 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
  26
  27 enum outtype { NORMAL, SIMPLE };
  28
  29 typedef struct {
  30
  31     struct {
  32         char *title;
  33         char *articleid;
  34         char *revid;
  35         char *date;
  36         char *time;
  37         char *timestamp;
  38         char *anon;
  39         char *editor;
  40         char *editorid;
  41         char *minor;
  42         char *comment;
  43         char *text;
  44     } rev;
  45
  46     char *dropstr;
  47     enum elements element;
  48     enum block position;
  49     enum outtype output_type;
  50
  51 } revisionData;
  52
  53
  54 /* free_data and clean_data
  55  * Takes a pointer to the data struct and an integer {0,1} indicating if the
  56  * title data needs to be cleared as well.
  57  * Also, frees memory dynamically allocated to store data.
  58  */
  59 static void
  60 clean_data(revisionData *data, int title)
  61 {
  62     if (title) {
  63         data->rev.title = NULL;
  64         data->rev.articleid = NULL;
  65     }
  66     data->rev.revid = NULL;
  67     data->rev.date = NULL;
  68     data->rev.time = NULL;
  69     data->rev.timestamp = NULL;
  70     data->rev.anon = NULL;
  71     data->rev.editor = NULL;
  72     data->rev.editorid = NULL;
  73     data->rev.minor = NULL;
  74     data->rev.comment = NULL;
  75     data->rev.text = NULL;
  76     data->element = UNUSED;
  77     //data->position =
  78 }
  79
  80 static void
  81 free_data(revisionData *data, int title)
  82 {
  83     if (title) {
  84         //printf("freeing article\n");
  85         free(data->rev.title);
  86         free(data->rev.articleid);
  87     }
  88     free(data->rev.revid);
  89     free(data->rev.date);
  90     free(data->rev.time);
  91     free(data->rev.timestamp);
  92     free(data->rev.anon);
  93     free(data->rev.editor);
  94     free(data->rev.editorid);
  95     free(data->rev.minor);
  96     free(data->rev.comment);
  97     free(data->rev.text);
  98 }
  99
 100 cleanup_revision(revisionData *data) {
 101     free_data(data, 0);
 102     clean_data(data, 0);
 103 }
 104
 105 cleanup_article(revisionData *data) {
 106     free_data(data, 1);
 107     clean_data(data, 1);
 108 }
 109
 110
 111 static void
 112 init_data(revisionData *data, char *dropstr, int output_type)
 113 {
 114     clean_data(data, 1); // sets every element to null...
 115     data->dropstr = dropstr;
 116     data->output_type = output_type;
 117 }
 118
 119 /* for debugging only, prints out the state of the data struct
 120  */
 121 static void
 122 print_state(revisionData *data)
 123 {
 124     printf("element = %i\n", data->element);
 125     printf("output_type = %i\n", data->output_type);
 126     printf("title = %s\n", data->rev.title);
 127     printf("articleid = %s\n", data->rev.articleid);
 128     printf("revid = %s\n", data->rev.revid);
 129     printf("date = %s\n", data->rev.date);
 130     printf("time = %s\n", data->rev.time);
 131     printf("anon = %s\n", data->rev.anon);
 132     printf("editor = %s\n", data->rev.editor);
 133     printf("editorid = %s\n", data->rev.editorid);
 134     printf("minor = %s\n", data->rev.minor);
 135     printf("comment = %s\n", data->rev.comment);
 136     printf("text = %s\n", data->rev.text);
 137     printf("\n");
 138
 139 }
 140
 141 /* Write a header for the comma-separated output
 142  */
 143 static void
 144 write_header()
 145 {
 146  //   printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
 147 //    printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
 148
 149 }
 150
 151
 152 /*
 153  * write a line of comma-separated value formatted data to standard out
 154  * follows the form:
 155  * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
 156  * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
 157  *
 158  * it is called right before cleanup_revision() and cleanup_article()
 159  */
 160 static void
 161 write_row(revisionData *data)
 162 {
 163     // define temporary variables to hold output values:
 164     char *title, *articleid;
 165     char *revid, *date, *time, *anon, *editor, *editorid;
 166     char *minor, *comment;
 167     char *text;
 168     // perform some simple logic to obtain correct output values
 169
 170     if (data->rev.minor == NULL)
 171         minor = "0";
 172     else minor = data->rev.minor;
 173
 174     if (data->rev.editor == NULL)
 175         anon = "1";
 176     else anon = "0";
 177
 178     if (data->rev.title ==  NULL)
 179         title = "";
 180     else title = data->rev.title;
 181
 182     if (data->rev.articleid == NULL)
 183         articleid = "";
 184     else articleid = data->rev.articleid;
 185
 186     if (data->rev.revid == NULL)
 187         revid = "";
 188     else revid = data->rev.revid;
 189
 190     if (data->rev.date == NULL)
 191         date = "";
 192     else date = data->rev.date;
 193
 194     if (data->rev.time == NULL)
 195         time = "";
 196     else time = data->rev.time;
 197
 198     if (data->rev.editor == NULL)
 199         editor = "";
 200     else editor = data->rev.editor;
 201
 202     if (data->rev.editorid == NULL)
 203         editorid = "";
 204     else editorid = data->rev.editorid;
 205
 206     if (data->rev.text == NULL)
 207         text = "";
 208     else text = data->rev.text;
 209
 210
 211     if (data->rev.comment == NULL)
 212         comment = "";
 213     else comment = data->rev.comment;
 214
 215
 216     // TODO: make it so you can specify fields to output
 217     // note that date and time are separated by a space, to match postgres's
 218     // timestamp format
 219     switch (data->output_type)
 220     {
 221         case NORMAL:
 222             printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%s\t%s\n",
 223                 title,articleid,revid,date,time,anon,editor,editorid,minor,comment,text);
 224             break;
 225         case SIMPLE:
 226             printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\n",
 227                 title,articleid,revid,date,time,anon,editor,editorid,minor);
 228             break;
 229     }
 230
 231 }
 232
 233 static char
 234 *timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
 235 {
 236     char *timeinstamp = &timestamp[DATE_LENGTH+1];
 237     strncpy(time_buffer, timeinstamp, TIME_LENGTH);
 238     time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
 239 }
 240
 241
 242 static char
 243 *datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
 244 {
 245     strncpy(date_buffer, timestamp, DATE_LENGTH);
 246     date_buffer[DATE_LENGTH] = '\0';
 247 }
 248
 249 char
 250 *append(char *entry, char *new)
 251 {
 252     char *newbuff;
 253     int len;
 254     len = (strlen(entry)+strlen(new))*sizeof(char) + 1;
 255     newbuff = realloc(entry, len);
 256     strcat(newbuff, new);
 257     return newbuff;
 258 }
 259
 260 char
 261 *cache(char *entry, char *new)
 262 {
 263     char *newbuff;
 264     int len;
 265     len = strlen(new)*sizeof(char) + 1; // include space for the '\0' !
 266     newbuff = malloc(len);
 267     strcpy(newbuff,new);
 268     return newbuff;
 269
 270 }
 271
 272 char
 273 *store(char *entry, char *new)
 274 {
 275     char *newbuff;
 276     if (entry == NULL)
 277         newbuff = cache(entry, new);
 278     else
 279         newbuff = append(entry, new);
 280     return newbuff;
 281 }
 282
 283 void
 284 split_timestamp(revisionData *data)
 285 {
 286     char *t = data->rev.timestamp;
 287     char date_buffer[DATE_LENGTH+1];
 288     char time_buffer[TIME_LENGTH+1];
 289     datestr(t, date_buffer);
 290     timestr(t, time_buffer);
 291     data->rev.date = store(data->rev.date, date_buffer);
 292     data->rev.time = store(data->rev.time, time_buffer);
 293 }
 294
 295 /* currently unused */
 296 static int
 297 is_whitespace(char *string) {
 298     int len = strlen(string);
 299     while (isspace(string[0]) && strlen(string) > 0) {
 300         string++;
 301     }
 302     if (strcmp(string, "") == 0)
 303         return 1;
 304     else
 305         return 0;
 306 }
 307
 308 static void
 309 squeeze(char *s, int c) {
 310     int i, j;
 311     for (i = j = 0; s[i] != '\0'; i++)
 312         if (s[i] != c)
 313             s[j++] = s[i];
 314     s[j] = '\0';
 315 }
 316
 317 int
 318 contains(char *s, char *t)
 319 {
 320     char c = t[0]; //just get the first character of t
 321     int i = 0;
 322     while (s[i] != '\0') {
 323         if (s[i] == c)
 324             return 1;
 325         i++;
 326     }
 327 }
 328
 329 static void
 330 charhndl(revisionData *data, char *s, int len)
 331 {
 332     if (data->element != UNUSED && data->position != SKIP) {
 333         char t[len];
 334         strncpy(t,s,len);
 335         t[len] = '\0'; // makes t a well-formed string
 336         switch (data->element) {
 337             case TITLE:
 338                 {
 339                     data->rev.title = store(data->rev.title, t);
 340                     // skip any articles with bad characters in their titles
 341                     if (contains(t, data->dropstr)) {
 342                         data->position = SKIP;
 343                         //printf("found a baddie\n");
 344                     }
 345                     break;
 346                 }
 347             case ARTICLEID:
 348                    // printf("articleid = %s\n", t);
 349                     data->rev.articleid = store(data->rev.articleid, t);
 350                     break;
 351             case REVID:
 352                    // printf("revid = %s\n", t);
 353                     data->rev.revid = store(data->rev.revid, t);
 354                     break;
 355             case TIMESTAMP:
 356                     data->rev.timestamp = store(data->rev.timestamp, t);
 357                     if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH)
 358                         split_timestamp(data);
 359                     break;
 360             case EDITOR: {
 361                     data->rev.editor = store(data->rev.editor, t);
 362                     break;
 363                     }
 364             case EDITORID:
 365                     //printf("editorid = %s\n", t);
 366                     data->rev.editorid = store(data->rev.editorid, t);
 367                     break;
 368             /* the following are implied or skipped:
 369             case MINOR:
 370                     printf("found minor element\n");  doesn't work
 371                     break;                   minor tag is just a tag
 372             case UNUSED:
 373             */
 374             case COMMENT:
 375                    // printf("row: comment is %s\n", t);
 376                     data->rev.comment = store(data->rev.comment, t);
 377                     break;
 378             case TEXT:
 379                    data->rev.text = store(data->rev.text, t);
 380                    break;
 381             default: break;
 382         }
 383     }
 384 }
 385
 386 static void
 387 start(revisionData *data, const char *name, const char **attr)
 388 {
 389
 390     if (strcmp(name,"title") == 0) {
 391         cleanup_article(data); // cleans up data from last article
 392         data->element = TITLE;
 393         data->position = TITLE_BLOCK;
 394     } else if (data->position != SKIP) {
 395         if (strcmp(name,"revision") == 0) {
 396             data->element = REVISION;
 397             data->position = REVISION_BLOCK;
 398         } else if (strcmp(name, "contributor") == 0) {
 399             data->element = CONTRIBUTOR;
 400             data->position = CONTRIBUTOR_BLOCK;
 401         } else if (strcmp(name,"id") == 0)
 402             switch (data->position) {
 403                 case TITLE_BLOCK:
 404                     data->element = ARTICLEID;
 405                     break;
 406                 case REVISION_BLOCK:
 407                     data->element = REVID;
 408                     break;
 409                 case CONTRIBUTOR_BLOCK:
 410                     data->element = EDITORID;
 411                     break;
 412             }
 413
 414         // minor tag has no character data, so we parse here
 415         else if (strcmp(name,"minor") == 0) {
 416             data->element = MINOR;
 417             data->rev.minor = store(data->rev.minor, "1");
 418         }
 419         else if (strcmp(name,"timestamp") == 0)
 420             data->element = TIMESTAMP;
 421
 422         else if (strcmp(name, "username") == 0)
 423             data->element = EDITOR;
 424
 425         else if (strcmp(name,"ip") == 0)
 426             data->element = EDITORID;
 427
 428         else if (strcmp(name,"comment") == 0)
 429             data->element = COMMENT;
 430
 431         else if (strcmp(name,"text") == 0)
 432             data->element = TEXT;
 433
 434         else if (strcmp(name,"page") == 0
 435                 || strcmp(name,"mediawiki") == 0
 436                 || strcmp(name,"restrictions") == 0
 437                 || strcmp(name,"siteinfo") == 0)
 438             data->element = UNUSED;
 439     }
 440
 441 }
 442
 443
 444 static void
 445 end(revisionData *data, const char *name)
 446 {
 447     if (strcmp(name, "revision") == 0 && data->position != SKIP) {
 448         write_row(data); // crucial... :)
 449         cleanup_revision(data);  // also crucial
 450     } else {
 451         data->element = UNUSED; // sets our state to "not-in-useful"
 452     }                           // thus avoiding unpleasant character data
 453                                 // b/w tags (newlines etc.)
 454 }
 455
 456 void print_usage(char* argv[]) {
 457     fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
 458     fprintf(stderr, "\n");
 459     fprintf(stderr, "options:\n");
 460     fprintf(stderr, "  -t   print text and comments after each line of tab separated data\n");
 461     fprintf(stderr, "\n");
 462     fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
 463     fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
 464     fprintf(stderr, "\n");
 465     fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
 466     fprintf(stderr, "\n");
 467     fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
 468 }
 469
 470
 471 int
 472 main(int argc, char *argv[])
 473 {
 474
 475     char *dropstr = "";
 476     enum outtype output_type;
 477     int dry_run = 0;
 478     // in "simple" output, we don't print text and comments
 479     output_type = SIMPLE;
 480     char c;
 481
 482     while ((c = getopt(argc, argv, "hr:sd")) != -1)
 483         switch (c)
 484         {
 485             case 'r':
 486                 dropstr = optarg;
 487                 break;
 488             case 'd':
 489                 dry_run = 1;
 490                 break;
 491             case 't':
 492                 output_type = NORMAL;
 493                 break;
 494             case 'h':
 495                 print_usage(argv);
 496                 exit(0);
 497                 break;
 498         }
 499
 500     if (dry_run) { // lets us print initialization options
 501         printf("simple_output = %i\n", output_type);
 502         printf("dropstr = %s\n", dropstr);
 503         exit(1);
 504     }
 505
 506     // create a new instance of the expat parser
 507     XML_Parser parser = XML_ParserCreate(NULL);
 508
 509     // initialize the user data struct which is passed to callback functions
 510     revisionData data;
 511     // initialize the elements of the struct to default values
 512     init_data(&data, dropstr, output_type);
 513
 514
 515     // makes the parser pass "data" as the first argument to every callback
 516     XML_SetUserData(parser, &data);
 517     // sets start and end to be the element start and end handlers
 518     XML_SetElementHandler(parser, (void *) start, (void *) end);
 519     // sets charhndl to be the callback for raw character data
 520     XML_SetCharacterDataHandler(parser, (void *) charhndl);
 521
 522     int done;
 523     char buf[BUFSIZ];
 524
 525     write_header();
 526
 527     // shovel data into the parser
 528     do {
 529
 530         // read into buf a bufferfull of data from standard input
 531         size_t len = fread(buf, 1, sizeof(buf), stdin);
 532         done = len < sizeof(buf); // checks if we've got the last bufferfull
 533
 534         // passes the buffer of data to the parser and checks for error
 535         //   (this is where the callbacks are invoked)
 536         if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
 537             fprintf(stderr,
 538                 "%s at line %d\n",
 539                 XML_ErrorString(XML_GetErrorCode(parser)),
 540                 (int) XML_GetCurrentLineNumber(parser));
 541             return 1;
 542         }
 543     } while (!done);
 544
 545
 546     XML_ParserFree(parser);
 547
 548     return 0;
 549 }