projects.mako.cc - wikiq/blob - wikiq.c

   1 /*
   2  * An XML parser for Wikipedia Data dumps.
   3  * Converts XML files to tab-separated values files readable by spreadsheets
   4  * and statistical packages.
   5  */
   6
   7 #include <stdio.h>
   8 #include <string.h>
   9 #include <ctype.h>
  10 #include <stdlib.h>
  11 #include "expat.h"
  12 #include <getopt.h>
  13
  14 #define BUFFER_SIZE 80
  15 // timestamp of the form 2003-11-07T00:43:23Z
  16 #define DATE_LENGTH 10
  17 #define TIME_LENGTH 8
  18 #define TIMESTAMP_LENGTH 20
  19
  20 // 2048 KB in bytes + 1
  21 #define TEXT_BUFFER_SIZE 2097153
  22 #define FIELD_BUFFER_SIZE 1024
  23
  24 enum elements {
  25     TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
  26     EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
  27 };
  28
  29 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
  30
  31 enum outtype { FULL, SIMPLE };
  32
  33 typedef struct {
  34
  35     char *title;
  36     char *articleid;
  37     char *revid;
  38     char *date;
  39     char *time;
  40     char *timestamp;
  41     char *anon;
  42     char *editor;
  43     char *editorid;
  44     bool minor;
  45     char *comment;
  46     char text[TEXT_BUFFER_SIZE];
  47
  48     enum elements element;
  49     enum block position;
  50     enum outtype output_type;
  51
  52 } revisionData;
  53
  54
  55 /* free_data and clean_data
  56  * Takes a pointer to the data struct and an integer {0,1} indicating if the
  57  * title data needs to be cleared as well.
  58  * Also, frees memory dynamically allocated to store data.
  59  */
  60 static void
  61 clean_data(revisionData *data, int title)
  62 {
  63     if (title) {
  64         data->title = NULL;
  65         data->articleid = NULL;
  66     }
  67     data->revid = NULL;
  68     data->date = NULL;
  69     data->time = NULL;
  70     data->timestamp = NULL;
  71     data->anon = NULL;
  72     data->editor = NULL;
  73     data->editorid = NULL;
  74     data->minor = false;
  75     data->comment = NULL;
  76     //data->text = NULL;
  77     data->element = UNUSED;
  78     //data->position =
  79 }
  80
  81 static void
  82 free_data(revisionData *data, int title)
  83 {
  84     if (title) {
  85         //printf("freeing article\n");
  86         free(data->title);
  87         free(data->articleid);
  88     }
  89     free(data->revid);
  90     free(data->date);
  91     free(data->time);
  92     free(data->timestamp);
  93     free(data->anon);
  94     free(data->editor);
  95     free(data->editorid);
  96     free(data->comment);
  97     //free(data->text);
  98     data->text[0] = '\0';
  99 }
 100
 101 void cleanup_revision(revisionData *data) {
 102     free_data(data, 0);
 103     clean_data(data, 0);
 104 }
 105
 106 void cleanup_article(revisionData *data) {
 107     free_data(data, 1);
 108     clean_data(data, 1);
 109 }
 110
 111
 112 static void
 113 init_data(revisionData *data, outtype output_type)
 114 {
 115     clean_data(data, 1); // sets every element to null...
 116     data->output_type = output_type;
 117 }
 118
 119 /* for debugging only, prints out the state of the data struct
 120  */
 121 static void
 122 print_state(revisionData *data)
 123 {
 124     printf("element = %i\n", data->element);
 125     printf("output_type = %i\n", data->output_type);
 126     printf("title = %s\n", data->title);
 127     printf("articleid = %s\n", data->articleid);
 128     printf("revid = %s\n", data->revid);
 129     printf("date = %s\n", data->date);
 130     printf("time = %s\n", data->time);
 131     printf("anon = %s\n", data->anon);
 132     printf("editor = %s\n", data->editor);
 133     printf("editorid = %s\n", data->editorid);
 134     printf("minor = %s\n", (data->minor ? "1" : "0"));
 135     printf("comment = %s\n", data->comment);
 136     printf("text = %s\n", data->text);
 137     printf("\n");
 138
 139 }
 140
 141 /* Write a header for the comma-separated output
 142  */
 143 static void
 144 write_header()
 145 {
 146  //   printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
 147 //    printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
 148
 149 }
 150
 151
 152 /*
 153  * write a line of comma-separated value formatted data to standard out
 154  * follows the form:
 155  * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
 156  * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
 157  *
 158  * it is called right before cleanup_revision() and cleanup_article()
 159  */
 160 static void
 161 write_row(revisionData *data)
 162 {
 163
 164     // TODO: make it so you can specify fields to output
 165     // note that date and time are separated by a space, to match postgres's
 166     // timestamp format
 167     printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
 168         (data->title != NULL) ? data->title : "",
 169         (data->articleid != NULL) ? data->articleid : "",
 170         (data->revid != NULL) ? data->revid : "",
 171         (data->date != NULL) ? data->date : "",
 172         (data->time != NULL) ? data->time : "",
 173         (data->editor != NULL) ? "0" : "1",
 174         (data->editor != NULL) ? data->editor : "",
 175         (data->editorid != NULL) ? data->editorid  : "",
 176         (data->minor) ? "1" : "0");
 177     switch (data->output_type)
 178     {
 179         case SIMPLE:
 180             printf("\t%i\n", (unsigned int) strlen(data->text));
 181             break;
 182         case FULL:
 183             printf("\t%s\t%s\n", data->comment, data->text);
 184             break;
 185     }
 186
 187 }
 188
 189 void
 190 *timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
 191 {
 192     char *timeinstamp = &timestamp[DATE_LENGTH+1];
 193     strncpy(time_buffer, timeinstamp, TIME_LENGTH);
 194     time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
 195 }
 196
 197
 198 void
 199 *datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
 200 {
 201     strncpy(date_buffer, timestamp, DATE_LENGTH);
 202     date_buffer[DATE_LENGTH] = '\0';
 203 }
 204
 205 char
 206 *append(char *entry, char *newstr)
 207 {
 208     char *newbuff;
 209     int len;
 210     len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1;
 211     newbuff = (char*) realloc(entry, len);
 212     strcat(newbuff, newstr);
 213     return newbuff;
 214 }
 215
 216 char
 217 *cache(char *entry, char *newstr)
 218 {
 219     char *newbuff;
 220     int len;
 221     len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' !
 222     newbuff = (char*) malloc(len);
 223     strcpy(newbuff,newstr);
 224     return newbuff;
 225
 226 }
 227
 228 char
 229 *store(char *entry, char *newstr)
 230 {
 231     char *newbuff;
 232     if (entry == NULL)
 233         newbuff = cache(entry, newstr);
 234     else
 235         newbuff = append(entry, newstr);
 236     return newbuff;
 237 }
 238
 239 void
 240 split_timestamp(revisionData *data)
 241 {
 242     char *t = data->timestamp;
 243     char date_buffer[DATE_LENGTH+1];
 244     char time_buffer[TIME_LENGTH+1];
 245     datestr(t, date_buffer);
 246     timestr(t, time_buffer);
 247     data->date = store(data->date, date_buffer);
 248     data->time = store(data->time, time_buffer);
 249 }
 250
 251 /* currently unused */
 252 static int
 253 is_whitespace(char *string) {
 254     int len = strlen(string);
 255     while (isspace(string[0]) && strlen(string) > 0) {
 256         string++;
 257     }
 258     if (strcmp(string, "") == 0)
 259         return 1;
 260     else
 261         return 0;
 262 }
 263
 264 static void
 265 charhndl(void* vdata, const XML_Char* s, int len)
 266 {
 267     revisionData* data = (revisionData*) vdata;
 268     if (data->element != UNUSED && data->position != SKIP) {
 269         char t[len];
 270         strncpy(t,s,len);
 271         t[len] = '\0'; // makes t a well-formed string
 272         switch (data->element) {
 273             case TITLE:
 274                 {
 275                     data->title = store(data->title, t);
 276                     // skip any articles with bad characters in their titles
 277                     break;
 278                 }
 279             case ARTICLEID:
 280                    // printf("articleid = %s\n", t);
 281                     data->articleid = store(data->articleid, t);
 282                     break;
 283             case REVID:
 284                    // printf("revid = %s\n", t);
 285                     data->revid = store(data->revid, t);
 286                     break;
 287             case TIMESTAMP:
 288                     data->timestamp = store(data->timestamp, t);
 289                     if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
 290                         split_timestamp(data);
 291                     break;
 292             case EDITOR: {
 293                     data->editor = store(data->editor, t);
 294                     break;
 295                     }
 296             case EDITORID:
 297                     //printf("editorid = %s\n", t);
 298                     data->editorid = store(data->editorid, t);
 299                     break;
 300             /* the following are implied or skipped:
 301             case MINOR:
 302                     printf("found minor element\n");  doesn't work
 303                     break;                   minor tag is just a tag
 304             case UNUSED:
 305             */
 306             case COMMENT:
 307                    // printf("row: comment is %s\n", t);
 308                     //if (data->output_type == FULL) {
 309                         data->comment = store(data->comment, t);
 310                     //}
 311                     break;
 312             case TEXT:
 313                     //if (data->output_type == FULL) {
 314                         //data->text = store(data->text, t);
 315                         //
 316                     strcat(data->text, t);
 317                     //}
 318                    break;
 319             default: break;
 320         }
 321     }
 322 }
 323
 324 static void
 325 start(void* vdata, const XML_Char* name, const XML_Char** attr)
 326 {
 327     revisionData* data = (revisionData*) vdata;
 328
 329     if (strcmp(name,"title") == 0) {
 330         cleanup_article(data); // cleans up data from last article
 331         data->element = TITLE;
 332         data->position = TITLE_BLOCK;
 333     } else if (data->position != SKIP) {
 334         if (strcmp(name,"revision") == 0) {
 335             data->element = REVISION;
 336             data->position = REVISION_BLOCK;
 337         } else if (strcmp(name, "contributor") == 0) {
 338             data->element = CONTRIBUTOR;
 339             data->position = CONTRIBUTOR_BLOCK;
 340         } else if (strcmp(name,"id") == 0)
 341             switch (data->position) {
 342                 case TITLE_BLOCK:
 343                     data->element = ARTICLEID;
 344                     break;
 345                 case REVISION_BLOCK:
 346                     data->element = REVID;
 347                     break;
 348                 case CONTRIBUTOR_BLOCK:
 349                     data->element = EDITORID;
 350                     break;
 351             }
 352
 353         // minor tag has no character data, so we parse here
 354         else if (strcmp(name,"minor") == 0) {
 355             data->element = MINOR;
 356             data->minor = true;
 357         }
 358         else if (strcmp(name,"timestamp") == 0)
 359             data->element = TIMESTAMP;
 360
 361         else if (strcmp(name, "username") == 0)
 362             data->element = EDITOR;
 363
 364         else if (strcmp(name,"ip") == 0)
 365             data->element = EDITORID;
 366
 367         else if (strcmp(name,"comment") == 0)
 368             data->element = COMMENT;
 369
 370         else if (strcmp(name,"text") == 0)
 371             data->element = TEXT;
 372
 373         else if (strcmp(name,"page") == 0
 374                 || strcmp(name,"mediawiki") == 0
 375                 || strcmp(name,"restrictions") == 0
 376                 || strcmp(name,"siteinfo") == 0)
 377             data->element = UNUSED;
 378     }
 379
 380 }
 381
 382
 383 static void
 384 end(void* vdata, const XML_Char* name)
 385 {
 386     revisionData* data = (revisionData*) vdata;
 387     if (strcmp(name, "revision") == 0 && data->position != SKIP) {
 388         write_row(data); // crucial... :)
 389         cleanup_revision(data);  // also crucial
 390     } else {
 391         data->element = UNUSED; // sets our state to "not-in-useful"
 392     }                           // thus avoiding unpleasant character data
 393                                 // b/w tags (newlines etc.)
 394 }
 395
 396 void print_usage(char* argv[]) {
 397     fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
 398     fprintf(stderr, "\n");
 399     fprintf(stderr, "options:\n");
 400     fprintf(stderr, "  -t   print text and comments after each line of tab separated data\n");
 401     fprintf(stderr, "\n");
 402     fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
 403     fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
 404     fprintf(stderr, "\n");
 405     fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
 406     fprintf(stderr, "\n");
 407     fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
 408 }
 409
 410
 411 int
 412 main(int argc, char *argv[])
 413 {
 414
 415     enum outtype output_type;
 416     int dry_run = 0;
 417     // in "simple" output, we don't print text and comments
 418     output_type = SIMPLE;
 419     char c;
 420
 421     while ((c = getopt(argc, argv, "ht")) != -1)
 422         switch (c)
 423         {
 424             case 'd':
 425                 dry_run = 1;
 426                 break;
 427             case 't':
 428                 output_type = FULL;
 429                 break;
 430             case 'h':
 431                 print_usage(argv);
 432                 exit(0);
 433                 break;
 434         }
 435
 436     if (dry_run) { // lets us print initialization options
 437         printf("simple_output = %i\n", output_type);
 438         exit(1);
 439     }
 440
 441     // create a new instance of the expat parser
 442     XML_Parser parser = XML_ParserCreate(NULL);
 443
 444     // initialize the user data struct which is passed to callback functions
 445     revisionData data;
 446     // initialize the elements of the struct to default values
 447     init_data(&data, output_type);
 448
 449
 450     // makes the parser pass "data" as the first argument to every callback
 451     XML_SetUserData(parser, &data);
 452     void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
 453     void (*endFnPtr)(void*, const XML_Char*) = end;
 454     void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
 455
 456     // sets start and end to be the element start and end handlers
 457     XML_SetElementHandler(parser, startFnPtr, endFnPtr);
 458     // sets charhndl to be the callback for raw character data
 459     XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
 460
 461     int done;
 462     char buf[BUFSIZ];
 463
 464     write_header();
 465
 466     // shovel data into the parser
 467     do {
 468
 469         // read into buf a bufferfull of data from standard input
 470         size_t len = fread(buf, 1, sizeof(buf), stdin);
 471         done = len < sizeof(buf); // checks if we've got the last bufferfull
 472
 473         // passes the buffer of data to the parser and checks for error
 474         //   (this is where the callbacks are invoked)
 475         if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
 476             fprintf(stderr,
 477                 "%s at line %d\n",
 478                 XML_ErrorString(XML_GetErrorCode(parser)),
 479                 (int) XML_GetCurrentLineNumber(parser));
 480             return 1;
 481         }
 482     } while (!done);
 483
 484
 485     XML_ParserFree(parser);
 486
 487     return 0;
 488 }