projects.mako.cc - wikiq/blob - wikiq.c

   1 /*
   2  * An XML parser for Wikipedia Data dumps.
   3  * Converts XML files to tab-separated values files readable by spreadsheets
   4  * and statistical packages.
   5  */
   6
   7 #include <stdio.h>
   8 #include <string.h>
   9 #include <ctype.h>
  10 #include <stdlib.h>
  11 #include "expat.h"
  12 #include <getopt.h>
  13
  14 // timestamp of the form 2003-11-07T00:43:23Z
  15 #define DATE_LENGTH 10
  16 #define TIME_LENGTH 8
  17 #define TIMESTAMP_LENGTH 20
  18
  19 #define MEGABYTE 1048576
  20 #define FIELD_BUFFER_SIZE 1024
  21 // 2048 KB in bytes + 1
  22 //#define TEXT_BUFFER_SIZE 2097153
  23 //#define TEXT_BUFFER_SIZE 10485760
  24
  25 enum elements {
  26     TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
  27     EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
  28 };
  29
  30 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
  31
  32 enum outtype { FULL, SIMPLE };
  33
  34 typedef struct {
  35
  36     // pointers to once-allocated buffers
  37     char *title;
  38     char *articleid;
  39     char *revid;
  40     char *date;
  41     char *time;
  42     char *timestamp;
  43     char *anon;
  44     char *editor;
  45     char *editorid;
  46     char *comment;
  47     char *text;
  48
  49     // track string size of the elements, to prevent O(N^2) processing in charhndl
  50     // when we have to take strlen for every character which we append to the buffer
  51     size_t title_size;
  52     size_t articleid_size;
  53     size_t revid_size;
  54     size_t date_size;
  55     size_t time_size;
  56     size_t timestamp_size;
  57     size_t anon_size;
  58     size_t editor_size;
  59     size_t editorid_size;
  60     size_t comment_size;
  61     size_t text_size;
  62
  63     bool minor;
  64
  65     enum elements element;
  66     enum block position;
  67     enum outtype output_type;
  68
  69 } revisionData;
  70
  71
  72 /* free_data and clean_data
  73  * Takes a pointer to the data struct and an integer {0,1} indicating if the
  74  * title data needs to be cleared as well.
  75  * Also, frees memory dynamically allocated to store data.
  76  */
  77 static void
  78 clean_data(revisionData *data, int title)
  79 {
  80     // reset title (if we are switching articles)
  81     if (title) {
  82         data->title[0] = '\0';
  83         data->articleid[0] = '\0';
  84         data->title_size = 0;
  85         data->articleid_size = 0;
  86     }
  87
  88     // reset text fields
  89     data->revid[0] = '\0';
  90     data->date[0] = '\0';
  91     data->time[0] = '\0';
  92     data->timestamp[0] = '\0';
  93     data->anon[0] = '\0';
  94     data->editor[0] = '\0';
  95     data->editorid[0] = '\0';
  96     data->comment[0] = '\0';
  97     data->text[0] = '\0';
  98
  99     // reset length tracking
 100     data->revid_size = 0;
 101     data->date_size = 0;
 102     data->time_size = 0;
 103     data->timestamp_size = 0;
 104     data->anon_size = 0;
 105     data->editor_size = 0;
 106     data->editorid_size = 0;
 107     data->comment_size = 0;
 108     data->text_size = 0;
 109
 110     // reset flags and element type info
 111     data->minor = false;
 112     data->element = UNUSED;
 113
 114 }
 115
 116 // presently unused
 117 static void
 118 free_data(revisionData *data, int title)
 119 {
 120     if (title) {
 121         //printf("freeing article\n");
 122         free(data->title);
 123         free(data->articleid);
 124     }
 125     free(data->revid);
 126     free(data->date);
 127     free(data->time);
 128     free(data->timestamp);
 129     free(data->anon);
 130     free(data->editor);
 131     free(data->editorid);
 132     free(data->comment);
 133     free(data->text);
 134 }
 135
 136 void cleanup_revision(revisionData *data) {
 137     clean_data(data, 0);
 138 }
 139
 140 void cleanup_article(revisionData *data) {
 141     clean_data(data, 1);
 142 }
 143
 144
 145 static void
 146 init_data(revisionData *data, outtype output_type)
 147 {
 148     data->text = (char*) malloc(4 * MEGABYTE);  // 2MB is the article length limit, 4MB is 'safe'?
 149     data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
 150     data->title = (char*) malloc(FIELD_BUFFER_SIZE);
 151     data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
 152     data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
 153     data->date = (char*) malloc(FIELD_BUFFER_SIZE);
 154     data->time = (char*) malloc(FIELD_BUFFER_SIZE);
 155     data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
 156     data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
 157     data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
 158     data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
 159     data->minor = false;
 160
 161     // resets the data fields, null terminates strings, sets lengths
 162     clean_data(data, 1);
 163
 164     data->output_type = output_type;
 165 }
 166
 167 /* for debugging only, prints out the state of the data struct
 168  */
 169 static void
 170 print_state(revisionData *data)
 171 {
 172     printf("element = %i\n", data->element);
 173     printf("output_type = %i\n", data->output_type);
 174     printf("title = %s\n", data->title);
 175     printf("articleid = %s\n", data->articleid);
 176     printf("revid = %s\n", data->revid);
 177     printf("date = %s\n", data->date);
 178     printf("time = %s\n", data->time);
 179     printf("anon = %s\n", data->anon);
 180     printf("editor = %s\n", data->editor);
 181     printf("editorid = %s\n", data->editorid);
 182     printf("minor = %s\n", (data->minor ? "1" : "0"));
 183     printf("comment = %s\n", data->comment);
 184     printf("text = %s\n", data->text);
 185     printf("\n");
 186
 187 }
 188
 189 /* Write a header for the comma-separated output
 190  */
 191 static void
 192 write_header()
 193 {
 194  //   printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
 195 //    printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
 196
 197 }
 198
 199
 200 /*
 201  * write a line of comma-separated value formatted data to standard out
 202  * follows the form:
 203  * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
 204  * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
 205  *
 206  * it is called right before cleanup_revision() and cleanup_article()
 207  */
 208 static void
 209 write_row(revisionData *data)
 210 {
 211
 212     // TODO: make it so you can specify fields to output
 213     // note that date and time are separated by a space, to match postgres's
 214     // timestamp format
 215     printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
 216         data->title,
 217         data->articleid,
 218         data->revid,
 219         data->date,
 220         data->time,
 221         (data->editor[0] != '\0') ? "0" : "1",  // anon?
 222         data->editor,
 223         data->editorid,
 224         (data->minor) ? "1" : "0");
 225     switch (data->output_type)
 226     {
 227         case SIMPLE:
 228             printf("\t%i\n", (unsigned int) strlen(data->text));
 229             //printf("\n");
 230             break;
 231         case FULL:
 232             printf("\t%s\t%s\n", data->comment, data->text);
 233             break;
 234     }
 235
 236 }
 237
 238 char
 239 *append(char *entry, char *newstr)
 240 {
 241     char *newbuff;
 242     int len;
 243     len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1;
 244     newbuff = (char*) realloc(entry, len);
 245     strcat(newbuff, newstr);
 246     return newbuff;
 247 }
 248
 249 char
 250 *cache(char *entry, char *newstr)
 251 {
 252     char *newbuff;
 253     int len;
 254     len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' !
 255     newbuff = (char*) malloc(len);
 256     strcpy(newbuff,newstr);
 257     return newbuff;
 258
 259 }
 260
 261 char
 262 *store(char *entry, char *newstr)
 263 {
 264     char *newbuff;
 265     if (entry == NULL)
 266         newbuff = cache(entry, newstr);
 267     else
 268         newbuff = append(entry, newstr);
 269     return newbuff;
 270 }
 271
 272 void
 273 split_timestamp(revisionData *data)
 274 {
 275     char *t = data->timestamp;
 276     strncpy(data->date, data->timestamp, DATE_LENGTH);
 277     char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
 278     strncpy(data->time, timeinstamp, TIME_LENGTH);
 279 }
 280
 281 /* currently unused */
 282 static int
 283 is_whitespace(char *string) {
 284     int len = strlen(string);
 285     while (isspace(string[0]) && strlen(string) > 0) {
 286         string++;
 287     }
 288     if (strcmp(string, "") == 0)
 289         return 1;
 290     else
 291         return 0;
 292 }
 293
 294 // like strncat but with previously known length
 295 char*
 296 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
 297 {
 298    //size_t dest_len = strlen(dest);
 299    size_t i;
 300
 301    for (i = 0 ; i < n && src[i] != '\0' ; i++)
 302        dest[dest_len + i] = src[i];
 303    dest[dest_len + i] = '\0';
 304
 305    return dest;
 306 }
 307
 308 static void
 309 charhndl(void* vdata, const XML_Char* s, int len)
 310 {
 311     revisionData* data = (revisionData*) vdata;
 312     if (data->element != UNUSED && data->position != SKIP) {
 313         //char t[len];
 314         //strncpy(t,s,len);
 315         //t[len] = '\0'; // makes t a well-formed string
 316         switch (data->element) {
 317             case TEXT:
 318                    // printf("buffer length = %i, text: %s\n", len, t);
 319                     strlcatn(data->text, s, data->text_size, len);
 320                     data->text_size += len;
 321                     break;
 322             case COMMENT:
 323                     strlcatn(data->comment, s, data->comment_size, len);
 324                     data->comment_size += len;
 325                     break;
 326             case TITLE:
 327                     strlcatn(data->title, s, data->title_size, len);
 328                     data->title_size += len;
 329                     break;
 330             case ARTICLEID:
 331                    // printf("articleid = %s\n", t);
 332                     strlcatn(data->articleid, s, data->articleid_size, len);
 333                     data->articleid_size += len;
 334                     break;
 335             case REVID:
 336                    // printf("revid = %s\n", t);
 337                     strlcatn(data->revid, s, data->revid_size, len);
 338                     data->revid_size += len;
 339                     break;
 340             case TIMESTAMP:
 341                     strlcatn(data->timestamp, s, data->timestamp_size, len);
 342                     data->timestamp_size += len;
 343                     if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
 344                         split_timestamp(data);
 345                     break;
 346             case EDITOR:
 347                     strlcatn(data->editor, s, data->editor_size, len);
 348                     data->editor_size += len;
 349                     break;
 350             case EDITORID:
 351                     //printf("editorid = %s\n", t);
 352                     strlcatn(data->editorid, s, data->editorid_size, len);
 353                     data->editorid_size += len;
 354                     break;
 355             /* the following are implied or skipped:
 356             case MINOR:
 357                     printf("found minor element\n");  doesn't work
 358                     break;                   minor tag is just a tag
 359             case UNUSED:
 360             */
 361             default: break;
 362         }
 363     }
 364 }
 365
 366 static void
 367 start(void* vdata, const XML_Char* name, const XML_Char** attr)
 368 {
 369     revisionData* data = (revisionData*) vdata;
 370
 371     if (strcmp(name,"title") == 0) {
 372         cleanup_article(data); // cleans up data from last article
 373         data->element = TITLE;
 374         data->position = TITLE_BLOCK;
 375     } else if (data->position != SKIP) {
 376         if (strcmp(name,"revision") == 0) {
 377             data->element = REVISION;
 378             data->position = REVISION_BLOCK;
 379         } else if (strcmp(name, "contributor") == 0) {
 380             data->element = CONTRIBUTOR;
 381             data->position = CONTRIBUTOR_BLOCK;
 382         } else if (strcmp(name,"id") == 0)
 383             switch (data->position) {
 384                 case TITLE_BLOCK:
 385                     data->element = ARTICLEID;
 386                     break;
 387                 case REVISION_BLOCK:
 388                     data->element = REVID;
 389                     break;
 390                 case CONTRIBUTOR_BLOCK:
 391                     data->element = EDITORID;
 392                     break;
 393             }
 394
 395         // minor tag has no character data, so we parse here
 396         else if (strcmp(name,"minor") == 0) {
 397             data->element = MINOR;
 398             data->minor = true;
 399         }
 400         else if (strcmp(name,"timestamp") == 0)
 401             data->element = TIMESTAMP;
 402
 403         else if (strcmp(name, "username") == 0)
 404             data->element = EDITOR;
 405
 406         else if (strcmp(name,"ip") == 0)
 407             data->element = EDITORID;
 408
 409         else if (strcmp(name,"comment") == 0)
 410             data->element = COMMENT;
 411
 412         else if (strcmp(name,"text") == 0)
 413             data->element = TEXT;
 414
 415         else if (strcmp(name,"page") == 0
 416                 || strcmp(name,"mediawiki") == 0
 417                 || strcmp(name,"restrictions") == 0
 418                 || strcmp(name,"siteinfo") == 0)
 419             data->element = UNUSED;
 420     }
 421
 422 }
 423
 424
 425 static void
 426 end(void* vdata, const XML_Char* name)
 427 {
 428     revisionData* data = (revisionData*) vdata;
 429     if (strcmp(name, "revision") == 0 && data->position != SKIP) {
 430         write_row(data); // crucial... :)
 431         cleanup_revision(data);  // also crucial
 432     } else {
 433         data->element = UNUSED; // sets our state to "not-in-useful"
 434     }                           // thus avoiding unpleasant character data
 435                                 // b/w tags (newlines etc.)
 436 }
 437
 438 void print_usage(char* argv[]) {
 439     fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
 440     fprintf(stderr, "\n");
 441     fprintf(stderr, "options:\n");
 442     fprintf(stderr, "  -t   print text and comments after each line of tab separated data\n");
 443     fprintf(stderr, "\n");
 444     fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
 445     fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
 446     fprintf(stderr, "\n");
 447     fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor, revlength\n");
 448     fprintf(stderr, "\n");
 449     fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
 450 }
 451
 452
 453 int
 454 main(int argc, char *argv[])
 455 {
 456
 457     enum outtype output_type;
 458     int dry_run = 0;
 459     // in "simple" output, we don't print text and comments
 460     output_type = SIMPLE;
 461     char c;
 462
 463     while ((c = getopt(argc, argv, "ht")) != -1)
 464         switch (c)
 465         {
 466             case 'd':
 467                 dry_run = 1;
 468                 break;
 469             case 't':
 470                 output_type = FULL;
 471                 break;
 472             case 'h':
 473                 print_usage(argv);
 474                 exit(0);
 475                 break;
 476         }
 477
 478     if (dry_run) { // lets us print initialization options
 479         printf("simple_output = %i\n", output_type);
 480         exit(1);
 481     }
 482
 483     // create a new instance of the expat parser
 484     XML_Parser parser = XML_ParserCreate("UTF-8");
 485
 486     // initialize the user data struct which is passed to callback functions
 487     revisionData data;
 488     // initialize the elements of the struct to default values
 489     init_data(&data, output_type);
 490
 491
 492     // makes the parser pass "data" as the first argument to every callback
 493     XML_SetUserData(parser, &data);
 494     void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
 495     void (*endFnPtr)(void*, const XML_Char*) = end;
 496     void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
 497
 498     // sets start and end to be the element start and end handlers
 499     XML_SetElementHandler(parser, startFnPtr, endFnPtr);
 500     // sets charhndl to be the callback for character data
 501     XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
 502
 503     bool done;
 504     char buf[BUFSIZ];
 505
 506     // shovel data into the parser
 507     do {
 508
 509         // read into buf a bufferfull of data from standard input
 510         size_t len = fread(buf, 1, BUFSIZ, stdin);
 511         done = len < BUFSIZ; // checks if we've got the last bufferfull
 512
 513         // passes the buffer of data to the parser and checks for error
 514         //   (this is where the callbacks are invoked)
 515         if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
 516             fprintf(stderr,
 517                 "%s at line %d\n",
 518                 XML_ErrorString(XML_GetErrorCode(parser)),
 519                 (int) XML_GetCurrentLineNumber(parser));
 520             return 1;
 521         }
 522     } while (!done);
 523
 524
 525     XML_ParserFree(parser);
 526
 527     return 0;
 528 }