projects.mako.cc - wikiq/blob - wikiq.c

   1 /*
   2  * An XML parser for Wikipedia Data dumps.
   3  * Converts XML files to tab-separated values files readable by spreadsheets
   4  * and statistical packages.
   5  */
   6
   7 #include <stdio.h>
   8 #include <string.h>
   9 #include <ctype.h>
  10 #include <stdlib.h>
  11 #include "expat.h"
  12 #include <getopt.h>
  13
  14 #define BUFFER_SIZE 80
  15 // timestamp of the form 2003-11-07T00:43:23Z
  16 #define DATE_LENGTH 10
  17 #define TIME_LENGTH 8
  18 #define TIMESTAMP_LENGTH 20
  19
  20 enum elements {
  21     TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
  22     EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
  23 };
  24
  25 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
  26
  27 enum outtype { FULL, SIMPLE };
  28
  29 typedef struct {
  30
  31     struct {
  32         char *title;
  33         char *articleid;
  34         char *revid;
  35         char *date;
  36         char *time;
  37         char *timestamp;
  38         char *anon;
  39         char *editor;
  40         char *editorid;
  41         bool minor;
  42         char *comment;
  43         char *text;
  44     } rev;
  45
  46     enum elements element;
  47     enum block position;
  48     enum outtype output_type;
  49
  50 } revisionData;
  51
  52
  53 /* free_data and clean_data
  54  * Takes a pointer to the data struct and an integer {0,1} indicating if the
  55  * title data needs to be cleared as well.
  56  * Also, frees memory dynamically allocated to store data.
  57  */
  58 static void
  59 clean_data(revisionData *data, int title)
  60 {
  61     if (title) {
  62         data->rev.title = NULL;
  63         data->rev.articleid = NULL;
  64     }
  65     data->rev.revid = NULL;
  66     data->rev.date = NULL;
  67     data->rev.time = NULL;
  68     data->rev.timestamp = NULL;
  69     data->rev.anon = NULL;
  70     data->rev.editor = NULL;
  71     data->rev.editorid = NULL;
  72     data->rev.minor = false;
  73     data->rev.comment = NULL;
  74     data->rev.text = NULL;
  75     data->element = UNUSED;
  76     //data->position =
  77 }
  78
  79 static void
  80 free_data(revisionData *data, int title)
  81 {
  82     if (title) {
  83         //printf("freeing article\n");
  84         free(data->rev.title);
  85         free(data->rev.articleid);
  86     }
  87     free(data->rev.revid);
  88     free(data->rev.date);
  89     free(data->rev.time);
  90     free(data->rev.timestamp);
  91     free(data->rev.anon);
  92     free(data->rev.editor);
  93     free(data->rev.editorid);
  94     free(data->rev.comment);
  95     free(data->rev.text);
  96 }
  97
  98 void cleanup_revision(revisionData *data) {
  99     free_data(data, 0);
 100     clean_data(data, 0);
 101 }
 102
 103 void cleanup_article(revisionData *data) {
 104     free_data(data, 1);
 105     clean_data(data, 1);
 106 }
 107
 108
 109 static void
 110 init_data(revisionData *data, outtype output_type)
 111 {
 112     clean_data(data, 1); // sets every element to null...
 113     data->output_type = output_type;
 114 }
 115
 116 /* for debugging only, prints out the state of the data struct
 117  */
 118 static void
 119 print_state(revisionData *data)
 120 {
 121     printf("element = %i\n", data->element);
 122     printf("output_type = %i\n", data->output_type);
 123     printf("title = %s\n", data->rev.title);
 124     printf("articleid = %s\n", data->rev.articleid);
 125     printf("revid = %s\n", data->rev.revid);
 126     printf("date = %s\n", data->rev.date);
 127     printf("time = %s\n", data->rev.time);
 128     printf("anon = %s\n", data->rev.anon);
 129     printf("editor = %s\n", data->rev.editor);
 130     printf("editorid = %s\n", data->rev.editorid);
 131     printf("minor = %s\n", (data->rev.minor ? "1" : "0"));
 132     printf("comment = %s\n", data->rev.comment);
 133     printf("text = %s\n", data->rev.text);
 134     printf("\n");
 135
 136 }
 137
 138 /* Write a header for the comma-separated output
 139  */
 140 static void
 141 write_header()
 142 {
 143  //   printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n");
 144 //    printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n");
 145
 146 }
 147
 148
 149 /*
 150  * write a line of comma-separated value formatted data to standard out
 151  * follows the form:
 152  * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
 153  * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
 154  *
 155  * it is called right before cleanup_revision() and cleanup_article()
 156  */
 157 static void
 158 write_row(revisionData *data)
 159 {
 160
 161     // TODO: make it so you can specify fields to output
 162     // note that date and time are separated by a space, to match postgres's
 163     // timestamp format
 164     printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s",
 165         (data->rev.title != NULL) ? data->rev.title : "",
 166         (data->rev.articleid != NULL) ? data->rev.articleid : "",
 167         (data->rev.revid != NULL) ? data->rev.revid : "",
 168         (data->rev.date != NULL) ? data->rev.date : "",
 169         (data->rev.time != NULL) ? data->rev.time : "",
 170         (data->rev.editor != NULL) ? "0" : "1",
 171         (data->rev.editor != NULL) ? data->rev.editor : "",
 172         (data->rev.editorid != NULL) ? data->rev.editorid  : "",
 173         (data->rev.minor) ? "1" : "0");
 174     switch (data->output_type)
 175     {
 176         case SIMPLE:
 177             printf("\n");
 178             break;
 179         case FULL:
 180             printf("\t%s\t%s\n",
 181                 (data->rev.comment != NULL) ? data->rev.comment : "",
 182                 (data->rev.text != NULL) ? data->rev.text : "");
 183             break;
 184     }
 185
 186 }
 187
 188 static char
 189 *timestr(char *timestamp, char time_buffer[TIME_LENGTH+1])
 190 {
 191     char *timeinstamp = &timestamp[DATE_LENGTH+1];
 192     strncpy(time_buffer, timeinstamp, TIME_LENGTH);
 193     time_buffer[TIME_LENGTH] = '\0'; // makes it a well-formed string
 194 }
 195
 196
 197 static char
 198 *datestr(char *timestamp, char date_buffer[DATE_LENGTH+1])
 199 {
 200     strncpy(date_buffer, timestamp, DATE_LENGTH);
 201     date_buffer[DATE_LENGTH] = '\0';
 202 }
 203
 204 char
 205 *append(char *entry, char *newstr)
 206 {
 207     char *newbuff;
 208     int len;
 209     len = (strlen(entry)+strlen(newstr))*sizeof(char) + 1;
 210     newbuff = (char*) realloc(entry, len);
 211     strcat(newbuff, newstr);
 212     return newbuff;
 213 }
 214
 215 char
 216 *cache(char *entry, char *newstr)
 217 {
 218     char *newbuff;
 219     int len;
 220     len = strlen(newstr)*sizeof(char) + 1; // include space for the '\0' !
 221     newbuff = (char*) malloc(len);
 222     strcpy(newbuff,newstr);
 223     return newbuff;
 224
 225 }
 226
 227 char
 228 *store(char *entry, char *newstr)
 229 {
 230     char *newbuff;
 231     if (entry == NULL)
 232         newbuff = cache(entry, newstr);
 233     else
 234         newbuff = append(entry, newstr);
 235     return newbuff;
 236 }
 237
 238 void
 239 split_timestamp(revisionData *data)
 240 {
 241     char *t = data->rev.timestamp;
 242     char date_buffer[DATE_LENGTH+1];
 243     char time_buffer[TIME_LENGTH+1];
 244     datestr(t, date_buffer);
 245     timestr(t, time_buffer);
 246     data->rev.date = store(data->rev.date, date_buffer);
 247     data->rev.time = store(data->rev.time, time_buffer);
 248 }
 249
 250 /* currently unused */
 251 static int
 252 is_whitespace(char *string) {
 253     int len = strlen(string);
 254     while (isspace(string[0]) && strlen(string) > 0) {
 255         string++;
 256     }
 257     if (strcmp(string, "") == 0)
 258         return 1;
 259     else
 260         return 0;
 261 }
 262
 263 static void
 264 squeeze(char *s, int c) {
 265     int i, j;
 266     for (i = j = 0; s[i] != '\0'; i++)
 267         if (s[i] != c)
 268             s[j++] = s[i];
 269     s[j] = '\0';
 270 }
 271
 272 int
 273 contains(char *s, char *t)
 274 {
 275     char c = t[0]; //just get the first character of t
 276     int i = 0;
 277     while (s[i] != '\0') {
 278         if (s[i] == c)
 279             return 1;
 280         i++;
 281     }
 282 }
 283
 284 static void
 285 charhndl(void* vdata, const XML_Char* s, int len)
 286 {
 287     revisionData* data = (revisionData*) vdata;
 288     if (data->element != UNUSED && data->position != SKIP) {
 289         char t[len];
 290         strncpy(t,s,len);
 291         t[len] = '\0'; // makes t a well-formed string
 292         switch (data->element) {
 293             case TITLE:
 294                 {
 295                     data->rev.title = store(data->rev.title, t);
 296                     // skip any articles with bad characters in their titles
 297                     break;
 298                 }
 299             case ARTICLEID:
 300                    // printf("articleid = %s\n", t);
 301                     data->rev.articleid = store(data->rev.articleid, t);
 302                     break;
 303             case REVID:
 304                    // printf("revid = %s\n", t);
 305                     data->rev.revid = store(data->rev.revid, t);
 306                     break;
 307             case TIMESTAMP:
 308                     data->rev.timestamp = store(data->rev.timestamp, t);
 309                     if (strlen(data->rev.timestamp) == TIMESTAMP_LENGTH)
 310                         split_timestamp(data);
 311                     break;
 312             case EDITOR: {
 313                     data->rev.editor = store(data->rev.editor, t);
 314                     break;
 315                     }
 316             case EDITORID:
 317                     //printf("editorid = %s\n", t);
 318                     data->rev.editorid = store(data->rev.editorid, t);
 319                     break;
 320             /* the following are implied or skipped:
 321             case MINOR:
 322                     printf("found minor element\n");  doesn't work
 323                     break;                   minor tag is just a tag
 324             case UNUSED:
 325             */
 326             case COMMENT:
 327                    // printf("row: comment is %s\n", t);
 328                     data->rev.comment = store(data->rev.comment, t);
 329                     break;
 330             case TEXT:
 331                    data->rev.text = store(data->rev.text, t);
 332                    break;
 333             default: break;
 334         }
 335     }
 336 }
 337
 338 static void
 339 start(void* vdata, const XML_Char* name, const XML_Char** attr)
 340 {
 341     revisionData* data = (revisionData*) vdata;
 342
 343     if (strcmp(name,"title") == 0) {
 344         cleanup_article(data); // cleans up data from last article
 345         data->element = TITLE;
 346         data->position = TITLE_BLOCK;
 347     } else if (data->position != SKIP) {
 348         if (strcmp(name,"revision") == 0) {
 349             data->element = REVISION;
 350             data->position = REVISION_BLOCK;
 351         } else if (strcmp(name, "contributor") == 0) {
 352             data->element = CONTRIBUTOR;
 353             data->position = CONTRIBUTOR_BLOCK;
 354         } else if (strcmp(name,"id") == 0)
 355             switch (data->position) {
 356                 case TITLE_BLOCK:
 357                     data->element = ARTICLEID;
 358                     break;
 359                 case REVISION_BLOCK:
 360                     data->element = REVID;
 361                     break;
 362                 case CONTRIBUTOR_BLOCK:
 363                     data->element = EDITORID;
 364                     break;
 365             }
 366
 367         // minor tag has no character data, so we parse here
 368         else if (strcmp(name,"minor") == 0) {
 369             data->element = MINOR;
 370             data->rev.minor = true;
 371         }
 372         else if (strcmp(name,"timestamp") == 0)
 373             data->element = TIMESTAMP;
 374
 375         else if (strcmp(name, "username") == 0)
 376             data->element = EDITOR;
 377
 378         else if (strcmp(name,"ip") == 0)
 379             data->element = EDITORID;
 380
 381         else if (strcmp(name,"comment") == 0)
 382             data->element = COMMENT;
 383
 384         else if (strcmp(name,"text") == 0)
 385             data->element = TEXT;
 386
 387         else if (strcmp(name,"page") == 0
 388                 || strcmp(name,"mediawiki") == 0
 389                 || strcmp(name,"restrictions") == 0
 390                 || strcmp(name,"siteinfo") == 0)
 391             data->element = UNUSED;
 392     }
 393
 394 }
 395
 396
 397 static void
 398 end(void* vdata, const XML_Char* name)
 399 {
 400     revisionData* data = (revisionData*) vdata;
 401     if (strcmp(name, "revision") == 0 && data->position != SKIP) {
 402         write_row(data); // crucial... :)
 403         cleanup_revision(data);  // also crucial
 404     } else {
 405         data->element = UNUSED; // sets our state to "not-in-useful"
 406     }                           // thus avoiding unpleasant character data
 407                                 // b/w tags (newlines etc.)
 408 }
 409
 410 void print_usage(char* argv[]) {
 411     fprintf(stderr, "usage: <wikimedia dump xml> | %s [options]\n", argv[0]);
 412     fprintf(stderr, "\n");
 413     fprintf(stderr, "options:\n");
 414     fprintf(stderr, "  -t   print text and comments after each line of tab separated data\n");
 415     fprintf(stderr, "\n");
 416     fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n");
 417     fprintf(stderr, "a tab-separated stream of revisions on standard out:\n");
 418     fprintf(stderr, "\n");
 419     fprintf(stderr, "title, articleid, revid, date, time, anon, editor, editorid, minor\n");
 420     fprintf(stderr, "\n");
 421     fprintf(stderr, "author: Erik Garrison <erik@hypervolu.me>\n");
 422 }
 423
 424
 425 int
 426 main(int argc, char *argv[])
 427 {
 428
 429     enum outtype output_type;
 430     int dry_run = 0;
 431     // in "simple" output, we don't print text and comments
 432     output_type = SIMPLE;
 433     char c;
 434
 435     while ((c = getopt(argc, argv, "hsd")) != -1)
 436         switch (c)
 437         {
 438             case 'd':
 439                 dry_run = 1;
 440                 break;
 441             case 't':
 442                 output_type = FULL;
 443                 break;
 444             case 'h':
 445                 print_usage(argv);
 446                 exit(0);
 447                 break;
 448         }
 449
 450     if (dry_run) { // lets us print initialization options
 451         printf("simple_output = %i\n", output_type);
 452         exit(1);
 453     }
 454
 455     // create a new instance of the expat parser
 456     XML_Parser parser = XML_ParserCreate(NULL);
 457
 458     // initialize the user data struct which is passed to callback functions
 459     revisionData data;
 460     // initialize the elements of the struct to default values
 461     init_data(&data, output_type);
 462
 463
 464     // makes the parser pass "data" as the first argument to every callback
 465     XML_SetUserData(parser, &data);
 466     void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
 467     void (*endFnPtr)(void*, const XML_Char*) = end;
 468     void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
 469
 470     // sets start and end to be the element start and end handlers
 471     XML_SetElementHandler(parser, startFnPtr, endFnPtr);
 472     // sets charhndl to be the callback for raw character data
 473     XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
 474
 475     int done;
 476     char buf[BUFSIZ];
 477
 478     write_header();
 479
 480     // shovel data into the parser
 481     do {
 482
 483         // read into buf a bufferfull of data from standard input
 484         size_t len = fread(buf, 1, sizeof(buf), stdin);
 485         done = len < sizeof(buf); // checks if we've got the last bufferfull
 486
 487         // passes the buffer of data to the parser and checks for error
 488         //   (this is where the callbacks are invoked)
 489         if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
 490             fprintf(stderr,
 491                 "%s at line %d\n",
 492                 XML_ErrorString(XML_GetErrorCode(parser)),
 493                 (int) XML_GetCurrentLineNumber(parser));
 494             return 1;
 495         }
 496     } while (!done);
 497
 498
 499     XML_ParserFree(parser);
 500
 501     return 0;
 502 }