projects.mako.cc - wikiq/blob - wikiq.cpp

   1 /*
   2  * An XML parser for Wikipedia Data dumps.
   3  * Converts XML files to tab-separated values files readable by spreadsheets
   4  * and statistical packages.
   5  */
   6
   7 #include <stdio.h>
   8 #include <iostream>
   9 #include <string.h>
  10 #include <ctype.h>
  11 #include <stdlib.h>
  12 #include "expat.h"
  13 #include <getopt.h>
  14 #include "md5.h"
  15 #include "dtl/dtl.hpp"
  16 #include <vector>
  17 #include <map>
  18 #include <pcrecpp.h>
  19
  20
  21 using namespace std;
  22
  23 // timestamp of the form 2003-11-07T00:43:23Z
  24 #define DATE_LENGTH 10
  25 #define TIME_LENGTH 8
  26 #define TIMESTAMP_LENGTH 20
  27
  28 #define MEGABYTE 1048576
  29 #define FIELD_BUFFER_SIZE 1024
  30
  31 // this can be changed at runtime if we encounter an article larger than 10mb
  32 size_t text_buffer_size = 10 * MEGABYTE;
  33
  34 enum elements {
  35     TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR,
  36     EDITOR, EDITORID, MINOR, COMMENT, UNUSED, TEXT
  37 };
  38
  39 enum block { TITLE_BLOCK, REVISION_BLOCK, CONTRIBUTOR_BLOCK, SKIP };
  40
  41 enum outtype { FULL, SIMPLE };
  42
  43 typedef struct {
  44
  45     // pointers to once-allocated buffers
  46     char *title;
  47     char *articleid;
  48     char *revid;
  49     char *date;
  50     char *time;
  51     char *timestamp;
  52     char *anon;
  53     char *editor;
  54     char *editorid;
  55     char *comment;
  56     char *text;
  57     vector<string> last_text_tokens;
  58
  59     // title regexes
  60     vector<pcrecpp::RE> title_regexes;
  61
  62     // regexes for checking with revisions
  63     vector<string> content_regex_names;
  64     vector<pcrecpp::RE> content_regexes;
  65
  66     // regexes for looking within diffs
  67     vector<string> diff_regex_names;
  68     vector<pcrecpp::RE> diff_regexes;
  69
  70     map<string, string> revision_md5; // used for detecting reversions
  71
  72     // track string size of the elements, to prevent O(N^2) processing in charhndl
  73     // when we have to take strlen for every character which we append to the buffer
  74     size_t title_size;
  75     size_t articleid_size;
  76     size_t revid_size;
  77     size_t date_size;
  78     size_t time_size;
  79     size_t timestamp_size;
  80     size_t anon_size;
  81     size_t editor_size;
  82     size_t editorid_size;
  83     size_t comment_size;
  84     size_t text_size;
  85
  86     bool minor;
  87
  88     enum elements element;
  89     enum block position;
  90     enum outtype output_type;
  91
  92 } revisionData;
  93
  94
  95 /* free_data and clean_data
  96  * Takes a pointer to the data struct and an integer {0,1} indicating if the
  97  * title data needs to be cleared as well.
  98  * Also, frees memory dynamically allocated to store data.
  99  */
 100 static void
 101 clean_data(revisionData *data, int title)
 102 {
 103     // reset title (if we are switching articles)
 104     if (title) {
 105         data->title[0] = '\0';
 106         data->articleid[0] = '\0';
 107         data->title_size = 0;
 108         data->articleid_size = 0;
 109     }
 110
 111     // reset text fields
 112     data->revid[0] = '\0';
 113     data->date[0] = '\0';
 114     data->time[0] = '\0';
 115     data->timestamp[0] = '\0';
 116     data->anon[0] = '\0';
 117     data->editor[0] = '\0';
 118     data->editorid[0] = '\0';
 119     data->comment[0] = '\0';
 120     data->text[0] = '\0';
 121
 122     // reset length tracking
 123     data->revid_size = 0;
 124     data->date_size = 0;
 125     data->time_size = 0;
 126     data->timestamp_size = 0;
 127     data->anon_size = 0;
 128     data->editor_size = 0;
 129     data->editorid_size = 0;
 130     data->comment_size = 0;
 131     data->text_size = 0;
 132
 133     // reset flags and element type info
 134     data->minor = false;
 135     data->element = UNUSED;
 136
 137 }
 138
 139 // presently unused
 140 static void
 141 free_data(revisionData *data, int title)
 142 {
 143     if (title) {
 144         //printf("freeing article\n");
 145         free(data->title);
 146         free(data->articleid);
 147     }
 148     free(data->revid);
 149     free(data->date);
 150     free(data->time);
 151     free(data->timestamp);
 152     free(data->anon);
 153     free(data->editor);
 154     free(data->editorid);
 155     free(data->comment);
 156     free(data->text);
 157     data->last_text_tokens.clear();
 158 }
 159
 160 void cleanup_revision(revisionData *data) {
 161     clean_data(data, 0);
 162 }
 163
 164 void cleanup_article(revisionData *data) {
 165     clean_data(data, 1);
 166     data->last_text_tokens.clear();
 167     data->revision_md5.clear();
 168 }
 169
 170
 171 static void
 172 init_data(revisionData *data, outtype output_type)
 173 {
 174     data->text = (char*) malloc(text_buffer_size);
 175     data->comment = (char*) malloc(FIELD_BUFFER_SIZE);
 176     data->title = (char*) malloc(FIELD_BUFFER_SIZE);
 177     data->articleid = (char*) malloc(FIELD_BUFFER_SIZE);
 178     data->revid = (char*) malloc(FIELD_BUFFER_SIZE);
 179     data->date = (char*) malloc(FIELD_BUFFER_SIZE);
 180     data->time = (char*) malloc(FIELD_BUFFER_SIZE);
 181     data->timestamp = (char*) malloc(FIELD_BUFFER_SIZE);
 182     data->anon = (char*) malloc(FIELD_BUFFER_SIZE);
 183     data->editor = (char*) malloc(FIELD_BUFFER_SIZE);
 184     data->editorid = (char*) malloc(FIELD_BUFFER_SIZE);
 185     data->minor = false;
 186
 187     // resets the data fields, null terminates strings, sets lengths
 188     clean_data(data, 1);
 189
 190     data->output_type = output_type;
 191 }
 192
 193 /* for debugging only, prints out the state of the data struct
 194  */
 195 static void
 196 print_state(revisionData *data)
 197 {
 198     printf("element = %i\n", data->element);
 199     printf("output_type = %i\n", data->output_type);
 200     printf("title = %s\n", data->title);
 201     printf("articleid = %s\n", data->articleid);
 202     printf("revid = %s\n", data->revid);
 203     printf("date = %s\n", data->date);
 204     printf("time = %s\n", data->time);
 205     printf("anon = %s\n", data->anon);
 206     printf("editor = %s\n", data->editor);
 207     printf("editorid = %s\n", data->editorid);
 208     printf("minor = %s\n", (data->minor ? "1" : "0"));
 209     printf("comment = %s\n", data->comment);
 210     printf("text = %s\n", data->text);
 211     printf("\n");
 212
 213 }
 214
 215
 216 /*
 217  * write a line of comma-separated value formatted data to standard out
 218  * follows the form:
 219  * title,articleid,revid,date,time,anon,editor,editorid,minor,comment
 220  * (str)  (int)    (int) (str)(str)(bin)(str)   (int)   (bin) (str)
 221  *
 222  * it is called right before cleanup_revision() and cleanup_article()
 223  */
 224 static void
 225 write_row(revisionData *data)
 226 {
 227
 228     // get md5sum
 229     md5_state_t state;
 230     md5_byte_t digest[16];
 231     char md5_hex_output[2 * 16 + 1];
 232     md5_init(&state);
 233     md5_append(&state, (const md5_byte_t *)data->text, data->text_size);
 234     md5_finish(&state, digest);
 235     int di;
 236     for (di = 0; di < 16; ++di) {
 237         sprintf(md5_hex_output + di * 2, "%02x", digest[di]);
 238     }
 239
 240     string reverted_to;
 241     map<string, string>::iterator prev_revision = data->revision_md5.find(md5_hex_output);
 242     if (prev_revision != data->revision_md5.end()) {
 243         reverted_to = prev_revision->second; // id of previous revision
 244     }
 245     data->revision_md5[md5_hex_output] = data->revid;
 246
 247     string text = string(data->text, data->text_size);
 248     vector<string> text_tokens;
 249     size_t pos = 0;
 250     size_t start = 0;
 251     while ((pos = text.find_first_of(" \n\t\r", pos)) != string::npos) {
 252         //cout << "\"\"\"" << text.substr(start, pos - start) << "\"\"\"" << endl;
 253         text_tokens.push_back(text.substr(start, pos - start));
 254         start = pos;
 255         ++pos;
 256     }
 257
 258     // look to see if (a) we've passed in a list of /any/ title_regexes
 259     // and (b) if all of the title_regex_matches match
 260     // if (a) is true and (b) is not, we return
 261     bool any_title_regex_match = false;
 262     if (!data->title_regexes.empty()) {
 263         for (vector<pcrecpp::RE>::iterator r = data->title_regexes.begin(); r != data->title_regexes.end(); ++r) {
 264             pcrecpp::RE& title_regex = *r;
 265             if (title_regex.PartialMatch(data->title)) {
 266                 any_title_regex_match = true;
 267                 break;
 268             }
 269         }
 270         if (!any_title_regex_match) {
 271             return;
 272         }
 273     }
 274
 275     // search the content of the revision for a any of the regexes
 276     vector<bool> content_regex_matches;
 277     if (!data->content_regexes.empty()) {
 278         for (vector<pcrecpp::RE>::iterator r = data->content_regexes.begin(); r != data->content_regexes.end(); ++r) {
 279             pcrecpp::RE& content_regex = *r;
 280             content_regex_matches.push_back(content_regex.PartialMatch(data->text));
 281         }
 282     }
 283
 284     //vector<string> additions;
 285     //vector<string> deletions;
 286     string additions;
 287     string deletions;
 288
 289     vector<bool> diff_regex_matches_adds;
 290     vector<bool> diff_regex_matches_dels;
 291
 292     if (data->last_text_tokens.empty()) {
 293         additions = data->text;
 294     } else {
 295         // do the diff
 296
 297         dtl::Diff< string, vector<string> > d(data->last_text_tokens, text_tokens);
 298         //d.onOnlyEditDistance();
 299         d.compose();
 300
 301         vector<pair<string, dtl::elemInfo> > ses_v = d.getSes().getSequence();
 302         for (vector<pair<string, dtl::elemInfo> >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) {
 303             switch (sit->second.type) {
 304             case dtl::SES_ADD:
 305                 //cout << "ADD: \"" << sit->first << "\"" << endl;
 306                 additions += sit->first;
 307                 break;
 308             case dtl::SES_DELETE:
 309                 //cout << "DEL: \"" << sit->first << "\"" << endl;
 310                 deletions += sit->first;
 311                 break;
 312             }
 313         }
 314     }
 315
 316     if (!additions.empty()) {
 317         //cout << "ADD: " << additions << endl;
 318         for (vector<pcrecpp::RE>::iterator r = data->diff_regexes.begin(); r != data->diff_regexes.end(); ++r) {
 319             pcrecpp::RE& diff_regex = *r;
 320             diff_regex_matches_adds.push_back(diff_regex.PartialMatch(additions));
 321         }
 322     }
 323
 324     if (!deletions.empty()) {
 325         //cout << "DEL: " << deletions << endl;
 326         for (vector<pcrecpp::RE>::iterator r = data->diff_regexes.begin(); r != data->diff_regexes.end(); ++r) {
 327             pcrecpp::RE& diff_regex = *r;
 328             diff_regex_matches_dels.push_back(diff_regex.PartialMatch(deletions));
 329         }
 330     }
 331
 332     data->last_text_tokens = text_tokens;
 333
 334
 335     // print line of tsv output
 336     cout
 337         << data->title << "\t"
 338         << data->articleid << "\t"
 339         << data->revid << "\t"
 340         << data->date << " "
 341         << data->time << "\t"
 342         << ((data->editor[0] != '\0') ? "FALSE" : "TRUE") << "\t"
 343         << data->editor << "\t"
 344         << data->editorid << "\t"
 345         << ((data->minor) ? "TRUE" : "FALSE") << "\t"
 346         << (unsigned int) data->text_size << "\t"
 347         << md5_hex_output << "\t"
 348         << reverted_to << "\t"
 349         << (int) additions.size() << "\t"
 350         << (int) deletions.size();
 351
 352     for (int n = 0; n < data->content_regex_names.size(); ++n) {
 353         cout << "\t" << ((!content_regex_matches.empty()
 354                           && content_regex_matches.at(n)) ? "TRUE" : "FALSE");
 355     }
 356
 357     for (int n = 0; n < data->diff_regex_names.size(); ++n) {
 358         cout << "\t" << ((!diff_regex_matches_adds.empty() && diff_regex_matches_adds.at(n)) ? "TRUE" : "FALSE")
 359              << "\t" << ((!diff_regex_matches_dels.empty() && diff_regex_matches_dels.at(n)) ? "TRUE" : "FALSE");
 360     }
 361     cout << endl;
 362
 363     //
 364     if (data->output_type == FULL) {
 365         cout << "comment:" << data->comment << endl
 366              << "text:" << endl << data->text << endl;
 367     }
 368
 369 }
 370
 371 void
 372 split_timestamp(revisionData *data)
 373 {
 374     char *t = data->timestamp;
 375     strncpy(data->date, data->timestamp, DATE_LENGTH);
 376     char *timeinstamp = &data->timestamp[DATE_LENGTH+1];
 377     strncpy(data->time, timeinstamp, TIME_LENGTH);
 378 }
 379
 380 // like strncat but with previously known length
 381 char*
 382 strlcatn(char *dest, const char *src, size_t dest_len, size_t n)
 383 {
 384    size_t i;
 385
 386    for (i = 0 ; i < n && src[i] != '\0' ; i++)
 387        dest[dest_len + i] = src[i];
 388    dest[dest_len + i] = '\0';
 389
 390    return dest;
 391 }
 392
 393 static void
 394 charhndl(void* vdata, const XML_Char* s, int len)
 395 {
 396     revisionData* data = (revisionData*) vdata;
 397     size_t bufsz;
 398     if (data->element != UNUSED && data->position != SKIP) {
 399         switch (data->element) {
 400             case TEXT:
 401                     // check if we'd overflow our buffer
 402                     bufsz = data->text_size + len;
 403                     if (bufsz + 1 > text_buffer_size) {
 404                         data->text = (char*) realloc(data->text, bufsz + 1);
 405                         text_buffer_size = bufsz + 1;
 406                     }
 407                     strlcatn(data->text, s, data->text_size, len);
 408                     data->text_size = bufsz;
 409                     break;
 410             case COMMENT:
 411                     strlcatn(data->comment, s, data->comment_size, len);
 412                     data->comment_size += len;
 413                     break;
 414             case TITLE:
 415                     strlcatn(data->title, s, data->title_size, len);
 416                     data->title_size += len;
 417                     break;
 418             case ARTICLEID:
 419                    // printf("articleid = %s\n", t);
 420                     strlcatn(data->articleid, s, data->articleid_size, len);
 421                     data->articleid_size += len;
 422                     break;
 423             case REVID:
 424                    // printf("revid = %s\n", t);
 425                     strlcatn(data->revid, s, data->revid_size, len);
 426                     data->revid_size += len;
 427                     break;
 428             case TIMESTAMP:
 429                     strlcatn(data->timestamp, s, data->timestamp_size, len);
 430                     data->timestamp_size += len;
 431                     if (strlen(data->timestamp) == TIMESTAMP_LENGTH)
 432                         split_timestamp(data);
 433                     break;
 434             case EDITOR:
 435                     strlcatn(data->editor, s, data->editor_size, len);
 436                     data->editor_size += len;
 437                     break;
 438             case EDITORID:
 439                     //printf("editorid = %s\n", t);
 440                     strlcatn(data->editorid, s, data->editorid_size, len);
 441                     data->editorid_size += len;
 442                     break;
 443             /* the following are implied or skipped:
 444             case MINOR:
 445                     printf("found minor element\n");  doesn't work
 446                     break;                   minor tag is just a tag
 447             case UNUSED:
 448             */
 449             default: break;
 450         }
 451     }
 452 }
 453
 454 static void
 455 start(void* vdata, const XML_Char* name, const XML_Char** attr)
 456 {
 457     revisionData* data = (revisionData*) vdata;
 458
 459     if (strcmp(name,"title") == 0) {
 460         cleanup_article(data); // cleans up data from last article
 461         data->element = TITLE;
 462         data->position = TITLE_BLOCK;
 463     } else if (data->position != SKIP) {
 464         if (strcmp(name,"revision") == 0) {
 465             data->element = REVISION;
 466             data->position = REVISION_BLOCK;
 467         } else if (strcmp(name, "contributor") == 0) {
 468             data->element = CONTRIBUTOR;
 469             data->position = CONTRIBUTOR_BLOCK;
 470         } else if (strcmp(name,"id") == 0)
 471             switch (data->position) {
 472                 case TITLE_BLOCK:
 473                     data->element = ARTICLEID;
 474                     break;
 475                 case REVISION_BLOCK:
 476                     data->element = REVID;
 477                     break;
 478                 case CONTRIBUTOR_BLOCK:
 479                     data->element = EDITORID;
 480                     break;
 481             }
 482
 483         // minor tag has no character data, so we parse here
 484         else if (strcmp(name,"minor") == 0) {
 485             data->element = MINOR;
 486             data->minor = true;
 487         }
 488         else if (strcmp(name,"timestamp") == 0)
 489             data->element = TIMESTAMP;
 490
 491         else if (strcmp(name, "username") == 0)
 492             data->element = EDITOR;
 493
 494         else if (strcmp(name,"ip") == 0)
 495             data->element = EDITORID;
 496
 497         else if (strcmp(name,"comment") == 0)
 498             data->element = COMMENT;
 499
 500         else if (strcmp(name,"text") == 0)
 501             data->element = TEXT;
 502
 503         else if (strcmp(name,"page") == 0
 504                 || strcmp(name,"mediawiki") == 0
 505                 || strcmp(name,"restrictions") == 0
 506                 || strcmp(name,"siteinfo") == 0)
 507             data->element = UNUSED;
 508     }
 509
 510 }
 511
 512
 513 static void
 514 end(void* vdata, const XML_Char* name)
 515 {
 516     revisionData* data = (revisionData*) vdata;
 517     if (strcmp(name, "revision") == 0 && data->position != SKIP) {
 518         write_row(data); // crucial... :)
 519         cleanup_revision(data);  // also crucial
 520     } else {
 521         data->element = UNUSED; // sets our state to "not-in-useful"
 522     }                           // thus avoiding unpleasant character data
 523                                 // b/w tags (newlines etc.)
 524 }
 525
 526 void print_usage(char* argv[]) {
 527     cerr << "usage: <wikimedia dump xml> | " << argv[0] << "[options]" << endl
 528          << endl
 529          << "options:" << endl
 530          << "  -v   verbose mode prints text and comments after each line of tab separated data" << endl
 531          << "  -n   name of the following regex for contet (e.g. -n name -r \"...\")" << endl
 532          << "  -r   regex to check against content of the revision" << endl
 533          << "  -N   name of the following regex for diffs (e.g. -N name -R \"...\")" << endl
 534          << "  -R   regex to check against diffs (i.e., additions and deletions)" << endl
 535          << "  -t   parse revisions only from pages whose titles match regex(es)" << endl
 536          << endl
 537          << "Takes a wikimedia data dump XML stream on standard in, and produces" << endl
 538          << "a tab-separated stream of revisions on standard out:" << endl
 539          << endl
 540          << "title, articleid, revid, timestamp, anon, editor, editorid, minor," << endl
 541          << "text_length, text_md5, reversion, additions_size, deletions_size" << endl
 542          << ".... and additional fields for each regex executed against add/delete diffs" << endl
 543          << endl
 544          << "Boolean fields are TRUE/FALSE except in the case of reversion, which is blank" << endl
 545          << "unless the article is a revert to a previous revision, in which case, it" << endl
 546          << "contains the revision ID of the revision which was reverted to." << endl
 547          << endl
 548          << "authors: Erik Garrison <erik@hypervolu.me>" << endl;
 549          << "         Benjamin Mako Hill <mako@atdot.cc>" << endl;
 550 }
 551
 552
 553 int
 554 main(int argc, char *argv[])
 555 {
 556
 557     enum outtype output_type;
 558     int dry_run = 0;
 559     // in "simple" output, we don't print text and comments
 560     output_type = SIMPLE;
 561     char c;
 562     string diff_regex_name;
 563     string content_regex_name;
 564
 565     // the user data struct which is passed to callback functions
 566     revisionData data;
 567
 568     while ((c = getopt(argc, argv, "hvn:r:t:")) != -1)
 569         switch (c)
 570         {
 571             case 'd':
 572                 dry_run = 1;
 573                 break;
 574             case 'v':
 575                 output_type = FULL;
 576                 break;
 577             case 'n':
 578                 content_regex_name = optarg;
 579                 break;
 580             case 'r':
 581                 data.content_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
 582                 data.content_regex_names.push_back(content_regex_name);
 583                 if (!content_regex_name.empty()) {
 584                     content_regex_name.clear();
 585                 }
 586                 break;
 587             case 'N':
 588                 diff_regex_name = optarg;
 589                 break;
 590             case 'R':
 591                 data.diff_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
 592                 data.diff_regex_names.push_back(diff_regex_name);
 593                 if (!diff_regex_name.empty()) {
 594                     diff_regex_name.clear();
 595                 }
 596                 break;
 597             case 'h':
 598                 print_usage(argv);
 599                 exit(0);
 600                 break;
 601             case 't':
 602                 data.title_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8()));
 603                 break;
 604         }
 605
 606     if (dry_run) { // lets us print initialization options
 607         printf("simple_output = %i\n", output_type);
 608         exit(1);
 609     }
 610
 611     // create a new instance of the expat parser
 612     XML_Parser parser = XML_ParserCreate("UTF-8");
 613
 614     // initialize the elements of the struct to default values
 615     init_data(&data, output_type);
 616
 617
 618     // makes the parser pass "data" as the first argument to every callback
 619     XML_SetUserData(parser, &data);
 620     void (*startFnPtr)(void*, const XML_Char*, const XML_Char**) = start;
 621     void (*endFnPtr)(void*, const XML_Char*) = end;
 622     void (*charHandlerFnPtr)(void*, const XML_Char*, int) = charhndl;
 623
 624     // sets start and end to be the element start and end handlers
 625     XML_SetElementHandler(parser, startFnPtr, endFnPtr);
 626     // sets charhndl to be the callback for character data
 627     XML_SetCharacterDataHandler(parser, charHandlerFnPtr);
 628
 629     bool done;
 630     char buf[BUFSIZ];
 631
 632     // write header
 633
 634     cout << "title" << "\t"
 635         << "articleid" << "\t"
 636         << "revid" << "\t"
 637         << "date" << "_"
 638         << "time" << "\t"
 639         << "anon" << "\t"
 640         << "editor" << "\t"
 641         << "editor_id" << "\t"
 642         << "minor" << "\t"
 643         << "text_size" << "\t"
 644         << "text_md5" << "\t"
 645         << "reversion" << "\t"
 646         << "additions_size" << "\t"
 647         << "deletions_size";
 648
 649     int n = 0;
 650     if (!data.content_regexes.empty()) {
 651         for (vector<pcrecpp::RE>::iterator r = data.content_regexes.begin();
 652              r != data.content_regexes.end(); ++r, ++n) {
 653             if (data.content_regex_names.at(n).empty()) {
 654                 cout << "\t" << "regex" << n;
 655             } else {
 656                 cout << "\t" << data.content_regex_names.at(n);
 657             }
 658         }
 659     }
 660
 661     if (!data.diff_regexes.empty()) {
 662         for (vector<pcrecpp::RE>::iterator r = data.diff_regexes.begin(); r != data.diff_regexes.end(); ++r, ++n) {
 663             if (data.diff_regex_names.at(n).empty()) {
 664                 cout << "\t" << "regex_" << n << "_add"
 665                      << "\t" << "regex_" << n << "_del";
 666             } else {
 667                 cout << "\t" << data.diff_regex_names.at(n) << "_add"
 668                      << "\t" << data.diff_regex_names.at(n) << "_del";
 669             }
 670         }
 671     }
 672
 673     cout << endl;
 674
 675     // shovel data into the parser
 676     do {
 677
 678         // read into buf a bufferfull of data from standard input
 679         size_t len = fread(buf, 1, BUFSIZ, stdin);
 680         done = len < BUFSIZ; // checks if we've got the last bufferfull
 681
 682         // passes the buffer of data to the parser and checks for error
 683         //   (this is where the callbacks are invoked)
 684         if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) {
 685             cerr << "XML ERROR: " << XML_ErrorString(XML_GetErrorCode(parser)) << " at line "
 686                  << (int) XML_GetCurrentLineNumber(parser) << endl;
 687             return 1;
 688         }
 689     } while (!done);
 690
 691
 692     XML_ParserFree(parser);
 693
 694     return 0;
 695 }