X-Git-Url: https://projects.mako.cc/source/wikiq/blobdiff_plain/26c05eec96c89590ddf4d516ae0cf9fc6bdbec82..59363fd9d5c67c2ec370d436c329508c4b7a6f54:/wikiq.cpp?ds=sidebyside diff --git a/wikiq.cpp b/wikiq.cpp index 144e563..da5e7b7 100644 --- a/wikiq.cpp +++ b/wikiq.cpp @@ -15,6 +15,9 @@ #include "md5.h" #include "dtl/dtl.hpp" #include +#include +#include + using namespace std; @@ -25,9 +28,9 @@ using namespace std; #define MEGABYTE 1048576 #define FIELD_BUFFER_SIZE 1024 -// 2048 KB in bytes + 1 -//#define TEXT_BUFFER_SIZE 2097153 -//#define TEXT_BUFFER_SIZE 10485760 + +// this can be changed at runtime if we encounter an article larger than 10mb +size_t text_buffer_size = 10 * MEGABYTE; enum elements { TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR, @@ -52,7 +55,10 @@ typedef struct { char *editorid; char *comment; char *text; - vector last_tokens; + vector last_text_tokens; + vector regexes; + vector regex_names; + map revision_md5; // used for detecting reversions // track string size of the elements, to prevent O(N^2) processing in charhndl // when we have to take strlen for every character which we append to the buffer @@ -139,7 +145,7 @@ free_data(revisionData *data, int title) free(data->editorid); free(data->comment); free(data->text); - data->last_tokens.clear(); + data->last_text_tokens.clear(); } void cleanup_revision(revisionData *data) { @@ -148,14 +154,15 @@ void cleanup_revision(revisionData *data) { void cleanup_article(revisionData *data) { clean_data(data, 1); - data->last_tokens.clear(); + data->last_text_tokens.clear(); + data->revision_md5.clear(); } static void init_data(revisionData *data, outtype output_type) { - data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'? + data->text = (char*) malloc(text_buffer_size); data->comment = (char*) malloc(FIELD_BUFFER_SIZE); data->title = (char*) malloc(FIELD_BUFFER_SIZE); data->articleid = (char*) malloc(FIELD_BUFFER_SIZE); @@ -196,16 +203,6 @@ print_state(revisionData *data) } -/* Write a header for the comma-separated output - */ -static void -write_header() -{ - // printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n"); -// printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n"); - -} - /* * write a line of comma-separated value formatted data to standard out @@ -231,31 +228,38 @@ write_row(revisionData *data) sprintf(md5_hex_output + di * 2, "%02x", digest[di]); } + string reverted_to; + map::iterator prev_revision = data->revision_md5.find(md5_hex_output); + if (prev_revision != data->revision_md5.end()) { + reverted_to = prev_revision->second; // id of previous revision + } + data->revision_md5[md5_hex_output] = data->revid; + string text = string(data->text, data->text_size); vector text_tokens; - size_t period_pos = 0; - size_t paragraph_pos = 0; + size_t pos = 0; size_t start = 0; - while ((period_pos = text.find(".", period_pos + 1)) != string::npos && - (paragraph_pos = text.find("\n\n", paragraph_pos + 1)) != string::npos) { - if (paragraph_pos < period_pos) { - text_tokens.push_back(text.substr(start, paragraph_pos - start)); - start = paragraph_pos; - } else { - text_tokens.push_back(text.substr(start, period_pos - start)); - start = period_pos; - } + while ((pos = text.find_first_of(" \n\t\r", pos)) != string::npos) { + //cout << "\"\"\"" << text.substr(start, pos - start) << "\"\"\"" << endl; + text_tokens.push_back(text.substr(start, pos - start)); + start = pos; + ++pos; } - vector additions; - vector deletions; + //vector additions; + //vector deletions; + string additions; + string deletions; - if (data->last_tokens.empty()) { - data->last_tokens = text_tokens; + vector regex_matches_adds; + vector regex_matches_dels; + + if (data->last_text_tokens.empty()) { + additions = data->text; } else { // do the diff - dtl::Diff< string, vector > d(data->last_tokens, text_tokens); + dtl::Diff< string, vector > d(data->last_text_tokens, text_tokens); //d.onOnlyEditDistance(); d.compose(); @@ -263,44 +267,64 @@ write_row(revisionData *data) for (vector >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) { switch (sit->second.type) { case dtl::SES_ADD: - cout << "ADD: \"" << sit->first << "\"" << endl; - additions.push_back(sit->first); + //cout << "ADD: \"" << sit->first << "\"" << endl; + additions += sit->first; break; case dtl::SES_DELETE: - cout << "DEL: \"" << sit->first << "\"" << endl; - deletions.push_back(sit->first); + //cout << "DEL: \"" << sit->first << "\"" << endl; + deletions += sit->first; break; } } + } + + if (!additions.empty()) { + //cout << "ADD: " << additions << endl; + for (vector::iterator r = data->regexes.begin(); r != data->regexes.end(); ++r) { + pcrecpp::RE& regex = *r; + regex_matches_adds.push_back(regex.PartialMatch(additions)); + } + } - // apply regex to the diff - - - data->last_tokens = text_tokens; + if (!deletions.empty()) { + //cout << "DEL: " << deletions << endl; + for (vector::iterator r = data->regexes.begin(); r != data->regexes.end(); ++r) { + pcrecpp::RE& regex = *r; + regex_matches_dels.push_back(regex.PartialMatch(deletions)); + } } + data->last_text_tokens = text_tokens; + // print line of tsv output - printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\t%i\t%i\n", - data->title, - data->articleid, - data->revid, - data->date, - data->time, - (data->editor[0] != '\0') ? "0" : "1", // anon? - data->editor, - data->editorid, - (data->minor) ? "1" : "0", - (unsigned int) data->text_size, - shannon_H(data->text, data->text_size), - md5_hex_output, - (int) additions.size(), - (int) deletions.size() - ); + cout + << data->title << "\t" + << data->articleid << "\t" + << data->revid << "\t" + << data->date << " " + << data->time << "\t" + << ((data->editor[0] != '\0') ? "FALSE" : "TRUE") << "\t" + << data->editor << "\t" + << data->editorid << "\t" + << ((data->minor) ? "TRUE" : "FALSE") << "\t" + << (unsigned int) data->text_size << "\t" + << shannon_H(data->text, data->text_size) << "\t" + << md5_hex_output << "\t" + << reverted_to << "\t" + << (int) additions.size() << "\t" + << (int) deletions.size(); + + for (int n = 0; n < data->regex_names.size(); ++n) { + cout << "\t" << ((!regex_matches_adds.empty() && regex_matches_adds.at(n)) ? "TRUE" : "FALSE") + << "\t" << ((!regex_matches_dels.empty() && regex_matches_dels.at(n)) ? "TRUE" : "FALSE"); + } + cout << endl; // if (data->output_type == FULL) { - printf("comment:%s\ntext:\n%s\n", data->comment, data->text); + cout << "comment:" << data->comment << endl + << "text:" << endl << data->text << endl; } } @@ -318,7 +342,6 @@ split_timestamp(revisionData *data) char* strlcatn(char *dest, const char *src, size_t dest_len, size_t n) { - //size_t dest_len = strlen(dest); size_t i; for (i = 0 ; i < n && src[i] != '\0' ; i++) @@ -332,15 +355,18 @@ static void charhndl(void* vdata, const XML_Char* s, int len) { revisionData* data = (revisionData*) vdata; + size_t bufsz; if (data->element != UNUSED && data->position != SKIP) { - //char t[len]; - //strncpy(t,s,len); - //t[len] = '\0'; // makes t a well-formed string switch (data->element) { case TEXT: - // printf("buffer length = %i, text: %s\n", len, t); + // check if we'd overflow our buffer + bufsz = data->text_size + len; + if (bufsz + 1 > text_buffer_size) { + data->text = (char*) realloc(data->text, bufsz + 1); + text_buffer_size = bufsz + 1; + } strlcatn(data->text, s, data->text_size, len); - data->text_size += len; + data->text_size = bufsz; break; case COMMENT: strlcatn(data->comment, s, data->comment_size, len); @@ -459,17 +485,25 @@ end(void* vdata, const XML_Char* name) } void print_usage(char* argv[]) { - fprintf(stderr, "usage: | %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -t print text and comments after each line of tab separated data\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n"); - fprintf(stderr, "a tab-separated stream of revisions on standard out:\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "author: Erik Garrison \n"); + cerr << "usage: | " << argv[0] << "[options]" << endl + << endl + << "options:" << endl + << " -t print text and comments after each line of tab separated data" << endl + << " -n name of the following regex (e.g. -n name -r \"...\")" << endl + << " -r regex to check against additions and deletions" << endl + << endl + << "Takes a wikimedia data dump XML stream on standard in, and produces" << endl + << "a tab-separated stream of revisions on standard out:" << endl + << endl + << "title, articleid, revid, timestamp, anon, editor, editorid, minor," << endl + << "text_length, text_entropy, text_md5, reversion, additions_size, deletions_size" << endl + << ".... and additional fields for each regex executed against add/delete diffs" << endl + << endl + << "Boolean fields are TRUE/FALSE except in the case of reversion, which is blank" << endl + << "unless the article is a revert to a previous revision, in which case, it" << endl + << "contains the revision ID of the revision which was reverted to." << endl + << endl + << "author: Erik Garrison " << endl; } @@ -482,8 +516,12 @@ main(int argc, char *argv[]) // in "simple" output, we don't print text and comments output_type = SIMPLE; char c; + string regex_name; - while ((c = getopt(argc, argv, "ht")) != -1) + // the user data struct which is passed to callback functions + revisionData data; + + while ((c = getopt(argc, argv, "htn:r:")) != -1) switch (c) { case 'd': @@ -492,6 +530,16 @@ main(int argc, char *argv[]) case 't': output_type = FULL; break; + case 'n': + regex_name = optarg; + break; + case 'r': + data.regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8())); + data.regex_names.push_back(regex_name); + if (!regex_name.empty()) { + regex_name.clear(); + } + break; case 'h': print_usage(argv); exit(0); @@ -506,8 +554,6 @@ main(int argc, char *argv[]) // create a new instance of the expat parser XML_Parser parser = XML_ParserCreate("UTF-8"); - // initialize the user data struct which is passed to callback functions - revisionData data; // initialize the elements of the struct to default values init_data(&data, output_type); @@ -525,6 +571,38 @@ main(int argc, char *argv[]) bool done; char buf[BUFSIZ]; + + // write header + + cout << "title" << "\t" + << "articleid" << "\t" + << "revid" << "\t" + << "date" << " " + << "time" << "\t" + << "anon" << "\t" + << "editor" << "\t" + << "editor_id" << "\t" + << "minor" << "\t" + << "text_size" << "\t" + << "text_entropy" << "\t" + << "text_md5" << "\t" + << "reversion" << "\t" + << "additions_size" << "\t" + << "deletions_size"; + + int n = 0; + if (!data.regexes.empty()) { + for (vector::iterator r = data.regexes.begin(); r != data.regexes.end(); ++r, ++n) { + if (data.regex_names.at(n).empty()) { + cout << "\t" << "regex_" << n << "_add" + << "\t" << "regex_" << n << "_del"; + } else { + cout << "\t" << data.regex_names.at(n) << "_add" + << "\t" << data.regex_names.at(n) << "_del"; + } + } + } + cout << endl; // shovel data into the parser do { @@ -536,10 +614,8 @@ main(int argc, char *argv[]) // passes the buffer of data to the parser and checks for error // (this is where the callbacks are invoked) if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) { - fprintf(stderr, - "%s at line %d\n", - XML_ErrorString(XML_GetErrorCode(parser)), - (int) XML_GetCurrentLineNumber(parser)); + cerr << "XML ERROR: " << XML_ErrorString(XML_GetErrorCode(parser)) << " at line " + << (int) XML_GetCurrentLineNumber(parser) << endl; return 1; } } while (!done);