X-Git-Url: https://projects.mako.cc/source/wikiq/blobdiff_plain/26c05eec96c89590ddf4d516ae0cf9fc6bdbec82..21341e70f9d509aeaaf0cb6a39caf56d704dc838:/wikiq.cpp diff --git a/wikiq.cpp b/wikiq.cpp index 144e563..ad9ac48 100644 --- a/wikiq.cpp +++ b/wikiq.cpp @@ -11,10 +11,12 @@ #include #include "expat.h" #include -#include "disorder.h" #include "md5.h" #include "dtl/dtl.hpp" #include +#include +#include + using namespace std; @@ -25,9 +27,9 @@ using namespace std; #define MEGABYTE 1048576 #define FIELD_BUFFER_SIZE 1024 -// 2048 KB in bytes + 1 -//#define TEXT_BUFFER_SIZE 2097153 -//#define TEXT_BUFFER_SIZE 10485760 + +// this can be changed at runtime if we encounter an article larger than 10mb +size_t text_buffer_size = 10 * MEGABYTE; enum elements { TITLE, ARTICLEID, REVISION, REVID, TIMESTAMP, CONTRIBUTOR, @@ -52,7 +54,20 @@ typedef struct { char *editorid; char *comment; char *text; - vector last_tokens; + vector last_text_tokens; + + // title regexes + vector title_regexes; + + // regexes for checking with revisions + vector content_regex_names; + vector content_regexes; + + // regexes for looking within diffs + vector diff_regex_names; + vector diff_regexes; + + map revision_md5; // used for detecting reversions // track string size of the elements, to prevent O(N^2) processing in charhndl // when we have to take strlen for every character which we append to the buffer @@ -139,7 +154,7 @@ free_data(revisionData *data, int title) free(data->editorid); free(data->comment); free(data->text); - data->last_tokens.clear(); + data->last_text_tokens.clear(); } void cleanup_revision(revisionData *data) { @@ -148,14 +163,15 @@ void cleanup_revision(revisionData *data) { void cleanup_article(revisionData *data) { clean_data(data, 1); - data->last_tokens.clear(); + data->last_text_tokens.clear(); + data->revision_md5.clear(); } static void init_data(revisionData *data, outtype output_type) { - data->text = (char*) malloc(4 * MEGABYTE); // 2MB is the article length limit, 4MB is 'safe'? + data->text = (char*) malloc(text_buffer_size); data->comment = (char*) malloc(FIELD_BUFFER_SIZE); data->title = (char*) malloc(FIELD_BUFFER_SIZE); data->articleid = (char*) malloc(FIELD_BUFFER_SIZE); @@ -196,16 +212,6 @@ print_state(revisionData *data) } -/* Write a header for the comma-separated output - */ -static void -write_header() -{ - // printf("title, articleid, revid, date, time, anon, editor, editorid, minor, comment\n"); -// printf("title\tarticleid\trevid\tdate time\tanon\teditor\teditorid\tminor\n"); - -} - /* * write a line of comma-separated value formatted data to standard out @@ -231,31 +237,64 @@ write_row(revisionData *data) sprintf(md5_hex_output + di * 2, "%02x", digest[di]); } + string reverted_to; + map::iterator prev_revision = data->revision_md5.find(md5_hex_output); + if (prev_revision != data->revision_md5.end()) { + reverted_to = prev_revision->second; // id of previous revision + } + data->revision_md5[md5_hex_output] = data->revid; + string text = string(data->text, data->text_size); vector text_tokens; - size_t period_pos = 0; - size_t paragraph_pos = 0; + size_t pos = 0; size_t start = 0; - while ((period_pos = text.find(".", period_pos + 1)) != string::npos && - (paragraph_pos = text.find("\n\n", paragraph_pos + 1)) != string::npos) { - if (paragraph_pos < period_pos) { - text_tokens.push_back(text.substr(start, paragraph_pos - start)); - start = paragraph_pos; - } else { - text_tokens.push_back(text.substr(start, period_pos - start)); - start = period_pos; + while ((pos = text.find_first_of(" \n\t\r", pos)) != string::npos) { + //cout << "\"\"\"" << text.substr(start, pos - start) << "\"\"\"" << endl; + text_tokens.push_back(text.substr(start, pos - start)); + start = pos; + ++pos; + } + + // look to see if (a) we've passed in a list of /any/ title_regexes + // and (b) if all of the title_regex_matches match + // if (a) is true and (b) is not, we return + bool any_title_regex_match = false; + if (!data->title_regexes.empty()) { + for (vector::iterator r = data->title_regexes.begin(); r != data->title_regexes.end(); ++r) { + pcrecpp::RE& title_regex = *r; + if (title_regex.PartialMatch(data->title)) { + any_title_regex_match = true; + break; + } + } + if (!any_title_regex_match) { + return; } } - vector additions; - vector deletions; + // search the content of the revision for a any of the regexes + vector content_regex_matches; + if (!data->content_regexes.empty()) { + for (vector::iterator r = data->content_regexes.begin(); r != data->content_regexes.end(); ++r) { + pcrecpp::RE& content_regex = *r; + content_regex_matches.push_back(content_regex.PartialMatch(data->text)); + } + } + + //vector additions; + //vector deletions; + string additions; + string deletions; + + vector diff_regex_matches_adds; + vector diff_regex_matches_dels; - if (data->last_tokens.empty()) { - data->last_tokens = text_tokens; + if (data->last_text_tokens.empty()) { + additions = data->text; } else { // do the diff - dtl::Diff< string, vector > d(data->last_tokens, text_tokens); + dtl::Diff< string, vector > d(data->last_text_tokens, text_tokens); //d.onOnlyEditDistance(); d.compose(); @@ -263,44 +302,68 @@ write_row(revisionData *data) for (vector >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) { switch (sit->second.type) { case dtl::SES_ADD: - cout << "ADD: \"" << sit->first << "\"" << endl; - additions.push_back(sit->first); + //cout << "ADD: \"" << sit->first << "\"" << endl; + additions += sit->first; break; case dtl::SES_DELETE: - cout << "DEL: \"" << sit->first << "\"" << endl; - deletions.push_back(sit->first); + //cout << "DEL: \"" << sit->first << "\"" << endl; + deletions += sit->first; break; } } + } + + if (!additions.empty()) { + //cout << "ADD: " << additions << endl; + for (vector::iterator r = data->diff_regexes.begin(); r != data->diff_regexes.end(); ++r) { + pcrecpp::RE& diff_regex = *r; + diff_regex_matches_adds.push_back(diff_regex.PartialMatch(additions)); + } + } - // apply regex to the diff - - - data->last_tokens = text_tokens; + if (!deletions.empty()) { + //cout << "DEL: " << deletions << endl; + for (vector::iterator r = data->diff_regexes.begin(); r != data->diff_regexes.end(); ++r) { + pcrecpp::RE& diff_regex = *r; + diff_regex_matches_dels.push_back(diff_regex.PartialMatch(deletions)); + } } + data->last_text_tokens = text_tokens; + // print line of tsv output - printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\t%i\t%i\n", - data->title, - data->articleid, - data->revid, - data->date, - data->time, - (data->editor[0] != '\0') ? "0" : "1", // anon? - data->editor, - data->editorid, - (data->minor) ? "1" : "0", - (unsigned int) data->text_size, - shannon_H(data->text, data->text_size), - md5_hex_output, - (int) additions.size(), - (int) deletions.size() - ); + cout + << data->title << "\t" + << data->articleid << "\t" + << data->revid << "\t" + << data->date << " " + << data->time << "\t" + << ((data->editor[0] != '\0') ? "FALSE" : "TRUE") << "\t" + << data->editor << "\t" + << data->editorid << "\t" + << ((data->minor) ? "TRUE" : "FALSE") << "\t" + << (unsigned int) data->text_size << "\t" + << md5_hex_output << "\t" + << reverted_to << "\t" + << (int) additions.size() << "\t" + << (int) deletions.size(); + + for (int n = 0; n < data->content_regex_names.size(); ++n) { + cout << "\t" << ((!content_regex_matches.empty() + && content_regex_matches.at(n)) ? "TRUE" : "FALSE"); + } + + for (int n = 0; n < data->diff_regex_names.size(); ++n) { + cout << "\t" << ((!diff_regex_matches_adds.empty() && diff_regex_matches_adds.at(n)) ? "TRUE" : "FALSE") + << "\t" << ((!diff_regex_matches_dels.empty() && diff_regex_matches_dels.at(n)) ? "TRUE" : "FALSE"); + } + cout << endl; // if (data->output_type == FULL) { - printf("comment:%s\ntext:\n%s\n", data->comment, data->text); + cout << "comment:" << data->comment << endl + << "text:" << endl << data->text << endl; } } @@ -318,7 +381,6 @@ split_timestamp(revisionData *data) char* strlcatn(char *dest, const char *src, size_t dest_len, size_t n) { - //size_t dest_len = strlen(dest); size_t i; for (i = 0 ; i < n && src[i] != '\0' ; i++) @@ -332,15 +394,18 @@ static void charhndl(void* vdata, const XML_Char* s, int len) { revisionData* data = (revisionData*) vdata; + size_t bufsz; if (data->element != UNUSED && data->position != SKIP) { - //char t[len]; - //strncpy(t,s,len); - //t[len] = '\0'; // makes t a well-formed string switch (data->element) { case TEXT: - // printf("buffer length = %i, text: %s\n", len, t); + // check if we'd overflow our buffer + bufsz = data->text_size + len; + if (bufsz + 1 > text_buffer_size) { + data->text = (char*) realloc(data->text, bufsz + 1); + text_buffer_size = bufsz + 1; + } strlcatn(data->text, s, data->text_size, len); - data->text_size += len; + data->text_size = bufsz; break; case COMMENT: strlcatn(data->comment, s, data->comment_size, len); @@ -459,17 +524,29 @@ end(void* vdata, const XML_Char* name) } void print_usage(char* argv[]) { - fprintf(stderr, "usage: | %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -t print text and comments after each line of tab separated data\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Takes a wikimedia data dump XML stream on standard in, and produces\n"); - fprintf(stderr, "a tab-separated stream of revisions on standard out:\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "title, articleid, revid, timestamp, anon, editor, editorid, minor, revlength, reventropy, revmd5\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "author: Erik Garrison \n"); + cerr << "usage: | " << argv[0] << "[options]" << endl + << endl + << "options:" << endl + << " -v verbose mode prints text and comments after each line of tab separated data" << endl + << " -n name of the following regex for content (e.g. -n name -r \"...\")" << endl + << " -r regex to check against content of the revision" << endl + << " -N name of the following regex for diffs (e.g. -N name -R \"...\")" << endl + << " -R regex to check against diffs (i.e., additions and deletions)" << endl + << " -t parse revisions only from pages whose titles match regex(es)" << endl + << endl + << "Takes a wikimedia data dump XML stream on standard in, and produces" << endl + << "a tab-separated stream of revisions on standard out:" << endl + << endl + << "title, articleid, revid, timestamp, anon, editor, editorid, minor," << endl + << "text_length, text_md5, reversion, additions_size, deletions_size" << endl + << ".... and additional fields for each regex executed against add/delete diffs" << endl + << endl + << "Boolean fields are TRUE/FALSE except in the case of reversion, which is blank" << endl + << "unless the article is a revert to a previous revision, in which case, it" << endl + << "contains the revision ID of the revision which was reverted to." << endl + << endl + << "authors: Erik Garrison " << endl + << " Benjamin Mako Hill " << endl; } @@ -482,20 +559,48 @@ main(int argc, char *argv[]) // in "simple" output, we don't print text and comments output_type = SIMPLE; char c; + string diff_regex_name; + string content_regex_name; + + // the user data struct which is passed to callback functions + revisionData data; - while ((c = getopt(argc, argv, "ht")) != -1) + while ((c = getopt(argc, argv, "hvn:r:t:")) != -1) switch (c) { case 'd': dry_run = 1; break; - case 't': + case 'v': output_type = FULL; break; + case 'n': + content_regex_name = optarg; + break; + case 'r': + data.content_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8())); + data.content_regex_names.push_back(content_regex_name); + if (!content_regex_name.empty()) { + content_regex_name.clear(); + } + break; + case 'N': + diff_regex_name = optarg; + break; + case 'R': + data.diff_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8())); + data.diff_regex_names.push_back(diff_regex_name); + if (!diff_regex_name.empty()) { + diff_regex_name.clear(); + } + break; case 'h': print_usage(argv); exit(0); break; + case 't': + data.title_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8())); + break; } if (dry_run) { // lets us print initialization options @@ -506,8 +611,6 @@ main(int argc, char *argv[]) // create a new instance of the expat parser XML_Parser parser = XML_ParserCreate("UTF-8"); - // initialize the user data struct which is passed to callback functions - revisionData data; // initialize the elements of the struct to default values init_data(&data, output_type); @@ -525,6 +628,49 @@ main(int argc, char *argv[]) bool done; char buf[BUFSIZ]; + + // write header + + cout << "title" << "\t" + << "articleid" << "\t" + << "revid" << "\t" + << "date" << "_" + << "time" << "\t" + << "anon" << "\t" + << "editor" << "\t" + << "editor_id" << "\t" + << "minor" << "\t" + << "text_size" << "\t" + << "text_md5" << "\t" + << "reversion" << "\t" + << "additions_size" << "\t" + << "deletions_size"; + + int n = 0; + if (!data.content_regexes.empty()) { + for (vector::iterator r = data.content_regexes.begin(); + r != data.content_regexes.end(); ++r, ++n) { + if (data.content_regex_names.at(n).empty()) { + cout << "\t" << "regex" << n; + } else { + cout << "\t" << data.content_regex_names.at(n); + } + } + } + + if (!data.diff_regexes.empty()) { + for (vector::iterator r = data.diff_regexes.begin(); r != data.diff_regexes.end(); ++r, ++n) { + if (data.diff_regex_names.at(n).empty()) { + cout << "\t" << "regex_" << n << "_add" + << "\t" << "regex_" << n << "_del"; + } else { + cout << "\t" << data.diff_regex_names.at(n) << "_add" + << "\t" << data.diff_regex_names.at(n) << "_del"; + } + } + } + + cout << endl; // shovel data into the parser do { @@ -536,10 +682,8 @@ main(int argc, char *argv[]) // passes the buffer of data to the parser and checks for error // (this is where the callbacks are invoked) if (XML_Parse(parser, buf, len, done) == XML_STATUS_ERROR) { - fprintf(stderr, - "%s at line %d\n", - XML_ErrorString(XML_GetErrorCode(parser)), - (int) XML_GetCurrentLineNumber(parser)); + cerr << "XML ERROR: " << XML_ErrorString(XML_GetErrorCode(parser)) << " at line " + << (int) XML_GetCurrentLineNumber(parser) << endl; return 1; } } while (!done);