X-Git-Url: https://projects.mako.cc/source/wikiq/blobdiff_plain/a8bbe4703654e6d6131368c0c42995f8933b3dce:/wikiq.c..26c05eec96c89590ddf4d516ae0cf9fc6bdbec82:/wikiq.cpp diff --git a/wikiq.c b/wikiq.cpp similarity index 88% rename from wikiq.c rename to wikiq.cpp index a8cf97d..144e563 100644 --- a/wikiq.c +++ b/wikiq.cpp @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -12,6 +13,10 @@ #include #include "disorder.h" #include "md5.h" +#include "dtl/dtl.hpp" +#include + +using namespace std; // timestamp of the form 2003-11-07T00:43:23Z #define DATE_LENGTH 10 @@ -47,6 +52,7 @@ typedef struct { char *editorid; char *comment; char *text; + vector last_tokens; // track string size of the elements, to prevent O(N^2) processing in charhndl // when we have to take strlen for every character which we append to the buffer @@ -133,6 +139,7 @@ free_data(revisionData *data, int title) free(data->editorid); free(data->comment); free(data->text); + data->last_tokens.clear(); } void cleanup_revision(revisionData *data) { @@ -141,6 +148,7 @@ void cleanup_revision(revisionData *data) { void cleanup_article(revisionData *data) { clean_data(data, 1); + data->last_tokens.clear(); } @@ -223,8 +231,57 @@ write_row(revisionData *data) sprintf(md5_hex_output + di * 2, "%02x", digest[di]); } + string text = string(data->text, data->text_size); + vector text_tokens; + size_t period_pos = 0; + size_t paragraph_pos = 0; + size_t start = 0; + while ((period_pos = text.find(".", period_pos + 1)) != string::npos && + (paragraph_pos = text.find("\n\n", paragraph_pos + 1)) != string::npos) { + if (paragraph_pos < period_pos) { + text_tokens.push_back(text.substr(start, paragraph_pos - start)); + start = paragraph_pos; + } else { + text_tokens.push_back(text.substr(start, period_pos - start)); + start = period_pos; + } + } + + vector additions; + vector deletions; + + if (data->last_tokens.empty()) { + data->last_tokens = text_tokens; + } else { + // do the diff + + dtl::Diff< string, vector > d(data->last_tokens, text_tokens); + //d.onOnlyEditDistance(); + d.compose(); + + vector > ses_v = d.getSes().getSequence(); + for (vector >::iterator sit=ses_v.begin(); sit!=ses_v.end(); ++sit) { + switch (sit->second.type) { + case dtl::SES_ADD: + cout << "ADD: \"" << sit->first << "\"" << endl; + additions.push_back(sit->first); + break; + case dtl::SES_DELETE: + cout << "DEL: \"" << sit->first << "\"" << endl; + deletions.push_back(sit->first); + break; + } + } + + // apply regex to the diff + + + data->last_tokens = text_tokens; + } + + // print line of tsv output - printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\n", + printf("%s\t%s\t%s\t%s %s\t%s\t%s\t%s\t%s\t%i\t%f\t%s\t%i\t%i\n", data->title, data->articleid, data->revid, @@ -236,7 +293,9 @@ write_row(revisionData *data) (data->minor) ? "1" : "0", (unsigned int) data->text_size, shannon_H(data->text, data->text_size), - md5_hex_output + md5_hex_output, + (int) additions.size(), + (int) deletions.size() ); //