From b1d72e4b54fe293ecd442c6dedc0d91b3b2519b1 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Fri, 11 Apr 2014 15:05:36 -0700 Subject: [PATCH] renamed variables for title regex The variables used to be called wp_namespace_* which was confusing because the software is not only useful for Wikipedia and is not only useful for matching namespaces. --- wikiq.cpp | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/wikiq.cpp b/wikiq.cpp index e8ca4d2..7123047 100644 --- a/wikiq.cpp +++ b/wikiq.cpp @@ -57,7 +57,7 @@ typedef struct { char *text; vector last_text_tokens; vector regexes; - vector wp_namespace_res; + vector title_regexes; vector regex_names; map revision_md5; // used for detecting reversions @@ -247,18 +247,19 @@ write_row(revisionData *data) ++pos; } - // skip this if the wp_namespace is not in the proscribed list of - // namespaces - bool wp_namespace_found = false; - if (!data->wp_namespace_res.empty()) { - for (vector::iterator r = data->wp_namespace_res.begin(); r != data->wp_namespace_res.end(); ++r) { - pcrecpp::RE& wp_namespace_re = *r; - if (wp_namespace_re.PartialMatch(data->title)) { - wp_namespace_found = true; + // look to see if (a) we've passed in a list of /any/ title_regexes + // and (b) if all of the title_regex_matches match + // if (a) is true and (b) is not, we return + bool any_title_regex_match = false; + if (!data->title_regexes.empty()) { + for (vector::iterator r = data->title_regexes.begin(); r != data->title_regexes.end(); ++r) { + pcrecpp::RE& title_regex = *r; + if (title_regex.PartialMatch(data->title)) { + any_title_regex_match = true; break; } } - if (!wp_namespace_found) { + if (!any_title_regex_match) { return; } } @@ -508,7 +509,7 @@ void print_usage(char* argv[]) { << " -v verbose mode prints text and comments after each line of tab separated data" << endl << " -n name of the following regex (e.g. -n name -r \"...\")" << endl << " -r regex to check against additions and deletions" << endl - << " -t regex(es) to check title against as a way of limiting output to specific namespaces" << endl + << " -t parse revisions only from pages whose titles match regex(es)" << endl << endl << "Takes a wikimedia data dump XML stream on standard in, and produces" << endl << "a tab-separated stream of revisions on standard out:" << endl @@ -563,7 +564,7 @@ main(int argc, char *argv[]) exit(0); break; case 't': - data.wp_namespace_res.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8())); + data.title_regexes.push_back(pcrecpp::RE(optarg, pcrecpp::UTF8())); break; } -- 2.30.2