From: Benjamin Mako Hill Date: Fri, 29 May 2015 06:20:57 +0000 (-0700) Subject: initial import from shared repository into new public repository X-Git-Url: https://projects.mako.cc/source/protection-tools/commitdiff_plain/40337071f4f25378fcf2d4f47199e9f9e9bf3a85?ds=sidebyside initial import from shared repository into new public repository --- 40337071f4f25378fcf2d4f47199e9f9e9bf3a85 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..20655e9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +docs/index.html +enwiki_201501-protection_spells-v1.RData +enwiki_201501-protection_spells-v1.tsv.bz2 +enwiki-20150112-page.csv +enwiki-20150112-page.sql.gz +enwiki-20150112-page_restrictions.csv +enwiki-20150112-page_restrictions.sql.gz +enwiki-20150112-pages-logging.xml.gz +output-deletions.tsv +output-moves.tsv +output-protections.tsv diff --git a/02-mysqldump_to_csv.py b/02-mysqldump_to_csv.py new file mode 100755 index 0000000..a3a4a9d --- /dev/null +++ b/02-mysqldump_to_csv.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +import fileinput +import csv +import sys +import re + +# This prevents prematurely closed pipes from raising +# an exception in Python +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +def is_insert(line): + """ + Returns true if the line begins a SQL insert statement. + """ + return line.startswith('INSERT INTO') or False + + +def get_values(line): + """ + Returns the portion of an INSERT statement containing values + """ + return line.partition('` VALUES ')[2].strip() + + +def values_sanity_check(values): + """ + Ensures that values from the INSERT statement meet basic checks. + """ + assert values + assert values[0] == '(' + assert values[-2:] == ');' + # Assertions have not been raised + return True + +def clean_quotes_for_fread(col): + col = re.sub('\t', ' ', col) + if col.startswith('"'): + return('"' + col + '"') + else: + return(col) + +def parse_values(values, outfile): + """ + Given a file handle and the raw values from a MySQL INSERT + statement, write the equivalent CSV to the file + """ + values = values.rstrip(");") + values = values.lstrip("(") + + reader = csv.reader(values.split("),("), delimiter=',', + doublequote=False, + escapechar='\\', + quotechar="'", + strict=True) + + for reader_row in reader: + print("\t".join([clean_quotes_for_fread(col) for col in reader_row])) + +def main(): + """ + Parse arguments and start the program + """ + # Iterate over all lines in all files + # listed in sys.argv[1:] + # or stdin if no args given. + try: + for line in fileinput.input(): + # Look for an INSERT statement and parse it. + if is_insert(line): + values = get_values(line.strip().rstrip()) + if values_sanity_check(values): + parse_values(values, sys.stdout) + except KeyboardInterrupt: + sys.exit(0) + +if __name__ == "__main__": + main() diff --git a/03-parse_mw_eventlog.py b/03-parse_mw_eventlog.py new file mode 100755 index 0000000..957bb49 --- /dev/null +++ b/03-parse_mw_eventlog.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 + +import re +import sys +import os.path + +from xml.sax import handler, make_parser +from xml.sax.saxutils import XMLFilterBase +from collections import defaultdict +import calendar + +class WikiLogItem(object): + """ + Holds data related to one element parsed from the dump + """ + __slots__ = ( + 'id', + 'action', + 'type', + 'timestamp', + 'logtitle', + 'comment', + 'params', + 'username', + 'userid', + 'contrib_deleted' + ) + def __init__(self): + for attr in self.__slots__: + setattr(self, attr, '') + + def __str__(self): + return repr({k: getattr(self, k) for k in self.__slots__}) + +class text_normalize_filter(XMLFilterBase): + """ + SAX filter to ensure that contiguous texts nodes are merged into one + That hopefully speeds up the parsing process a lot, specially when + reading revisions with long text + + Recipe by Uche Ogbuji, James Kew and Peter Cogolo Retrieved from: Python + Cookbook, 2nd ed., by Alex Martelli, Anna Martelli Ravenscroft, and + David Ascher (O'Reillly Media, 2005) 0-596-00797-3 + """ + def __init__(self, upstream, downstream): + XMLFilterBase.__init__(self, upstream) + self._downstream=downstream + self._accumulator=[] + def _complete_text_node(self): + if self._accumulator: + self._downstream.characters(''.join(self._accumulator)) + self._accumulator=[] + def characters(self, text): + self._accumulator.append(text) + def ignorableWhiteSpace(self, ws): + self._accumulator.append(text) + +def _wrap_complete(method_name): + def method(self, *a, **k): + self._complete_text_node() + getattr(self._downstream, method_name)(*a, **k) + method.__name__= method_name + setattr(text_normalize_filter, method_name, method) + +for n in '''startElement endElement'''.split(): + _wrap_complete(n) + +class WikiDumpHandler(handler.ContentHandler): + """ + A ContentHandler designed to pull out page ids, titles and text from + Wiki pages. These are assembled into WikiLogItem objects and sent off to + the supplied callback. + """ + def __init__(self, logItemCallBack=None): + handler.ContentHandler.__init__(self) + self.currentTag = '' + self.insideContribTag = False + self.logItemCallBack = logItemCallBack + self.logItemsProcessed = 0 + + def startElement(self, name, attrs): + self.currentTag = name + if (name == 'logitem'): + # add a log item + self.currentLogItem = WikiLogItem() + elif (name == 'contributor'): + # when we're in revision, ignore ids + self.insideContribTag = True + if 'deleted' in attrs: + self.currentLogItem.contrib_deleted = True + else: + self.currentLogItem.contrib_deleted = False + + def endElement(self, name): + if (name == 'logitem'): + if self.logItemCallBack is not None: + self.logItemCallBack(self.currentLogItem) + self.logItemsProcessed += 1 + elif (name == 'contributor'): + # we've finished the revision section + self.insideContribTag = False + self.currentTag = '' + + def characters(self, content): + if (self.currentTag == 'id' and not self.insideContribTag): + self.currentLogItem.id = content + elif (self.currentTag == 'id' and self.insideContribTag): + self.currentLogItem.userid = content + elif (self.currentTag == 'username' and self.insideContribTag): + self.currentLogItem.username = content + elif (self.currentTag == 'action'): + self.currentLogItem.action = content + elif (self.currentTag == 'type'): + self.currentLogItem.type = content + elif (self.currentTag == 'logtitle'): + self.currentLogItem.logtitle = content + elif (self.currentTag == 'timestamp'): + self.currentLogItem.timestamp = content + elif (self.currentTag == 'comment'): + self.currentLogItem.comment = content + elif (self.currentTag == 'params'): + self.currentLogItem.params = content + +class logExporter: + def __init__(self, input, output_base="output"): + self.input_file = input + self.move_log = open(output_base + "-moves.tsv", "w") + self.prot_log = open(output_base + "-protections.tsv", "w") + self.del_log = open(output_base + "-deletions.tsv", "w") + + self.prot_titles = defaultdict(None) + + self.cal_dict = {v: k for k,v in enumerate(calendar.month_name)} + self.r_param_string = re.compile(r'\[(?P\w+)=(?P\w+)\] \((?P.*?)\)+') + self.r_expir_string = re.compile(r'expires (?P\d{2}):(?P\d{2}), (?P\d+) (?P\w+) (?P\d{4})') + + # this marks whether we have moved into late 2008 + # when material is being recorded consistently + self.in_window = False + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.move_log.close() + self.prot_log.close() + self.del_log.close() + + def __flush(self): + self.move_log.flush() + self.prot_log.flush() + self.del_log.flush() + sys.stdout.flush() + sys.stderr.flush() + + def __clean_timestamp(self, timestamp): + timestamp = timestamp.replace("T", " ") + timestamp = timestamp.replace("Z", "") + + return timestamp + + def __clean_logItem(self, logItem): + logItem.comment = re.sub(r'\s', r' ', logItem.comment) + logItem.params = re.sub(r'\s', r' ', logItem.params) + + # add userid and username, but only if it's not deleted + if logItem.contrib_deleted: + logItem.userid = "" + logItem.username = "" + else: + logItem.username = re.sub(r'\s', r' ', logItem.username) + + logItem.timestamp = self.__clean_timestamp(logItem.timestamp) + + return logItem + + def printDelete(self, logItem): + logItem = self.__clean_logItem(logItem) + + output = [logItem.id, '"' + logItem.logtitle + '"', + logItem.action, logItem.timestamp] + + print("\t".join(output), file=self.del_log) + + def printMove(self, logItem): + logItem = self.__clean_logItem(logItem) + + output = [logItem.id, logItem.timestamp, + '"' + logItem.params + '"', # old location + '"' + logItem.logtitle + '"'] # new location + print("\t".join(output), file=self.move_log) + + # add the title to the list of titles + self.prot_titles[logItem.logtitle] = None + + def printProtect(self, logItem): + logItem = self.__clean_logItem(logItem) + + param_string = logItem.params + rights = {} + + for m in self.r_param_string.finditer(param_string): + right = m.group("right") + group = m.group("group") + raw_period = m.group("period") + + if not re.search("indefinite", raw_period): + m2 = self.r_expir_string.match(raw_period) + period_nums = [int(x) for x in [m2.group("year"), + self.cal_dict[m2.group("month")], + m2.group("day"), + m2.group("hour"), + m2.group("min")]] + period_nums = tuple(period_nums) + period = "%d-%02d-%02d %02d:%02d:00" % period_nums + else: + period = "" + + rights[right] = (group, period) + + output = [logItem.id, '"' + logItem.logtitle + '"', + logItem.action, logItem.timestamp] + + for right in rights: + group, expir = rights[right] + print("\t".join(output + [right, group, expir]), + file=self.prot_log) + + # add the title to the list of titles + self.prot_titles[logItem.logtitle] = None + + def printUnprotect(self, logItem): + logItem = self.__clean_logItem(logItem) + + output = [logItem.id, '"' + logItem.logtitle + '"', + logItem.action, logItem.timestamp, + '', '', ''] + print("\t".join(output), file=self.prot_log) + + # remove the current title from the list of titles + self.prot_titles.pop(logItem.logtitle, None) + + def conditionallyPrint(self, logItem): + # print deletions only if we've seen a protection event + if logItem.type == 'delete' \ + and logItem.logtitle in self.prot_titles: + self.printDelete(logItem) + + elif logItem.type == "protect": + if logItem.action == "move_prot": + self.printMove(logItem) + + elif logItem.action == "protect" \ + or logItem.action == "modify": + + # this limits it to only things after 2008 when this + # data started being stored in params + if not logItem.params: + return + else: + self.in_window = True + self.printProtect(logItem) + + elif logItem.action == "unprotect": + if self.in_window: self.printUnprotect(logItem) + + else: + # this is some kind of error so we'll print the article and + # return + print(logItem, file=sys.stderr) + + +def parseWithCallback(incoming_data, callback): + parser = make_parser() + parser.setFeature(handler.feature_namespaces, 0) + + # apply the text_normalize_filter + wdh = WikiDumpHandler(logItemCallBack=callback) + filter_handler = text_normalize_filter(parser, wdh) + + filter_handler.parse(incoming_data) + +if __name__ == "__main__": + """ + When called as script, argv[1] is assumed to be a filename and we + simply print pages found. If it's missing, we just use sys.stdin + instead. + """ + + if len(sys.argv) > 1: + input_file = open(sys.argv[1], "r") + output_base = re.sub(r'\.\w+$', '', os.path.basename(sys.argv[1])) + else: + input_file = sys.stdin + output_base = "output" + + with logExporter(input_file, output_base) as exporter: + parseWithCallback(input_file, exporter.conditionallyPrint) + diff --git a/04-import_merge_data.R b/04-import_merge_data.R new file mode 100644 index 0000000..cd15adc --- /dev/null +++ b/04-import_merge_data.R @@ -0,0 +1,118 @@ +page.restrictions.filename <- "enwiki-20150112-page_restrictions.csv" +page.metadata.filename <- "enwiki-20150112-page.csv" +log.filename <- "output-protections.tsv" +moves.filename <- "output-moves.tsv" +dels.filename <- "output-deletions.tsv" + +setwd("~/protection") +Sys.setenv(tz = "UTC") +library(parallel) +library(data.table) + +## 0. load namespace data +wp.ns <- read.csv("wikipedia_namespaces.csv", + header=TRUE, stringsAsFactors=FALSE) +# drop aliases completely +wp.ns <- wp.ns[!wp.ns$alias,] +wp.ns$alias <- NULL + +setDT(wp.ns) +setnames(wp.ns, c("ns.num", "ns.name")) +wp.ns[, ns.name := gsub(' ', '_', ns.name)] +setkey(wp.ns, "ns.num") + +## 1. load the protection snapshot data from the parsed SQL file +final.state <- fread(page.restrictions.filename, + header=FALSE, na.strings=NULL, + colClasses=list(character=6)) + +setnames(final.state, c("page.id", "type", "level", + "cascade", "user.id", "expiry", + "id")) + +# drop change several column types +final.state[,cascade := as.logical(cascade)] +final.state[,type := as.factor(type)] +final.state[,level := as.factor(level)] + +final.state[,c("user.id", "id") := NULL] + +# NULL expiry seem to be just infinite because we've confirmed that several of +# these pages are protected even if their expiry is null +final.state$expiry[final.state$expiry == "infinity"] <- NA +final.state$expiry[final.state$expiry == "NULL"] <- NA + +final.state$expiry <- as.POSIXct(final.state$expiry, + format="%Y%m%d%H%M%s", tz="UTC") + +# print the range +range(final.state$expiry, na.rm=TRUE) + +setkey(final.state, "page.id") + +# 2. load up the page information for the id/title mapping +page.info <- fread(page.metadata.filename, + header=FALSE, stringsAsFactors=FALSE, na.strings=NULL, + select=1:3, showProgress=TRUE) + +setnames(page.info, c("page.id", "ns.num", "title")) + +# merge namespace data into the data.table +setkey(page.info, "ns.num") +page.info <- wp.ns[page.info] +page.info[ns.num != 0, title := paste(ns.name, title, sep=":")] +page.info[,c("ns.num", "ns.name") := NULL] + +# merge the page titles onto the final state data +setkey(page.info, "page.id") +final.state <- page.info[final.state] + +# some pageids are missing (pages deleted?) so we'll drop those +table(is.na(final.state$title)) +final.state <- final.state[!is.na(final.state$title),] + +# 3. load the page log data from TSV file +log <- fread(log.filename, header=FALSE, na.strings="") + +setnames(log, c("id", "title", "log.type", "log.time", + "type", "level", "expiry")) + +log[, log.type := as.factor(log.type)] +log[, type := as.factor(type)] +log[, level := as.factor(level)] +log[, log.time := as.POSIXct(log.time, tz="UTC")] +log[, expiry := as.POSIXct(expiry, tz="UTC")] +log[, title := gsub(' ', '_', title)] + +# clean up the log file by dropping some invalid data (e.g., log entries that +# expire before they are blocked. these are all done within the minute and seem +# to be a mediawiki bug (e.g., "EEE") +log <- log[is.na(log$expiry) | !log$expiry < log$log.time,] + +# 4. load in the move data from TSV +moves <- fread(moves.filename, sep="\t", header=FALSE) +setnames(moves, c("id", "log.time", "from.title", "to.title")) + +# drop id column +moves[, from.title := gsub(' ', '_', from.title)] +moves[, to.title := gsub(' ', '_', to.title)] +moves[, log.time := as.POSIXct(log.time, tz="UTC")] + +# 5. load in deletion data from the TSV file +dels <- fread(dels.filename, header=FALSE, na.strings="NA") +setnames(dels, c("id", "title", "log.type", "log.time")) + +# drop id column +dels[, log.time := as.POSIXct(log.time, tz="UTC")] +dels[, title := gsub(' ', '_', title)] + +# because revision deletion does not affect protection and restoration does +# not restore restrictions, we can safely limit this to only delete evetns +# and drop this columns +dels <- dels[dels$log.type == "delete",] +dels[, log.type := NULL] + +# save temporary state +save(log, moves, dels, final.state, file="processed_log_and_sql_data.RData") + + diff --git a/05-generate_spells.R b/05-generate_spells.R new file mode 100644 index 0000000..8356273 --- /dev/null +++ b/05-generate_spells.R @@ -0,0 +1,360 @@ +Sys.setenv(tz = "utc") +library(parallel) +library(data.table) + +dump.creation.time <- as.POSIXct("2015-01-12 00:00:00 utc") + +r <- list() + +## step 2: merge moves and deletions into an expanded log +################################################################# +load("processed_log_and_sql_data.RData") + +r[["num.final.state.orig"]] <- nrow(final.state) +r[["num.log"]] <- nrow(log) +r[["num.moves"]] <- nrow(log) +r[["num.dels"]] <- nrow(dels) +r[["dump.creation.time"]] <- dump.creation.time + +events <- log[,list(title, id)] +events[, type := 'protect'] + +tmp <- moves[,list(from.title, id)] +tmp[, type := 'move'] +setnames(tmp, "from.title", "title") + +tmp2 <- dels[, list(title, id)] +tmp2[, type := 'delete'] + +events <- rbindlist(list(events, tmp, tmp2)) +rm(tmp, tmp2) + +# convert type to a factor and sort +events[, type := as.factor(type)] + +# dropping missing titles +events <- events[!is.na(events$title),] + +setkey(events, title) + +# return true if next event is move +get.next.event <- function (move.item) { + destination <- move.item[,to.title] + log.id <- move.item[,id] + + x <- events[destination,] + x <- x[x$id > log.id,] + + # return the first item in the event list + # this will return a one row data.table of all NA if it's missing... + return(x[sort.list(x$id)[1],]) +} + +# BOOKMARK: BM-D +## turn/expand moves into a protection and unprotection events +############################################################### +# build list of uninterupted moves; was end = end of y^ +build.move.chains <- function (move.chain){ + last.move <- move.chain[nrow(move.chain),] + next.event <- get.next.event(last.move) + + if (!all(is.na(next.event))) { + if (next.event[, type] == "move") { + move.chain <- rbindlist(list(move.chain, moves.tmp[J(next.event[,id]),])) + build.move.chains(move.chain) + } else { + return(list(move.chain, FALSE)) + } + } else { + return(list(move.chain, TRUE)) + } +} + +setkey(moves, id) +moves.tmp <- moves + +moves.result = list() +while (nrow(moves.tmp) > 0) { + move <- moves.tmp[1,] + + rv <- build.move.chains(move) + move.chain <- rv[[1]] + ends.with.move <- rv[[2]] + + for (i in seq(1, nrow(move.chain))) { + moves.tmp <- moves.tmp[!J(move.chain[i, id]),] + } + + moves.result[[length(moves.result)+1]] <- list(move.chain, ends.with.move) + +} + +explode.move.chain <- function (chain, rights) { + log.unprot <- data.table(id=chain$id, title=chain$from.title, + log.type="unprotect", log.time=chain$log.time, + type="move", level=NA, expiry=NA) + log.prot <- data.table(id=chain$id, title=chain$to.title, + log.type="protect", log.time=chain$log.time) + + # if, and only if, there are no previous events, we'll add right censored event + if (!any(events[chain[1,from.title],id] < chain[1, id])) { + log.prot <- rbind(log.prot, + data.table(id=NA, title=chain[1,from.title], log.type="protect", + log.time=NA)) + } + + # copy data.table once per right + log.prot <- cbind(log.prot[rep(seq_len(nrow(log.prot)), nrow(rights)),], + rights[unlist(lapply(seq_len(nrow(rights)), + function (i) {rep(i, nrow(log.prot))})),]) + + # put together the unprotect and protect events and return them + return(rbind(log.unprot, log.prot)) +} + +# check in the final state data +explode.with.final.state.data <- function (chain) { + final.title <- chain[nrow(chain), to.title] + + # this will be set to a list of rights, or all NA if they are missing + rights <- final.state[final.title, list(type, level, expiry)] + + # return entries for the log + explode.move.chain(chain, rights) +} + +setkey(final.state, title) + +moves.results.moveterm <- moves.result[sapply(moves.result, function (x) {x[[2]]})] +moves.results.moveterm <- lapply(moves.results.moveterm, function (x) {x[[1]]}) + +log.moveterm <- rbindlist(mclapply(moves.results.moveterm, explode.with.final.state.data)) + +moves.results.otherterm <- moves.result[!sapply(moves.result, function (x) {x[[2]]})] +moves.results.otherterm <- lapply(moves.results.otherterm, function (x) {x[[1]]}) + +log.otherterm <- rbindlist(mclapply(moves.results.otherterm, + function (x) { explode.move.chain(x, data.table(type=NA, level=NA, expiry=NA)) })) + +expanded.log <- rbind(log, log.moveterm, log.otherterm, + data.table(id=dels$id, title=dels$title, log.type="unprotect", + log.time=dels$log.time, type="delete", level=NA, expiry=NA)) + +setkey(expanded.log, title) + +save(expanded.log, file="expanded_log.RData") +# load("expanded_log.RData"); load("processed_log_and_sql_data.RData") + +### GENERATE SPELLS +generate.spells <- function (page.title, d) { + x <- d[page.title,] + setkey(x, id) + + spells <- data.table() + tmp.spells <- data.table() + prev.mod <- FALSE + for (i in seq_len(nrow(x))) { + row <- as.list(x[i,]) + + # if it's the first time, through and we're seeing an uprot, create a l-cens event + if (i == 1 && row[["log.type"]] == "unprotect") { + tmp.spells <- data.table(title=row[["title"]], type=NA, level=NA, start=NA, end=NA) + } + + # first, see if any of the previous tmp.spells expired naturally + if (nrow(tmp.spells) > 0 && + nrow(tmp.spells[!is.na(tmp.spells$end),]) > 0 && + nrow(tmp.spells[tmp.spells$end < row[["log.time"]],]) > 0) { + # if they did, add them to spells and drop them from tmp.spells + spells <- rbind(spells, tmp.spells[!is.na(tmp.spells$end) & tmp.spells$end < row[["log.time"]],]) + tmp.spells <- tmp.spells[!(!is.na(tmp.spells$end) & tmp.spells$end < row[["log.time"]]),] + } + + # otherwise, see if the prevoius spell was a protect/modify and ended a + # spell by omission + if (prev.mod && !is.na(prev.id) && prev.id != row[["id"]]) { + unlisted.types <- tmp.spells[,type][!tmp.spells[,type] %in% x[x$id == row[["id"]], type]] + tmp.tmp.spells <- tmp.spells[tmp.spells$type %in% unlisted.types,] + tmp.tmp.spells$end <- x[x$id == row[["id"]],log.time] + spells <- rbind(spells, tmp.tmp.spells) + + tmp.spells <- tmp.spells[!tmp.spells$type %in% unlisted.types,] + } + + # if we are adding a new protection event + if (row[["log.type"]] %in% c("protect", "modify")) { + prev.mod <- TRUE; prev.id <- row[["id"]] + # if there is an active spell that conflicts with the current bit, + # end them with the spells start time + if (nrow(tmp.spells) > 0) { + # we could be missing previous data on type in which case we want to replace + # and add to the spells + if (all(is.na(tmp.spells$type))) { + tmp.spells$end <- row[["log.time"]] + spells <- rbind(spells, tmp.spells) + tmp.spells <- data.table() + } else if (is.na(row[["type"]])) { + next + } else { + conflict <- tmp.spells$type == row[["type"]] + # if it's the same rights, we update the expiry date and continue + if (any(conflict) && tmp.spells[conflict,level] == row[["level"]]) { + tmp.spells$end[conflict] <- row[["expiry"]] + next + } + # otherwise, we end the spell + tmp.spells$end[conflict] <- row[["log.time"]] + spells <- rbind(spells, tmp.spells[conflict,]) + tmp.spells <- tmp.spells[!conflict,] + } + } + + # add the new spell to the list of active spells + tmp.spells <- rbind(tmp.spells, + data.table(title=row[["title"]], type=row[["type"]], level=row[["level"]], + start=row[["log.time"]], end=row[["expiry"]])) + + # if it's an unprotection event and we're sitting on existing events + } else if (row[["log.type"]] == "unprotect" && nrow(tmp.spells) > 0) { + prev.mod <- FALSE; prev.id <- row[["id"]] + # end /all/ active spells and add them to spells + tmp.spells$end <- row[["log.time"]] + spells <- rbind(spells, tmp.spells) + tmp.spells <- data.table() + } + } + # if this is the final time through, add any active spells + if (nrow(tmp.spells) > 0) { + spells <- rbind(spells, tmp.spells) + } + tmp.spells <- data.table() + if (any(is.na(spells$title))) { print(page.title) } + return(spells) +} + +page.titles <- unique(expanded.log$title) +page.titles <- page.titles[!is.na(page.titles)] +page.titles <- page.titles[page.titles != ""] +#spells <- rbindlist(lapply(page.titles, generate.spells, expanded.log)) +spells <- rbindlist(mclapply(page.titles, generate.spells, expanded.log)) + +save(spells, file="spells-nofinal.RData") + +# load("spells-nofinal.RData"); load("processed_log_and_sql_data.RData") + +# remove cascacading data from final.state data to allow for merging later +final.state[, cascade := NULL] + +# set any ongoing spells and final state data ongoing at the point of data +# collection to right censored +spells <- spells[spells$end > dump.creation.time, end := NA] +final.state <- final.state[final.state$expiry > dump.creation.time, expiry := NA] + +# drop log entries from outside our data collection window +spells <- spells[spells$start < dump.creation.time,] + +# drop any spells other than edit, move or upload for which we have no final +# state data; and any final state data along the same lines +spells <- spells[spells$type %in% c("edit", "move", "upload"),] +final.state <- final.state[final.state$type %in% c("edit", "move", "upload"),] + +# TODO/FIX? handle the two extra NA dropped here +spells <- spells[!is.na(spells$title),] + +r[["num.spells.orig"]] <- nrow(spells) + +# we're now going to load on any final.state data for missing spells +setkey(final.state, title, type) +setkey(spells, title, type) + +## merge final.state data with spells +####################################################################### +# there are several situations we need to take into account: +# 1. final.state for pages for which we have no record: we create right/left +# censored spells for these +# 2. we have an open spells. final.state data that matches data we have in log +# data. we can discard this final state data +# 3. we have an open spell and final state data, but they disagree. +# 4. final state data for spells our log data suggests should be closed: add right/ +# 5. log data suggests is should be open + +# 1. final.state for pages for which we have no record: we create right/left +# censored spells for these. most likely, these were spells started before +# 2008. as a result, we grab final.state data for those spells +missing.spells <- final.state[is.na(spells[final.state, level, mult="first"]),] +missing.spells <- missing.spells[!missing.spells$type == "aft",] +missing.spells[, start := NA] +missing.spells[, end := NA] +missing.spells[, page.id := NULL] +missing.spells[, expiry := NULL] + +# drop these missing spells from the final.state +final.state <- final.state[!missing.spells[,list(title, type)]] + +# print the number of missing spells +r[["num.created.from.final.state"]] <- nrow(missing.spells) +r[["num.created.from.final.state.pages"]] <- length(unique(missing.spells$title)) + +# to answer 2+ we first need create a dataset of spells from the log that are +# open at the time of data.collection. these will either be because they have +# an infinite expiry or because they were ongoing at the time of +# data.collection. +open.spells <- spells[is.na(spells$end),] +setkey(open.spells, start) + +setkey(open.spells, title, type) +setkey(final.state, title, type) + +# BOOKMARK: BK-A +# look for spells that both final.state and log suggests are open but there is +# disagreement on +tmp <- open.spells[final.state, nomatch=0] +tmp <- tmp[as.character(tmp$level) != as.character(tmp$i.level), list(title, type)] + +# handchecking each of the examples suggests that these are all due a lag in +# time between the creation of the SQL final.state data and the log. we can +# simply remove these from the final state data +r[["num.dropped.level.nomatch"]] <- nrow(tmp) + +# as a result, we can simply drop these form the final state +# an alternate approach would be to set these to NA as in: +# spells[tmp, level := as.character(NA), mult="last"] +final.state <- final.state[!tmp,] + +# now, we have to take the list of open spells and find the subset of the final +# state that does not match +open.spells[, expiry := end] +setkey(open.spells, title, type, level, expiry) +setkey(final.state, title, type, level, expiry) + +# BOOKMARK: BK-C +final.state.missing <- open.spells[!final.state,] +# BOOKMARK: BK-B +open.spells.missing <- final.state[!open.spells,] + +r[["num.final.state.missing"]] <- nrow(final.state.missing) +r[["num.spell.missing"]] <- nrow(open.spells.missing) + +# spells <- rbind(spells, missing.spells, missing.spells2) +spells <- rbind(spells, missing.spells) +setkey(spells, title) + +save(spells, file="spells.RData") + +# save several other computationally intensive datasets + +# load page info data +page.metadata.filename <- "enwiki-20150112-page.csv" +page.info <- fread(page.metadata.filename, + header=FALSE, stringsAsFactors=FALSE, na.strings=NULL, + select=1:3, showProgress=TRUE) + +setnames(page.info, c("page.id", "ns.num", "title")) + +r[["num.pages"]] <- nrow(page.info) +r[["num.pages.main"]] <- table(page.info$ns.num)[["0"]] + +## save and go home +########################################################## +save(r, file="sweave_data_spellgen.RData") + diff --git a/GPL-3 b/GPL-3 new file mode 100644 index 0000000..94a9ed0 --- /dev/null +++ b/GPL-3 @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/LICENSES b/LICENSES new file mode 100644 index 0000000..71d44ad --- /dev/null +++ b/LICENSES @@ -0,0 +1,42 @@ +Copyright: + + 2015 Benjamin Mako Hill http://mako.cc/academic/ + +Unless described in this file this software is distributed as free software. +With the exceptions of files listed in one of the headings below, this software +is all distributed under the GNU General Public License version 3 or any later +version. + +A full copy of the license is availalbe in the file GPL-3 included in along +side this licensing file. + +mysqldump_to_csv.py +------------------------------ + +This license applies the file: + 02-mysqldump_to_csv.py + +Documentation on this software and its license can be found at: + https://github.com/jamesmishra/mysqldump-to-csv + +License: + The MIT License (MIT) + Copyright (c) 2014 James Mishra + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. diff --git a/README b/README new file mode 100644 index 0000000..5370398 --- /dev/null +++ b/README @@ -0,0 +1,160 @@ +Page Protection: Another Missing Dimension of Wikipedia Research +------------------------------------------------------------------------- + +| Author: Benjamin Mako Hill and Aaron Shaw +| Homepage: http://communitydata.cc/wiki-protection/ +| Code: http://projects.mako.cc/source/?p=protection-tools +| License: GNU GPLv3 or any later version +| Description: + + Tools to generate a protection spells dataset from "raw" MediaWiki + XML dumps like those published by the Wikimedia foundation. + +General Information +---------------------- + +page protection log data is very uneven and changed its format many +times. For several years before 2008-09, data on the specific rights +were recorded in comment fields but this was subject to incorrect +parsing because people sometimes added additional information (or +tried to reproduce the format "by hand" as well). + +This code is limited to extracting and parsing only the most reliable +information which means only protection data from 2008-09 and the +point of data collection. + +There are two sources of page protection data: + +- A log file that contains log events including protections, moves, + deletions, blocks, etc. + +- A snapshot from Wikipedia MySQL database of protected pages at the + point that the database (and log) was exported. + +As mentioned above, our log data begins only in 2008-09 which means we +have data on protection spells that is both right censored (i.e., +ongoing protection events) and left censored (i.e., protect spells +that were ongoing in 2008-09. Because protection data was not +recorded reliably over Wikipedia's history, we believe that avoiding +censoring (either left or right) is not technically possible given the +data sources the WMF has published. + + + |---------|----------------|-----> +2004 2008 2015 + |--Our Dataset---| + +Because we don't have a perfect data for the beginning of any window, +it means that events that happened are invisible to us unless they are +ongoing at the point of data collection. When our dataset starts in +2008 (as it does in the dataset we produce here where the log format +stabilizes, this means that there are two conditions where data will +be missing (refer to the timeline figure above): + +a) If a page is protected in and is deleted in we wll have no + indication that the page was protected at all and no page + protection spells will be included in the database. + +b) If a page is protected in and its protection status is changed + at any time during period (i.e., either by adjusting the + rights, updating the expiration time, or unprotecting the page) we + will know that the page was protected during the period from the + beginning of until but will we not know the specific rights + associated with the protection. We will have complete data on the + protection status of that page from onward. + + +Running the Software +----------------------------- + +0. Install dependencies +=========================== + +- Python 3 +- GNU R +- `data.table` R package available on CRAN + +1. Download Dumps +========================== + +First, download a dump of MediWiki log actions. WMF distributes these +for English Wikipedia at http://dumps.wikimedia.org. For example, the +latest version when we wrote this document was this file: + +https://dumps.wikimedia.org/enwiki/20150112/enwiki-20150112-pages-logging.xml.gz + +You will also need a page that includes the status of protected pages +at the point that data was created in the database. WMF distributes +these from http://dumps.wikimedia.org as well. For example, the latest +version is here: + +https://dumps.wikimedia.org/enwiki/20150112/enwiki-20150112-page_restrictions.sql.gz + +Finally, because the table above maps only page ID numbers to +restriction events, we'll need a mapping of page IDs to page titles +which is contained in a table like this one (the latest at the time of +writing): + +https://dumps.wikimedia.org/enwiki/20150112/enwiki-20150112-page.sql.gz + +2. Parse SQL Tables +============================ + +An important first step is parsing the two wQL dumps file into CSV +format. We can use the `mysqldump_to_csv.py` to do this like: + +$ zcat enwiki-20150112-page_restrictions.sql.gz | ./02-mysqldump_to_csv.py \ + > enwiki-20150112-page_restrictions.csv + +$ zcat enwiki-20150112-page.sql.gz | ./02-mysqldump_to_csv.py \ + > enwiki-20150112-page.csv + +The first file is small and should be very quick. The second file is +larger but still should still take only several minutes. + +The number is adapated from: https://github.com/jamesmishra/mysqldump-to-csv + +3. Parse Log File +================================= + +The log file that contains changes over time is much larger and will +take several hours to parse for English Wikipedia even on a very fast +machine. The page log can be parsed using the file like: + +$ zcat enwiki-20150112-pages-logging.xml.gz | ./03-parse_mw_event_log.py + +This will produce several TSV files of the log file in several different +formats: + + output-moves.tsv + output-protection.tsv + output-deletions.tsv + +If you pass an XML filename to 03-parse_mw_event_log.py, these file +will not be called output but something based on the root of the +filename. + +4. Import Data Into R +============================= + +Import all of the data that we've created into a series of RData +files. You'll need to first edit the file `04-import_data.R` so that +the input files (all defined at teh very top of the script) match the +files that you've downloaded and created. + +Once you've done that, you can run the R script which will load and +process these: + +$ R --no-save < 04-import_data.R + +This will output an RData file called `processed_log_and_sql_data.RData`. + +5. Generate Spells +============================= + +Finally, we run the command that reads in all the prepared dataset and +generates the spells dataset: + +$ R --no-save < 05-generate_spells.R + +This will generate the final page protection dataset: `spells.RData` diff --git a/docs/README b/docs/README new file mode 120000 index 0000000..59a23c4 --- /dev/null +++ b/docs/README @@ -0,0 +1 @@ +../README \ No newline at end of file diff --git a/docs/biology_screenshot.png b/docs/biology_screenshot.png new file mode 100644 index 0000000..80f1c0a Binary files /dev/null and b/docs/biology_screenshot.png differ diff --git a/docs/homepage.rst b/docs/homepage.rst new file mode 100644 index 0000000..4ac1618 --- /dev/null +++ b/docs/homepage.rst @@ -0,0 +1,142 @@ +Page Protection Software and Dataset +================================================================== + +.. figure:: biology_screenshot.png + :align: right + :figwidth: 526px + + Example of the English Wikipedia article on Biology which has been + protected for long periods of time. Note the "View Source" button + instead of "Edit" and the small lock signaling that the page is + protected. + +**Page protection** is a `feature of MediaWiki software`__ that allows +administrators to restrict contributions to particular pages. For +example, a page can be “protected” so that only administrators or +logged-in editors with a history of good editing can edit, move, or +create it. + +__ https://www.mediawiki.org/wiki/Help:Protected_pages + +Protection might involve “full protection” where a page can only be +edited by administrators (i.e., “sysops”) or “semi-protection” where a +page can only be edited by accounts with a history of good edits +(i.e., “autoconfirmed” users). + +Although largely hidden, page protection profoundly shapes activity on +the site. For example, page protection is an important tool used to +manage access and participation in situations where vandalism or +interpersonal conflict can threaten to undermine content quality. +While protection affects only a small portion of pages in English +Wikipedia, many of the most highly viewed pages are protected. For +example, the “Main Page” in English Wikipedia has been protected since +February, 2006 and all Featured Articles are protected at the time +they appear on the site’s main page. Millions of viewers may never +edit Wikipedia because they never see an edit button. + +Despite it's widespread and influential nature, very little +quantitative research on Wikipedia has taken page protection into +account systematically. This page contains software and data to help +Wikipedia research do exactly this in their work. + +Because a page's protection status can change over time, the snapshots +of page protection data stored by Wikimedia and `published by +Wikimedia Foundation in as dumps`__ is incomplete. As a result, taking +protection into account involves looking at several different sources +of data. + +__ http://dumps.wikimedia.org/ + +Much more detail can be found in our paper (currently under review) +`Page Protection: Another Missing Dimension of Wikipedia +Research`__. If you use this software or these data, we would +appreciate if you cite the paper: + + *Hill, Benjamin Mako and Aaron Shaw. "Page Protection: Another + Missing Dimension of Wikipedia Research." Working Paper. 2015.* + +__ FORTHCOMING + +Page Protection Software +============================= + +Building page protection data is a multi-step and labor intensive +process. We have `publicly released software in Python and R to do +these two steps`__ under the `GNU GPL version 3`__. The software is +designed for people already comfortable with working with MediaWiki +XML dumps and the tools and software necessary to do this. + +__ http://projects.mako.cc/source/?p=protection-tools +__ http://www.gnu.org/licenses/gpl-3.0.html + +You can download the software from our git repository like:: + + git clone git://projects.mako.cc/protection-tools + +Detailed documentation on how to use the software is in available in `our +README file`__. + +__ README + +Page Protection Data +========================= + +.. figure:: protections_over_time.png + :align: right + :figwidth: 432px + + Count of pages protected from editing in English Wikipedia over + time for all pages and for the article namespace only. + +In `our paper`__, we present an analysis of page protection data from +English Wikipedia in the dump created in January 2015. You can +download `the dump files we used`__ from `the Wikimedia Foundation +dataset archive`__ and at the URLs detailed in the README__. Because +generating these dumps can be computationally intense, we have +published the output of the software above run on the this dump. + +You can download the dataset in the following formats: + +- `RData`__ — Suitable for use in `GNU R`__ +- `bzip2 compressed tab separated values`__ — Suitable for use + in other languages and statistical packages. + +__ FORTHCOMING +__ README +__ http://dumps.wikimedia.org/enwiki/20150112/ +__ http://dumps.wikimedia.org/ +__ enwiki_201501-protection_spells-v1.RData +__ http://www.r-project.org/ +__ enwiki_201501-protection_spells-v1.tsv.bz2 + + +More Information +================== + +For details about the dataset, why it is important, and for examples on +how it can be used to come to better findings in Wikipedia research, +please read `the companion paper`__. + +__ FORTHCOMING + +If you notice issues or bugs in our data or `code`__, contact `Benjamin +Mako Hill`__ or `Aaron Shaw`__. + +__ http://projects.mako.cc/source/?p=project-tools +__ http://mako.cc/contact/ +__ http://aaronshaw.org/ + +Patches and improvements are welcome! Details on `how to produce and send +a patch using git are online`__. + +__ http://projects.mako.cc/source/ + +---- + +ⓒ Copyright `Benjamin Mako Hill`__ and `Aaron Shaw`__ :: `Creative Commons BY-SA`__ :: Updated: Thu Jul 3 13:22:29 PDT 2014 + +__ http://mako.cc/academic/ +__ http://aaronshaw.org/ +__ http://creativecommons.org/licenses/by-sa/4.0/ + +.. LocalWords: png figwidth px autoconfirmed diff --git a/docs/protections_over_time.png b/docs/protections_over_time.png new file mode 100644 index 0000000..2f90846 Binary files /dev/null and b/docs/protections_over_time.png differ