page.restrictions.filename <- "enwiki-20150112-page_restrictions.csv" page.metadata.filename <- "enwiki-20150112-page.csv" log.filename <- "output-protections.tsv" moves.filename <- "output-moves.tsv" dels.filename <- "output-deletions.tsv" setwd("~/protection") Sys.setenv(tz = "UTC") library(parallel) library(data.table) ## 0. load namespace data wp.ns <- read.csv("wikipedia_namespaces.csv", header=TRUE, stringsAsFactors=FALSE) # drop aliases completely wp.ns <- wp.ns[!wp.ns$alias,] wp.ns$alias <- NULL setDT(wp.ns) setnames(wp.ns, c("ns.num", "ns.name")) wp.ns[, ns.name := gsub(' ', '_', ns.name)] setkey(wp.ns, "ns.num") ## 1. load the protection snapshot data from the parsed SQL file final.state <- fread(page.restrictions.filename, header=FALSE, na.strings=NULL, colClasses=list(character=6)) setnames(final.state, c("page.id", "type", "level", "cascade", "user.id", "expiry", "id")) # drop change several column types final.state[,cascade := as.logical(cascade)] final.state[,type := as.factor(type)] final.state[,level := as.factor(level)] final.state[,c("user.id", "id") := NULL] # NULL expiry seem to be just infinite because we've confirmed that several of # these pages are protected even if their expiry is null final.state$expiry[final.state$expiry == "infinity"] <- NA final.state$expiry[final.state$expiry == "NULL"] <- NA final.state$expiry <- as.POSIXct(final.state$expiry, format="%Y%m%d%H%M%s", tz="UTC") # print the range range(final.state$expiry, na.rm=TRUE) setkey(final.state, "page.id") # 2. load up the page information for the id/title mapping page.info <- fread(page.metadata.filename, header=FALSE, stringsAsFactors=FALSE, na.strings=NULL, select=1:3, showProgress=TRUE) setnames(page.info, c("page.id", "ns.num", "title")) # merge namespace data into the data.table setkey(page.info, "ns.num") page.info <- wp.ns[page.info] page.info[ns.num != 0, title := paste(ns.name, title, sep=":")] page.info[,c("ns.num", "ns.name") := NULL] # merge the page titles onto the final state data setkey(page.info, "page.id") final.state <- page.info[final.state] # some pageids are missing (pages deleted?) so we'll drop those table(is.na(final.state$title)) final.state <- final.state[!is.na(final.state$title),] # 3. load the page log data from TSV file log <- fread(log.filename, header=FALSE, na.strings="") setnames(log, c("id", "title", "log.type", "log.time", "type", "level", "expiry")) log[, log.type := as.factor(log.type)] log[, type := as.factor(type)] log[, level := as.factor(level)] log[, log.time := as.POSIXct(log.time, tz="UTC")] log[, expiry := as.POSIXct(expiry, tz="UTC")] log[, title := gsub(' ', '_', title)] # clean up the log file by dropping some invalid data (e.g., log entries that # expire before they are blocked. these are all done within the minute and seem # to be a mediawiki bug (e.g., "EEE") log <- log[is.na(log$expiry) | !log$expiry < log$log.time,] # 4. load in the move data from TSV moves <- fread(moves.filename, sep="\t", header=FALSE) setnames(moves, c("id", "log.time", "from.title", "to.title")) # drop id column moves[, from.title := gsub(' ', '_', from.title)] moves[, to.title := gsub(' ', '_', to.title)] moves[, log.time := as.POSIXct(log.time, tz="UTC")] # 5. load in deletion data from the TSV file dels <- fread(dels.filename, header=FALSE, na.strings="NA") setnames(dels, c("id", "title", "log.type", "log.time")) # drop id column dels[, log.time := as.POSIXct(log.time, tz="UTC")] dels[, title := gsub(' ', '_', title)] # because revision deletion does not affect protection and restoration does # not restore restrictions, we can safely limit this to only delete evetns # and drop this columns dels <- dels[dels$log.type == "delete",] dels[, log.type := NULL] # save temporary state save(log, moves, dels, final.state, file="processed_log_and_sql_data.RData")