Sys.setenv(tz = "utc") library(parallel) library(data.table) dump.creation.time <- as.POSIXct("2015-01-12 00:00:00 utc") r <- list() ## step 2: merge moves and deletions into an expanded log ################################################################# load("processed_log_and_sql_data.RData") r[["num.final.state.orig"]] <- nrow(final.state) r[["num.log"]] <- nrow(log) r[["num.moves"]] <- nrow(log) r[["num.dels"]] <- nrow(dels) r[["dump.creation.time"]] <- dump.creation.time events <- log[,list(title, id)] events[, type := 'protect'] tmp <- moves[,list(from.title, id)] tmp[, type := 'move'] setnames(tmp, "from.title", "title") tmp2 <- dels[, list(title, id)] tmp2[, type := 'delete'] events <- rbindlist(list(events, tmp, tmp2)) rm(tmp, tmp2) # convert type to a factor and sort events[, type := as.factor(type)] # dropping missing titles events <- events[!is.na(events$title),] setkey(events, title) # return true if next event is move get.next.event <- function (move.item) { destination <- move.item[,to.title] log.id <- move.item[,id] x <- events[destination,] x <- x[x$id > log.id,] # return the first item in the event list # this will return a one row data.table of all NA if it's missing... return(x[sort.list(x$id)[1],]) } # BOOKMARK: BM-D ## turn/expand moves into a protection and unprotection events ############################################################### # build list of uninterupted moves; was end = end of y^ build.move.chains <- function (move.chain){ last.move <- move.chain[nrow(move.chain),] next.event <- get.next.event(last.move) if (!all(is.na(next.event))) { if (next.event[, type] == "move") { move.chain <- rbindlist(list(move.chain, moves.tmp[J(next.event[,id]),])) build.move.chains(move.chain) } else { return(list(move.chain, FALSE)) } } else { return(list(move.chain, TRUE)) } } setkey(moves, id) moves.tmp <- moves moves.result = list() while (nrow(moves.tmp) > 0) { move <- moves.tmp[1,] rv <- build.move.chains(move) move.chain <- rv[[1]] ends.with.move <- rv[[2]] for (i in seq(1, nrow(move.chain))) { moves.tmp <- moves.tmp[!J(move.chain[i, id]),] } moves.result[[length(moves.result)+1]] <- list(move.chain, ends.with.move) } explode.move.chain <- function (chain, rights) { log.unprot <- data.table(id=chain$id, title=chain$from.title, log.type="unprotect", log.time=chain$log.time, type="move", level=NA, expiry=NA) log.prot <- data.table(id=chain$id, title=chain$to.title, log.type="protect", log.time=chain$log.time) # if, and only if, there are no previous events, we'll add right censored event if (!any(events[chain[1,from.title],id] < chain[1, id])) { log.prot <- rbind(log.prot, data.table(id=NA, title=chain[1,from.title], log.type="protect", log.time=NA)) } # copy data.table once per right log.prot <- cbind(log.prot[rep(seq_len(nrow(log.prot)), nrow(rights)),], rights[unlist(lapply(seq_len(nrow(rights)), function (i) {rep(i, nrow(log.prot))})),]) # put together the unprotect and protect events and return them return(rbind(log.unprot, log.prot)) } # check in the final state data explode.with.final.state.data <- function (chain) { final.title <- chain[nrow(chain), to.title] # this will be set to a list of rights, or all NA if they are missing rights <- final.state[final.title, list(type, level, expiry)] # return entries for the log explode.move.chain(chain, rights) } setkey(final.state, title) moves.results.moveterm <- moves.result[sapply(moves.result, function (x) {x[[2]]})] moves.results.moveterm <- lapply(moves.results.moveterm, function (x) {x[[1]]}) log.moveterm <- rbindlist(mclapply(moves.results.moveterm, explode.with.final.state.data)) moves.results.otherterm <- moves.result[!sapply(moves.result, function (x) {x[[2]]})] moves.results.otherterm <- lapply(moves.results.otherterm, function (x) {x[[1]]}) log.otherterm <- rbindlist(mclapply(moves.results.otherterm, function (x) { explode.move.chain(x, data.table(type=NA, level=NA, expiry=NA)) })) expanded.log <- rbind(log, log.moveterm, log.otherterm, data.table(id=dels$id, title=dels$title, log.type="unprotect", log.time=dels$log.time, type="delete", level=NA, expiry=NA)) setkey(expanded.log, title) save(expanded.log, file="expanded_log.RData") # load("expanded_log.RData"); load("processed_log_and_sql_data.RData") ### GENERATE SPELLS generate.spells <- function (page.title, d) { x <- d[page.title,] setkey(x, id) spells <- data.table() tmp.spells <- data.table() prev.mod <- FALSE for (i in seq_len(nrow(x))) { row <- as.list(x[i,]) # if it's the first time, through and we're seeing an uprot, create a l-cens event if (i == 1 && row[["log.type"]] == "unprotect") { tmp.spells <- data.table(title=row[["title"]], type=NA, level=NA, start=NA, end=NA) } # first, see if any of the previous tmp.spells expired naturally if (nrow(tmp.spells) > 0 && nrow(tmp.spells[!is.na(tmp.spells$end),]) > 0 && nrow(tmp.spells[tmp.spells$end < row[["log.time"]],]) > 0) { # if they did, add them to spells and drop them from tmp.spells spells <- rbind(spells, tmp.spells[!is.na(tmp.spells$end) & tmp.spells$end < row[["log.time"]],]) tmp.spells <- tmp.spells[!(!is.na(tmp.spells$end) & tmp.spells$end < row[["log.time"]]),] } # otherwise, see if the prevoius spell was a protect/modify and ended a # spell by omission if (prev.mod && !is.na(prev.id) && prev.id != row[["id"]]) { unlisted.types <- tmp.spells[,type][!tmp.spells[,type] %in% x[x$id == row[["id"]], type]] tmp.tmp.spells <- tmp.spells[tmp.spells$type %in% unlisted.types,] tmp.tmp.spells$end <- x[x$id == row[["id"]],log.time] spells <- rbind(spells, tmp.tmp.spells) tmp.spells <- tmp.spells[!tmp.spells$type %in% unlisted.types,] } # if we are adding a new protection event if (row[["log.type"]] %in% c("protect", "modify")) { prev.mod <- TRUE; prev.id <- row[["id"]] # if there is an active spell that conflicts with the current bit, # end them with the spells start time if (nrow(tmp.spells) > 0) { # we could be missing previous data on type in which case we want to replace # and add to the spells if (all(is.na(tmp.spells$type))) { tmp.spells$end <- row[["log.time"]] spells <- rbind(spells, tmp.spells) tmp.spells <- data.table() } else if (is.na(row[["type"]])) { next } else { conflict <- tmp.spells$type == row[["type"]] # if it's the same rights, we update the expiry date and continue if (any(conflict) && tmp.spells[conflict,level] == row[["level"]]) { tmp.spells$end[conflict] <- row[["expiry"]] next } # otherwise, we end the spell tmp.spells$end[conflict] <- row[["log.time"]] spells <- rbind(spells, tmp.spells[conflict,]) tmp.spells <- tmp.spells[!conflict,] } } # add the new spell to the list of active spells tmp.spells <- rbind(tmp.spells, data.table(title=row[["title"]], type=row[["type"]], level=row[["level"]], start=row[["log.time"]], end=row[["expiry"]])) # if it's an unprotection event and we're sitting on existing events } else if (row[["log.type"]] == "unprotect" && nrow(tmp.spells) > 0) { prev.mod <- FALSE; prev.id <- row[["id"]] # end /all/ active spells and add them to spells tmp.spells$end <- row[["log.time"]] spells <- rbind(spells, tmp.spells) tmp.spells <- data.table() } } # if this is the final time through, add any active spells if (nrow(tmp.spells) > 0) { spells <- rbind(spells, tmp.spells) } tmp.spells <- data.table() if (any(is.na(spells$title))) { print(page.title) } return(spells) } page.titles <- unique(expanded.log$title) page.titles <- page.titles[!is.na(page.titles)] page.titles <- page.titles[page.titles != ""] #spells <- rbindlist(lapply(page.titles, generate.spells, expanded.log)) spells <- rbindlist(mclapply(page.titles, generate.spells, expanded.log)) save(spells, file="spells-nofinal.RData") # load("spells-nofinal.RData"); load("processed_log_and_sql_data.RData") # remove cascacading data from final.state data to allow for merging later final.state[, cascade := NULL] # set any ongoing spells and final state data ongoing at the point of data # collection to right censored spells <- spells[spells$end > dump.creation.time, end := NA] final.state <- final.state[final.state$expiry > dump.creation.time, expiry := NA] # drop log entries from outside our data collection window spells <- spells[spells$start < dump.creation.time,] # drop any spells other than edit, move or upload for which we have no final # state data; and any final state data along the same lines spells <- spells[spells$type %in% c("edit", "move", "upload"),] final.state <- final.state[final.state$type %in% c("edit", "move", "upload"),] # TODO/FIX? handle the two extra NA dropped here spells <- spells[!is.na(spells$title),] r[["num.spells.orig"]] <- nrow(spells) # we're now going to load on any final.state data for missing spells setkey(final.state, title, type) setkey(spells, title, type) ## merge final.state data with spells ####################################################################### # there are several situations we need to take into account: # 1. final.state for pages for which we have no record: we create right/left # censored spells for these # 2. we have an open spells. final.state data that matches data we have in log # data. we can discard this final state data # 3. we have an open spell and final state data, but they disagree. # 4. final state data for spells our log data suggests should be closed: add right/ # 5. log data suggests is should be open # 1. final.state for pages for which we have no record: we create right/left # censored spells for these. most likely, these were spells started before # 2008. as a result, we grab final.state data for those spells missing.spells <- final.state[is.na(spells[final.state, level, mult="first"]),] missing.spells <- missing.spells[!missing.spells$type == "aft",] missing.spells[, start := NA] missing.spells[, end := NA] missing.spells[, page.id := NULL] missing.spells[, expiry := NULL] # drop these missing spells from the final.state final.state <- final.state[!missing.spells[,list(title, type)]] # print the number of missing spells r[["num.created.from.final.state"]] <- nrow(missing.spells) r[["num.created.from.final.state.pages"]] <- length(unique(missing.spells$title)) # to answer 2+ we first need create a dataset of spells from the log that are # open at the time of data.collection. these will either be because they have # an infinite expiry or because they were ongoing at the time of # data.collection. open.spells <- spells[is.na(spells$end),] setkey(open.spells, start) setkey(open.spells, title, type) setkey(final.state, title, type) # BOOKMARK: BK-A # look for spells that both final.state and log suggests are open but there is # disagreement on tmp <- open.spells[final.state, nomatch=0] tmp <- tmp[as.character(tmp$level) != as.character(tmp$i.level), list(title, type)] # handchecking each of the examples suggests that these are all due a lag in # time between the creation of the SQL final.state data and the log. we can # simply remove these from the final state data r[["num.dropped.level.nomatch"]] <- nrow(tmp) # as a result, we can simply drop these form the final state # an alternate approach would be to set these to NA as in: # spells[tmp, level := as.character(NA), mult="last"] final.state <- final.state[!tmp,] # now, we have to take the list of open spells and find the subset of the final # state that does not match open.spells[, expiry := end] setkey(open.spells, title, type, level, expiry) setkey(final.state, title, type, level, expiry) # BOOKMARK: BK-C final.state.missing <- open.spells[!final.state,] # BOOKMARK: BK-B open.spells.missing <- final.state[!open.spells,] r[["num.final.state.missing"]] <- nrow(final.state.missing) r[["num.spell.missing"]] <- nrow(open.spells.missing) # spells <- rbind(spells, missing.spells, missing.spells2) spells <- rbind(spells, missing.spells) setkey(spells, title) save(spells, file="spells.RData")