5 dump.creation.time <- as.POSIXct("2015-01-12 00:00:00 utc")
9 ## step 2: merge moves and deletions into an expanded log
10 #################################################################
11 load("processed_log_and_sql_data.RData")
13 r[["num.final.state.orig"]] <- nrow(final.state)
14 r[["num.log"]] <- nrow(log)
15 r[["num.moves"]] <- nrow(log)
16 r[["num.dels"]] <- nrow(dels)
17 r[["dump.creation.time"]] <- dump.creation.time
19 events <- log[,list(title, id)]
20 events[, type := 'protect']
22 tmp <- moves[,list(from.title, id)]
24 setnames(tmp, "from.title", "title")
26 tmp2 <- dels[, list(title, id)]
27 tmp2[, type := 'delete']
29 events <- rbindlist(list(events, tmp, tmp2))
32 # convert type to a factor and sort
33 events[, type := as.factor(type)]
35 # dropping missing titles
36 events <- events[!is.na(events$title),]
40 # return true if next event is move
41 get.next.event <- function (move.item) {
42 destination <- move.item[,to.title]
43 log.id <- move.item[,id]
45 x <- events[destination,]
46 x <- x[x$id > log.id,]
48 # return the first item in the event list
49 # this will return a one row data.table of all NA if it's missing...
50 return(x[sort.list(x$id)[1],])
54 ## turn/expand moves into a protection and unprotection events
55 ###############################################################
56 # build list of uninterupted moves; was end = end of y^
57 build.move.chains <- function (move.chain){
58 last.move <- move.chain[nrow(move.chain),]
59 next.event <- get.next.event(last.move)
61 if (!all(is.na(next.event))) {
62 if (next.event[, type] == "move") {
63 move.chain <- rbindlist(list(move.chain, moves.tmp[J(next.event[,id]),]))
64 build.move.chains(move.chain)
66 return(list(move.chain, FALSE))
69 return(list(move.chain, TRUE))
77 while (nrow(moves.tmp) > 0) {
80 rv <- build.move.chains(move)
82 ends.with.move <- rv[[2]]
84 for (i in seq(1, nrow(move.chain))) {
85 moves.tmp <- moves.tmp[!J(move.chain[i, id]),]
88 moves.result[[length(moves.result)+1]] <- list(move.chain, ends.with.move)
92 explode.move.chain <- function (chain, rights) {
93 log.unprot <- data.table(id=chain$id, title=chain$from.title,
94 log.type="unprotect", log.time=chain$log.time,
95 type="move", level=NA, expiry=NA)
96 log.prot <- data.table(id=chain$id, title=chain$to.title,
97 log.type="protect", log.time=chain$log.time)
99 # if, and only if, there are no previous events, we'll add right censored event
100 if (!any(events[chain[1,from.title],id] < chain[1, id])) {
101 log.prot <- rbind(log.prot,
102 data.table(id=NA, title=chain[1,from.title], log.type="protect",
106 # copy data.table once per right
107 log.prot <- cbind(log.prot[rep(seq_len(nrow(log.prot)), nrow(rights)),],
108 rights[unlist(lapply(seq_len(nrow(rights)),
109 function (i) {rep(i, nrow(log.prot))})),])
111 # put together the unprotect and protect events and return them
112 return(rbind(log.unprot, log.prot))
115 # check in the final state data
116 explode.with.final.state.data <- function (chain) {
117 final.title <- chain[nrow(chain), to.title]
119 # this will be set to a list of rights, or all NA if they are missing
120 rights <- final.state[final.title, list(type, level, expiry)]
122 # return entries for the log
123 explode.move.chain(chain, rights)
126 setkey(final.state, title)
128 moves.results.moveterm <- moves.result[sapply(moves.result, function (x) {x[[2]]})]
129 moves.results.moveterm <- lapply(moves.results.moveterm, function (x) {x[[1]]})
131 log.moveterm <- rbindlist(mclapply(moves.results.moveterm, explode.with.final.state.data))
133 moves.results.otherterm <- moves.result[!sapply(moves.result, function (x) {x[[2]]})]
134 moves.results.otherterm <- lapply(moves.results.otherterm, function (x) {x[[1]]})
136 log.otherterm <- rbindlist(mclapply(moves.results.otherterm,
137 function (x) { explode.move.chain(x, data.table(type=NA, level=NA, expiry=NA)) }))
139 expanded.log <- rbind(log, log.moveterm, log.otherterm,
140 data.table(id=dels$id, title=dels$title, log.type="unprotect",
141 log.time=dels$log.time, type="delete", level=NA, expiry=NA))
143 setkey(expanded.log, title)
145 save(expanded.log, file="expanded_log.RData")
146 # load("expanded_log.RData"); load("processed_log_and_sql_data.RData")
149 generate.spells <- function (page.title, d) {
153 spells <- data.table()
154 tmp.spells <- data.table()
156 for (i in seq_len(nrow(x))) {
157 row <- as.list(x[i,])
159 # if it's the first time, through and we're seeing an uprot, create a l-cens event
160 if (i == 1 && row[["log.type"]] == "unprotect") {
161 tmp.spells <- data.table(title=row[["title"]], type=NA, level=NA, start=NA, end=NA)
164 # first, see if any of the previous tmp.spells expired naturally
165 if (nrow(tmp.spells) > 0 &&
166 nrow(tmp.spells[!is.na(tmp.spells$end),]) > 0 &&
167 nrow(tmp.spells[tmp.spells$end < row[["log.time"]],]) > 0) {
168 # if they did, add them to spells and drop them from tmp.spells
169 spells <- rbind(spells, tmp.spells[!is.na(tmp.spells$end) & tmp.spells$end < row[["log.time"]],])
170 tmp.spells <- tmp.spells[!(!is.na(tmp.spells$end) & tmp.spells$end < row[["log.time"]]),]
173 # otherwise, see if the prevoius spell was a protect/modify and ended a
175 if (prev.mod && !is.na(prev.id) && prev.id != row[["id"]]) {
176 unlisted.types <- tmp.spells[,type][!tmp.spells[,type] %in% x[x$id == row[["id"]], type]]
177 tmp.tmp.spells <- tmp.spells[tmp.spells$type %in% unlisted.types,]
178 tmp.tmp.spells$end <- x[x$id == row[["id"]],log.time]
179 spells <- rbind(spells, tmp.tmp.spells)
181 tmp.spells <- tmp.spells[!tmp.spells$type %in% unlisted.types,]
184 # if we are adding a new protection event
185 if (row[["log.type"]] %in% c("protect", "modify")) {
186 prev.mod <- TRUE; prev.id <- row[["id"]]
187 # if there is an active spell that conflicts with the current bit,
188 # end them with the spells start time
189 if (nrow(tmp.spells) > 0) {
190 # we could be missing previous data on type in which case we want to replace
191 # and add to the spells
192 if (all(is.na(tmp.spells$type))) {
193 tmp.spells$end <- row[["log.time"]]
194 spells <- rbind(spells, tmp.spells)
195 tmp.spells <- data.table()
196 } else if (is.na(row[["type"]])) {
199 conflict <- tmp.spells$type == row[["type"]]
200 # if it's the same rights, we update the expiry date and continue
201 if (any(conflict) && tmp.spells[conflict,level] == row[["level"]]) {
202 tmp.spells$end[conflict] <- row[["expiry"]]
205 # otherwise, we end the spell
206 tmp.spells$end[conflict] <- row[["log.time"]]
207 spells <- rbind(spells, tmp.spells[conflict,])
208 tmp.spells <- tmp.spells[!conflict,]
212 # add the new spell to the list of active spells
213 tmp.spells <- rbind(tmp.spells,
214 data.table(title=row[["title"]], type=row[["type"]], level=row[["level"]],
215 start=row[["log.time"]], end=row[["expiry"]]))
217 # if it's an unprotection event and we're sitting on existing events
218 } else if (row[["log.type"]] == "unprotect" && nrow(tmp.spells) > 0) {
219 prev.mod <- FALSE; prev.id <- row[["id"]]
220 # end /all/ active spells and add them to spells
221 tmp.spells$end <- row[["log.time"]]
222 spells <- rbind(spells, tmp.spells)
223 tmp.spells <- data.table()
226 # if this is the final time through, add any active spells
227 if (nrow(tmp.spells) > 0) {
228 spells <- rbind(spells, tmp.spells)
230 tmp.spells <- data.table()
231 if (any(is.na(spells$title))) { print(page.title) }
235 page.titles <- unique(expanded.log$title)
236 page.titles <- page.titles[!is.na(page.titles)]
237 page.titles <- page.titles[page.titles != ""]
238 #spells <- rbindlist(lapply(page.titles, generate.spells, expanded.log))
239 spells <- rbindlist(mclapply(page.titles, generate.spells, expanded.log))
241 save(spells, file="spells-nofinal.RData")
243 # load("spells-nofinal.RData"); load("processed_log_and_sql_data.RData")
245 # remove cascacading data from final.state data to allow for merging later
246 final.state[, cascade := NULL]
248 # set any ongoing spells and final state data ongoing at the point of data
249 # collection to right censored
250 spells <- spells[spells$end > dump.creation.time, end := NA]
251 final.state <- final.state[final.state$expiry > dump.creation.time, expiry := NA]
253 # drop log entries from outside our data collection window
254 spells <- spells[spells$start < dump.creation.time,]
256 # drop any spells other than edit, move or upload for which we have no final
257 # state data; and any final state data along the same lines
258 spells <- spells[spells$type %in% c("edit", "move", "upload"),]
259 final.state <- final.state[final.state$type %in% c("edit", "move", "upload"),]
261 # TODO/FIX? handle the two extra NA dropped here
262 spells <- spells[!is.na(spells$title),]
264 r[["num.spells.orig"]] <- nrow(spells)
266 # we're now going to load on any final.state data for missing spells
267 setkey(final.state, title, type)
268 setkey(spells, title, type)
270 ## merge final.state data with spells
271 #######################################################################
272 # there are several situations we need to take into account:
273 # 1. final.state for pages for which we have no record: we create right/left
274 # censored spells for these
275 # 2. we have an open spells. final.state data that matches data we have in log
276 # data. we can discard this final state data
277 # 3. we have an open spell and final state data, but they disagree.
278 # 4. final state data for spells our log data suggests should be closed: add right/
279 # 5. log data suggests is should be open
281 # 1. final.state for pages for which we have no record: we create right/left
282 # censored spells for these. most likely, these were spells started before
283 # 2008. as a result, we grab final.state data for those spells
284 missing.spells <- final.state[is.na(spells[final.state, level, mult="first"]),]
285 missing.spells <- missing.spells[!missing.spells$type == "aft",]
286 missing.spells[, start := NA]
287 missing.spells[, end := NA]
288 missing.spells[, page.id := NULL]
289 missing.spells[, expiry := NULL]
291 # drop these missing spells from the final.state
292 final.state <- final.state[!missing.spells[,list(title, type)]]
294 # print the number of missing spells
295 r[["num.created.from.final.state"]] <- nrow(missing.spells)
296 r[["num.created.from.final.state.pages"]] <- length(unique(missing.spells$title))
298 # to answer 2+ we first need create a dataset of spells from the log that are
299 # open at the time of data.collection. these will either be because they have
300 # an infinite expiry or because they were ongoing at the time of
302 open.spells <- spells[is.na(spells$end),]
303 setkey(open.spells, start)
305 setkey(open.spells, title, type)
306 setkey(final.state, title, type)
309 # look for spells that both final.state and log suggests are open but there is
311 tmp <- open.spells[final.state, nomatch=0]
312 tmp <- tmp[as.character(tmp$level) != as.character(tmp$i.level), list(title, type)]
314 # handchecking each of the examples suggests that these are all due a lag in
315 # time between the creation of the SQL final.state data and the log. we can
316 # simply remove these from the final state data
317 r[["num.dropped.level.nomatch"]] <- nrow(tmp)
319 # as a result, we can simply drop these form the final state
320 # an alternate approach would be to set these to NA as in:
321 # spells[tmp, level := as.character(NA), mult="last"]
322 final.state <- final.state[!tmp,]
324 # now, we have to take the list of open spells and find the subset of the final
325 # state that does not match
326 open.spells[, expiry := end]
327 setkey(open.spells, title, type, level, expiry)
328 setkey(final.state, title, type, level, expiry)
331 final.state.missing <- open.spells[!final.state,]
333 open.spells.missing <- final.state[!open.spells,]
335 r[["num.final.state.missing"]] <- nrow(final.state.missing)
336 r[["num.spell.missing"]] <- nrow(open.spells.missing)
338 # spells <- rbind(spells, missing.spells, missing.spells2)
339 spells <- rbind(spells, missing.spells)
340 setkey(spells, title)
342 save(spells, file="spells.RData")
344 # save several other computationally intensive datasets
346 # load page info data
347 page.metadata.filename <- "enwiki-20150112-page.csv"
348 page.info <- fread(page.metadata.filename,
349 header=FALSE, stringsAsFactors=FALSE, na.strings=NULL,
350 select=1:3, showProgress=TRUE)
352 setnames(page.info, c("page.id", "ns.num", "title"))
354 r[["num.pages"]] <- nrow(page.info)
355 r[["num.pages.main"]] <- table(page.info$ns.num)[["0"]]
358 ##########################################################
359 save(r, file="sweave_data_spellgen.RData")