-# options(cores = 20)
-# options(mc.cores = 20)
-# Sys.setenv(GOTO_NUM_THREADS=05)
-
-generate.spells <- function (page, d) {
-
- x <- d[page,mult="all"]
- x <- as.data.frame(x)
- x <- x[sort.list(x$timestamp),]
-
- # transform the target because there are some differences that don't matter
- x$target <- gsub('_', ' ', x$target)
- x$target <- gsub("(^[[:alpha:]])", "\\U\\1", x$target, perl=TRUE)
- x$target <- gsub('\\#.*$', '', x$target)
-
- if (dim(x)[1] > 1) {
- x$redirect.prev <- c(FALSE, x$redirect[1:(length(x$redirect)-1)])
- x$target.prev <- c(NA, x$target[1:(length(x$redirect)-1)])
- } else {
- x$redirect.prev <- FALSE
- x$target.prev <- NA
- }
-
- # get a list of transitions
- x <- x[x$redirect != x$redirect.prev |
- ((!is.na(x$target) & !is.na(x$target.prev)) &
- x$target != x$target.prev),]
-
- # if there is only one transition it stays that way
- if (dim(x)[1] > 1) {
- x$end <- c(x$timestamp[2:dim(x)[1]], NA)
- } else {
- x$end <- NA
- }
-
- x <- x[x$redirect == TRUE,]
-
- # relabel the columsn
- x <- x[,c("page.id", "timestamp", "end", "page.title", "target")]
- colnames(x) <- c("page.id", "start", "end", "page.title", "target")
-
- return(x)
-}
-
-filename.to.spells <- function (filename) {
- con <- pipe(paste("bzcat", filename))
-
- d <- read.delim(con, stringsAsFactors=FALSE, header=FALSE, skip=1,
- encoding="UTF-8", quote="")
-
- colnames(d) <- c("page.id", "revision.id", "page.title", "timestamp",
- "deleted", "redirect", "target")
-
- d$timestamp <- as.POSIXct(d$timestamp, tz="UTC", origin="1970-01-01 00:00:00")
-
- d <- d[!d$deleted,]
-
- redirected.pages <- unique(d$page.title[d$redirect])
-
- # convert to data.table
- d <- as.data.table(d)
- setkey(d, "page.title")
-
- redirect.spells <- do.call("rbind", lapply(redirected.pages, generate.spells, d))
-
- return(redirect.spells)
-}