X-Git-Url: https://projects.mako.cc/source/redirect-tools/blobdiff_plain/6afdb261da3e6b7fe8d122614f052644556a3095..94bd17bbb5059ab82da990a6488cbe6eb3d5c42d:/redirect_tools.R?ds=sidebyside diff --git a/redirect_tools.R b/redirect_tools.R new file mode 100644 index 0000000..00c9801 --- /dev/null +++ b/redirect_tools.R @@ -0,0 +1,79 @@ +# EDIT ME: Configuration options are in the stanza below +####################################################################### +redirect.data.dir <- "output/redir" +spells.data.dir <- "output/spells" +output.data.dir <- "output" + +# functions and code used elsewhere +####################################################################### +Sys.setenv(TZ = "UTC") + +library(data.table) + +# options(cores = 20) +# options(mc.cores = 20) +# Sys.setenv(GOTO_NUM_THREADS=05) + +generate.spells <- function (page, d) { + + x <- d[page,mult="all"] + x <- as.data.frame(x) + x <- x[sort.list(x$timestamp),] + + # transform the target because there are some differences that don't matter + x$target <- gsub('_', ' ', x$target) + x$target <- gsub("(^[[:alpha:]])", "\\U\\1", x$target, perl=TRUE) + x$target <- gsub('\\#.*$', '', x$target) + + if (dim(x)[1] > 1) { + x$redirect.prev <- c(FALSE, x$redirect[1:(length(x$redirect)-1)]) + x$target.prev <- c(NA, x$target[1:(length(x$redirect)-1)]) + } else { + x$redirect.prev <- FALSE + x$target.prev <- NA + } + + # get a list of transitions + x <- x[x$redirect != x$redirect.prev | + ((!is.na(x$target) & !is.na(x$target.prev)) & + x$target != x$target.prev),] + + # if there is only one transition it stays that way + if (dim(x)[1] > 1) { + x$end <- c(x$timestamp[2:dim(x)[1]], NA) + } else { + x$end <- NA + } + + x <- x[x$redirect == TRUE,] + + # relabel the columsn + x <- x[,c("page.id", "timestamp", "end", "page.title", "target")] + colnames(x) <- c("page.id", "start", "end", "page.title", "target") + + return(x) +} + +filename.to.spells <- function (filename) { + con <- pipe(paste("bzcat", filename)) + + d <- read.delim(con, stringsAsFactors=FALSE, header=FALSE, skip=1, + encoding="UTF-8", quote="") + + colnames(d) <- c("page.id", "revision.id", "page.title", "timestamp", + "deleted", "redirect", "target") + + d$timestamp <- as.POSIXct(d$timestamp, tz="UTC", origin="1970-01-01 00:00:00") + + d <- d[!d$deleted,] + + redirected.pages <- unique(d$page.title[d$redirect]) + + # convert to data.table + d <- as.data.table(d) + setkey(d, "page.title") + + redirect.spells <- rbindlist(lapply(redirected.pages, generate.spells, d)) + + return(redirect.spells) +}