X-Git-Url: https://projects.mako.cc/source/redirect-tools/blobdiff_plain/6afdb261da3e6b7fe8d122614f052644556a3095..14977b34053a82232ac5aa1323e3ec894af627a0:/02-generate_spells.R diff --git a/02-generate_spells.R b/02-generate_spells.R index d279962..c2a2188 100644 --- a/02-generate_spells.R +++ b/02-generate_spells.R @@ -1,85 +1,19 @@ -Sys.setenv(TZ = "UTC") +source("redirect_tools.R") -library(data.table) +cur.dir <- getwd() -# options(cores = 20) -# options(mc.cores = 20) -# Sys.setenv(GOTO_NUM_THREADS=05) - -generate.spells <- function (page, d) { - - x <- d[page,mult="all"] - x <- as.data.frame(x) - x <- x[sort.list(x$timestamp),] - - # transform the target because there are some differences that don't matter - x$target <- gsub('_', ' ', x$target) - x$target <- gsub("(^[[:alpha:]])", "\\U\\1", x$target, perl=TRUE) - x$target <- gsub('\\#.*$', '', x$target) - - if (dim(x)[1] > 1) { - x$redirect.prev <- c(FALSE, x$redirect[1:(length(x$redirect)-1)]) - x$target.prev <- c(NA, x$target[1:(length(x$redirect)-1)]) - } else { - x$redirect.prev <- FALSE - x$target.prev <- NA - } - - # get a list of transitions - x <- x[x$redirect != x$redirect.prev | - ((!is.na(x$target) & !is.na(x$target.prev)) & - x$target != x$target.prev),] - - # if there is only one transition it stays that way - if (dim(x)[1] > 1) { - x$end <- c(x$timestamp[2:dim(x)[1]], NA) - } else { - x$end <- NA - } - - x <- x[x$redirect == TRUE,] - - # relabel the columsn - x <- x[,c("page.id", "timestamp", "end", "page.title", "target")] - colnames(x) <- c("page.id", "start", "end", "page.title", "target") - - return(x) -} - -filename.to.spells <- function (filename) { - con <- pipe(paste("bzcat", filename)) - - d <- read.delim(con, stringsAsFactors=FALSE, header=FALSE, skip=1, - encoding="UTF-8", quote="") - - colnames(d) <- c("page.id", "revision.id", "page.title", "timestamp", - "deleted", "redirect", "target") - - d$timestamp <- as.POSIXct(d$timestamp, tz="UTC", origin="1970-01-01 00:00:00") - - d <- d[!d$deleted,] - - redirected.pages <- unique(d$page.title[d$redirect]) - - # convert to data.table - d <- as.data.table(d) - setkey(d, "page.title") - - redirect.spells <- do.call("rbind", lapply(redirected.pages, generate.spells, d)) - - return(redirect.spells) -} +# save the run number +setwd(redirect.data.dir) +redirect.spells <- filename.to.spells(list.files()) +setwd(cur.dir) +setwd(spells.data.dir) +save(redirect.spells, file="redirect_spells.RData") -# save the run number -run <- as.numeric(commandArgs(TRUE)) + 1 -run.string <- sprintf("%03d", run) +write.csv(redirect.spells, file="redirect_spells.tsv", sep="\t" + fileEncoding="UTF-8", row.names=FALSE) -setwd("/nfs/home/B/bhill/data/wp-enwiki-redir") -redirect.spells <- filename.to.spells(list.files()[run]) +library(foreign) +write.dta(redirect.spells, file="redirect_spells.dta") -setwd("/nfs/home/B/bhill/data/wp-enwiki-redir-spells") -save(redirect.spells, file=paste("redirect_spells-", run.string, ".RData", sep="")) -# debug code -# filename <- "wp_edits_redir_070.tsv.bz2"