tweaks based on testing on Aaron's system. mostly grammar/syntax clarifications
[redirect-tools] / 02-generate_spells.R
index d27996289dcfee2c1a7e9c0965f51d00df359afe..c2a21884025d363a126903e2a6c7ff91ed9fb3a3 100644 (file)
@@ -1,85 +1,19 @@
-Sys.setenv(TZ = "UTC")
+source("redirect_tools.R")
 
-library(data.table)
+cur.dir <- getwd()
 
-# options(cores = 20)
-# options(mc.cores = 20)
-# Sys.setenv(GOTO_NUM_THREADS=05)
-
-generate.spells <- function (page, d) {
-
-    x <- d[page,mult="all"]
-    x <- as.data.frame(x)
-    x <- x[sort.list(x$timestamp),]
-
-    # transform the target because there are some differences that don't matter
-    x$target <- gsub('_', ' ', x$target)
-    x$target <- gsub("(^[[:alpha:]])", "\\U\\1", x$target, perl=TRUE)
-    x$target <- gsub('\\#.*$', '', x$target)
-
-    if (dim(x)[1] > 1) {
-        x$redirect.prev <- c(FALSE, x$redirect[1:(length(x$redirect)-1)])
-        x$target.prev <- c(NA, x$target[1:(length(x$redirect)-1)])
-    } else {
-        x$redirect.prev <- FALSE
-        x$target.prev <- NA
-    }
-    
-    # get a list of transitions
-    x <- x[x$redirect != x$redirect.prev |
-           ((!is.na(x$target) & !is.na(x$target.prev)) &
-            x$target != x$target.prev),]
-
-   # if there is only one transition it stays that way
-    if (dim(x)[1] > 1) {
-        x$end <- c(x$timestamp[2:dim(x)[1]], NA)
-    } else {
-        x$end <- NA
-    }
-
-    x <- x[x$redirect == TRUE,]
-
-    # relabel the columsn
-    x <- x[,c("page.id", "timestamp", "end", "page.title", "target")]
-    colnames(x) <- c("page.id", "start", "end", "page.title", "target")
-    
-    return(x)
-}
-
-filename.to.spells <- function (filename) {
-    con <- pipe(paste("bzcat", filename))
-
-    d <- read.delim(con, stringsAsFactors=FALSE, header=FALSE, skip=1,
-                    encoding="UTF-8", quote="")
-
-    colnames(d) <- c("page.id", "revision.id", "page.title", "timestamp",
-                     "deleted", "redirect", "target")
-
-    d$timestamp <- as.POSIXct(d$timestamp, tz="UTC", origin="1970-01-01 00:00:00")
-    
-    d <- d[!d$deleted,]
-
-    redirected.pages <- unique(d$page.title[d$redirect])
-
-    # convert to data.table
-    d <- as.data.table(d)
-    setkey(d, "page.title")
-
-    redirect.spells <- do.call("rbind", lapply(redirected.pages, generate.spells, d))
-
-    return(redirect.spells)
-}
+# save the run number
+setwd(redirect.data.dir)
+redirect.spells <- filename.to.spells(list.files())
 
+setwd(cur.dir)
+setwd(spells.data.dir)
+save(redirect.spells, file="redirect_spells.RData")
 
-# save the run number
-run <- as.numeric(commandArgs(TRUE)) + 1
-run.string <- sprintf("%03d", run)
+write.csv(redirect.spells, file="redirect_spells.tsv", sep="\t"
+                 fileEncoding="UTF-8", row.names=FALSE)
 
-setwd("/nfs/home/B/bhill/data/wp-enwiki-redir")
-redirect.spells <- filename.to.spells(list.files()[run])
+library(foreign)
+write.dta(redirect.spells, file="redirect_spells.dta")
 
-setwd("/nfs/home/B/bhill/data/wp-enwiki-redir-spells")
-save(redirect.spells, file=paste("redirect_spells-", run.string, ".RData", sep=""))
 
-# debug code
-# filename <- "wp_edits_redir_070.tsv.bz2"

Benjamin Mako Hill || Want to submit a patch?