--- /dev/null
+#!/usr/bin/env python27
+
+from wmf import dump
+import sys
+import re
+
+dumpIterator = dump.Iterator(sys.stdin)
+
+print(u"\t".join(["page.id", "revision.id", "page.title", "timestamp", "deleted", "redirect", "target"]))
+
+for page in dumpIterator.readPages():
+ #Do things with a page
+ #like extract it's title: page.getTitle()
+ #or it's ID: page.getId()
+
+ for revision in page.readRevisions():
+ rev_data = []
+
+ rev_data.append(unicode(page.getId()))
+ rev_data.append(unicode(revision.getId()))
+ rev_data.append(unicode(page.getTitle()))
+ rev_data.append(unicode(revision.getTimestamp()))
+
+ text = revision.getText()
+
+ if text == None:
+ rev_data.append("TRUE") # revision was deleted
+ rev_data.append("NA") # redirect bool = unknown
+ rev_data.append("NA") # redirect target missing
+ else:
+ rev_data.append("FALSE") # revision was not deleted
+ match = re.match(r"^#redirect \[\[(.*)\]\]", text, re.IGNORECASE)
+ if match:
+ target = match.group(1)
+ rev_data.append("TRUE") # redirect bool = TRUE
+ rev_data.append(target) # redirect target
+ else:
+ rev_data.append("FALSE") # redirect bool = FALSE
+ rev_data.append("NA") # redirect target missing
+
+ print(u"\t".join(rev_data).encode("utf-8"))
--- /dev/null
+Sys.setenv(TZ = "UTC")
+
+library(data.table)
+
+# options(cores = 20)
+# options(mc.cores = 20)
+# Sys.setenv(GOTO_NUM_THREADS=05)
+
+generate.spells <- function (page, d) {
+
+ x <- d[page,mult="all"]
+ x <- as.data.frame(x)
+ x <- x[sort.list(x$timestamp),]
+
+ # transform the target because there are some differences that don't matter
+ x$target <- gsub('_', ' ', x$target)
+ x$target <- gsub("(^[[:alpha:]])", "\\U\\1", x$target, perl=TRUE)
+ x$target <- gsub('\\#.*$', '', x$target)
+
+ if (dim(x)[1] > 1) {
+ x$redirect.prev <- c(FALSE, x$redirect[1:(length(x$redirect)-1)])
+ x$target.prev <- c(NA, x$target[1:(length(x$redirect)-1)])
+ } else {
+ x$redirect.prev <- FALSE
+ x$target.prev <- NA
+ }
+
+ # get a list of transitions
+ x <- x[x$redirect != x$redirect.prev |
+ ((!is.na(x$target) & !is.na(x$target.prev)) &
+ x$target != x$target.prev),]
+
+ # if there is only one transition it stays that way
+ if (dim(x)[1] > 1) {
+ x$end <- c(x$timestamp[2:dim(x)[1]], NA)
+ } else {
+ x$end <- NA
+ }
+
+ x <- x[x$redirect == TRUE,]
+
+ # relabel the columsn
+ x <- x[,c("page.id", "timestamp", "end", "page.title", "target")]
+ colnames(x) <- c("page.id", "start", "end", "page.title", "target")
+
+ return(x)
+}
+
+filename.to.spells <- function (filename) {
+ con <- pipe(paste("bzcat", filename))
+
+ d <- read.delim(con, stringsAsFactors=FALSE, header=FALSE, skip=1,
+ encoding="UTF-8", quote="")
+
+ colnames(d) <- c("page.id", "revision.id", "page.title", "timestamp",
+ "deleted", "redirect", "target")
+
+ d$timestamp <- as.POSIXct(d$timestamp, tz="UTC", origin="1970-01-01 00:00:00")
+
+ d <- d[!d$deleted,]
+
+ redirected.pages <- unique(d$page.title[d$redirect])
+
+ # convert to data.table
+ d <- as.data.table(d)
+ setkey(d, "page.title")
+
+ redirect.spells <- do.call("rbind", lapply(redirected.pages, generate.spells, d))
+
+ return(redirect.spells)
+}
+
+
+# save the run number
+run <- as.numeric(commandArgs(TRUE)) + 1
+run.string <- sprintf("%03d", run)
+
+setwd("/nfs/home/B/bhill/data/wp-enwiki-redir")
+redirect.spells <- filename.to.spells(list.files()[run])
+
+setwd("/nfs/home/B/bhill/data/wp-enwiki-redir-spells")
+save(redirect.spells, file=paste("redirect_spells-", run.string, ".RData", sep=""))
+
+# debug code
+# filename <- "wp_edits_redir_070.tsv.bz2"
--- /dev/null
+# #1: redirect spells
+setwd("~/data/wp-enwiki-redir-spells/")
+redirect.spells <- do.call("rbind", lapply(list.files(), function (x) {load(x); return(redirect.spells)}))
+
+setwd("~/data/rdata")
+save(redirect.spells, file="redirect_spells.RData")
+
--- /dev/null
+Step 1: Flag Redirects in Revisions
+====================================
+
+Dependencies:
+
+- Python 2.7
+- Wikimedia Utilities (https://bitbucket.org/halfak/wikimedia-utilities)
+
+ Input: 7z compressed Wikimedia XML Dump files
+Output: bzip compressed TSV files (one file per input file; one line per revision)
+
+Run the file `01-extract_redirects.py` to build a dataset of revisions or edits
+that marks every revisions as either containinig a revision, or not.
+
+The script `01-extract_redirects.py` takes a MediaWiki dump file on STDIN and
+outputs a TSV file on STDOUT of the following form:
+
+> page.id revision.id page.title timestamp deleted redirect target
+> 1935456 17563584 Mikhail Alekseevich Lavrentiev 1116962833 FALSE FALSE NA
+> 1935456 22034930 Mikhail Alekseevich Lavrentiev 1125245577 FALSE TRUE Mikhail Lavrentyev
+
+In this case, the first revision of the article "Mikhail Alekseevich
+Lavrentiev" was not a redirect but the second is a redirect to "Mikhail
+Lavrentyev".
+
+Because the full history dumps from the WMF foundation are split into many
+files, it is can be appropriate to parse these dumps in parallel. Although the
+specific ways you choose to do this will vary by system, we've included
+examples of the scripts we used with Condor on the Harvard/MIT Data Center
+(HMDC) in the "examples/" directory. They will not work without modification
+for your computing environment but they will give you an idea of where you
+might want to start..
+
+Step 2: Generate spells
+====================================
+
+Dependencies:
+
+- GNU R
+- data.table (http://cran.r-project.org/web/packages/data.table/)
+
+ Input: bzip compressed TSV files
+Output: RData files containing data.frame of redirect spells named
+ `redirect.spell` (one file per input file)
+
+The file `02-generate_spells.R` contains an R function `generate.spells()` that
+takes a data frame of edit data as created in step 1 and a list of page title
+and which will create a list of redirect spells for those pages.
+
+It also contains a function `filename.to.spells()` which takes the filename of
+a bzip compressed file of the form created in step 1 and outputs a full list of
+redirect spells.
+
+In its current form, the R code is designed to be run on the HMDC cluster using
+Condor using the scripts prefixed with "02" in the examples directory. These
+scripts can be modified to work in different configurations.
+
+Step 3: Assemble Redirects Data
+====================================
+
+Dependencies:
+
+- GNU R
+
+Output: RData files containing data.frame of redirect spells named
+ `redirect.spell`
+Output: A combined RData file that contains all redirect spells
+
+The file `03-assemble_redirect_spells.R` contains R code that will read in all
+of the separate RData files, assmebles the many smaller dataframes into a
+single data.frame, and then saves that unified data.frame into a single RData
+file.
+
--- /dev/null
+#!/bin/bash -x
+
+DATA_DIR="/nfs/home/B/bhill/shared_space/barnstar"
+NUM_RUNS=$(find ${DATA_DIR}/wp-enwiki-xml -name '*7z' |wc -l)
+
+condor_submit_util -x ./extract_redirects_wrapper.sh -i /dev/null -a '$(Process)' -n ${NUM_RUNS}
--- /dev/null
+#!/bin/bash
+
+RUN=$(expr $1 + 1)
+INDEX=$(printf "%03d" "$RUN")
+
+CUR_DIR="/nfs/home/B/bhill/condor_jobs/extract_redirects-20140412"
+DATA_DIR="/nfs/home/B/bhill/shared_space/barnstar"
+INPUT_FILE=$(find ${DATA_DIR}/wp-enwiki-xml -name '*7z' | sed -n ${RUN}p)
+OUTPUT_FILE="${DATA_DIR}/wp-enwiki-redir/wp_edits_redir_${INDEX}.tsv.bz2"
+
+# print material out
+7za x -so "${INPUT_FILE}" | /usr/local/bin/python2.7 ${CUR_DIR}/extract_redirects.py | bzip2 -c - > ${OUTPUT_FILE}
--- /dev/null
+#!/bin/bash -x
+
+DATA_DIR="/nfs/home/B/bhill/shared_space/barnstar"
+NUM_RUNS=$(find ${DATA_DIR}/wp-enwiki-redir -name '*tsv.bz2' |wc -l)
+
+condor_submit_util -i redir-edits_to_spells.R -a '--no-restore --no-save --args $(Process)' -n ${NUM_RUNS}
+