initial version of code committed to git
authorBenjamin Mako Hill <mako@atdot.cc>
Tue, 24 Jun 2014 16:31:02 +0000 (12:31 -0400)
committerBenjamin Mako Hill <mako@atdot.cc>
Tue, 24 Jun 2014 16:31:02 +0000 (12:31 -0400)
01-extract_redirects.py [new file with mode: 0755]
02-generate_spells.R [new file with mode: 0644]
03-assemble_redirect_spells.R [new file with mode: 0644]
README [new file with mode: 0644]
example/01-condor_invoke_script.sh [new file with mode: 0755]
example/01-extract_redirects_wrapper.sh [new file with mode: 0755]
example/02-condor_invoke_script.sh [new file with mode: 0755]

diff --git a/01-extract_redirects.py b/01-extract_redirects.py
new file mode 100755 (executable)
index 0000000..1690941
--- /dev/null
@@ -0,0 +1,41 @@
+#!/usr/bin/env python27
+
+from wmf import dump
+import sys
+import re
+
+dumpIterator = dump.Iterator(sys.stdin)
+
+print(u"\t".join(["page.id", "revision.id", "page.title", "timestamp", "deleted", "redirect", "target"]))
+
+for page in dumpIterator.readPages():
+   #Do things with a page
+   #like extract it's title: page.getTitle()
+   #or it's ID: page.getId()
+   
+   for revision in page.readRevisions():
+      rev_data = []
+
+      rev_data.append(unicode(page.getId()))
+      rev_data.append(unicode(revision.getId()))
+      rev_data.append(unicode(page.getTitle()))
+      rev_data.append(unicode(revision.getTimestamp()))
+
+      text = revision.getText()
+     
+      if text == None:
+          rev_data.append("TRUE") # revision was deleted
+          rev_data.append("NA") # redirect bool = unknown
+          rev_data.append("NA") # redirect target missing
+      else:
+          rev_data.append("FALSE") # revision was not deleted
+          match = re.match(r"^#redirect \[\[(.*)\]\]", text, re.IGNORECASE)
+          if match:
+              target = match.group(1)
+              rev_data.append("TRUE") # redirect bool = TRUE
+              rev_data.append(target) # redirect target
+          else:
+              rev_data.append("FALSE") # redirect bool = FALSE
+              rev_data.append("NA") # redirect target missing
+
+      print(u"\t".join(rev_data).encode("utf-8"))
diff --git a/02-generate_spells.R b/02-generate_spells.R
new file mode 100644 (file)
index 0000000..d279962
--- /dev/null
@@ -0,0 +1,85 @@
+Sys.setenv(TZ = "UTC")
+
+library(data.table)
+
+# options(cores = 20)
+# options(mc.cores = 20)
+# Sys.setenv(GOTO_NUM_THREADS=05)
+
+generate.spells <- function (page, d) {
+
+    x <- d[page,mult="all"]
+    x <- as.data.frame(x)
+    x <- x[sort.list(x$timestamp),]
+
+    # transform the target because there are some differences that don't matter
+    x$target <- gsub('_', ' ', x$target)
+    x$target <- gsub("(^[[:alpha:]])", "\\U\\1", x$target, perl=TRUE)
+    x$target <- gsub('\\#.*$', '', x$target)
+
+    if (dim(x)[1] > 1) {
+        x$redirect.prev <- c(FALSE, x$redirect[1:(length(x$redirect)-1)])
+        x$target.prev <- c(NA, x$target[1:(length(x$redirect)-1)])
+    } else {
+        x$redirect.prev <- FALSE
+        x$target.prev <- NA
+    }
+    
+    # get a list of transitions
+    x <- x[x$redirect != x$redirect.prev |
+           ((!is.na(x$target) & !is.na(x$target.prev)) &
+            x$target != x$target.prev),]
+
+   # if there is only one transition it stays that way
+    if (dim(x)[1] > 1) {
+        x$end <- c(x$timestamp[2:dim(x)[1]], NA)
+    } else {
+        x$end <- NA
+    }
+
+    x <- x[x$redirect == TRUE,]
+
+    # relabel the columsn
+    x <- x[,c("page.id", "timestamp", "end", "page.title", "target")]
+    colnames(x) <- c("page.id", "start", "end", "page.title", "target")
+    
+    return(x)
+}
+
+filename.to.spells <- function (filename) {
+    con <- pipe(paste("bzcat", filename))
+
+    d <- read.delim(con, stringsAsFactors=FALSE, header=FALSE, skip=1,
+                    encoding="UTF-8", quote="")
+
+    colnames(d) <- c("page.id", "revision.id", "page.title", "timestamp",
+                     "deleted", "redirect", "target")
+
+    d$timestamp <- as.POSIXct(d$timestamp, tz="UTC", origin="1970-01-01 00:00:00")
+    
+    d <- d[!d$deleted,]
+
+    redirected.pages <- unique(d$page.title[d$redirect])
+
+    # convert to data.table
+    d <- as.data.table(d)
+    setkey(d, "page.title")
+
+    redirect.spells <- do.call("rbind", lapply(redirected.pages, generate.spells, d))
+
+    return(redirect.spells)
+}
+
+
+# save the run number
+run <- as.numeric(commandArgs(TRUE)) + 1
+run.string <- sprintf("%03d", run)
+
+setwd("/nfs/home/B/bhill/data/wp-enwiki-redir")
+redirect.spells <- filename.to.spells(list.files()[run])
+
+setwd("/nfs/home/B/bhill/data/wp-enwiki-redir-spells")
+save(redirect.spells, file=paste("redirect_spells-", run.string, ".RData", sep=""))
+
+# debug code
+# filename <- "wp_edits_redir_070.tsv.bz2"
diff --git a/03-assemble_redirect_spells.R b/03-assemble_redirect_spells.R
new file mode 100644 (file)
index 0000000..49f545b
--- /dev/null
@@ -0,0 +1,7 @@
+# #1: redirect spells
+setwd("~/data/wp-enwiki-redir-spells/")
+redirect.spells <- do.call("rbind", lapply(list.files(), function (x) {load(x); return(redirect.spells)}))
+
+setwd("~/data/rdata")
+save(redirect.spells, file="redirect_spells.RData")
+
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..486bc5e
--- /dev/null
+++ b/README
@@ -0,0 +1,73 @@
+Step 1: Flag Redirects in Revisions
+====================================
+
+Dependencies: 
+
+- Python 2.7
+- Wikimedia Utilities (https://bitbucket.org/halfak/wikimedia-utilities)
+
+ Input: 7z compressed Wikimedia XML Dump files
+Output: bzip compressed TSV files (one file per input file; one line per revision)
+
+Run the file `01-extract_redirects.py` to build a dataset of revisions or edits
+that marks every revisions as either containinig a revision, or not.
+
+The script `01-extract_redirects.py` takes a MediaWiki dump file on STDIN and
+outputs a TSV file on STDOUT of the following form:
+
+> page.id revision.id   page.title      timestamp       deleted redirect  target
+> 1935456 17563584        Mikhail Alekseevich Lavrentiev  1116962833      FALSE   FALSE   NA
+> 1935456 22034930        Mikhail Alekseevich Lavrentiev  1125245577      FALSE   TRUE    Mikhail Lavrentyev
+
+In this case, the first revision of the article "Mikhail Alekseevich
+Lavrentiev" was not a redirect but the second is a redirect to "Mikhail
+Lavrentyev".
+
+Because the full history dumps from the WMF foundation are split into many
+files, it is can be appropriate to parse these dumps in parallel. Although the
+specific ways you choose to do this will vary by system, we've included
+examples of the scripts we used with Condor on the Harvard/MIT Data Center
+(HMDC) in the "examples/" directory. They will not work without modification
+for your computing environment but they will give you an idea of where you
+might want to start..
+
+Step 2: Generate spells
+====================================
+
+Dependencies:
+
+- GNU R
+- data.table (http://cran.r-project.org/web/packages/data.table/)
+
+ Input: bzip compressed TSV files 
+Output: RData files containing data.frame of redirect spells named
+       `redirect.spell` (one file per input file)
+
+The file `02-generate_spells.R` contains an R function `generate.spells()` that
+takes a data frame of edit data as created in step 1 and a list of page title
+and which will create a list of redirect spells for those pages.
+
+It also contains a function `filename.to.spells()` which takes the filename of
+a bzip compressed file of the form created in step 1 and outputs a full list of
+redirect spells.
+
+In its current form, the R code is designed to be run on the HMDC cluster using
+Condor using the scripts prefixed with "02" in the examples directory. These
+scripts can be modified to work in different configurations.
+
+Step 3: Assemble Redirects Data
+====================================
+
+Dependencies:
+
+- GNU R
+
+Output: RData files containing data.frame of redirect spells named
+       `redirect.spell`
+Output: A combined RData file that contains all redirect spells
+
+The file `03-assemble_redirect_spells.R` contains R code that will read in all
+of the separate RData files, assmebles the many smaller dataframes into a
+single data.frame, and then saves that unified data.frame into a single RData
+file.
+
diff --git a/example/01-condor_invoke_script.sh b/example/01-condor_invoke_script.sh
new file mode 100755 (executable)
index 0000000..9ee8351
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/bash -x
+
+DATA_DIR="/nfs/home/B/bhill/shared_space/barnstar"
+NUM_RUNS=$(find ${DATA_DIR}/wp-enwiki-xml -name '*7z' |wc -l)
+
+condor_submit_util -x ./extract_redirects_wrapper.sh -i /dev/null -a '$(Process)' -n ${NUM_RUNS}
diff --git a/example/01-extract_redirects_wrapper.sh b/example/01-extract_redirects_wrapper.sh
new file mode 100755 (executable)
index 0000000..e953c61
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+RUN=$(expr $1 + 1)
+INDEX=$(printf "%03d" "$RUN")
+
+CUR_DIR="/nfs/home/B/bhill/condor_jobs/extract_redirects-20140412"
+DATA_DIR="/nfs/home/B/bhill/shared_space/barnstar"
+INPUT_FILE=$(find ${DATA_DIR}/wp-enwiki-xml -name '*7z' | sed -n ${RUN}p)
+OUTPUT_FILE="${DATA_DIR}/wp-enwiki-redir/wp_edits_redir_${INDEX}.tsv.bz2"
+
+# print material out
+7za x -so "${INPUT_FILE}" | /usr/local/bin/python2.7 ${CUR_DIR}/extract_redirects.py | bzip2 -c - > ${OUTPUT_FILE}
diff --git a/example/02-condor_invoke_script.sh b/example/02-condor_invoke_script.sh
new file mode 100755 (executable)
index 0000000..6ff7644
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/bash -x
+
+DATA_DIR="/nfs/home/B/bhill/shared_space/barnstar"
+NUM_RUNS=$(find ${DATA_DIR}/wp-enwiki-redir -name '*tsv.bz2' |wc -l)
+
+condor_submit_util -i redir-edits_to_spells.R -a '--no-restore --no-save --args $(Process)' -n ${NUM_RUNS}
+

Benjamin Mako Hill || Want to submit a patch?