From: Benjamin Mako Hill Date: Tue, 24 Jun 2014 16:31:02 +0000 (-0400) Subject: initial version of code committed to git X-Git-Url: https://projects.mako.cc/source/redirect-tools/commitdiff_plain/6afdb261da3e6b7fe8d122614f052644556a3095 initial version of code committed to git --- 6afdb261da3e6b7fe8d122614f052644556a3095 diff --git a/01-extract_redirects.py b/01-extract_redirects.py new file mode 100755 index 0000000..1690941 --- /dev/null +++ b/01-extract_redirects.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python27 + +from wmf import dump +import sys +import re + +dumpIterator = dump.Iterator(sys.stdin) + +print(u"\t".join(["page.id", "revision.id", "page.title", "timestamp", "deleted", "redirect", "target"])) + +for page in dumpIterator.readPages(): + #Do things with a page + #like extract it's title: page.getTitle() + #or it's ID: page.getId() + + for revision in page.readRevisions(): + rev_data = [] + + rev_data.append(unicode(page.getId())) + rev_data.append(unicode(revision.getId())) + rev_data.append(unicode(page.getTitle())) + rev_data.append(unicode(revision.getTimestamp())) + + text = revision.getText() + + if text == None: + rev_data.append("TRUE") # revision was deleted + rev_data.append("NA") # redirect bool = unknown + rev_data.append("NA") # redirect target missing + else: + rev_data.append("FALSE") # revision was not deleted + match = re.match(r"^#redirect \[\[(.*)\]\]", text, re.IGNORECASE) + if match: + target = match.group(1) + rev_data.append("TRUE") # redirect bool = TRUE + rev_data.append(target) # redirect target + else: + rev_data.append("FALSE") # redirect bool = FALSE + rev_data.append("NA") # redirect target missing + + print(u"\t".join(rev_data).encode("utf-8")) diff --git a/02-generate_spells.R b/02-generate_spells.R new file mode 100644 index 0000000..d279962 --- /dev/null +++ b/02-generate_spells.R @@ -0,0 +1,85 @@ +Sys.setenv(TZ = "UTC") + +library(data.table) + +# options(cores = 20) +# options(mc.cores = 20) +# Sys.setenv(GOTO_NUM_THREADS=05) + +generate.spells <- function (page, d) { + + x <- d[page,mult="all"] + x <- as.data.frame(x) + x <- x[sort.list(x$timestamp),] + + # transform the target because there are some differences that don't matter + x$target <- gsub('_', ' ', x$target) + x$target <- gsub("(^[[:alpha:]])", "\\U\\1", x$target, perl=TRUE) + x$target <- gsub('\\#.*$', '', x$target) + + if (dim(x)[1] > 1) { + x$redirect.prev <- c(FALSE, x$redirect[1:(length(x$redirect)-1)]) + x$target.prev <- c(NA, x$target[1:(length(x$redirect)-1)]) + } else { + x$redirect.prev <- FALSE + x$target.prev <- NA + } + + # get a list of transitions + x <- x[x$redirect != x$redirect.prev | + ((!is.na(x$target) & !is.na(x$target.prev)) & + x$target != x$target.prev),] + + # if there is only one transition it stays that way + if (dim(x)[1] > 1) { + x$end <- c(x$timestamp[2:dim(x)[1]], NA) + } else { + x$end <- NA + } + + x <- x[x$redirect == TRUE,] + + # relabel the columsn + x <- x[,c("page.id", "timestamp", "end", "page.title", "target")] + colnames(x) <- c("page.id", "start", "end", "page.title", "target") + + return(x) +} + +filename.to.spells <- function (filename) { + con <- pipe(paste("bzcat", filename)) + + d <- read.delim(con, stringsAsFactors=FALSE, header=FALSE, skip=1, + encoding="UTF-8", quote="") + + colnames(d) <- c("page.id", "revision.id", "page.title", "timestamp", + "deleted", "redirect", "target") + + d$timestamp <- as.POSIXct(d$timestamp, tz="UTC", origin="1970-01-01 00:00:00") + + d <- d[!d$deleted,] + + redirected.pages <- unique(d$page.title[d$redirect]) + + # convert to data.table + d <- as.data.table(d) + setkey(d, "page.title") + + redirect.spells <- do.call("rbind", lapply(redirected.pages, generate.spells, d)) + + return(redirect.spells) +} + + +# save the run number +run <- as.numeric(commandArgs(TRUE)) + 1 +run.string <- sprintf("%03d", run) + +setwd("/nfs/home/B/bhill/data/wp-enwiki-redir") +redirect.spells <- filename.to.spells(list.files()[run]) + +setwd("/nfs/home/B/bhill/data/wp-enwiki-redir-spells") +save(redirect.spells, file=paste("redirect_spells-", run.string, ".RData", sep="")) + +# debug code +# filename <- "wp_edits_redir_070.tsv.bz2" diff --git a/03-assemble_redirect_spells.R b/03-assemble_redirect_spells.R new file mode 100644 index 0000000..49f545b --- /dev/null +++ b/03-assemble_redirect_spells.R @@ -0,0 +1,7 @@ +# #1: redirect spells +setwd("~/data/wp-enwiki-redir-spells/") +redirect.spells <- do.call("rbind", lapply(list.files(), function (x) {load(x); return(redirect.spells)})) + +setwd("~/data/rdata") +save(redirect.spells, file="redirect_spells.RData") + diff --git a/README b/README new file mode 100644 index 0000000..486bc5e --- /dev/null +++ b/README @@ -0,0 +1,73 @@ +Step 1: Flag Redirects in Revisions +==================================== + +Dependencies: + +- Python 2.7 +- Wikimedia Utilities (https://bitbucket.org/halfak/wikimedia-utilities) + + Input: 7z compressed Wikimedia XML Dump files +Output: bzip compressed TSV files (one file per input file; one line per revision) + +Run the file `01-extract_redirects.py` to build a dataset of revisions or edits +that marks every revisions as either containinig a revision, or not. + +The script `01-extract_redirects.py` takes a MediaWiki dump file on STDIN and +outputs a TSV file on STDOUT of the following form: + +> page.id revision.id page.title timestamp deleted redirect target +> 1935456 17563584 Mikhail Alekseevich Lavrentiev 1116962833 FALSE FALSE NA +> 1935456 22034930 Mikhail Alekseevich Lavrentiev 1125245577 FALSE TRUE Mikhail Lavrentyev + +In this case, the first revision of the article "Mikhail Alekseevich +Lavrentiev" was not a redirect but the second is a redirect to "Mikhail +Lavrentyev". + +Because the full history dumps from the WMF foundation are split into many +files, it is can be appropriate to parse these dumps in parallel. Although the +specific ways you choose to do this will vary by system, we've included +examples of the scripts we used with Condor on the Harvard/MIT Data Center +(HMDC) in the "examples/" directory. They will not work without modification +for your computing environment but they will give you an idea of where you +might want to start.. + +Step 2: Generate spells +==================================== + +Dependencies: + +- GNU R +- data.table (http://cran.r-project.org/web/packages/data.table/) + + Input: bzip compressed TSV files +Output: RData files containing data.frame of redirect spells named + `redirect.spell` (one file per input file) + +The file `02-generate_spells.R` contains an R function `generate.spells()` that +takes a data frame of edit data as created in step 1 and a list of page title +and which will create a list of redirect spells for those pages. + +It also contains a function `filename.to.spells()` which takes the filename of +a bzip compressed file of the form created in step 1 and outputs a full list of +redirect spells. + +In its current form, the R code is designed to be run on the HMDC cluster using +Condor using the scripts prefixed with "02" in the examples directory. These +scripts can be modified to work in different configurations. + +Step 3: Assemble Redirects Data +==================================== + +Dependencies: + +- GNU R + +Output: RData files containing data.frame of redirect spells named + `redirect.spell` +Output: A combined RData file that contains all redirect spells + +The file `03-assemble_redirect_spells.R` contains R code that will read in all +of the separate RData files, assmebles the many smaller dataframes into a +single data.frame, and then saves that unified data.frame into a single RData +file. + diff --git a/example/01-condor_invoke_script.sh b/example/01-condor_invoke_script.sh new file mode 100755 index 0000000..9ee8351 --- /dev/null +++ b/example/01-condor_invoke_script.sh @@ -0,0 +1,6 @@ +#!/bin/bash -x + +DATA_DIR="/nfs/home/B/bhill/shared_space/barnstar" +NUM_RUNS=$(find ${DATA_DIR}/wp-enwiki-xml -name '*7z' |wc -l) + +condor_submit_util -x ./extract_redirects_wrapper.sh -i /dev/null -a '$(Process)' -n ${NUM_RUNS} diff --git a/example/01-extract_redirects_wrapper.sh b/example/01-extract_redirects_wrapper.sh new file mode 100755 index 0000000..e953c61 --- /dev/null +++ b/example/01-extract_redirects_wrapper.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +RUN=$(expr $1 + 1) +INDEX=$(printf "%03d" "$RUN") + +CUR_DIR="/nfs/home/B/bhill/condor_jobs/extract_redirects-20140412" +DATA_DIR="/nfs/home/B/bhill/shared_space/barnstar" +INPUT_FILE=$(find ${DATA_DIR}/wp-enwiki-xml -name '*7z' | sed -n ${RUN}p) +OUTPUT_FILE="${DATA_DIR}/wp-enwiki-redir/wp_edits_redir_${INDEX}.tsv.bz2" + +# print material out +7za x -so "${INPUT_FILE}" | /usr/local/bin/python2.7 ${CUR_DIR}/extract_redirects.py | bzip2 -c - > ${OUTPUT_FILE} diff --git a/example/02-condor_invoke_script.sh b/example/02-condor_invoke_script.sh new file mode 100755 index 0000000..6ff7644 --- /dev/null +++ b/example/02-condor_invoke_script.sh @@ -0,0 +1,7 @@ +#!/bin/bash -x + +DATA_DIR="/nfs/home/B/bhill/shared_space/barnstar" +NUM_RUNS=$(find ${DATA_DIR}/wp-enwiki-redir -name '*tsv.bz2' |wc -l) + +condor_submit_util -i redir-edits_to_spells.R -a '--no-restore --no-save --args $(Process)' -n ${NUM_RUNS} +