setwd("~/japan_trip_2014") # url <- 'http://wiki.mako.cc/Yashetarium_ledger' # url <- 'http://en.wikipedia.org/wiki/Comparison_of_e-book_readers' # stringsAsFactors <- FALSE; empty.string <- TRUE read.mw.tables <- function (url, strip.html=FALSE, stringsAsFactors=FALSE, empty.string="") { # if it's a URL, we should get wiki text # TODO tweak this to try it before adding the action is.url <- regexpr("http",url) > 0 if (is.url) { file <- paste(url, "?action=raw", sep="") } else { file <- url } wiki.text <- do.call("paste", as.list(readLines(file, warn=FALSE))) # remove all non-wiki text wiki.text <- gsub('.*(\\{\\|.*?\\|\\})', '\\1', wiki.text) wiki.text <- gsub('^(.*\\|\\}).*$', '\\1', wiki.text) tables <- paste("{|", strsplit(wiki.text, '\\{\\|')[[1]]) # drop the first which is just the front tables <- tables[-1] # strip html tags out of the output tables <- gsub('<(?:[^>\'"]*|".*?"|\'.*?\')+>', '', tables) wikitable.to.df <- function (wt) { split.wt.into.rows <- function (wt) { wt.rows <- strsplit(wt, '\\s*\\|\\-\\s*')[[1]] # if there's a first row, that's style attributions wt.rows[-1] } split.wt.row.into.cells <- function (wt.row) { wt.cells <- strsplit(wt.row, '\\s*\\|\\s*')[[1]] # if the first cell has something, it's style and can be # dropped and returned wt.cells[-1] } # trip beginning and end tags and split into rows wt <- sub('^\\{\\|\\s*(.*?)\\s*\\|\\}$', '\\1', wt) wt.rows <- split.wt.into.rows(wt) # if it's a header, we keep it for later if (substr(wt.rows[1], 1, 1) == "!") { wt.header <- strsplit(sub('\\!\\s*(.*)$', '\\1', wt.rows[1]),'\\s*!\\s*')[[1]] wt.rows <- wt.rows[-1] } # turn wt into a table row.list <- lapply(wt.rows, split.wt.row.into.cells) # MW lists can have different number of cells, so we need to pad row.list <- lapply(row.list, function (x) { total <- max(sapply(row.list, length)) c(x, rep("", (total - length(x)))) }) # create the text matrix m <- do.call("rbind", row.list) # drop any any items that are empty missing.cols <- apply(m, 2, function (x) {all(x == "")}) if (any(missing.cols)) { m <- m[,!missing.cols] } # set the header correct if that's in the table if (exists("wt.header")) { # if the missing columns are the same ones, use that if (any(missing.cols) & all(missing.cols == (names(m) == ""))) { colnames(m) <- wt.header[!wt.header == ""] } else { wt.header <- c(wt.header, rep(NA, max(sapply(row.list, length)) - length(wt.header))) ## print(length(wt.header)) ## print(wt.header) ## print(max(sapply(row.list, length))) ## print(row.list) colnames(m) <- wt.header } } # turn any numbers into numbers d <- data.frame(m) d <- data.frame(lapply(d, function (x) { # convert any numbers into numbers if (all(grepl('^[0-9\\.]*$', x))) return(as.numeric(as.character(x))) else return(x) })) # set missing things to NA d[,sapply(d, class) == "factor"] <- do.call("data.frame", lapply(d[,sapply(d, class) == "factor"], function (x) {x[x==empty.string] <- NA;x})) # for every data.frame, try to change things to numbers if (!stringsAsFactors) { d[,sapply(d, class) == "factor"] <- do.call("cbind", lapply(d[, sapply(d, class) == "factor"], function (x) {as.character(x)})) } return(d) } wt <- lapply(tables, wikitable.to.df) # TODO write strip.wikimarkup function } ## run the zero sum thing ledger ########################################################### url <- 'http://wiki.mako.cc/Travel_plans/Winter_2014' d <- read.mw.tables(url)[[1]] colnames(d) <- tolower(colnames(d)) d$cost <- as.numeric(as.character(d$cost)) d$paid <- as.character(d$paid) d$beneficiaries <- as.character(d$beneficiaries) # replace all with everybody d$beneficiaries <- sub("All", "Aaron, Mako, Mika, Vaughn", d$beneficiaries) gen.cost <- function (x) { tmp.money <- rep(0, 4) names(tmp.money) <- c("Aaron", "Mako", "Mika", "Vaughn") payees <- x$paid purchasers <- strsplit(x$beneficiaries, ", ")[[1]] tmp.money[purchasers] <- x$cost / length(purchasers) tmp.money[payees] <- tmp.money[payees] - x$cost return(tmp.money) } e <- do.call("rbind", by(d, seq(1,dim(d)[1]), gen.cost)) round(apply(e, 2, sum))