From: Benjamin Mako Hill Date: Thu, 6 Mar 2014 23:03:52 +0000 (-0800) Subject: added version of the zerosum script for japan (forked from the yashetarium) X-Git-Url: https://projects.mako.cc/source/zerosum-wikitable/commitdiff_plain/0838f1355fab9e77fb517578c2364f07ec64492d?ds=inline added version of the zerosum script for japan (forked from the yashetarium) --- 0838f1355fab9e77fb517578c2364f07ec64492d diff --git a/zerosum.R b/zerosum.R new file mode 100644 index 0000000..1dfba92 --- /dev/null +++ b/zerosum.R @@ -0,0 +1,156 @@ +setwd("~/japan_trip_2014") + +# url <- 'http://wiki.mako.cc/Yashetarium_ledger' +# url <- 'http://en.wikipedia.org/wiki/Comparison_of_e-book_readers' +# stringsAsFactors <- FALSE; empty.string <- TRUE + +read.mw.tables <- function (url, + strip.html=FALSE, + stringsAsFactors=FALSE, + empty.string="") { + # if it's a URL, we should get wiki text + + # TODO tweak this to try it before adding the action + is.url <- regexpr("http",url) > 0 + if (is.url) { file <- paste(url, "?action=raw", sep="") + } else { file <- url } + + wiki.text <- do.call("paste", as.list(readLines(file, warn=FALSE))) + + # remove all non-wiki text + wiki.text <- gsub('.*(\\{\\|.*?\\|\\})', '\\1', wiki.text) + wiki.text <- gsub('^(.*\\|\\}).*$', '\\1', wiki.text) + + tables <- paste("{|", strsplit(wiki.text, '\\{\\|')[[1]]) + # drop the first which is just the front + tables <- tables[-1] + + # strip html tags out of the output + tables <- gsub('<(?:[^>\'"]*|".*?"|\'.*?\')+>', '', tables) + + wikitable.to.df <- function (wt) { + split.wt.into.rows <- function (wt) { + wt.rows <- strsplit(wt, '\\s*\\|\\-\\s*')[[1]] + + # if there's a first row, that's style attributions + wt.rows[-1] + } + + + split.wt.row.into.cells <- function (wt.row) { + wt.cells <- strsplit(wt.row, '\\s*\\|\\s*')[[1]] + + # if the first cell has something, it's style and can be + # dropped and returned + wt.cells[-1] + } + + # trip beginning and end tags and split into rows + wt <- sub('^\\{\\|\\s*(.*?)\\s*\\|\\}$', '\\1', wt) + wt.rows <- split.wt.into.rows(wt) + + # if it's a header, we keep it for later + if (substr(wt.rows[1], 1, 1) == "!") { + wt.header <- strsplit(sub('\\!\\s*(.*)$', + '\\1', wt.rows[1]),'\\s*!\\s*')[[1]] + wt.rows <- wt.rows[-1] + } + + # turn wt into a table + row.list <- lapply(wt.rows, split.wt.row.into.cells) + + # MW lists can have different number of cells, so we need to pad + row.list <- lapply(row.list, function (x) { + total <- max(sapply(row.list, length)) + c(x, rep("", (total - length(x)))) }) + + # create the text matrix + m <- do.call("rbind", row.list) + + # drop any any items that are empty + missing.cols <- apply(m, 2, function (x) {all(x == "")}) + if (any(missing.cols)) { + m <- m[,!missing.cols] + } + + # set the header correct if that's in the table + if (exists("wt.header")) { + # if the missing columns are the same ones, use that + if (any(missing.cols) & + all(missing.cols == (names(m) == ""))) { + colnames(m) <- wt.header[!wt.header == ""] + } else { + wt.header <- c(wt.header, rep(NA, + max(sapply(row.list, length)) - length(wt.header))) + + ## print(length(wt.header)) + ## print(wt.header) + ## print(max(sapply(row.list, length))) + ## print(row.list) + + colnames(m) <- wt.header + } + } + + # turn any numbers into numbers + d <- data.frame(m) + + d <- data.frame(lapply(d, + function (x) { + # convert any numbers into numbers + if (all(grepl('^[0-9\\.]*$', x))) + return(as.numeric(as.character(x))) + else return(x) + })) + + # set missing things to NA + d[,sapply(d, class) == "factor"] <- do.call("data.frame", + lapply(d[,sapply(d, class) == "factor"], + function (x) {x[x==empty.string] <- NA;x})) + + # for every data.frame, try to change things to numbers + if (!stringsAsFactors) { + d[,sapply(d, class) == "factor"] <- do.call("cbind", + lapply(d[, sapply(d, class) == "factor"], + function (x) {as.character(x)})) + } + + return(d) + + } + + wt <- lapply(tables, wikitable.to.df) + + # TODO write strip.wikimarkup function +} + + +## run the zero sum thing ledger +########################################################### +url <- 'http://wiki.mako.cc/Travel_plans/Winter_2014' +d <- read.mw.tables(url)[[1]] +colnames(d) <- tolower(colnames(d)) +d$cost <- as.numeric(as.character(d$cost)) +d$paid <- as.character(d$paid) +d$beneficiaries <- as.character(d$beneficiaries) + + +# replace all with everybody +d$beneficiaries <- sub("All", "Aaron, Mako, Mika, Vaughn", d$beneficiaries) + +gen.cost <- function (x) { + tmp.money <- rep(0, 4) + names(tmp.money) <- c("Aaron", "Mako", "Mika", "Vaughn") + + payees <- x$paid + purchasers <- strsplit(x$beneficiaries, ", ")[[1]] + + tmp.money[purchasers] <- x$cost / length(purchasers) + tmp.money[payees] <- tmp.money[payees] - x$cost + + return(tmp.money) +} + +e <- do.call("rbind", by(d, seq(1,dim(d)[1]), gen.cost)) +round(apply(e, 2, sum)) +ls