1 setwd("~/japan_trip_2014")
3 # url <- 'http://wiki.mako.cc/Yashetarium_ledger'
4 # url <- 'http://en.wikipedia.org/wiki/Comparison_of_e-book_readers'
5 # stringsAsFactors <- FALSE; empty.string <- TRUE
7 read.mw.tables <- function (url,
9 stringsAsFactors=FALSE,
11 # if it's a URL, we should get wiki text
13 # TODO tweak this to try it before adding the action
14 is.url <- regexpr("http",url) > 0
15 if (is.url) { file <- paste(url, "?action=raw", sep="")
16 } else { file <- url }
18 wiki.text <- do.call("paste", as.list(readLines(file, warn=FALSE)))
20 # remove all non-wiki text
21 wiki.text <- gsub('.*(\\{\\|.*?\\|\\})', '\\1', wiki.text)
22 wiki.text <- gsub('^(.*\\|\\}).*$', '\\1', wiki.text)
24 tables <- paste("{|", strsplit(wiki.text, '\\{\\|')[[1]])
25 # drop the first which is just the front
28 # strip html tags out of the output
29 tables <- gsub('<(?:[^>\'"]*|".*?"|\'.*?\')+>', '', tables)
31 wikitable.to.df <- function (wt) {
32 split.wt.into.rows <- function (wt) {
33 wt.rows <- strsplit(wt, '\\s*\\|\\-\\s*')[[1]]
35 # if there's a first row, that's style attributions
40 split.wt.row.into.cells <- function (wt.row) {
41 wt.cells <- strsplit(wt.row, '\\s*\\|\\s*')[[1]]
43 # if the first cell has something, it's style and can be
44 # dropped and returned
48 # trip beginning and end tags and split into rows
49 wt <- sub('^\\{\\|\\s*(.*?)\\s*\\|\\}$', '\\1', wt)
50 wt.rows <- split.wt.into.rows(wt)
52 # if it's a header, we keep it for later
53 if (substr(wt.rows[1], 1, 1) == "!") {
54 wt.header <- strsplit(sub('\\!\\s*(.*)$',
55 '\\1', wt.rows[1]),'\\s*!\\s*')[[1]]
56 wt.rows <- wt.rows[-1]
59 # turn wt into a table
60 row.list <- lapply(wt.rows, split.wt.row.into.cells)
62 # MW lists can have different number of cells, so we need to pad
63 row.list <- lapply(row.list, function (x) {
64 total <- max(sapply(row.list, length))
65 c(x, rep("", (total - length(x)))) })
67 # create the text matrix
68 m <- do.call("rbind", row.list)
70 # drop any any items that are empty
71 missing.cols <- apply(m, 2, function (x) {all(x == "")})
72 if (any(missing.cols)) {
73 m <- m[,!missing.cols]
76 # set the header correct if that's in the table
77 if (exists("wt.header")) {
78 # if the missing columns are the same ones, use that
79 if (any(missing.cols) &
80 all(missing.cols == (names(m) == ""))) {
81 colnames(m) <- wt.header[!wt.header == ""]
83 wt.header <- c(wt.header, rep(NA,
84 max(sapply(row.list, length)) - length(wt.header)))
86 ## print(length(wt.header))
88 ## print(max(sapply(row.list, length)))
91 colnames(m) <- wt.header
95 # turn any numbers into numbers
98 d <- data.frame(lapply(d,
100 # convert any numbers into numbers
101 if (all(grepl('^[0-9\\.]*$', x)))
102 return(as.numeric(as.character(x)))
106 # set missing things to NA
107 d[,sapply(d, class) == "factor"] <- do.call("data.frame",
108 lapply(d[,sapply(d, class) == "factor"],
109 function (x) {x[x==empty.string] <- NA;x}))
111 # for every data.frame, try to change things to numbers
112 if (!stringsAsFactors) {
113 d[,sapply(d, class) == "factor"] <- do.call("cbind",
114 lapply(d[, sapply(d, class) == "factor"],
115 function (x) {as.character(x)}))
122 wt <- lapply(tables, wikitable.to.df)
124 # TODO write strip.wikimarkup function
128 ## run the zero sum thing ledger
129 ###########################################################
130 url <- 'http://wiki.mako.cc/Travel_plans/Winter_2014'
131 d <- read.mw.tables(url)[[1]]
132 colnames(d) <- tolower(colnames(d))
133 d$cost <- as.numeric(as.character(d$cost))
134 d$paid <- as.character(d$paid)
135 d$beneficiaries <- as.character(d$beneficiaries)
138 # replace all with everybody
139 d$beneficiaries <- sub("All", "Aaron, Mako, Mika, Vaughn", d$beneficiaries)
141 gen.cost <- function (x) {
142 tmp.money <- rep(0, 4)
143 names(tmp.money) <- c("Aaron", "Mako", "Mika", "Vaughn")
146 purchasers <- strsplit(x$beneficiaries, ", ")[[1]]
148 tmp.money[purchasers] <- x$cost / length(purchasers)
149 tmp.money[payees] <- tmp.money[payees] - x$cost
154 e <- do.call("rbind", by(d, seq(1,dim(d)[1]), gen.cost))
155 round(apply(e, 2, sum))