projects.mako.cc - zerosum-wikitable/blob - zerosum.R

   1 setwd("~/japan_trip_2014")
   2
   3 # url <- 'http://wiki.mako.cc/Yashetarium_ledger'
   4 # url <- 'http://en.wikipedia.org/wiki/Comparison_of_e-book_readers'
   5 # stringsAsFactors <- FALSE; empty.string <- TRUE
   6
   7 read.mw.tables <- function (url,
   8                             strip.html=FALSE,
   9                             stringsAsFactors=FALSE,
  10                             empty.string="") {
  11     # if it's a URL, we should get wiki text
  12
  13     # TODO tweak this to try it before adding the action
  14     is.url <- regexpr("http",url) > 0
  15     if (is.url) { file <- paste(url, "?action=raw", sep="")
  16     } else { file <- url }
  17
  18     wiki.text <- do.call("paste", as.list(readLines(file, warn=FALSE)))
  19
  20     # remove all non-wiki text
  21     wiki.text <- gsub('.*(\\{\\|.*?\\|\\})', '\\1', wiki.text)
  22     wiki.text <- gsub('^(.*\\|\\}).*$', '\\1', wiki.text)
  23
  24     tables <- paste("{|", strsplit(wiki.text, '\\{\\|')[[1]])
  25      # drop the first which is just the front
  26     tables <- tables[-1]
  27
  28     # strip html tags out of the output
  29     tables <- gsub('<(?:[^>\'"]*|".*?"|\'.*?\')+>', '', tables)
  30
  31     wikitable.to.df <- function (wt) {
  32         split.wt.into.rows <-  function (wt) {
  33             wt.rows <- strsplit(wt, '\\s*\\|\\-\\s*')[[1]]
  34
  35             # if there's a first row, that's style attributions
  36             wt.rows[-1]
  37         }
  38
  39
  40         split.wt.row.into.cells <- function (wt.row) {
  41             wt.cells <- strsplit(wt.row, '\\s*\\|\\s*')[[1]]
  42
  43             # if the first cell has something, it's style and can be
  44             # dropped and returned
  45             wt.cells[-1]
  46         }
  47
  48         # trip beginning and end tags and split into rows
  49         wt <- sub('^\\{\\|\\s*(.*?)\\s*\\|\\}$', '\\1', wt)
  50         wt.rows <- split.wt.into.rows(wt)
  51
  52         # if it's a header, we keep it for later
  53         if (substr(wt.rows[1], 1, 1) == "!") {
  54             wt.header <- strsplit(sub('\\!\\s*(.*)$',
  55                                       '\\1', wt.rows[1]),'\\s*!\\s*')[[1]]
  56             wt.rows <- wt.rows[-1]
  57         }
  58
  59         # turn wt into a table
  60         row.list <- lapply(wt.rows, split.wt.row.into.cells)
  61
  62         # MW lists can have different number of cells, so we need to pad
  63         row.list <- lapply(row.list, function (x) {
  64             total <- max(sapply(row.list, length))
  65             c(x, rep("", (total - length(x)))) })
  66
  67         # create the text matrix
  68         m <- do.call("rbind", row.list)
  69
  70         # drop any any items that are empty
  71         missing.cols <- apply(m, 2, function (x) {all(x == "")})
  72         if (any(missing.cols)) {
  73             m <- m[,!missing.cols]
  74         }
  75
  76         # set the header correct if that's in the table
  77         if (exists("wt.header")) {
  78             # if the missing columns are the same ones, use that
  79             if (any(missing.cols) &
  80                 all(missing.cols == (names(m) == ""))) {
  81                 colnames(m) <- wt.header[!wt.header == ""]
  82             } else {
  83                 wt.header <- c(wt.header, rep(NA,
  84                     max(sapply(row.list, length)) - length(wt.header)))
  85
  86             ## print(length(wt.header))
  87             ## print(wt.header)
  88             ## print(max(sapply(row.list, length)))
  89             ## print(row.list)
  90
  91                 colnames(m) <- wt.header
  92             }
  93         }
  94
  95         # turn any numbers into numbers
  96         d <- data.frame(m)
  97
  98         d <- data.frame(lapply(d,
  99             function (x) {
 100                 # convert any numbers into numbers
 101                 if (all(grepl('^[0-9\\.]*$', x)))
 102                     return(as.numeric(as.character(x)))
 103                 else return(x)
 104             }))
 105
 106         # set missing things to NA
 107         d[,sapply(d, class) == "factor"] <- do.call("data.frame",
 108             lapply(d[,sapply(d, class) == "factor"],
 109                    function (x) {x[x==empty.string] <- NA;x}))
 110
 111         # for every data.frame, try to change things to numbers
 112         if (!stringsAsFactors) {
 113             d[,sapply(d, class) == "factor"] <- do.call("cbind",
 114                          lapply(d[, sapply(d, class) == "factor"],
 115                                 function (x) {as.character(x)}))
 116         }
 117
 118         return(d)
 119
 120     }
 121
 122     wt <- lapply(tables, wikitable.to.df)
 123
 124      # TODO write strip.wikimarkup function
 125 }
 126
 127
 128 ## run the zero sum thing ledger
 129 ###########################################################
 130 url <- 'http://wiki.mako.cc/Travel_plans/Winter_2014'
 131 d <- read.mw.tables(url)[[1]]
 132 colnames(d) <- tolower(colnames(d))
 133 d$cost <- as.numeric(as.character(d$cost))
 134 d$paid <- as.character(d$paid)
 135 d$beneficiaries <- as.character(d$beneficiaries)
 136
 137
 138 # replace all with everybody
 139 d$beneficiaries <- sub("All", "Aaron, Mako, Mika, Vaughn", d$beneficiaries)
 140
 141 gen.cost <- function (x) {
 142     tmp.money <- rep(0, 4)
 143     names(tmp.money) <- c("Aaron", "Mako", "Mika", "Vaughn")
 144
 145     payees <- x$paid
 146     purchasers <- strsplit(x$beneficiaries, ", ")[[1]]
 147
 148     tmp.money[purchasers] <- x$cost / length(purchasers)
 149     tmp.money[payees] <- tmp.money[payees] - x$cost
 150
 151     return(tmp.money)
 152 }
 153
 154 e <- do.call("rbind", by(d, seq(1,dim(d)[1]), gen.cost))
 155 round(apply(e, 2, sum))
 156