+library(data.table)
+library(ggplot2)
+library(reshape)
+
+theme_set(theme_bw())
+
+d = read.delim("mail_metadata.tsv", header=FALSE,
+ col.names=c("flags", "timestamp", "precedence", "google"))
+
+d$timestamp <- as.POSIXct(d$timestamp, tz="UTC",
+ origin=as.POSIXct("1970-01-01 00:00:00"))
+
+# limit the dataset to emails sent post timestamp
+d <- d[d$timestamp > as.POSIXct("2004-04-01 00:00:00"),]
+
+d$week <- cut(d$timestamp, breaks="weeks")
+
+# list and then drop list mail
+table(d$precedence)
+d <- d[is.na(d$precedence),]
+d$precedence <- NULL
+
+d$replied <- grepl('R', d$flags)
+
+google.by.week <- function (d) {
+ setDT(d)
+
+ weeks <- d[,list(total=length(google), google=table(google)["TRUE"]), by=week]
+
+ # drop things
+ weeks <- weeks[weeks$total > 1,]
+ weeks$google.prop <- weeks$google / weeks$total
+
+ weeks$week <- as.Date(as.character(weeks$week))
+
+ return(weeks)
+}
+
+# find proportions per year
+replied <- google.by.week(d[d$replied,])
+replied <- replied[complete.cases(replied),]
+
+replied.tbl <- as.data.frame(
+ tapply(replied$google, substr(as.character(replied$week), 1, 4), sum) /
+ tapply(replied$total, substr(as.character(replied$week), 1, 4), sum))
+
+colnames(replied.tbl) <- "prop.google"
+replied.tbl$year <- row.names(replied.tbl)
+row.names(replied.tbl) <- NULL
+
+ggplot(data=replied.tbl) + aes(x=year, y=prop.google) +
+ geom_bar(stat="identity")
+
+replied.tbl
+
+# Graph #1: Emails from Google Over Time
+#######################################################
+
+raw.data <- google.by.week(d)
+raw.data$google.prop <- NULL
+
+raw.data <- melt(raw.data, id.var="week")
+
+
+pdf(file="emails_gmail_over_time.pdf", width=10, height=6)
+
+ggplot(data=raw.data) + aes(x=week, y=value, color=variable, group=variable) +
+ geom_point() +
+ stat_smooth(method="loess", show_guide=FALSE) +
+ scale_color_discrete("", breaks=c("total", "google"),
+ labels=c("All Emails", "From Google")) +
+ scale_x_date("Date") +
+ scale_y_continuous("Number of Emails")
+
+dev.off()
+
+# Graph #2: Proportions of Email from Google
+#######################################################
+
+prop.data <- rbind(cbind(google.by.week(d), subset="All Email"),
+ cbind(google.by.week(d[d$replied]), subset="Emails with Replies"))
+
+
+pdf(file="emails_gmail_prop_over_time.pdf", width=10, height=8)
+
+ggplot(data=prop.data) + aes(x=week, y=google.prop, size=total, group=subset) +
+ geom_point() + facet_grid(subset~.) +
+ scale_y_continuous("Proportion from Google", limits=c(0,1)) +
+ scale_x_date("Date") +
+ scale_size("Emails") +
+ stat_smooth(method="loess", show_guide=FALSE)
+
+dev.off()