]> projects.mako.cc - gmail-maildir-counter/blob - analysis.R
1f68d5bc1ac78eb6486fdac52734ba1232ee5b91
[gmail-maildir-counter] / analysis.R
1 library(data.table)
2 library(ggplot2)
3 library(reshape)
4
5 theme_set(theme_bw())
6
7 d = read.delim("mail_metadata.tsv", header=FALSE,
8     col.names=c("flags", "timestamp", "precedence", "google"))
9
10 d$timestamp <- as.POSIXct(d$timestamp, tz="UTC",
11                           origin=as.POSIXct("1970-01-01 00:00:00"))
12
13 # limit the dataset to emails sent post timestamp
14 d <- d[d$timestamp > as.POSIXct("2004-04-01 00:00:00"),]
15
16 d$week <- cut(d$timestamp, breaks="weeks")
17
18 # list and then drop list mail
19 table(d$precedence)
20 d <- d[is.na(d$precedence),]
21 d$precedence <- NULL
22
23 d$replied <- grepl('R', d$flags)
24
25 google.by.week <- function (d) {
26     setDT(d)
27
28     weeks <- d[,list(total=length(google), google=table(google)["TRUE"]), by=week]
29
30     # drop things
31     weeks <- weeks[weeks$total > 1,]
32     weeks$google.prop <- weeks$google / weeks$total
33
34     weeks$week <- as.Date(as.character(weeks$week))
35
36     return(weeks)
37 }
38
39 # find proportions per year
40 replied <- google.by.week(d[d$replied,])
41 replied <- replied[complete.cases(replied),]
42
43 replied.tbl <- as.data.frame(
44     tapply(replied$google, substr(as.character(replied$week), 1, 4), sum) /
45     tapply(replied$total, substr(as.character(replied$week), 1, 4), sum))
46
47 colnames(replied.tbl) <- "prop.google"
48 replied.tbl$year <- row.names(replied.tbl)
49 row.names(replied.tbl) <- NULL
50
51 ggplot(data=replied.tbl) + aes(x=year, y=prop.google) +
52     geom_bar(stat="identity")
53
54 replied.tbl
55
56 # Graph #1: Emails from Google Over Time
57 #######################################################
58
59 raw.data <- google.by.week(d)
60 raw.data$google.prop <- NULL
61
62 raw.data <- melt(raw.data, id.var="week")
63
64
65 pdf(file="emails_gmail_over_time.pdf", width=10, height=6)
66
67 ggplot(data=raw.data) + aes(x=week, y=value, color=variable, group=variable) +
68     geom_point() +
69     stat_smooth(method="loess", show_guide=FALSE) +
70     scale_color_discrete("", breaks=c("total", "google"),
71                          labels=c("All Emails", "From Google")) +
72     scale_x_date("Date") +
73     scale_y_continuous("Number of Emails")
74
75 dev.off()
76
77 # Graph #2: Proportions of Email from Google
78 #######################################################
79
80 prop.data <- rbind(cbind(google.by.week(d), subset="All Email"),
81                    cbind(google.by.week(d[d$replied]), subset="Email with Replies"))
82
83
84 pdf(file="emails_gmail_prop_over_time.pdf", width=10, height=8)
85
86 ggplot(data=prop.data) + aes(x=week, y=google.prop, size=total, group=subset) +
87     geom_point() + facet_grid(subset~.) +
88     scale_y_continuous("Proportion from Google", limits=c(0,1)) +
89     scale_x_date("Date") +
90     scale_size("Emails") +
91     stat_smooth(method="loess", show_guide=FALSE) 
92
93 dev.off()

Benjamin Mako Hill || Want to submit a patch?