]> projects.mako.cc - gmail-maildir-counter/commitdiff
initial import of code to count messages from gmail
authorBenjamin Mako Hill <mako@atdot.cc>
Mon, 12 May 2014 00:20:32 +0000 (17:20 -0700)
committerBenjamin Mako Hill <mako@atdot.cc>
Mon, 12 May 2014 00:20:32 +0000 (17:20 -0700)
.gitignore [new file with mode: 0644]
README [new file with mode: 0644]
analysis.R [new file with mode: 0644]
count_gmail.py [new file with mode: 0755]

diff --git a/.gitignore b/.gitignore
new file mode 100644 (file)
index 0000000..3baaa88
--- /dev/null
@@ -0,0 +1,2 @@
+*png
+*pdf
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..6e6da54
--- /dev/null
+++ b/README
@@ -0,0 +1,28 @@
+Scripts to Count Messages from Google
+==============================================================
+
+Author: Benjamin Mako Hill (mako@atdot.cc)
+License: GNU General Public License version 3 or any later version
+Home: http://projects.mako.cc/
+
+1. Parse your mailbox using the count_gmail.py script
+--------------------------------------------------------------
+
+I ran the script like this:
+
+$ python count_gmail.py ~/incoming/mail/default > mail_metadata.tsv
+
+2. Parse the output using analysis.R
+--------------------------------------------------------------
+
+I run R interactively in Emacs/ESS but you might want to use RStudio
+if you are not familiar with Emacs. Alternatively, if you also output
+into mail_metadata.tsv, you can just run:
+
+$ R --no-save < analysis.R
+
+It will create the two PDFs files of graphs for you in the local directory.
+
+The I converted the PDFs into PNGs with imagemagick's mogrify:
+
+$ mogrify -format png *pdf
\ No newline at end of file
diff --git a/analysis.R b/analysis.R
new file mode 100644 (file)
index 0000000..6d3d2d2
--- /dev/null
@@ -0,0 +1,93 @@
+library(data.table)
+library(ggplot2)
+library(reshape)
+
+theme_set(theme_bw())
+
+d = read.delim("mail_metadata.tsv", header=FALSE,
+    col.names=c("flags", "timestamp", "precedence", "google"))
+
+d$timestamp <- as.POSIXct(d$timestamp, tz="UTC",
+                          origin=as.POSIXct("1970-01-01 00:00:00"))
+
+# limit the dataset to emails sent post timestamp
+d <- d[d$timestamp > as.POSIXct("2004-04-01 00:00:00"),]
+
+d$week <- cut(d$timestamp, breaks="weeks")
+
+# list and then drop list mail
+table(d$precedence)
+d <- d[is.na(d$precedence),]
+d$precedence <- NULL
+
+d$replied <- grepl('R', d$flags)
+
+google.by.week <- function (d) {
+    setDT(d)
+
+    weeks <- d[,list(total=length(google), google=table(google)["TRUE"]), by=week]
+
+    # drop things
+    weeks <- weeks[weeks$total > 1,]
+    weeks$google.prop <- weeks$google / weeks$total
+
+    weeks$week <- as.Date(as.character(weeks$week))
+
+    return(weeks)
+}
+
+# find proportions per year
+replied <- google.by.week(d[d$replied,])
+replied <- replied[complete.cases(replied),]
+
+replied.tbl <- as.data.frame(
+    tapply(replied$google, substr(as.character(replied$week), 1, 4), sum) /
+    tapply(replied$total, substr(as.character(replied$week), 1, 4), sum))
+
+colnames(replied.tbl) <- "prop.google"
+replied.tbl$year <- row.names(replied.tbl)
+row.names(replied.tbl) <- NULL
+
+ggplot(data=replied.tbl) + aes(x=year, y=prop.google) +
+    geom_bar(stat="identity")
+
+replied.tbl
+
+# Graph #1: Emails from Google Over Time
+#######################################################
+
+raw.data <- google.by.week(d)
+raw.data$google.prop <- NULL
+
+raw.data <- melt(raw.data, id.var="week")
+
+
+pdf(file="emails_gmail_over_time.pdf", width=10, height=6)
+
+ggplot(data=raw.data) + aes(x=week, y=value, color=variable, group=variable) +
+    geom_point() +
+    stat_smooth(method="loess", show_guide=FALSE) +
+    scale_color_discrete("", breaks=c("total", "google"),
+                         labels=c("All Emails", "From Google")) +
+    scale_x_date("Date") +
+    scale_y_continuous("Number of Emails")
+
+dev.off()
+
+# Graph #2: Proportions of Email from Google
+#######################################################
+
+prop.data <- rbind(cbind(google.by.week(d), subset="All Email"),
+                   cbind(google.by.week(d[d$replied]), subset="Emails with Replies"))
+
+
+pdf(file="emails_gmail_prop_over_time.pdf", width=10, height=8)
+
+ggplot(data=prop.data) + aes(x=week, y=google.prop, size=total, group=subset) +
+    geom_point() + facet_grid(subset~.) +
+    scale_y_continuous("Proportion from Google", limits=c(0,1)) +
+    scale_x_date("Date") +
+    scale_size("Emails") +
+    stat_smooth(method="loess", show_guide=FALSE) 
+
+dev.off()
diff --git a/count_gmail.py b/count_gmail.py
new file mode 100755 (executable)
index 0000000..d379dae
--- /dev/null
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import mailbox
+import rfc822
+import email.utils
+import time
+import sys
+import os.path
+
+md_name = os.path.expanduser(sys.argv[1])
+
+inbox = mailbox.Maildir(md_name, factory=None)
+
+for msg in inbox:
+    flags = msg.get_flags()
+    #date = msg.get_date()
+    date = msg.get("Date")
+
+    if date == None:
+        continue
+    else:
+        date = email.utils.parsedate(date)
+        if date == None:
+            continue
+        else:
+            date = time.mktime(date)
+
+    precedence = msg.get("Precedence")
+    if precedence == None:
+        precedence = "NA"
+    
+    google = "FALSE"
+    recvd_list = msg.get_all("Received")
+
+    # skip this message if there are no received headers (malformed?)
+    if recvd_list == None:
+        continue
+
+    # if there is a list of received headers, skip
+    for recvd in recvd_list:
+        if "google.com" in recvd or "gmail.com" in recvd or "googlemail.com" in recvd:
+            google = "TRUE"
+
+    # put it together and output        
+    print("\t".join([flags, str(date), precedence, google]))
+
+

Benjamin Mako Hill || Want to submit a patch?