From 21ef016ae6a8068ec3bf8237b84176ffee105936 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Sun, 11 May 2014 17:20:32 -0700 Subject: [PATCH] initial import of code to count messages from gmail --- .gitignore | 2 ++ README | 28 +++++++++++++++ analysis.R | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++ count_gmail.py | 47 +++++++++++++++++++++++++ 4 files changed, 170 insertions(+) create mode 100644 .gitignore create mode 100644 README create mode 100644 analysis.R create mode 100755 count_gmail.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3baaa88 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*png +*pdf diff --git a/README b/README new file mode 100644 index 0000000..6e6da54 --- /dev/null +++ b/README @@ -0,0 +1,28 @@ +Scripts to Count Messages from Google +============================================================== + +Author: Benjamin Mako Hill (mako@atdot.cc) +License: GNU General Public License version 3 or any later version +Home: http://projects.mako.cc/ + +1. Parse your mailbox using the count_gmail.py script +-------------------------------------------------------------- + +I ran the script like this: + +$ python count_gmail.py ~/incoming/mail/default > mail_metadata.tsv + +2. Parse the output using analysis.R +-------------------------------------------------------------- + +I run R interactively in Emacs/ESS but you might want to use RStudio +if you are not familiar with Emacs. Alternatively, if you also output +into mail_metadata.tsv, you can just run: + +$ R --no-save < analysis.R + +It will create the two PDFs files of graphs for you in the local directory. + +The I converted the PDFs into PNGs with imagemagick's mogrify: + +$ mogrify -format png *pdf \ No newline at end of file diff --git a/analysis.R b/analysis.R new file mode 100644 index 0000000..6d3d2d2 --- /dev/null +++ b/analysis.R @@ -0,0 +1,93 @@ +library(data.table) +library(ggplot2) +library(reshape) + +theme_set(theme_bw()) + +d = read.delim("mail_metadata.tsv", header=FALSE, + col.names=c("flags", "timestamp", "precedence", "google")) + +d$timestamp <- as.POSIXct(d$timestamp, tz="UTC", + origin=as.POSIXct("1970-01-01 00:00:00")) + +# limit the dataset to emails sent post timestamp +d <- d[d$timestamp > as.POSIXct("2004-04-01 00:00:00"),] + +d$week <- cut(d$timestamp, breaks="weeks") + +# list and then drop list mail +table(d$precedence) +d <- d[is.na(d$precedence),] +d$precedence <- NULL + +d$replied <- grepl('R', d$flags) + +google.by.week <- function (d) { + setDT(d) + + weeks <- d[,list(total=length(google), google=table(google)["TRUE"]), by=week] + + # drop things + weeks <- weeks[weeks$total > 1,] + weeks$google.prop <- weeks$google / weeks$total + + weeks$week <- as.Date(as.character(weeks$week)) + + return(weeks) +} + +# find proportions per year +replied <- google.by.week(d[d$replied,]) +replied <- replied[complete.cases(replied),] + +replied.tbl <- as.data.frame( + tapply(replied$google, substr(as.character(replied$week), 1, 4), sum) / + tapply(replied$total, substr(as.character(replied$week), 1, 4), sum)) + +colnames(replied.tbl) <- "prop.google" +replied.tbl$year <- row.names(replied.tbl) +row.names(replied.tbl) <- NULL + +ggplot(data=replied.tbl) + aes(x=year, y=prop.google) + + geom_bar(stat="identity") + +replied.tbl + +# Graph #1: Emails from Google Over Time +####################################################### + +raw.data <- google.by.week(d) +raw.data$google.prop <- NULL + +raw.data <- melt(raw.data, id.var="week") + + +pdf(file="emails_gmail_over_time.pdf", width=10, height=6) + +ggplot(data=raw.data) + aes(x=week, y=value, color=variable, group=variable) + + geom_point() + + stat_smooth(method="loess", show_guide=FALSE) + + scale_color_discrete("", breaks=c("total", "google"), + labels=c("All Emails", "From Google")) + + scale_x_date("Date") + + scale_y_continuous("Number of Emails") + +dev.off() + +# Graph #2: Proportions of Email from Google +####################################################### + +prop.data <- rbind(cbind(google.by.week(d), subset="All Email"), + cbind(google.by.week(d[d$replied]), subset="Emails with Replies")) + + +pdf(file="emails_gmail_prop_over_time.pdf", width=10, height=8) + +ggplot(data=prop.data) + aes(x=week, y=google.prop, size=total, group=subset) + + geom_point() + facet_grid(subset~.) + + scale_y_continuous("Proportion from Google", limits=c(0,1)) + + scale_x_date("Date") + + scale_size("Emails") + + stat_smooth(method="loess", show_guide=FALSE) + +dev.off() diff --git a/count_gmail.py b/count_gmail.py new file mode 100755 index 0000000..d379dae --- /dev/null +++ b/count_gmail.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import mailbox +import rfc822 +import email.utils +import time +import sys +import os.path + +md_name = os.path.expanduser(sys.argv[1]) + +inbox = mailbox.Maildir(md_name, factory=None) + +for msg in inbox: + flags = msg.get_flags() + #date = msg.get_date() + date = msg.get("Date") + + if date == None: + continue + else: + date = email.utils.parsedate(date) + if date == None: + continue + else: + date = time.mktime(date) + + precedence = msg.get("Precedence") + if precedence == None: + precedence = "NA" + + google = "FALSE" + recvd_list = msg.get_all("Received") + + # skip this message if there are no received headers (malformed?) + if recvd_list == None: + continue + + # if there is a list of received headers, skip + for recvd in recvd_list: + if "google.com" in recvd or "gmail.com" in recvd or "googlemail.com" in recvd: + google = "TRUE" + + # put it together and output + print("\t".join([flags, str(date), precedence, google])) + + -- 2.39.5