import of github code used for the hackathon
[github-barcamp-201407] / analysis.R
1 # load up libraries and data
2 #######################################################################
3 library(ggplot2)
4 library(reshape2)
5 library(parallel)
6
7 d <- read.delim("gitdata_1week.tsv", stringsAsFactors=FALSE)
8
9 d$url <- NULL
10 d$date <- gsub("T", " ", d$date)
11 d$date <- as.POSIXct(d$date)
12
13 # crop the event off the end
14 d$type <- gsub("Event$", "", d$type)
15
16 rev(sort(table(d$type)))
17 head(rev(sort(table(d$name))))
18 head(rev(sort(table(d$actor))))
19
20 d$min <- cut(d$date, breaks="min")
21 d$hour <- cut(d$date, breaks="hour")
22
23 # generate graphs
24 ######################################################################
25
26 pdf("github_graphs.pdf", width=10, height=7)
27
28 # graph all together
29 grid.tmp <- as.data.frame(table(d$hour))
30 colnames(grid.tmp) <- c("date", "freq")
31 grid.tmp$date <- as.POSIXct(grid.tmp$date)
32
33 qplot(date, freq, data=grid.tmp, geom="line")
34
35 # just types of events
36 grid.tmp <- melt(lapply(tapply(d$type, d$hour, table), as.list), L2~L1)
37 colnames(grid.tmp) <- c("value", "event", "date")
38 grid.tmp$date <- as.POSIXct(grid.tmp$date)
39
40 ggplot(data=grid.tmp) +
41     aes(x=date, y=value, group=event, color=event) + geom_line()
42
43 ggplot(data=grid.tmp) +
44     aes(x=date, y=value, group=event, color=event) + geom_bar(stat="identity")
45
46 ggplot(data=grid.tmp) +
47     aes(x=date, y=value, group=event, color=event, size=value) + geom_line() +
48     facet_grid(event~., scale="free_y")
49
50 # create first differences
51 build.phase.diagram.dataset <- function (d) {
52     grid.tmp <- as.data.frame(table(d$hour))
53     colnames(grid.tmp) <- c("date", "freq")
54     grid.tmp$date <- as.POSIXct(grid.tmp$date)
55     grid.tmp$freq2 <- c(NA, grid.tmp$freq[1:(length(grid.tmp$freq)-1)])
56     grid.tmp$diff <- grid.tmp$freq2 - grid.tmp$freq
57     grid.tmp <- grid.tmp[2:(dim(grid.tmp)[1]-1),]
58     grid.tmp$hour.of.week <- as.numeric(as.factor(grid.tmp$date))
59     grid.tmp$hour <- as.numeric(as.factor(grid.tmp$date)) %% 24
60     return(grid.tmp)
61 }
62
63 phase.diagram <- function (subset) {
64     grid.tmp <- build.phase.diagram.dataset(d[d$type == subset,])
65     ggplot(grid.tmp) + aes(x=diff, y=freq2, size=hour, colour=hour) +
66         geom_path() + ggtitle(paste("Phase Diagram:", subset))
67
68 }
69
70 grid.tmp <- build.phase.diagram.dataset(d)
71 ggplot(grid.tmp) + aes(x=diff, y=freq2, size=hour, colour=hour) +
72     geom_path() + ggtitle("Phase Diagram: All Activity")
73
74 phase.diagram("Push")
75 phase.diagram("Watch")
76 phase.diagram("Fork")
77 phase.diagram("PullRequest")
78 dev.off()

Benjamin Mako Hill || Want to submit a patch?