]> projects.mako.cc - noahs_yelp_analysis/commitdiff
generate a second graph that shows the percentage of reviews
authorBenjamin Mako Hill <mako@atdot.cc>
Sun, 8 Apr 2012 22:06:57 +0000 (18:06 -0400)
committerBenjamin Mako Hill <mako@atdot.cc>
Sun, 8 Apr 2012 22:08:05 +0000 (18:08 -0400)
- also fix a few minor bugs including setting the correct order for the
  stars so it goes from highest to lowest as it is displayed

analysis.R

index 13ccaf30de2f40e32406feb03078907d4733e160..aadc02e6ccde4d9da620f181905b70c632b5f137 100644 (file)
@@ -1,6 +1,7 @@
-cat("\n","Enter textfile name","\n") # prompt
+qcat("\n","Enter textfile name","\n") # prompt
 y<-readLines(n=1) 
 y<-readLines(n=1) 
-x <- paste("lord-hobo-cambridge.txt",sep="")
+
+x <- "lord-hobo-cambridge.txt"
 d <- read.csv(x)[,c(1,2,3)]
 
 colnames(d) <- c("rating", "day.of.week", "day.in.review")
 d <- read.csv(x)[,c(1,2,3)]
 
 colnames(d) <- c("rating", "day.of.week", "day.in.review")
@@ -14,7 +15,8 @@ d$day.of.week[d$day.of.week == "3"] <- "Wednesday"
 d$day.of.week[d$day.of.week == "4"] <- "Thursday"
 d$day.of.week[d$day.of.week == "5"] <- "Friday"
 d$day.of.week[d$day.of.week == "6"] <- "Saturday"
 d$day.of.week[d$day.of.week == "4"] <- "Thursday"
 d$day.of.week[d$day.of.week == "5"] <- "Friday"
 d$day.of.week[d$day.of.week == "6"] <- "Saturday"
-d$day.of.week <- factor(d$day.of.week, levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"))
+d$day.of.week <- factor(d$day.of.week, levels = c("Monday","Tuesday","Wednesday",
+                                         "Thursday","Friday","Saturday","Sunday"))
 
 d$day.of.week <- as.factor(d$day.of.week) 
 
 
 d$day.of.week <- as.factor(d$day.of.week) 
 
@@ -22,15 +24,40 @@ t <- table(d$day.of.week, as.factor(d$rating))
 t2 <- prop.table(t,2)
 chisq.test(d$day.of.week, as.factor(d$rating))
 
 t2 <- prop.table(t,2)
 chisq.test(d$day.of.week, as.factor(d$rating))
 
-# draw a graph
+# compute and add a set of weights so that we can graph proportions
+# using the same data.frame
+w <- data.frame(1 / (table(d$day.of.week)/sum(table(d$day.of.week))))
+colnames(w) <- c("day.of.week", "plot.weight")
+d <- merge(d, w, by="day.of.week", all.x=TRUE, all.y=FALSE)
+
+# draw two graphs
 library(ggplot2)
 library(ggplot2)
-v <- paste(y,"-baaaarplot.png",sep="")
-png(v, width=8, height=6, unit="in", res=200)
-p <- qplot(day.of.week, data=d, fill=as.factor(rating), main=y)
-#p <- qplot(levels(d$day.of.week),as.factor(names(d$rating)), t, data=data.frame(t,levels(d$day.of.week)))
-p <- p + scale_x_discrete("Day of Week") + scale_y_continuous("yelp # star review") + scale_fill_discrete("") 
+
+# generate the unscaled count graph
+filename.count <- paste(y,"-count-png",sep="")
+
+png(filename.count, width=8, height=6, unit="in", res=200)
+p <- qplot(day.of.week, data=d, fill=as.factor(rating), geom="bar", main=y)
+p <- p + scale_x_discrete("Day of Week") +
+  scale_y_continuous("yelp # star review") +
+  scale_fill_discrete("", breaks=as.character(rev(1:5)) )
+print(p)
+dev.off()
+
+# generate the scaled proportion graph
+filename.prop <- paste(y,"-proportion-png",sep="")
+
+max.value <- tapply(d$plot.weight, d$day.of.week, sum)[1]
+
+p <- qplot(day.of.week, data=d, fill=as.factor(rating), geom="bar",
+           weight=plot.weight, main=y)
+p <- p + scale_x_discrete("Day of Week") +
+  scale_y_continuous("percentage of reviews",
+                     breaks=(0:4 * max.value/4), labels=seq(0,100, 25)) +
+  scale_fill_discrete("", breaks=as.character(rev(1:5)))
 print(p)
 dev.off()
 
 
 
 print(p)
 dev.off()
 
 
 
+

Benjamin Mako Hill || Want to submit a patch?