From: Benjamin Mako Hill Date: Sun, 8 Apr 2012 22:06:57 +0000 (-0400) Subject: generate a second graph that shows the percentage of reviews X-Git-Url: https://projects.mako.cc/source/noahs_yelp_analysis/commitdiff_plain/ae54477ea84b57cfaa834c193054b510ad2dd221?hp=d9c54e8cf56aa6db7df4823a535047d85b03f373 generate a second graph that shows the percentage of reviews - also fix a few minor bugs including setting the correct order for the stars so it goes from highest to lowest as it is displayed --- diff --git a/analysis.R b/analysis.R index 13ccaf3..aadc02e 100644 --- a/analysis.R +++ b/analysis.R @@ -1,6 +1,7 @@ -cat("\n","Enter textfile name","\n") # prompt +qcat("\n","Enter textfile name","\n") # prompt y<-readLines(n=1) -x <- paste("lord-hobo-cambridge.txt",sep="") + +x <- "lord-hobo-cambridge.txt" d <- read.csv(x)[,c(1,2,3)] colnames(d) <- c("rating", "day.of.week", "day.in.review") @@ -14,7 +15,8 @@ d$day.of.week[d$day.of.week == "3"] <- "Wednesday" d$day.of.week[d$day.of.week == "4"] <- "Thursday" d$day.of.week[d$day.of.week == "5"] <- "Friday" d$day.of.week[d$day.of.week == "6"] <- "Saturday" -d$day.of.week <- factor(d$day.of.week, levels = c("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")) +d$day.of.week <- factor(d$day.of.week, levels = c("Monday","Tuesday","Wednesday", + "Thursday","Friday","Saturday","Sunday")) d$day.of.week <- as.factor(d$day.of.week) @@ -22,15 +24,40 @@ t <- table(d$day.of.week, as.factor(d$rating)) t2 <- prop.table(t,2) chisq.test(d$day.of.week, as.factor(d$rating)) -# draw a graph +# compute and add a set of weights so that we can graph proportions +# using the same data.frame +w <- data.frame(1 / (table(d$day.of.week)/sum(table(d$day.of.week)))) +colnames(w) <- c("day.of.week", "plot.weight") +d <- merge(d, w, by="day.of.week", all.x=TRUE, all.y=FALSE) + +# draw two graphs library(ggplot2) -v <- paste(y,"-baaaarplot.png",sep="") -png(v, width=8, height=6, unit="in", res=200) -p <- qplot(day.of.week, data=d, fill=as.factor(rating), main=y) -#p <- qplot(levels(d$day.of.week),as.factor(names(d$rating)), t, data=data.frame(t,levels(d$day.of.week))) -p <- p + scale_x_discrete("Day of Week") + scale_y_continuous("yelp # star review") + scale_fill_discrete("") + +# generate the unscaled count graph +filename.count <- paste(y,"-count-png",sep="") + +png(filename.count, width=8, height=6, unit="in", res=200) +p <- qplot(day.of.week, data=d, fill=as.factor(rating), geom="bar", main=y) +p <- p + scale_x_discrete("Day of Week") + + scale_y_continuous("yelp # star review") + + scale_fill_discrete("", breaks=as.character(rev(1:5)) ) +print(p) +dev.off() + +# generate the scaled proportion graph +filename.prop <- paste(y,"-proportion-png",sep="") + +max.value <- tapply(d$plot.weight, d$day.of.week, sum)[1] + +p <- qplot(day.of.week, data=d, fill=as.factor(rating), geom="bar", + weight=plot.weight, main=y) +p <- p + scale_x_discrete("Day of Week") + + scale_y_continuous("percentage of reviews", + breaks=(0:4 * max.value/4), labels=seq(0,100, 25)) + + scale_fill_discrete("", breaks=as.character(rev(1:5))) print(p) dev.off() +