]> projects.mako.cc - harrypotter-wikipedia-cdsw-solutions/blob - solution5.py
initial versions of solutions to harry potter challenges
[harrypotter-wikipedia-cdsw-solutions] / solution5.py
1 # Q: Instead of "binning" your dataset by day, change to bin it by month for each of the two previous questions.
2
3 # This is a modified version of solution4.py. Changing solution3.py
4 # would be similar.
5
6 from csv import DictReader
7
8 # STEP 1: read in the input file and count by article
9 input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
10
11 edits_by_article = {}
12 for row in DictReader(input_file, delimiter="\t"):
13     title = row['title']
14
15     if title in edits_by_article:
16         edits_by_article[title] = edits_by_article[title] + 1
17     else:
18         edits_by_article[title] = 1
19
20 input_file.close()
21
22
23 # STEP 2: find the list of the top 3 articles
24 top_articles = []
25 for title in sorted(edits_by_article, key=edits_by_article.get, reverse=True):
26     if len(top_articles) >= 3:
27         break
28     else:
29         top_articles.append(title)
30
31
32 # STEP 3: now, fill that by doing a version of the first count by
33 # going back through the original data and this time just count each
34 # of the three articles
35
36 article_edits_by_month = {}
37
38 input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
39 for row in DictReader(input_file, delimiter="\t"):
40     title = row['title']
41
42     if title not in top_articles:
43         continue
44
45     # NOTE: this line is the key difference
46     month = row['timestamp'][0:7]
47
48     if month in article_edits_by_month:
49         article_edits_by_month[month][title] = article_edits_by_month[month][title] + 1
50     else:
51         article_edits_by_month[month] = {}
52         for tmp_title in top_articles:
53             if tmp_title == title:
54                 article_edits_by_month[month][tmp_title] = 1
55             else:
56                 article_edits_by_month[month][tmp_title] = 0
57
58
59 # STEP 4: print it all out
60 # output the counts by month
61 output_file = open("hp_edits_by_month_top3_articles.tsv", "w", encoding='utf-8')
62
63 # write a header
64 title_header_string = "\t".join(top_articles)
65
66 output_file.write("month\t" + title_header_string + "\n")
67
68 # iterate through every month and print out data into the file
69 for month in article_edits_by_month:
70     title_values = []
71     for title in top_articles:
72         title_values.append(str(article_edits_by_month[month][title]))
73
74     title_values_string = "\t".join(title_values)
75     output_file.write("\t".join([month, title_values_string]) + "\n")
76
77 output_file.close()
78
79 # Example of interactive graph in Google Docs:
80 # http://mako.cc/go/0i

Benjamin Mako Hill || Want to submit a patch?