projects.mako.cc - harrypotter-wikipedia-cdsw-solutions/blob - solution5.py

   1 # Q: Instead of "binning" your dataset by day, change to bin it by month for each of the two previous questions.
   2
   3 # This is a modified version of solution4.py. Changing solution3.py
   4 # would be similar.
   5
   6 from csv import DictReader
   7
   8 # STEP 1: read in the input file and count by article
   9 input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
  10
  11 edits_by_article = {}
  12 for row in DictReader(input_file, delimiter="\t"):
  13     title = row['title']
  14
  15     if title in edits_by_article:
  16         edits_by_article[title] = edits_by_article[title] + 1
  17     else:
  18         edits_by_article[title] = 1
  19
  20 input_file.close()
  21
  22
  23 # STEP 2: find the list of the top 3 articles
  24 top_articles = []
  25 for title in sorted(edits_by_article, key=edits_by_article.get, reverse=True):
  26     if len(top_articles) >= 3:
  27         break
  28     else:
  29         top_articles.append(title)
  30
  31
  32 # STEP 3: now, fill that by doing a version of the first count by
  33 # going back through the original data and this time just count each
  34 # of the three articles
  35
  36 article_edits_by_month = {}
  37
  38 input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
  39 for row in DictReader(input_file, delimiter="\t"):
  40     title = row['title']
  41
  42     if title not in top_articles:
  43         continue
  44
  45     # NOTE: this line is the key difference
  46     month = row['timestamp'][0:7]
  47
  48     if month in article_edits_by_month:
  49         article_edits_by_month[month][title] = article_edits_by_month[month][title] + 1
  50     else:
  51         article_edits_by_month[month] = {}
  52         for tmp_title in top_articles:
  53             if tmp_title == title:
  54                 article_edits_by_month[month][tmp_title] = 1
  55             else:
  56                 article_edits_by_month[month][tmp_title] = 0
  57
  58
  59 # STEP 4: print it all out
  60 # output the counts by month
  61 output_file = open("hp_edits_by_month_top3_articles.tsv", "w", encoding='utf-8')
  62
  63 # write a header
  64 title_header_string = "\t".join(top_articles)
  65
  66 output_file.write("month\t" + title_header_string + "\n")
  67
  68 # iterate through every month and print out data into the file
  69 for month in article_edits_by_month:
  70     title_values = []
  71     for title in top_articles:
  72         title_values.append(str(article_edits_by_month[month][title]))
  73
  74     title_values_string = "\t".join(title_values)
  75     output_file.write("\t".join([month, title_values_string]) + "\n")
  76
  77 output_file.close()
  78
  79 # Example of interactive graph in Google Docs:
  80 # http://mako.cc/go/0i