projects.mako.cc - harrypotter-wikipedia-cdsw-solutions/blob - solution4.py

   1 # Q: Create graphs in a spreadsheet of the trend lines (i.e., edits per day over time) for the three most popular articles?
   2
   3 from csv import DictReader
   4
   5 # STEP 1: read in the input file and count by article
   6 input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
   7
   8 edits_by_article = {}
   9 for row in DictReader(input_file, delimiter="\t"):
  10     title = row['title']
  11
  12     if title in edits_by_article:
  13         edits_by_article[title] = edits_by_article[title] + 1
  14     else:
  15         edits_by_article[title] = 1
  16
  17 input_file.close()
  18
  19
  20 # STEP 2: find the list of the top 3 articles
  21 top_articles = []
  22 for title in sorted(edits_by_article, key=edits_by_article.get, reverse=True):
  23     if len(top_articles) >= 3:
  24         break
  25     else:
  26         top_articles.append(title)
  27
  28
  29 # STEP 3: now, fill that by doing a version of the first count by
  30 # going back through the original data and this time just count each
  31 # of the three articles
  32
  33 article_edits_by_day = {}
  34
  35 input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
  36 for row in DictReader(input_file, delimiter="\t"):
  37     title = row['title']
  38
  39     if title not in top_articles:
  40         continue
  41
  42     day = row['timestamp'][0:10]
  43
  44     if day in article_edits_by_day:
  45         article_edits_by_day[day][title] = article_edits_by_day[day][title] + 1
  46     else:
  47         article_edits_by_day[day] = {}
  48         for tmp_title in top_articles:
  49             if tmp_title == title:
  50                 article_edits_by_day[day][tmp_title] = 1
  51             else:
  52                 article_edits_by_day[day][tmp_title] = 0
  53
  54
  55 # STEP 4: print it all out
  56 # output the counts by day
  57 output_file = open("hp_edits_by_day_top3_articles.tsv", "w", encoding='utf-8')
  58
  59 # write a header
  60 title_header_string = "\t".join(top_articles)
  61
  62 output_file.write("day\t" + title_header_string + "\n")
  63
  64 # iterate through every day and print out data into the file
  65 for day in article_edits_by_day:
  66     title_values = []
  67     for title in top_articles:
  68         title_values.append(str(article_edits_by_day[day][title]))
  69
  70     title_values_string = "\t".join(title_values)
  71     output_file.write("\t".join([day, title_values_string]) + "\n")
  72
  73 output_file.close()
  74
  75 # Example of interactive graph in Google Docs:
  76 # http://mako.cc/go/0h