initial versions of solutions to harry potter challenges

author Benjamin Mako Hill <mako@atdot.cc>

Mon, 4 May 2015 23:50:45 +0000 (16:50 -0700)

committer Benjamin Mako Hill <mako@atdot.cc>

Mon, 4 May 2015 23:50:45 +0000 (16:50 -0700)
author Benjamin Mako Hill <mako@atdot.cc>
Mon, 4 May 2015 23:50:45 +0000 (16:50 -0700)
committer Benjamin Mako Hill <mako@atdot.cc>
Mon, 4 May 2015 23:50:45 +0000 (16:50 -0700)
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..5e9600f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+/*.tsv
+*~
diff --git a/README.md b/README.md

new file mode 100644 (file)

index 0000000..9064cbe
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+## Overview
+
+First, you'll need to run the `build_hpwp_dataset.py` code or download the `hp_wiki.tsv` file. The rest of the answers will assume you have a file called `hp_wiki.tsv` in the same directory. Because this file is quite big, I've not included it here.
+
+## Questions
+
+1. What are the most edited articles on Harry Potter on Wikipedia?
+2. Who are the 5 most active editors on articles in Harry Potter? How may edits have they made?
+3. Create graphs in a spreadsheet of the trend lines (i.e., edits per day over time) for the three most active editors?
+4. Create graphs in a spreadsheet of the trend lines (i.e., edits per day over time) for the three most popular articles?
+5. Instead of "binning" your dataset by day, change to bin it by month for each of the two previous questions.
+6. Pick a different topic in Wikipedia and download a new dataset. Answer the questions above for this other dataset.
+
diff --git a/solution1.py b/solution1.py

new file mode 100644 (file)

index 0000000..44ec35e
--- /dev/null
+++ b/solution1.py
@@ -0,0 +1,23 @@
+# Q: What are the most edited articles on Harry Potter on Wikipedia?
+
+from csv import DictReader
+
+# read in the input file and count by day
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
+
+edits_by_article = {}
+for row in DictReader(input_file, delimiter="\t"):
+    title = row['title']
+
+    if title in edits_by_article:
+        edits_by_article[title] = edits_by_article[title] + 1
+    else:
+        edits_by_article[title] = 1
+
+input_file.close()
+
+# I used this answer here:
+# https://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
+
+for title in sorted(edits_by_article, key=edits_by_article.get, reverse=True):
+    print(title)
diff --git a/solution2-alternative.py b/solution2-alternative.py

new file mode 100644 (file)

index 0000000..95a9810
--- /dev/null
+++ b/solution2-alternative.py
@@ -0,0 +1,30 @@
+# Q: Who are the 5 most active editors on articles in Harry Potter? How may edits have they made?
+
+from csv import DictReader
+
+# read in the input file and count by day
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
+
+edits_by_editor = {}
+for row in DictReader(input_file, delimiter="\t"):
+    user = row['user']
+
+    if user in edits_by_editor:
+        edits_by_editor[user] = edits_by_editor[user] + 1
+    else:
+        edits_by_editor[user] = 1
+
+input_file.close()
+
+# output the counts by day
+output_file = open("hp_edits_by_user.tsv", "w", encoding='utf-8')
+
+# write a header
+output_file.write("user\tedits\n")
+
+# iterate through every day and print out data into the file
+for user in edits_by_editor:
+    output_file.write("\t".join([user, str(edits_by_editor[user])]) + "\n")
+
+output_file.close()
+
diff --git a/solution2.py b/solution2.py

new file mode 100644 (file)

index 0000000..56851e5
--- /dev/null
+++ b/solution2.py
@@ -0,0 +1,29 @@
+# Q: Who are the 5 most active editors on articles in Harry Potter? How may edits have they made?
+
+from csv import DictReader
+
+# read in the input file and count by day
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
+
+edits_by_editor = {}
+for row in DictReader(input_file, delimiter="\t"):
+    user = row['user']
+
+    if user in edits_by_editor:
+        edits_by_editor[user] = edits_by_editor[user] + 1
+    else:
+        edits_by_editor[user] = 1
+
+input_file.close()
+
+# I used this answer here:
+# https://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value
+
+num_printed = 0
+for user in sorted(edits_by_editor, key=edits_by_editor.get, reverse=True):
+    print(user + " : " + str(edits_by_editor[user]))
+    if num_printed >= 4:
+        break
+    else:
+        num_printed = num_printed + 1
+        
diff --git a/solution3.py b/solution3.py

new file mode 100644 (file)

index 0000000..1321cbf
--- /dev/null
+++ b/solution3.py
@@ -0,0 +1,68 @@
+# Q: Create graphs in a spreadsheet of the trend lines (i.e., edits per day over time) for the three most active editors?
+
+from csv import DictReader
+
+# STEP 1: read in the input file and count by day
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
+
+edits_by_editor = {}
+for row in DictReader(input_file, delimiter="\t"):
+    user = row['user']
+
+    if user in edits_by_editor:
+        edits_by_editor[user] = edits_by_editor[user] + 1
+    else:
+        edits_by_editor[user] = 1
+
+input_file.close()
+
+
+# STEP 2: find the list of the top 3 editor
+top_editors = []
+for user in sorted(edits_by_editor, key=edits_by_editor.get, reverse=True):
+    if len(top_editors) >= 3:
+        break
+    else:
+        top_editors.append(user)
+
+# STEP 3.1: first, create a dictionary of dictionaries, one per user
+user_edits_by_day = {}
+for user in top_editors:
+    user_edits_by_day[user] = {}
+
+
+# STEP 3.2: now, fill that by doing a version of the first count by
+# going back through the original data and this time just count each
+# of the three editors
+
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
+for row in DictReader(input_file, delimiter="\t"):
+    user = row['user']
+    
+    if user not in top_editors:
+        continue
+
+    day = row['timestamp'][0:10]
+
+    if day in user_edits_by_day[user]:
+        user_edits_by_day[user][day] = user_edits_by_day[user][day] + 1
+    else:
+        user_edits_by_day[user][day] = 1
+
+
+# STEP 4: print it all out
+# output the counts by day
+output_file = open("hp_edits_by_day_top3_users.tsv", "w", encoding='utf-8')
+
+# write a header
+output_file.write("user\tday\tedits\n")
+
+# iterate through every day and print out data into the file
+for user in top_editors:
+    for day in user_edits_by_day[user]:
+        output_file.write("\t".join([user, day, str(user_edits_by_day[user][day])]) + "\n")
+
+output_file.close()
+
+# Example of interactive graph in Google Docs:
+# http://mako.cc/go/0g
diff --git a/solution4.py b/solution4.py

new file mode 100644 (file)

index 0000000..e06013a
--- /dev/null
+++ b/solution4.py
@@ -0,0 +1,76 @@
+# Q: Create graphs in a spreadsheet of the trend lines (i.e., edits per day over time) for the three most popular articles?
+
+from csv import DictReader
+
+# STEP 1: read in the input file and count by article
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
+
+edits_by_article = {}
+for row in DictReader(input_file, delimiter="\t"):
+    title = row['title']
+
+    if title in edits_by_article:
+        edits_by_article[title] = edits_by_article[title] + 1
+    else:
+        edits_by_article[title] = 1
+
+input_file.close()
+
+
+# STEP 2: find the list of the top 3 articles
+top_articles = []
+for title in sorted(edits_by_article, key=edits_by_article.get, reverse=True):
+    if len(top_articles) >= 3:
+        break
+    else:
+        top_articles.append(title)
+
+
+# STEP 3: now, fill that by doing a version of the first count by
+# going back through the original data and this time just count each
+# of the three articles
+
+article_edits_by_day = {}
+
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
+for row in DictReader(input_file, delimiter="\t"):
+    title = row['title']
+
+    if title not in top_articles:
+        continue
+    
+    day = row['timestamp'][0:10]
+
+    if day in article_edits_by_day:
+        article_edits_by_day[day][title] = article_edits_by_day[day][title] + 1
+    else:
+        article_edits_by_day[day] = {}
+        for tmp_title in top_articles:
+            if tmp_title == title:
+                article_edits_by_day[day][tmp_title] = 1
+            else:
+                article_edits_by_day[day][tmp_title] = 0
+
+
+# STEP 4: print it all out
+# output the counts by day
+output_file = open("hp_edits_by_day_top3_articles.tsv", "w", encoding='utf-8')
+
+# write a header
+title_header_string = "\t".join(top_articles)
+
+output_file.write("day\t" + title_header_string + "\n")
+
+# iterate through every day and print out data into the file
+for day in article_edits_by_day:
+    title_values = []
+    for title in top_articles:
+        title_values.append(str(article_edits_by_day[day][title]))
+
+    title_values_string = "\t".join(title_values)
+    output_file.write("\t".join([day, title_values_string]) + "\n")
+
+output_file.close()
+
+# Example of interactive graph in Google Docs:
+# http://mako.cc/go/0h
diff --git a/solution5.py b/solution5.py

new file mode 100644 (file)

index 0000000..0299e6e
--- /dev/null
+++ b/solution5.py
@@ -0,0 +1,80 @@
+# Q: Instead of "binning" your dataset by day, change to bin it by month for each of the two previous questions.
+
+# This is a modified version of solution4.py. Changing solution3.py
+# would be similar.
+
+from csv import DictReader
+
+# STEP 1: read in the input file and count by article
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
+
+edits_by_article = {}
+for row in DictReader(input_file, delimiter="\t"):
+    title = row['title']
+
+    if title in edits_by_article:
+        edits_by_article[title] = edits_by_article[title] + 1
+    else:
+        edits_by_article[title] = 1
+
+input_file.close()
+
+
+# STEP 2: find the list of the top 3 articles
+top_articles = []
+for title in sorted(edits_by_article, key=edits_by_article.get, reverse=True):
+    if len(top_articles) >= 3:
+        break
+    else:
+        top_articles.append(title)
+
+
+# STEP 3: now, fill that by doing a version of the first count by
+# going back through the original data and this time just count each
+# of the three articles
+
+article_edits_by_month = {}
+
+input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
+for row in DictReader(input_file, delimiter="\t"):
+    title = row['title']
+
+    if title not in top_articles:
+        continue
+
+    # NOTE: this line is the key difference
+    month = row['timestamp'][0:7]
+
+    if month in article_edits_by_month:
+        article_edits_by_month[month][title] = article_edits_by_month[month][title] + 1
+    else:
+        article_edits_by_month[month] = {}
+        for tmp_title in top_articles:
+            if tmp_title == title:
+                article_edits_by_month[month][tmp_title] = 1
+            else:
+                article_edits_by_month[month][tmp_title] = 0
+
+
+# STEP 4: print it all out
+# output the counts by month
+output_file = open("hp_edits_by_month_top3_articles.tsv", "w", encoding='utf-8')
+
+# write a header
+title_header_string = "\t".join(top_articles)
+
+output_file.write("month\t" + title_header_string + "\n")
+
+# iterate through every month and print out data into the file
+for month in article_edits_by_month:
+    title_values = []
+    for title in top_articles:
+        title_values.append(str(article_edits_by_month[month][title]))
+
+    title_values_string = "\t".join(title_values)
+    output_file.write("\t".join([month, title_values_string]) + "\n")
+
+output_file.close()
+
+# Example of interactive graph in Google Docs:
+# http://mako.cc/go/0i
diff --git a/solution6.py b/solution6.py

new file mode 100644 (file)

index 0000000..72a627f
--- /dev/null
+++ b/solution6.py
@@ -0,0 +1,10 @@
+# Q: Pick a different topic in Wikipedia and download a new
+# dataset. Answer the questions above for this other dataset.
+#
+# The solution involves editing the file build_hpwp_dataset.py and
+# changing the following line:
+#
+# category = "Harry Potter"
+#
+# If you change "Harry Potter" to any other Category in Wikipedia, it
+# will work the same.
author	Benjamin Mako Hill <mako@atdot.cc>
	Mon, 4 May 2015 23:50:45 +0000 (16:50 -0700)
committer	Benjamin Mako Hill <mako@atdot.cc>
	Mon, 4 May 2015 23:50:45 +0000 (16:50 -0700)
.gitignore	[new file with mode: 0644]	patch \| blob
README.md	[new file with mode: 0644]	patch \| blob
solution1.py	[new file with mode: 0644]	patch \| blob
solution2-alternative.py	[new file with mode: 0644]	patch \| blob
solution2.py	[new file with mode: 0644]	patch \| blob
solution3.py	[new file with mode: 0644]	patch \| blob
solution4.py	[new file with mode: 0644]	patch \| blob
solution5.py	[new file with mode: 0644]	patch \| blob
solution6.py	[new file with mode: 0644]	patch \| blob