From: Benjamin Mako Hill Date: Mon, 4 May 2015 23:50:45 +0000 (-0700) Subject: initial versions of solutions to harry potter challenges X-Git-Url: https://projects.mako.cc/source/harrypotter-wikipedia-cdsw-solutions/commitdiff_plain/HEAD initial versions of solutions to harry potter challenges --- b73ee1667f70d185493600c757a94a8e09aee86d diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5e9600f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/*.tsv +*~ diff --git a/README.md b/README.md new file mode 100644 index 0000000..9064cbe --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +## Overview + +First, you'll need to run the `build_hpwp_dataset.py` code or download the `hp_wiki.tsv` file. The rest of the answers will assume you have a file called `hp_wiki.tsv` in the same directory. Because this file is quite big, I've not included it here. + +## Questions + +1. What are the most edited articles on Harry Potter on Wikipedia? +2. Who are the 5 most active editors on articles in Harry Potter? How may edits have they made? +3. Create graphs in a spreadsheet of the trend lines (i.e., edits per day over time) for the three most active editors? +4. Create graphs in a spreadsheet of the trend lines (i.e., edits per day over time) for the three most popular articles? +5. Instead of "binning" your dataset by day, change to bin it by month for each of the two previous questions. +6. Pick a different topic in Wikipedia and download a new dataset. Answer the questions above for this other dataset. + diff --git a/solution1.py b/solution1.py new file mode 100644 index 0000000..44ec35e --- /dev/null +++ b/solution1.py @@ -0,0 +1,23 @@ +# Q: What are the most edited articles on Harry Potter on Wikipedia? + +from csv import DictReader + +# read in the input file and count by day +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") + +edits_by_article = {} +for row in DictReader(input_file, delimiter="\t"): + title = row['title'] + + if title in edits_by_article: + edits_by_article[title] = edits_by_article[title] + 1 + else: + edits_by_article[title] = 1 + +input_file.close() + +# I used this answer here: +# https://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value + +for title in sorted(edits_by_article, key=edits_by_article.get, reverse=True): + print(title) diff --git a/solution2-alternative.py b/solution2-alternative.py new file mode 100644 index 0000000..95a9810 --- /dev/null +++ b/solution2-alternative.py @@ -0,0 +1,30 @@ +# Q: Who are the 5 most active editors on articles in Harry Potter? How may edits have they made? + +from csv import DictReader + +# read in the input file and count by day +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") + +edits_by_editor = {} +for row in DictReader(input_file, delimiter="\t"): + user = row['user'] + + if user in edits_by_editor: + edits_by_editor[user] = edits_by_editor[user] + 1 + else: + edits_by_editor[user] = 1 + +input_file.close() + +# output the counts by day +output_file = open("hp_edits_by_user.tsv", "w", encoding='utf-8') + +# write a header +output_file.write("user\tedits\n") + +# iterate through every day and print out data into the file +for user in edits_by_editor: + output_file.write("\t".join([user, str(edits_by_editor[user])]) + "\n") + +output_file.close() + diff --git a/solution2.py b/solution2.py new file mode 100644 index 0000000..56851e5 --- /dev/null +++ b/solution2.py @@ -0,0 +1,29 @@ +# Q: Who are the 5 most active editors on articles in Harry Potter? How may edits have they made? + +from csv import DictReader + +# read in the input file and count by day +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") + +edits_by_editor = {} +for row in DictReader(input_file, delimiter="\t"): + user = row['user'] + + if user in edits_by_editor: + edits_by_editor[user] = edits_by_editor[user] + 1 + else: + edits_by_editor[user] = 1 + +input_file.close() + +# I used this answer here: +# https://stackoverflow.com/questions/613183/sort-a-python-dictionary-by-value + +num_printed = 0 +for user in sorted(edits_by_editor, key=edits_by_editor.get, reverse=True): + print(user + " : " + str(edits_by_editor[user])) + if num_printed >= 4: + break + else: + num_printed = num_printed + 1 + diff --git a/solution3.py b/solution3.py new file mode 100644 index 0000000..1321cbf --- /dev/null +++ b/solution3.py @@ -0,0 +1,68 @@ +# Q: Create graphs in a spreadsheet of the trend lines (i.e., edits per day over time) for the three most active editors? + +from csv import DictReader + +# STEP 1: read in the input file and count by day +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") + +edits_by_editor = {} +for row in DictReader(input_file, delimiter="\t"): + user = row['user'] + + if user in edits_by_editor: + edits_by_editor[user] = edits_by_editor[user] + 1 + else: + edits_by_editor[user] = 1 + +input_file.close() + + +# STEP 2: find the list of the top 3 editor +top_editors = [] +for user in sorted(edits_by_editor, key=edits_by_editor.get, reverse=True): + if len(top_editors) >= 3: + break + else: + top_editors.append(user) + +# STEP 3.1: first, create a dictionary of dictionaries, one per user +user_edits_by_day = {} +for user in top_editors: + user_edits_by_day[user] = {} + + +# STEP 3.2: now, fill that by doing a version of the first count by +# going back through the original data and this time just count each +# of the three editors + +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") +for row in DictReader(input_file, delimiter="\t"): + user = row['user'] + + if user not in top_editors: + continue + + day = row['timestamp'][0:10] + + if day in user_edits_by_day[user]: + user_edits_by_day[user][day] = user_edits_by_day[user][day] + 1 + else: + user_edits_by_day[user][day] = 1 + + +# STEP 4: print it all out +# output the counts by day +output_file = open("hp_edits_by_day_top3_users.tsv", "w", encoding='utf-8') + +# write a header +output_file.write("user\tday\tedits\n") + +# iterate through every day and print out data into the file +for user in top_editors: + for day in user_edits_by_day[user]: + output_file.write("\t".join([user, day, str(user_edits_by_day[user][day])]) + "\n") + +output_file.close() + +# Example of interactive graph in Google Docs: +# http://mako.cc/go/0g diff --git a/solution4.py b/solution4.py new file mode 100644 index 0000000..e06013a --- /dev/null +++ b/solution4.py @@ -0,0 +1,76 @@ +# Q: Create graphs in a spreadsheet of the trend lines (i.e., edits per day over time) for the three most popular articles? + +from csv import DictReader + +# STEP 1: read in the input file and count by article +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") + +edits_by_article = {} +for row in DictReader(input_file, delimiter="\t"): + title = row['title'] + + if title in edits_by_article: + edits_by_article[title] = edits_by_article[title] + 1 + else: + edits_by_article[title] = 1 + +input_file.close() + + +# STEP 2: find the list of the top 3 articles +top_articles = [] +for title in sorted(edits_by_article, key=edits_by_article.get, reverse=True): + if len(top_articles) >= 3: + break + else: + top_articles.append(title) + + +# STEP 3: now, fill that by doing a version of the first count by +# going back through the original data and this time just count each +# of the three articles + +article_edits_by_day = {} + +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") +for row in DictReader(input_file, delimiter="\t"): + title = row['title'] + + if title not in top_articles: + continue + + day = row['timestamp'][0:10] + + if day in article_edits_by_day: + article_edits_by_day[day][title] = article_edits_by_day[day][title] + 1 + else: + article_edits_by_day[day] = {} + for tmp_title in top_articles: + if tmp_title == title: + article_edits_by_day[day][tmp_title] = 1 + else: + article_edits_by_day[day][tmp_title] = 0 + + +# STEP 4: print it all out +# output the counts by day +output_file = open("hp_edits_by_day_top3_articles.tsv", "w", encoding='utf-8') + +# write a header +title_header_string = "\t".join(top_articles) + +output_file.write("day\t" + title_header_string + "\n") + +# iterate through every day and print out data into the file +for day in article_edits_by_day: + title_values = [] + for title in top_articles: + title_values.append(str(article_edits_by_day[day][title])) + + title_values_string = "\t".join(title_values) + output_file.write("\t".join([day, title_values_string]) + "\n") + +output_file.close() + +# Example of interactive graph in Google Docs: +# http://mako.cc/go/0h diff --git a/solution5.py b/solution5.py new file mode 100644 index 0000000..0299e6e --- /dev/null +++ b/solution5.py @@ -0,0 +1,80 @@ +# Q: Instead of "binning" your dataset by day, change to bin it by month for each of the two previous questions. + +# This is a modified version of solution4.py. Changing solution3.py +# would be similar. + +from csv import DictReader + +# STEP 1: read in the input file and count by article +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") + +edits_by_article = {} +for row in DictReader(input_file, delimiter="\t"): + title = row['title'] + + if title in edits_by_article: + edits_by_article[title] = edits_by_article[title] + 1 + else: + edits_by_article[title] = 1 + +input_file.close() + + +# STEP 2: find the list of the top 3 articles +top_articles = [] +for title in sorted(edits_by_article, key=edits_by_article.get, reverse=True): + if len(top_articles) >= 3: + break + else: + top_articles.append(title) + + +# STEP 3: now, fill that by doing a version of the first count by +# going back through the original data and this time just count each +# of the three articles + +article_edits_by_month = {} + +input_file = open("hp_wiki.tsv", 'r', encoding="utf-8") +for row in DictReader(input_file, delimiter="\t"): + title = row['title'] + + if title not in top_articles: + continue + + # NOTE: this line is the key difference + month = row['timestamp'][0:7] + + if month in article_edits_by_month: + article_edits_by_month[month][title] = article_edits_by_month[month][title] + 1 + else: + article_edits_by_month[month] = {} + for tmp_title in top_articles: + if tmp_title == title: + article_edits_by_month[month][tmp_title] = 1 + else: + article_edits_by_month[month][tmp_title] = 0 + + +# STEP 4: print it all out +# output the counts by month +output_file = open("hp_edits_by_month_top3_articles.tsv", "w", encoding='utf-8') + +# write a header +title_header_string = "\t".join(top_articles) + +output_file.write("month\t" + title_header_string + "\n") + +# iterate through every month and print out data into the file +for month in article_edits_by_month: + title_values = [] + for title in top_articles: + title_values.append(str(article_edits_by_month[month][title])) + + title_values_string = "\t".join(title_values) + output_file.write("\t".join([month, title_values_string]) + "\n") + +output_file.close() + +# Example of interactive graph in Google Docs: +# http://mako.cc/go/0i diff --git a/solution6.py b/solution6.py new file mode 100644 index 0000000..72a627f --- /dev/null +++ b/solution6.py @@ -0,0 +1,10 @@ +# Q: Pick a different topic in Wikipedia and download a new +# dataset. Answer the questions above for this other dataset. +# +# The solution involves editing the file build_hpwp_dataset.py and +# changing the following line: +# +# category = "Harry Potter" +# +# If you change "Harry Potter" to any other Category in Wikipedia, it +# will work the same.