From: Benjamin Mako Hill Date: Mon, 27 Apr 2015 22:35:56 +0000 (-0700) Subject: initial import of solutions for wikipedia data challenges X-Git-Url: https://projects.mako.cc/source/wikipedia-cdsw-solutions/commitdiff_plain?ds=inline initial import of solutions for wikipedia data challenges --- ad99cfbdf863dad5de738e8f98e6058e5e4a23a4 diff --git a/README.md b/README.md new file mode 100644 index 0000000..3c1ff10 --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +## Questions + +1. Save the revision metadata printed in wikipedia1-2.py to a file called "wikipedia_revisions.tsv". +2. Print out the revision ids and edit summaries (i.e., comment) of each revision for the article on Python. +3. Find out what other data or metadata you can print out for a revision for an article. +4. Which article is in more categories? Python (programming language) or R (programming language)? +5. Find out how many revisions to the article on "Python (programming language)" were made by user "Peterl"? How about "Hfastedge"? +6. How would you use the API to find out how many revisions/edits the user "Benjamin Mako Hill" has made to Wikipedia? +7. Can you build a list of all of the articles edited by "Benjamin Mako Hill"? What is the article with the longest title that user "Benjamin Mako Hill" has edited? +8. How many edits to the article "Python (programming language)" where made in 2014? + +## About + +Answers to the wikipedia-data-examples project created for the Community Data Science Workshop. + +Answers were created by: Benjamin Mako Hill + +All answers are written for Python 3. diff --git a/solution_1.py b/solution_1.py new file mode 100644 index 0000000..958c319 --- /dev/null +++ b/solution_1.py @@ -0,0 +1,38 @@ +# 1. Save the revision metadata printed in wikipedia1-2.py to a file called "wikipedia_revisions.tsv". + + +import requests + +# raw string: +# ?action=query&prop=revisions&titles=Python_(programming_language)&rvlimit=500&rvprop=timestamp|user&format=json') + +# parameter version which makes a little more sense +parameters = {'action' : 'query', + 'prop' : 'revisions', + 'titles' : 'Python (programming language)', + 'rvlimit' : 500, + 'rvprop' : "timestamp|user", + 'format' : 'json', + 'continue' : ''} + +output_file = open("wikipedia_revisions.tsv", 'w') + +# run a "while True" loop +while True: + wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + + for page_id in response["query"]["pages"].keys(): + page_title = response["query"]["pages"][page_id]["title"] + revisions = response["query"]["pages"][page_id]["revisions"] + + for rev in revisions: + print(page_title + "\t" + rev["user"] + "\t" + rev["timestamp"], file=output_file) + + if 'continue' in response: + parameters.update(response['continue']) + else: + break + + +output_file.close() diff --git a/solution_2.py b/solution_2.py new file mode 100644 index 0000000..05d2476 --- /dev/null +++ b/solution_2.py @@ -0,0 +1,31 @@ +# 2. Print out the revision ids and edit summaries (i.e., comment) of each revision for the article on Python. + +import requests + +# parameter version which makes a little more sense +parameters = {'action' : 'query', + 'prop' : 'revisions', + 'titles' : 'Python (programming language)', + 'rvlimit' : 100, + # changed this line to add ids|comment + 'rvprop' : "ids|comment", + 'format' : 'json', + 'continue' : ''} + +# run a "while True" loop +while True: + wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + + for page_id in response["query"]["pages"].keys(): + revisions = response["query"]["pages"][page_id]["revisions"] + + for rev in revisions: + # changed this line to add revid and comment + print(str(rev["revid"]) + "\t" + rev["comment"]) + + if 'continue' in response: + parameters.update(response['continue']) + else: + break + diff --git a/solution_3.py b/solution_3.py new file mode 100644 index 0000000..376c2e8 --- /dev/null +++ b/solution_3.py @@ -0,0 +1,23 @@ +# Find out what other data or metadata you can print out for a a +# revision for an article. +# +# The answer to this question comes down to the possibily options you +# can give to the "rvprop" parameter to requests. You can find that +# list here: https://www.mediawiki.org/wiki/API:Revisions#Parameters +# +# Here's the list at the writing of writing: +# +# rvprop: Which properties to get for each revision +# ids: Get both of these IDs: revid, parentid (default) MW 1.11+ +# flags: Whether the revision was a minor edit (default) MW 1.11+ +# timestamp: The date and time the revision was made (default) +# user: The user who made the revision, as well as userhidden and anon flags (default) MW 1.8 +# userid: User id of revision creator, as well as userhidden and anon flags MW 1.17+ +# sha1: SHA-1 (base 16) of the revision MW 1.19+ +# contentmodel: Content model id of the revision MW 1.21+ +# comment: The edit comment (default) +# parsedcomment: The edit/log comment in HTML format with wikilinks and section references expanded into hyperlinks MW 1.16 +# size: The size of the revision text in bytes MW 1.11+ +# content: The revision content. If set, the maximum limit will be 10 times as low +# tags: Any tags for this revision, such as those added by AbuseFilter MW 1.16+ + diff --git a/solution_4.py b/solution_4.py new file mode 100644 index 0000000..5bdf189 --- /dev/null +++ b/solution_4.py @@ -0,0 +1,33 @@ +# 4. Which article is in more categories? Python (programming language) or R (programming language)? + +import requests + +article_list = ["Python (programming language)", "R (programming language)"] + +for article in article_list: + # Get the list of categories + parameters = {'action' : 'query', + 'titles' : article, + 'prop' : 'categories', + 'format' : 'json', + 'continue' : ''} + + # reset the counter to zero once per article + counter = 0 + + # run a "while True" loop + while True: + wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + + for page_id in response["query"]["pages"].keys(): + for category in response["query"]["pages"][page_id]['categories']: + counter = counter + 1 + + if 'continue' in response: + parameters.update(response['continue']) + else: + break + + output_line = "%s: %s categories" % (article, counter) + print(output_line) diff --git a/solution_5-advanced.py b/solution_5-advanced.py new file mode 100644 index 0000000..fedf8ab --- /dev/null +++ b/solution_5-advanced.py @@ -0,0 +1,43 @@ +# 5. Find out how many revisions to the article on "Python +# (programming language)" were made by user "Peterl"? How about +# "Hfastedge"? + +import requests + +# parameter version which makes a little more sense +parameters = {'action' : 'query', + 'prop' : 'revisions', + 'titles' : 'Python (programming language)', + 'rvlimit' : 500, + 'rvprop' : "ids|user", + 'format' : 'json', + 'continue' : ''} + +user_counts = {} + +# run a "while True" loop +while True: + wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + + for page_id in response["query"]["pages"].keys(): + revisions = response["query"]["pages"][page_id]["revisions"] + + for rev in revisions: + current_user = rev['user'] + + if current_user in user_counts: + user_counts[current_user] = user_counts[current_user] + 1 + else: + user_counts[current_user] = 1 + + if 'continue' in response: + parameters.update(response['continue']) + else: + break + + +# now that we've built up the dictionary, lets print it out +for editor in user_counts.keys(): + print("%s made %s edits" % (editor, user_counts[editor])) + diff --git a/solution_5.py b/solution_5.py new file mode 100644 index 0000000..cd63da8 --- /dev/null +++ b/solution_5.py @@ -0,0 +1,42 @@ +# 5. Find out how many revisions to the article on "Python +# (programming language)" were made by user "Peterl"? How about +# "Hfastedge"? + +import requests + +# parameter version which makes a little more sense +parameters = {'action' : 'query', + 'prop' : 'revisions', + 'titles' : 'Python (programming language)', + 'rvlimit' : 500, + 'rvprop' : "ids|user", + 'format' : 'json', + 'continue' : ''} + +user_list = ['Peterl', 'Hfastedge'] + +counter_peterl = 0 +counter_hfastedge = 0 + +# run a "while True" loop +while True: + wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + + for page_id in response["query"]["pages"].keys(): + revisions = response["query"]["pages"][page_id]["revisions"] + + for rev in revisions: + if rev['user'] == "Peterl": + counter_peterl = counter_peterl + 1 + if rev['user'] == "Hfastedge": + counter_hfastedge = counter_hfastedge + 1 + + if 'continue' in response: + parameters.update(response['continue']) + else: + break + + +print("Peterl made %s edits" % counter_peterl) +print("Hfastedge made %s edits" % counter_hfastedge) diff --git a/solution_6.py b/solution_6.py new file mode 100644 index 0000000..4aaf38e --- /dev/null +++ b/solution_6.py @@ -0,0 +1,17 @@ +# 6. How would you use the API to find out how many revisions/edits the user "Benjamin Mako Hill" has made to Wikipedia? + +# I found documentation here: https://www.mediawiki.org/wiki/API:Users + +import requests + +# parameter version which makes a little more sense +parameters = {'action' : 'query', + 'list' : 'users', + 'ususers' : 'Benjamin Mako Hill', + 'usprop' : 'editcount', + 'format' : 'json' } + +wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) +response = wp_call.json() + +print(response['query']['users'][0]['editcount']) diff --git a/solution_7.py b/solution_7.py new file mode 100644 index 0000000..107cc2b --- /dev/null +++ b/solution_7.py @@ -0,0 +1,44 @@ +# 7. Can you build a list of all of the articles edited by "Benjamin +# Mako Hill"? What is the article with the longest title that user +# Benjamin Mako Hill has edited? + +# Step 1: Searching around on Google, I found this documentation which +# seemed like the right way to answer this question: +# https://www.mediawiki.org/wiki/API:Usercontribs + +import requests + +edited_pages = [] + +# parameter version which makes a little more sense +parameters = {'action' : 'query', + 'list' : 'usercontribs', + 'ucuser' : 'Benjamin Mako Hill', + 'uclimit' : 500, + 'ucprop' : 'title', + 'format' : 'json', + 'continue' : ''} + +while True: + wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + + contribs = response['query']['usercontribs'] + + for contrib in contribs: + if contrib['title'] not in edited_pages: + edited_pages.append(contrib['title']) + + # keep looping if we need to continue + if 'continue' in response: + parameters.update(response['continue']) + else: + break + +# print out the list of pages +counter = 0 +for page in edited_pages: + counter = counter + 1 + print(page) + +print("TOTAL PAGES: %s" % counter) diff --git a/solution_8.py b/solution_8.py new file mode 100644 index 0000000..89586f0 --- /dev/null +++ b/solution_8.py @@ -0,0 +1,37 @@ +# 8. How many edits to the article "Python (programming language)" +# where made in 2014? + +# Useful page is here: https://www.mediawiki.org/wiki/API:Revisions + +import requests + + +# parameter version which makes a little more sense +parameters = {'action' : 'query', + 'prop' : 'revisions', + 'titles' : 'Python (programming language)', + 'rvlimit' : 500, + 'rvprop' : "timestamp", + 'format' : 'json', + 'continue' : ''} + +edits_in_2014 = 0 + +# run a "while True" loop +while True: + wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + + for page_id in response["query"]["pages"].keys(): + revisions = response["query"]["pages"][page_id]["revisions"] + + for rev in revisions: + if rev['timestamp'][0:4] == "2014": + edits_in_2014 = edits_in_2014 + 1 + + if 'continue' in response: + parameters.update(response['continue']) + else: + break + +print(edits_in_2014)