X-Git-Url: https://projects.mako.cc/source/harrypotter-wikipedia-cdsw/blobdiff_plain/ce5c13c094d659125fe85d59b9bc0e4c2bf40072..3248892a26a9f80a1a8d6ef5da9ad89a26ca03df:/build_harry_potter_dataset.ipynb diff --git a/build_harry_potter_dataset.ipynb b/build_harry_potter_dataset.ipynb new file mode 100644 index 0000000..a9a62a5 --- /dev/null +++ b/build_harry_potter_dataset.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_article_revisions(title):\n", + " revisions = []\n", + "\n", + " # create a base url for the api and then a normal url which is initially\n", + " # just a copy of it\n", + " # The following line is what the requests call is doing, basically.\n", + " # \"http://en.wikipedia.org/w/api.php/?action=query&titles={0}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\".format(title)\n", + " wp_api_url = \"http://en.wikipedia.org/w/api.php/\"\n", + "\n", + " parameters = {'action' : 'query',\n", + " 'titles' : title,\n", + " 'prop' : 'revisions',\n", + " 'rvprop' : 'flags|timestamp|user|size|ids',\n", + " 'rvlimit' : 500,\n", + " 'format' : 'json',\n", + " 'continue' : '' }\n", + "\n", + " # we'll repeat this forever (i.e., we'll only stop when we find\n", + " # the \"break\" command)\n", + " while True:\n", + " # the first line open the urls but also handles unicode urls\n", + " call = requests.get(wp_api_url, params=parameters)\n", + " api_answer = call.json()\n", + "\n", + " # get the list of pages from the json object\n", + " pages = api_answer[\"query\"][\"pages\"]\n", + "\n", + " # for every page, (there should always be only one) get its revisions:\n", + " for page in pages.keys():\n", + " query_revisions = pages[page][\"revisions\"]\n", + "\n", + " # for every revision, first we do some cleaning up\n", + " for rev in query_revisions:\n", + " #print(rev)\n", + " # let's continue/skip this revision if the user is hidden\n", + " if \"userhidden\" in rev:\n", + " continue\n", + " \n", + " # 1: add a title field for the article because we're going to mix them together\n", + " rev[\"title\"] = title\n", + "\n", + " # 2: let's \"recode\" anon so it's true or false instead of present/missing\n", + " if \"anon\" in rev:\n", + " rev[\"anon\"] = True\n", + " else:\n", + " rev[\"anon\"] = False\n", + "\n", + " # 3: let's recode \"minor\" in the same way\n", + " if \"minor\" in rev:\n", + " rev[\"minor\"] = True\n", + " else:\n", + " rev[\"minor\"] = False\n", + "\n", + " # we're going to change the timestamp to make it work a little better in excel/spreadsheets\n", + " rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"T\", \" \")\n", + " rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"Z\", \"\")\n", + "\n", + " # finally, save the revisions we've seen to a varaible\n", + " revisions.append(rev)\n", + "\n", + " # 'continue' tells us there's more revisions to add\n", + " if 'continue' in api_answer:\n", + " # replace the 'continue' parameter with the contents of the\n", + " # api_answer dictionary.\n", + " parameters.update(api_answer['continue'])\n", + " else:\n", + " break\n", + "\n", + " # return all the revisions for this page\n", + " return(revisions)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "category = \"Harry Potter\"\n", + "\n", + "# we'll use another api called catscan2 to grab a list of pages in\n", + "# categories and subcategories. it works like all the other apis we've\n", + "# studied!\n", + "#\n", + "# The following requests call basically does the same thing as this string:\n", + "# \"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories={0}&doit=1&format=json\".format(category)\n", + "url_catscan = \"https://petscan.wmflabs.org/\"\n", + "\n", + "parameters = {'depth' : 10,\n", + " 'categories' : category,\n", + " 'format' : 'json',\n", + " 'doit' : 1}\n", + "\n", + "# r = requests.get(\"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories=Harry Potter&doit=1&format=json\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r = requests.get(url_catscan, params=parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "articles_json = r.json()\n", + "articles = articles_json[\"*\"][0][\"a\"][\"*\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# open a file to print the header\n", + "output_file = open(\"hp_wiki.tsv\", \"w\", encoding='utf-8')\n", + "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# for every article\n", + "for article in articles[0:10]:\n", + " # skip this until it's an article\n", + " if article[\"namespace\"] != 0:\n", + " continue\n", + "\n", + " # first grab the article's title\n", + " title = article[\"title\"]\n", + " print(title)\n", + "\n", + " # get the list of revisions from our function and then iterate through it,\n", + " # printing it to our output file\n", + " revisions = get_article_revisions(title)\n", + " for rev in revisions:\n", + " print(\"\\t\".join([rev[\"title\"], rev[\"user\"], rev[\"timestamp\"],\n", + " str(rev[\"size\"]), str(rev[\"anon\"]),\n", + " str(rev[\"minor\"]), str(rev[\"revid\"])]),\n", + " file=output_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# close the file, we're done here!\n", + "output_file.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}