{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_article_revisions(title):\n", " revisions = []\n", "\n", " # create a base url for the api and then a normal url which is initially\n", " # just a copy of it\n", " # The following line is what the requests call is doing, basically.\n", " # f\"http://en.wikipedia.org/w/api.php/?action=query&titles={title}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\"\n", " # e.g.: http://en.wikipedia.org/w/api.php/?action=query&titles=Harry_Potter&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\n", " wp_api_url = \"http://en.wikipedia.org/w/api.php/\"\n", "\n", " parameters = {'action' : 'query',\n", " 'titles' : title,\n", " 'prop' : 'revisions',\n", " 'rvprop' : 'flags|timestamp|user|size|ids',\n", " 'rvlimit' : 500,\n", " 'format' : 'json',\n", " 'continue' : '' }\n", "\n", " # we'll repeat this forever (i.e., we'll only stop when we find\n", " # the \"break\" command)\n", " while True:\n", " # the first line open the urls but also handles unicode urls\n", " call = requests.get(wp_api_url, params=parameters)\n", " api_answer = call.json()\n", "\n", " # get the list of pages from the json object\n", " pages = api_answer[\"query\"][\"pages\"]\n", "\n", " # for every page, (there should always be only one) get its revisions:\n", " for page in pages:\n", " query_revisions = pages[page][\"revisions\"]\n", "\n", " # for every revision, first we do some cleaning up\n", " for rev in query_revisions:\n", " #print(rev)\n", " # let's continue/skip this revision if the user is hidden\n", " if \"userhidden\" in rev:\n", " continue\n", " \n", " # 1: add a title field for the article because we're going to mix them together\n", " rev[\"title\"] = title\n", "\n", " # 2: let's \"recode\" anon so it's true or false instead of present/missing\n", " if \"anon\" in rev:\n", " rev[\"anon\"] = True\n", " else:\n", " rev[\"anon\"] = False\n", "\n", " # 3: let's recode \"minor\" in the same way\n", " if \"minor\" in rev:\n", " rev[\"minor\"] = True\n", " else:\n", " rev[\"minor\"] = False\n", "\n", " # we're going to change the timestamp to make it work a little better in excel/spreadsheets\n", " rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"T\", \" \")\n", " rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"Z\", \"\")\n", "\n", " # finally, save the revisions we've seen to a varaible\n", " revisions.append(rev)\n", "\n", " # 'continue' tells us there's more revisions to add\n", " if 'continue' in api_answer:\n", " # replace the 'continue' parameter with the contents of the\n", " # api_answer dictionary.\n", " parameters.update(api_answer['continue'])\n", " else:\n", " break\n", "\n", " # return all the revisions for this page\n", " return(revisions)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "category = \"Harry Potter\"\n", "\n", "# we'll use another api called petscan to grab a list of pages in\n", "# categories and subcategories. it works like all the other apis we've\n", "# studied!\n", "#\n", "# The following requests call basically does the same thing as this string:\n", "# f\"https://petscan.wmflabs.org/?depth=10&categories={category}&format=json&doit=1\"\n", "url_petscan = \"https://petscan.wmflabs.org/\"\n", "\n", "parameters = {'depth' : 10,\n", " 'categories' : category,\n", " 'format' : 'json',\n", " 'doit' : 1}\n", "\n", "# r = requests.get(\"https://petscan.wmflabs.org/?depth=10&categories=Harry Potter&format=json&doit=1\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "r = requests.get(url_petscan, params=parameters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "articles_json = r.json()\n", "articles = articles_json[\"*\"][0][\"a\"][\"*\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# open a file to print the header\n", "output_file = open(\"hp_wiki.tsv\", \"w\", encoding='utf-8')\n", "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# for every article\n", "for article in articles:\n", " # skip this until it's an article\n", " if article[\"namespace\"] != 0:\n", " continue\n", "\n", " # first grab the article's title\n", " title = article[\"title\"]\n", " print(title)\n", "\n", " # get the list of revisions from our function and then iterate through it,\n", " # printing it to our output file\n", " revisions = get_article_revisions(title)\n", " for rev in revisions:\n", " print(\"\\t\".join([rev[\"title\"], rev[\"user\"], rev[\"timestamp\"],\n", " str(rev[\"size\"]), str(rev[\"anon\"]),\n", " str(rev[\"minor\"]), str(rev[\"revid\"])]),\n", " file=output_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# close the file, we're done here!\n", "output_file.close()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }