update lecture material to move to notebook
[harrypotter-wikipedia-cdsw] / build_harry_potter_dataset.ipynb
diff --git a/build_harry_potter_dataset.ipynb b/build_harry_potter_dataset.ipynb
new file mode 100644 (file)
index 0000000..a9a62a5
--- /dev/null
@@ -0,0 +1,209 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_article_revisions(title):\n",
+    "    revisions = []\n",
+    "\n",
+    "    # create a base url for the api and then a normal url which is initially\n",
+    "    # just a copy of it\n",
+    "    # The following line is what the requests call is doing, basically.\n",
+    "    # \"http://en.wikipedia.org/w/api.php/?action=query&titles={0}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\".format(title)\n",
+    "    wp_api_url = \"http://en.wikipedia.org/w/api.php/\"\n",
+    "\n",
+    "    parameters = {'action' : 'query',\n",
+    "                  'titles' : title,\n",
+    "                  'prop' : 'revisions',\n",
+    "                  'rvprop' : 'flags|timestamp|user|size|ids',\n",
+    "                  'rvlimit' : 500,\n",
+    "                  'format' : 'json',\n",
+    "                  'continue' : '' }\n",
+    "\n",
+    "    # we'll repeat this forever (i.e., we'll only stop when we find\n",
+    "    # the \"break\" command)\n",
+    "    while True:\n",
+    "        # the first line open the urls but also handles unicode urls\n",
+    "        call = requests.get(wp_api_url, params=parameters)\n",
+    "        api_answer = call.json()\n",
+    "\n",
+    "        # get the list of pages from the json object\n",
+    "        pages = api_answer[\"query\"][\"pages\"]\n",
+    "\n",
+    "        # for every page, (there should always be only one) get its revisions:\n",
+    "        for page in pages.keys():\n",
+    "            query_revisions = pages[page][\"revisions\"]\n",
+    "\n",
+    "            # for every revision, first we do some cleaning up\n",
+    "            for rev in query_revisions:\n",
+    "                #print(rev)\n",
+    "                # let's continue/skip this revision if the user is hidden\n",
+    "                if \"userhidden\" in rev:\n",
+    "                    continue\n",
+    "                \n",
+    "                # 1: add a title field for the article because we're going to mix them together\n",
+    "                rev[\"title\"] = title\n",
+    "\n",
+    "                # 2: let's \"recode\" anon so it's true or false instead of present/missing\n",
+    "                if \"anon\" in rev:\n",
+    "                    rev[\"anon\"] = True\n",
+    "                else:\n",
+    "                    rev[\"anon\"] = False\n",
+    "\n",
+    "                # 3: let's recode \"minor\" in the same way\n",
+    "                if \"minor\" in rev:\n",
+    "                    rev[\"minor\"] = True\n",
+    "                else:\n",
+    "                    rev[\"minor\"] = False\n",
+    "\n",
+    "                # we're going to change the timestamp to make it work a little better in excel/spreadsheets\n",
+    "                rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"T\", \" \")\n",
+    "                rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"Z\", \"\")\n",
+    "\n",
+    "                # finally, save the revisions we've seen to a varaible\n",
+    "                revisions.append(rev)\n",
+    "\n",
+    "        # 'continue' tells us there's more revisions to add\n",
+    "        if 'continue' in api_answer:\n",
+    "            # replace the 'continue' parameter with the contents of the\n",
+    "            # api_answer dictionary.\n",
+    "            parameters.update(api_answer['continue'])\n",
+    "        else:\n",
+    "            break\n",
+    "\n",
+    "    # return all the revisions for this page\n",
+    "    return(revisions)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "category = \"Harry Potter\"\n",
+    "\n",
+    "# we'll use another api called catscan2 to grab a list of pages in\n",
+    "# categories and subcategories. it works like all the other apis we've\n",
+    "# studied!\n",
+    "#\n",
+    "# The following requests call basically does the same thing as this string:\n",
+    "# \"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories={0}&doit=1&format=json\".format(category)\n",
+    "url_catscan = \"https://petscan.wmflabs.org/\"\n",
+    "\n",
+    "parameters = {'depth' : 10,\n",
+    "              'categories' : category,\n",
+    "              'format' : 'json',\n",
+    "              'doit' : 1}\n",
+    "\n",
+    "# r = requests.get(\"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories=Harry Potter&doit=1&format=json\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r = requests.get(url_catscan, params=parameters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "articles_json = r.json()\n",
+    "articles = articles_json[\"*\"][0][\"a\"][\"*\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# open a file to print the header\n",
+    "output_file = open(\"hp_wiki.tsv\", \"w\", encoding='utf-8')\n",
+    "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for every article\n",
+    "for article in articles[0:10]:\n",
+    "    # skip this until it's an article\n",
+    "    if article[\"namespace\"] != 0:\n",
+    "        continue\n",
+    "\n",
+    "    # first grab the article's title\n",
+    "    title = article[\"title\"]\n",
+    "    print(title)\n",
+    "\n",
+    "    # get the list of revisions from our function and then iterate through it,\n",
+    "    # printing it to our output file\n",
+    "    revisions = get_article_revisions(title)\n",
+    "    for rev in revisions:\n",
+    "        print(\"\\t\".join([rev[\"title\"], rev[\"user\"], rev[\"timestamp\"],\n",
+    "                         str(rev[\"size\"]), str(rev[\"anon\"]),\n",
+    "                         str(rev[\"minor\"]), str(rev[\"revid\"])]),\n",
+    "             file=output_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# close the file, we're done here!\n",
+    "output_file.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Benjamin Mako Hill || Want to submit a patch?