projects.mako.cc - harrypotter-wikipedia-cdsw/blob - build_harry_potter_dataset.ipynb

   1 {
   2  "cells": [
   3   {
   4    "cell_type": "code",
   5    "execution_count": null,
   6    "metadata": {},
   7    "outputs": [],
   8    "source": [
   9     "import requests"
  10    ]
  11   },
  12   {
  13    "cell_type": "code",
  14    "execution_count": null,
  15    "metadata": {},
  16    "outputs": [],
  17    "source": [
  18     "def get_article_revisions(title):\n",
  19     "    revisions = []\n",
  20     "\n",
  21     "    # create a base url for the api and then a normal url which is initially\n",
  22     "    # just a copy of it\n",
  23     "    # The following line is what the requests call is doing, basically.\n",
  24     "    # \"http://en.wikipedia.org/w/api.php/?action=query&titles={0}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\".format(title)\n",
  25     "    wp_api_url = \"http://en.wikipedia.org/w/api.php/\"\n",
  26     "\n",
  27     "    parameters = {'action' : 'query',\n",
  28     "                  'titles' : title,\n",
  29     "                  'prop' : 'revisions',\n",
  30     "                  'rvprop' : 'flags|timestamp|user|size|ids',\n",
  31     "                  'rvlimit' : 500,\n",
  32     "                  'format' : 'json',\n",
  33     "                  'continue' : '' }\n",
  34     "\n",
  35     "    # we'll repeat this forever (i.e., we'll only stop when we find\n",
  36     "    # the \"break\" command)\n",
  37     "    while True:\n",
  38     "        # the first line open the urls but also handles unicode urls\n",
  39     "        call = requests.get(wp_api_url, params=parameters)\n",
  40     "        api_answer = call.json()\n",
  41     "\n",
  42     "        # get the list of pages from the json object\n",
  43     "        pages = api_answer[\"query\"][\"pages\"]\n",
  44     "\n",
  45     "        # for every page, (there should always be only one) get its revisions:\n",
  46     "        for page in pages.keys():\n",
  47     "            query_revisions = pages[page][\"revisions\"]\n",
  48     "\n",
  49     "            # for every revision, first we do some cleaning up\n",
  50     "            for rev in query_revisions:\n",
  51     "                #print(rev)\n",
  52     "                # let's continue/skip this revision if the user is hidden\n",
  53     "                if \"userhidden\" in rev:\n",
  54     "                    continue\n",
  55     "                \n",
  56     "                # 1: add a title field for the article because we're going to mix them together\n",
  57     "                rev[\"title\"] = title\n",
  58     "\n",
  59     "                # 2: let's \"recode\" anon so it's true or false instead of present/missing\n",
  60     "                if \"anon\" in rev:\n",
  61     "                    rev[\"anon\"] = True\n",
  62     "                else:\n",
  63     "                    rev[\"anon\"] = False\n",
  64     "\n",
  65     "                # 3: let's recode \"minor\" in the same way\n",
  66     "                if \"minor\" in rev:\n",
  67     "                    rev[\"minor\"] = True\n",
  68     "                else:\n",
  69     "                    rev[\"minor\"] = False\n",
  70     "\n",
  71     "                # we're going to change the timestamp to make it work a little better in excel/spreadsheets\n",
  72     "                rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"T\", \" \")\n",
  73     "                rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"Z\", \"\")\n",
  74     "\n",
  75     "                # finally, save the revisions we've seen to a varaible\n",
  76     "                revisions.append(rev)\n",
  77     "\n",
  78     "        # 'continue' tells us there's more revisions to add\n",
  79     "        if 'continue' in api_answer:\n",
  80     "            # replace the 'continue' parameter with the contents of the\n",
  81     "            # api_answer dictionary.\n",
  82     "            parameters.update(api_answer['continue'])\n",
  83     "        else:\n",
  84     "            break\n",
  85     "\n",
  86     "    # return all the revisions for this page\n",
  87     "    return(revisions)\n"
  88    ]
  89   },
  90   {
  91    "cell_type": "code",
  92    "execution_count": null,
  93    "metadata": {},
  94    "outputs": [],
  95    "source": [
  96     "category = \"Harry Potter\"\n",
  97     "\n",
  98     "# we'll use another api called catscan2 to grab a list of pages in\n",
  99     "# categories and subcategories. it works like all the other apis we've\n",
 100     "# studied!\n",
 101     "#\n",
 102     "# The following requests call basically does the same thing as this string:\n",
 103     "# \"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories={0}&doit=1&format=json\".format(category)\n",
 104     "url_catscan = \"https://petscan.wmflabs.org/\"\n",
 105     "\n",
 106     "parameters = {'depth' : 10,\n",
 107     "              'categories' : category,\n",
 108     "              'format' : 'json',\n",
 109     "              'doit' : 1}\n",
 110     "\n",
 111     "# r = requests.get(\"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories=Harry Potter&doit=1&format=json\"\n"
 112    ]
 113   },
 114   {
 115    "cell_type": "code",
 116    "execution_count": null,
 117    "metadata": {},
 118    "outputs": [],
 119    "source": [
 120     "r = requests.get(url_catscan, params=parameters)"
 121    ]
 122   },
 123   {
 124    "cell_type": "code",
 125    "execution_count": null,
 126    "metadata": {},
 127    "outputs": [],
 128    "source": [
 129     "articles_json = r.json()\n",
 130     "articles = articles_json[\"*\"][0][\"a\"][\"*\"]"
 131    ]
 132   },
 133   {
 134    "cell_type": "code",
 135    "execution_count": null,
 136    "metadata": {},
 137    "outputs": [],
 138    "source": [
 139     "# open a file to print the header\n",
 140     "output_file = open(\"hp_wiki.tsv\", \"w\", encoding='utf-8')\n",
 141     "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)\n"
 142    ]
 143   },
 144   {
 145    "cell_type": "code",
 146    "execution_count": null,
 147    "metadata": {},
 148    "outputs": [],
 149    "source": [
 150     "# for every article\n",
 151     "for article in articles[0:10]:\n",
 152     "    # skip this until it's an article\n",
 153     "    if article[\"namespace\"] != 0:\n",
 154     "        continue\n",
 155     "\n",
 156     "    # first grab the article's title\n",
 157     "    title = article[\"title\"]\n",
 158     "    print(title)\n",
 159     "\n",
 160     "    # get the list of revisions from our function and then iterate through it,\n",
 161     "    # printing it to our output file\n",
 162     "    revisions = get_article_revisions(title)\n",
 163     "    for rev in revisions:\n",
 164     "        print(\"\\t\".join([rev[\"title\"], rev[\"user\"], rev[\"timestamp\"],\n",
 165     "                         str(rev[\"size\"]), str(rev[\"anon\"]),\n",
 166     "                         str(rev[\"minor\"]), str(rev[\"revid\"])]),\n",
 167     "             file=output_file)"
 168    ]
 169   },
 170   {
 171    "cell_type": "code",
 172    "execution_count": null,
 173    "metadata": {},
 174    "outputs": [],
 175    "source": [
 176     "# close the file, we're done here!\n",
 177     "output_file.close()"
 178    ]
 179   },
 180   {
 181    "cell_type": "code",
 182    "execution_count": null,
 183    "metadata": {},
 184    "outputs": [],
 185    "source": []
 186   }
 187  ],
 188  "metadata": {
 189   "kernelspec": {
 190    "display_name": "Python 3",
 191    "language": "python",
 192    "name": "python3"
 193   },
 194   "language_info": {
 195    "codemirror_mode": {
 196     "name": "ipython",
 197     "version": 3
 198    },
 199    "file_extension": ".py",
 200    "mimetype": "text/x-python",
 201    "name": "python",
 202    "nbconvert_exporter": "python",
 203    "pygments_lexer": "ipython3",
 204    "version": "3.7.3"
 205   }
 206  },
 207  "nbformat": 4,
 208  "nbformat_minor": 2
 209 }