update the harry potter for morning lecture
[harrypotter-wikipedia-cdsw] / build_harry_potter_dataset.ipynb
index a9a62a562f25c837cbcb214ca77d10f83cdcf747..e075e9a8d4f640e8968a08fdd5a00b75f1a4520f 100644 (file)
@@ -21,7 +21,8 @@
     "    # create a base url for the api and then a normal url which is initially\n",
     "    # just a copy of it\n",
     "    # The following line is what the requests call is doing, basically.\n",
-    "    # \"http://en.wikipedia.org/w/api.php/?action=query&titles={0}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\".format(title)\n",
+    "    # f\"http://en.wikipedia.org/w/api.php/?action=query&titles={title}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\"\n",
+    "    # e.g.: http://en.wikipedia.org/w/api.php/?action=query&titles=Harry_Potter&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\n",
     "    wp_api_url = \"http://en.wikipedia.org/w/api.php/\"\n",
     "\n",
     "    parameters = {'action' : 'query',\n",
@@ -43,7 +44,7 @@
     "        pages = api_answer[\"query\"][\"pages\"]\n",
     "\n",
     "        # for every page, (there should always be only one) get its revisions:\n",
-    "        for page in pages.keys():\n",
+    "        for page in pages:\n",
     "            query_revisions = pages[page][\"revisions\"]\n",
     "\n",
     "            # for every revision, first we do some cleaning up\n",
    "source": [
     "category = \"Harry Potter\"\n",
     "\n",
-    "# we'll use another api called catscan2 to grab a list of pages in\n",
+    "# we'll use another api called petscan to grab a list of pages in\n",
     "# categories and subcategories. it works like all the other apis we've\n",
     "# studied!\n",
     "#\n",
     "# The following requests call basically does the same thing as this string:\n",
-    "# \"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories={0}&doit=1&format=json\".format(category)\n",
-    "url_catscan = \"https://petscan.wmflabs.org/\"\n",
+    "# f\"https://petscan.wmflabs.org/?depth=10&categories={category}&format=json&doit=1\"\n",
+    "url_petscan = \"https://petscan.wmflabs.org/\"\n",
     "\n",
     "parameters = {'depth' : 10,\n",
     "              'categories' : category,\n",
     "              'format' : 'json',\n",
     "              'doit' : 1}\n",
     "\n",
-    "# r = requests.get(\"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories=Harry Potter&doit=1&format=json\"\n"
+    "# r = requests.get(\"https://petscan.wmflabs.org/?depth=10&categories=Harry Potter&format=json&doit=1\")"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
-    "r = requests.get(url_catscan, params=parameters)"
+    "r = requests.get(url_petscan, params=parameters)"
    ]
   },
   {
    "source": [
     "# open a file to print the header\n",
     "output_file = open(\"hp_wiki.tsv\", \"w\", encoding='utf-8')\n",
-    "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)\n"
+    "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)"
    ]
   },
   {
    "outputs": [],
    "source": [
     "# for every article\n",
-    "for article in articles[0:10]:\n",
+    "for article in articles:\n",
     "    # skip this until it's an article\n",
     "    if article[\"namespace\"] != 0:\n",
     "        continue\n",

Benjamin Mako Hill || Want to submit a patch?