update the harry potter for morning lecture

[harrypotter-wikipedia-cdsw] / build_harry_potter_dataset.ipynb
diff --git a/build_harry_potter_dataset.ipynb b/build_harry_potter_dataset.ipynb

index a9a62a562f25c837cbcb214ca77d10f83cdcf747..e075e9a8d4f640e8968a08fdd5a00b75f1a4520f 100644 (file)
--- a/build_harry_potter_dataset.ipynb
+++ b/build_harry_potter_dataset.ipynb
@@ -21,7 +21,8 @@
      "    # create a base url for the api and then a normal url which is initially\n",
      "    # just a copy of it\n",
      "    # The following line is what the requests call is doing, basically.\n",
-    "    # \"http://en.wikipedia.org/w/api.php/?action=query&titles={0}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\".format(title)\n",
+    "    # f\"http://en.wikipedia.org/w/api.php/?action=query&titles={title}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\"\n",
+    "    # e.g.: http://en.wikipedia.org/w/api.php/?action=query&titles=Harry_Potter&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\n",
      "    wp_api_url = \"http://en.wikipedia.org/w/api.php/\"\n",
      "\n",
      "    parameters = {'action' : 'query',\n",
@@ -43,7 +44,7 @@
      "        pages = api_answer[\"query\"][\"pages\"]\n",
      "\n",
      "        # for every page, (there should always be only one) get its revisions:\n",
-    "        for page in pages.keys():\n",
+    "        for page in pages:\n",
      "            query_revisions = pages[page][\"revisions\"]\n",
      "\n",
      "            # for every revision, first we do some cleaning up\n",
@@ -95,20 +96,20 @@
     "source": [
      "category = \"Harry Potter\"\n",
      "\n",
-    "# we'll use another api called catscan2 to grab a list of pages in\n",
+    "# we'll use another api called petscan to grab a list of pages in\n",
      "# categories and subcategories. it works like all the other apis we've\n",
      "# studied!\n",
      "#\n",
      "# The following requests call basically does the same thing as this string:\n",
-    "# \"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories={0}&doit=1&format=json\".format(category)\n",
-    "url_catscan = \"https://petscan.wmflabs.org/\"\n",
+    "# f\"https://petscan.wmflabs.org/?depth=10&categories={category}&format=json&doit=1\"\n",
+    "url_petscan = \"https://petscan.wmflabs.org/\"\n",
      "\n",
      "parameters = {'depth' : 10,\n",
      "              'categories' : category,\n",
      "              'format' : 'json',\n",
      "              'doit' : 1}\n",
      "\n",
-    "# r = requests.get(\"http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories=Harry Potter&doit=1&format=json\"\n"
+    "# r = requests.get(\"https://petscan.wmflabs.org/?depth=10&categories=Harry Potter&format=json&doit=1\")"
     ]
    },
    {
@@ -117,7 +118,7 @@
     "metadata": {},
     "outputs": [],
     "source": [
-    "r = requests.get(url_catscan, params=parameters)"
+    "r = requests.get(url_petscan, params=parameters)"
     ]
    },
    {
@@ -138,7 +139,7 @@
     "source": [
      "# open a file to print the header\n",
      "output_file = open(\"hp_wiki.tsv\", \"w\", encoding='utf-8')\n",
-    "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)\n"
+    "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)"
     ]
    },
    {
@@ -148,7 +149,7 @@
     "outputs": [],
     "source": [
      "# for every article\n",
-    "for article in articles[0:10]:\n",
+    "for article in articles:\n",
      "    # skip this until it's an article\n",
      "    if article[\"namespace\"] != 0:\n",
      "        continue\n",