renamed file to match the content
[harrypotter-wikipedia-cdsw] / build_harry_potter_dataset.ipynb
1 {
2  "cells": [
3   {
4    "cell_type": "code",
5    "execution_count": null,
6    "metadata": {},
7    "outputs": [],
8    "source": [
9     "import requests"
10    ]
11   },
12   {
13    "cell_type": "code",
14    "execution_count": null,
15    "metadata": {},
16    "outputs": [],
17    "source": [
18     "def get_article_revisions(title):\n",
19     "    revisions = []\n",
20     "\n",
21     "    # create a base url for the api and then a normal url which is initially\n",
22     "    # just a copy of it\n",
23     "    # The following line is what the requests call is doing, basically.\n",
24     "    # f\"http://en.wikipedia.org/w/api.php/?action=query&titles={title}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\"\n",
25     "    # e.g.: http://en.wikipedia.org/w/api.php/?action=query&titles=Harry_Potter&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\n",
26     "    wp_api_url = \"http://en.wikipedia.org/w/api.php/\"\n",
27     "\n",
28     "    parameters = {'action' : 'query',\n",
29     "                  'titles' : title,\n",
30     "                  'prop' : 'revisions',\n",
31     "                  'rvprop' : 'flags|timestamp|user|size|ids',\n",
32     "                  'rvlimit' : 500,\n",
33     "                  'format' : 'json',\n",
34     "                  'continue' : '' }\n",
35     "\n",
36     "    # we'll repeat this forever (i.e., we'll only stop when we find\n",
37     "    # the \"break\" command)\n",
38     "    while True:\n",
39     "        # the first line open the urls but also handles unicode urls\n",
40     "        call = requests.get(wp_api_url, params=parameters)\n",
41     "        api_answer = call.json()\n",
42     "\n",
43     "        # get the list of pages from the json object\n",
44     "        pages = api_answer[\"query\"][\"pages\"]\n",
45     "\n",
46     "        # for every page, (there should always be only one) get its revisions:\n",
47     "        for page in pages:\n",
48     "            query_revisions = pages[page][\"revisions\"]\n",
49     "\n",
50     "            # for every revision, first we do some cleaning up\n",
51     "            for rev in query_revisions:\n",
52     "                #print(rev)\n",
53     "                # let's continue/skip this revision if the user is hidden\n",
54     "                if \"userhidden\" in rev:\n",
55     "                    continue\n",
56     "                \n",
57     "                # 1: add a title field for the article because we're going to mix them together\n",
58     "                rev[\"title\"] = title\n",
59     "\n",
60     "                # 2: let's \"recode\" anon so it's true or false instead of present/missing\n",
61     "                if \"anon\" in rev:\n",
62     "                    rev[\"anon\"] = True\n",
63     "                else:\n",
64     "                    rev[\"anon\"] = False\n",
65     "\n",
66     "                # 3: let's recode \"minor\" in the same way\n",
67     "                if \"minor\" in rev:\n",
68     "                    rev[\"minor\"] = True\n",
69     "                else:\n",
70     "                    rev[\"minor\"] = False\n",
71     "\n",
72     "                # we're going to change the timestamp to make it work a little better in excel/spreadsheets\n",
73     "                rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"T\", \" \")\n",
74     "                rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"Z\", \"\")\n",
75     "\n",
76     "                # finally, save the revisions we've seen to a varaible\n",
77     "                revisions.append(rev)\n",
78     "\n",
79     "        # 'continue' tells us there's more revisions to add\n",
80     "        if 'continue' in api_answer:\n",
81     "            # replace the 'continue' parameter with the contents of the\n",
82     "            # api_answer dictionary.\n",
83     "            parameters.update(api_answer['continue'])\n",
84     "        else:\n",
85     "            break\n",
86     "\n",
87     "    # return all the revisions for this page\n",
88     "    return(revisions)\n"
89    ]
90   },
91   {
92    "cell_type": "code",
93    "execution_count": null,
94    "metadata": {},
95    "outputs": [],
96    "source": [
97     "category = \"Harry Potter\"\n",
98     "\n",
99     "# we'll use another api called petscan to grab a list of pages in\n",
100     "# categories and subcategories. it works like all the other apis we've\n",
101     "# studied!\n",
102     "#\n",
103     "# The following requests call basically does the same thing as this string:\n",
104     "# f\"https://petscan.wmflabs.org/?depth=10&categories={category}&format=json&doit=1\"\n",
105     "url_petscan = \"https://petscan.wmflabs.org/\"\n",
106     "\n",
107     "parameters = {'depth' : 10,\n",
108     "              'categories' : category,\n",
109     "              'format' : 'json',\n",
110     "              'doit' : 1}\n",
111     "\n",
112     "# r = requests.get(\"https://petscan.wmflabs.org/?depth=10&categories=Harry Potter&format=json&doit=1\")"
113    ]
114   },
115   {
116    "cell_type": "code",
117    "execution_count": null,
118    "metadata": {},
119    "outputs": [],
120    "source": [
121     "r = requests.get(url_petscan, params=parameters)"
122    ]
123   },
124   {
125    "cell_type": "code",
126    "execution_count": null,
127    "metadata": {},
128    "outputs": [],
129    "source": [
130     "articles_json = r.json()\n",
131     "articles = articles_json[\"*\"][0][\"a\"][\"*\"]"
132    ]
133   },
134   {
135    "cell_type": "code",
136    "execution_count": null,
137    "metadata": {},
138    "outputs": [],
139    "source": [
140     "# open a file to print the header\n",
141     "output_file = open(\"hp_wiki.tsv\", \"w\", encoding='utf-8')\n",
142     "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)"
143    ]
144   },
145   {
146    "cell_type": "code",
147    "execution_count": null,
148    "metadata": {},
149    "outputs": [],
150    "source": [
151     "# for every article\n",
152     "for article in articles:\n",
153     "    # skip this until it's an article\n",
154     "    if article[\"namespace\"] != 0:\n",
155     "        continue\n",
156     "\n",
157     "    # first grab the article's title\n",
158     "    title = article[\"title\"]\n",
159     "    print(title)\n",
160     "\n",
161     "    # get the list of revisions from our function and then iterate through it,\n",
162     "    # printing it to our output file\n",
163     "    revisions = get_article_revisions(title)\n",
164     "    for rev in revisions:\n",
165     "        print(\"\\t\".join([rev[\"title\"], rev[\"user\"], rev[\"timestamp\"],\n",
166     "                         str(rev[\"size\"]), str(rev[\"anon\"]),\n",
167     "                         str(rev[\"minor\"]), str(rev[\"revid\"])]),\n",
168     "             file=output_file)"
169    ]
170   },
171   {
172    "cell_type": "code",
173    "execution_count": null,
174    "metadata": {},
175    "outputs": [],
176    "source": [
177     "# close the file, we're done here!\n",
178     "output_file.close()"
179    ]
180   },
181   {
182    "cell_type": "code",
183    "execution_count": null,
184    "metadata": {},
185    "outputs": [],
186    "source": []
187   }
188  ],
189  "metadata": {
190   "kernelspec": {
191    "display_name": "Python 3",
192    "language": "python",
193    "name": "python3"
194   },
195   "language_info": {
196    "codemirror_mode": {
197     "name": "ipython",
198     "version": 3
199    },
200    "file_extension": ".py",
201    "mimetype": "text/x-python",
202    "name": "python",
203    "nbconvert_exporter": "python",
204    "pygments_lexer": "ipython3",
205    "version": "3.7.3"
206   }
207  },
208  "nbformat": 4,
209  "nbformat_minor": 2
210 }

Benjamin Mako Hill || Want to submit a patch?