5 "execution_count": null,
14 "execution_count": null,
18 "def get_article_revisions(title):\n",
21 " # create a base url for the api and then a normal url which is initially\n",
22 " # just a copy of it\n",
23 " # The following line is what the requests call is doing, basically.\n",
24 " # f\"http://en.wikipedia.org/w/api.php/?action=query&titles={title}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\"\n",
25 " # e.g.: http://en.wikipedia.org/w/api.php/?action=query&titles=Harry_Potter&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue=\n",
26 " wp_api_url = \"http://en.wikipedia.org/w/api.php/\"\n",
28 " parameters = {'action' : 'query',\n",
29 " 'titles' : title,\n",
30 " 'prop' : 'revisions',\n",
31 " 'rvprop' : 'flags|timestamp|user|size|ids',\n",
32 " 'rvlimit' : 500,\n",
33 " 'format' : 'json',\n",
34 " 'continue' : '' }\n",
36 " # we'll repeat this forever (i.e., we'll only stop when we find\n",
37 " # the \"break\" command)\n",
39 " # the first line open the urls but also handles unicode urls\n",
40 " call = requests.get(wp_api_url, params=parameters)\n",
41 " api_answer = call.json()\n",
43 " # get the list of pages from the json object\n",
44 " pages = api_answer[\"query\"][\"pages\"]\n",
46 " # for every page, (there should always be only one) get its revisions:\n",
47 " for page in pages:\n",
48 " query_revisions = pages[page][\"revisions\"]\n",
50 " # for every revision, first we do some cleaning up\n",
51 " for rev in query_revisions:\n",
53 " # let's continue/skip this revision if the user is hidden\n",
54 " if \"userhidden\" in rev:\n",
57 " # 1: add a title field for the article because we're going to mix them together\n",
58 " rev[\"title\"] = title\n",
60 " # 2: let's \"recode\" anon so it's true or false instead of present/missing\n",
61 " if \"anon\" in rev:\n",
62 " rev[\"anon\"] = True\n",
64 " rev[\"anon\"] = False\n",
66 " # 3: let's recode \"minor\" in the same way\n",
67 " if \"minor\" in rev:\n",
68 " rev[\"minor\"] = True\n",
70 " rev[\"minor\"] = False\n",
72 " # we're going to change the timestamp to make it work a little better in excel/spreadsheets\n",
73 " rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"T\", \" \")\n",
74 " rev[\"timestamp\"] = rev[\"timestamp\"].replace(\"Z\", \"\")\n",
76 " # finally, save the revisions we've seen to a varaible\n",
77 " revisions.append(rev)\n",
79 " # 'continue' tells us there's more revisions to add\n",
80 " if 'continue' in api_answer:\n",
81 " # replace the 'continue' parameter with the contents of the\n",
82 " # api_answer dictionary.\n",
83 " parameters.update(api_answer['continue'])\n",
87 " # return all the revisions for this page\n",
88 " return(revisions)\n"
93 "execution_count": null,
97 "category = \"Harry Potter\"\n",
99 "# we'll use another api called petscan to grab a list of pages in\n",
100 "# categories and subcategories. it works like all the other apis we've\n",
103 "# The following requests call basically does the same thing as this string:\n",
104 "# f\"https://petscan.wmflabs.org/?depth=10&categories={category}&format=json&doit=1\"\n",
105 "url_petscan = \"https://petscan.wmflabs.org/\"\n",
107 "parameters = {'depth' : 10,\n",
108 " 'categories' : category,\n",
109 " 'format' : 'json',\n",
112 "# r = requests.get(\"https://petscan.wmflabs.org/?depth=10&categories=Harry Potter&format=json&doit=1\")"
117 "execution_count": null,
121 "r = requests.get(url_petscan, params=parameters)"
126 "execution_count": null,
130 "articles_json = r.json()\n",
131 "articles = articles_json[\"*\"][0][\"a\"][\"*\"]"
136 "execution_count": null,
140 "# open a file to print the header\n",
141 "output_file = open(\"hp_wiki.tsv\", \"w\", encoding='utf-8')\n",
142 "print(\"\\t\".join([\"title\", \"user\", \"timestamp\", \"size\", \"anon\", \"minor\", \"revid\"]), file=output_file)"
147 "execution_count": null,
151 "# for every article\n",
152 "for article in articles:\n",
153 " # skip this until it's an article\n",
154 " if article[\"namespace\"] != 0:\n",
157 " # first grab the article's title\n",
158 " title = article[\"title\"]\n",
161 " # get the list of revisions from our function and then iterate through it,\n",
162 " # printing it to our output file\n",
163 " revisions = get_article_revisions(title)\n",
164 " for rev in revisions:\n",
165 " print(\"\\t\".join([rev[\"title\"], rev[\"user\"], rev[\"timestamp\"],\n",
166 " str(rev[\"size\"]), str(rev[\"anon\"]),\n",
167 " str(rev[\"minor\"]), str(rev[\"revid\"])]),\n",
173 "execution_count": null,
177 "# close the file, we're done here!\n",
178 "output_file.close()"
183 "execution_count": null,
191 "display_name": "Python 3",
192 "language": "python",
200 "file_extension": ".py",
201 "mimetype": "text/x-python",
203 "nbconvert_exporter": "python",
204 "pygments_lexer": "ipython3",