projects
/
harrypotter-wikipedia-cdsw
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
7a78606
)
fix comments to include the full url being created
author
Benjamin Mako Hill
<mako@atdot.cc>
Wed, 10 Jun 2015 16:26:13 +0000
(09:26 -0700)
committer
Benjamin Mako Hill
<mako@atdot.cc>
Wed, 10 Jun 2015 16:26:13 +0000
(09:26 -0700)
build_hpwp_dataset.py
patch
|
blob
|
history
diff --git
a/build_hpwp_dataset.py
b/build_hpwp_dataset.py
index 38c299ded2ccd158e1cc6d04becd07d634ceb9d2..97cc5338251324b76ecef0467313dc31141b90ed 100644
(file)
--- a/
build_hpwp_dataset.py
+++ b/
build_hpwp_dataset.py
@@
-12,7
+12,7
@@
def get_article_revisions(title):
# create a base url for the api and then a normal url which is initially
# just a copy of it
# The following line is what the requests call is doing, basically.
# create a base url for the api and then a normal url which is initially
# just a copy of it
# The following line is what the requests call is doing, basically.
- # "http://en.wikipedia.org/w/api.php/?action=query&titles={0}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json".format(title)
+ # "http://en.wikipedia.org/w/api.php/?action=query&titles={0}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json
&continue=
".format(title)
wp_api_url = "http://en.wikipedia.org/w/api.php/"
parameters = {'action' : 'query',
wp_api_url = "http://en.wikipedia.org/w/api.php/"
parameters = {'action' : 'query',
@@
-39,6
+39,7
@@
def get_article_revisions(title):
# for every revision, first we do some cleaning up
for rev in query_revisions:
# for every revision, first we do some cleaning up
for rev in query_revisions:
+ #print(rev)
# let's continue/skip this revision if the user is hidden
if "userhidden" in rev:
continue
# let's continue/skip this revision if the user is hidden
if "userhidden" in rev:
continue
@@
-91,6
+92,8
@@
parameters = {'depth' : 10,
'format' : 'json',
'doit' : 1}
'format' : 'json',
'doit' : 1}
+# r = requests.get("http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories=Harry Potter&doit=1&format=json"
+
r = requests.get(url_catscan, params=parameters)
articles_json = r.json()
articles = articles_json["*"][0]["a"]["*"]
r = requests.get(url_catscan, params=parameters)
articles_json = r.json()
articles = articles_json["*"][0]["a"]["*"]
Benjamin Mako Hill
||
Want to submit a patch?