projects
/
harrypotter-wikipedia-cdsw
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
catscan is now petscan
[harrypotter-wikipedia-cdsw]
/
build_hpwp_dataset.py
diff --git
a/build_hpwp_dataset.py
b/build_hpwp_dataset.py
index 97cc5338251324b76ecef0467313dc31141b90ed..b42107223c29b420af035abd8656f53315ffabd1 100644
(file)
--- a/
build_hpwp_dataset.py
+++ b/
build_hpwp_dataset.py
@@
-1,6
+1,7
@@
#!/usr/bin/env python
# coding=utf-8
#!/usr/bin/env python
# coding=utf-8
+import encoding_fix
import requests
# get_article_revisions is a function that takes an article title in
import requests
# get_article_revisions is a function that takes an article title in
@@
-85,7
+86,7
@@
category = "Harry Potter"
#
# The following requests call basically does the same thing as this string:
# "http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories={0}&doit=1&format=json".format(category)
#
# The following requests call basically does the same thing as this string:
# "http://tools.wmflabs.org/catscan2/catscan2.php?depth=10&categories={0}&doit=1&format=json".format(category)
-url_catscan = "http
://tools.wmflabs.org/catscan2/catscan2.php
"
+url_catscan = "http
s://petscan.wmflabs.org/
"
parameters = {'depth' : 10,
'categories' : category,
parameters = {'depth' : 10,
'categories' : category,
@@
-104,9
+105,13
@@
output.write("\t".join(["title", "user", "timestamp", "size", "anon", "minor", "
# for every article
for article in articles:
# for every article
for article in articles:
+ # skip this until it's an article
+ if article["namespace"] != 0:
+ continue
# first grab the article's title
# first grab the article's title
- title = article["a"]["title"]
+ title = article["title"]
+ print(title)
# get the list of revisions from our function and then iterate through it,
# printing it to our output file
# get the list of revisions from our function and then iterate through it,
# printing it to our output file
@@
-118,5
+123,4
@@
for article in articles:
# close the file, we're done here!
output.close()
# close the file, we're done here!
output.close()
-
-
+
Benjamin Mako Hill
||
Want to submit a patch?