updated links and metadata
[redirect-tools] / 01-extract_redirects.py
1 #!/usr/bin/env python27
2
3 from wmf import dump
4 import sys
5 import re
6
7 dumpIterator = dump.Iterator(sys.stdin)
8
9 print(u"\t".join(["page.id", "revision.id", "page.title", "timestamp", "deleted", "redirect", "target"]))
10
11 for page in dumpIterator.readPages():
12    #Do things with a page
13    #like extract it's title: page.getTitle()
14    #or it's ID: page.getId()
15    
16    for revision in page.readRevisions():
17       rev_data = []
18
19       rev_data.append(unicode(page.getId()))
20       rev_data.append(unicode(revision.getId()))
21       rev_data.append(unicode(page.getTitle()))
22       rev_data.append(unicode(revision.getTimestamp()))
23
24       text = revision.getText()
25      
26       if text == None:
27           rev_data.append("TRUE") # revision was deleted
28           rev_data.append("NA") # redirect bool = unknown
29           rev_data.append("NA") # redirect target missing
30       else:
31           rev_data.append("FALSE") # revision was not deleted
32           match = re.match(r"^#redirect \[\[(.*)\]\]", text, re.IGNORECASE)
33           if match:
34               target = match.group(1)
35               rev_data.append("TRUE") # redirect bool = TRUE
36               rev_data.append(target) # redirect target
37           else:
38               rev_data.append("FALSE") # redirect bool = FALSE
39               rev_data.append("NA") # redirect target missing
40
41       print(u"\t".join(rev_data).encode("utf-8"))

Benjamin Mako Hill || Want to submit a patch?