1 #!/usr/bin/env python27
7 dumpIterator = dump.Iterator(sys.stdin)
9 print(u"\t".join(["page.id", "revision.id", "page.title", "timestamp", "deleted", "redirect", "target"]))
11 for page in dumpIterator.readPages():
12 #Do things with a page
13 #like extract it's title: page.getTitle()
14 #or it's ID: page.getId()
16 for revision in page.readRevisions():
19 rev_data.append(unicode(page.getId()))
20 rev_data.append(unicode(revision.getId()))
21 rev_data.append(unicode(page.getTitle()))
22 rev_data.append(unicode(revision.getTimestamp()))
24 text = revision.getText()
27 rev_data.append("TRUE") # revision was deleted
28 rev_data.append("NA") # redirect bool = unknown
29 rev_data.append("NA") # redirect target missing
31 rev_data.append("FALSE") # revision was not deleted
32 match = re.match(r"^#redirect \[\[(.*)\]\]", text, re.IGNORECASE)
34 target = match.group(1)
35 rev_data.append("TRUE") # redirect bool = TRUE
36 rev_data.append(target) # redirect target
38 rev_data.append("FALSE") # redirect bool = FALSE
39 rev_data.append("NA") # redirect target missing
41 print(u"\t".join(rev_data).encode("utf-8"))