updated links in the homepage
[protection-tools] / 03-parse_mw_eventlog.py
1 #!/usr/bin/env python3
2
3 import re
4 import sys
5 import os.path
6
7 from xml.sax import handler, make_parser
8 from xml.sax.saxutils import XMLFilterBase
9 from collections import defaultdict
10 import calendar
11
12 class WikiLogItem(object):
13     """
14     Holds data related to one <logitem> element parsed from the dump
15     """
16     __slots__ = (
17         'id',
18         'action',
19         'type',
20         'timestamp',
21         'logtitle',
22         'comment',
23         'params',
24         'username',
25         'userid',
26         'contrib_deleted'
27     )
28     def __init__(self):
29         for attr in self.__slots__:
30             setattr(self, attr, '')
31
32     def __str__(self):
33         return repr({k: getattr(self, k) for k in self.__slots__})
34
35 class text_normalize_filter(XMLFilterBase):
36     """
37     SAX filter to ensure that contiguous texts nodes are merged into one
38     That hopefully speeds up the parsing process a lot, specially when
39     reading revisions with long text
40     
41     Recipe by Uche Ogbuji, James Kew and Peter Cogolo Retrieved from: Python
42     Cookbook, 2nd ed., by Alex Martelli, Anna Martelli Ravenscroft, and
43     David Ascher (O'Reillly Media, 2005) 0-596-00797-3
44     """
45     def __init__(self, upstream, downstream):
46         XMLFilterBase.__init__(self, upstream)
47         self._downstream=downstream
48         self._accumulator=[]
49     def _complete_text_node(self):
50         if self._accumulator:
51             self._downstream.characters(''.join(self._accumulator))
52             self._accumulator=[]
53     def characters(self, text):
54         self._accumulator.append(text)
55     def ignorableWhiteSpace(self, ws):
56         self._accumulator.append(text)
57
58 def _wrap_complete(method_name):
59     def method(self, *a, **k):
60         self._complete_text_node()
61         getattr(self._downstream, method_name)(*a, **k)
62     method.__name__= method_name
63     setattr(text_normalize_filter, method_name, method)
64
65 for n in '''startElement endElement'''.split():
66     _wrap_complete(n)
67
68 class WikiDumpHandler(handler.ContentHandler):
69     """
70     A ContentHandler designed to pull out page ids, titles and text from
71     Wiki pages. These are assembled into WikiLogItem objects and sent off to
72     the supplied callback.
73     """
74     def __init__(self, logItemCallBack=None):
75         handler.ContentHandler.__init__(self)
76         self.currentTag = ''
77         self.insideContribTag = False
78         self.logItemCallBack = logItemCallBack
79         self.logItemsProcessed = 0
80
81     def startElement(self, name, attrs):
82         self.currentTag = name
83         if (name == 'logitem'):
84             # add a log item
85             self.currentLogItem = WikiLogItem()
86         elif (name == 'contributor'):
87             # when we're in revision, ignore ids
88             self.insideContribTag = True
89             if 'deleted' in attrs:
90                 self.currentLogItem.contrib_deleted = True
91             else:
92                 self.currentLogItem.contrib_deleted = False
93
94     def endElement(self, name):
95         if (name == 'logitem'):
96             if self.logItemCallBack is not None:
97                 self.logItemCallBack(self.currentLogItem)
98             self.logItemsProcessed += 1
99         elif (name == 'contributor'):
100             # we've finished the revision section
101             self.insideContribTag = False
102         self.currentTag = ''
103
104     def characters(self, content):
105         if (self.currentTag == 'id' and not self.insideContribTag):
106             self.currentLogItem.id = content
107         elif (self.currentTag == 'id' and self.insideContribTag):
108             self.currentLogItem.userid = content
109         elif (self.currentTag == 'username' and self.insideContribTag):
110             self.currentLogItem.username = content
111         elif (self.currentTag == 'action'):
112             self.currentLogItem.action = content
113         elif (self.currentTag == 'type'):
114             self.currentLogItem.type = content
115         elif (self.currentTag == 'logtitle'):
116             self.currentLogItem.logtitle = content
117         elif (self.currentTag == 'timestamp'):
118             self.currentLogItem.timestamp = content
119         elif (self.currentTag == 'comment'):
120             self.currentLogItem.comment = content
121         elif (self.currentTag == 'params'):
122             self.currentLogItem.params = content
123
124 class logExporter:
125     def __init__(self, input, output_base="output"):
126         self.input_file = input
127         self.move_log = open(output_base + "-moves.tsv", "w")
128         self.prot_log = open(output_base + "-protections.tsv", "w")
129         self.del_log = open(output_base + "-deletions.tsv", "w")
130
131         self.prot_titles = defaultdict(None)
132
133         self.cal_dict = {v: k for k,v in enumerate(calendar.month_name)}
134         self.r_param_string = re.compile(r'\[(?P<right>\w+)=(?P<group>\w+)\] \((?P<period>.*?)\)+')
135         self.r_expir_string = re.compile(r'expires (?P<hour>\d{2}):(?P<min>\d{2}), (?P<day>\d+) (?P<month>\w+) (?P<year>\d{4})')
136
137         # this marks whether we have moved into late 2008
138         # when material is being recorded consistently
139         self.in_window = False
140
141     def __enter__(self):
142         return self
143
144     def __exit__(self, type, value, traceback):
145         self.move_log.close()
146         self.prot_log.close()
147         self.del_log.close()
148
149     def __flush(self):
150         self.move_log.flush()
151         self.prot_log.flush()
152         self.del_log.flush()
153         sys.stdout.flush()
154         sys.stderr.flush()
155
156     def __clean_timestamp(self, timestamp):
157         timestamp = timestamp.replace("T", " ")
158         timestamp = timestamp.replace("Z", "")
159
160         return timestamp 
161
162     def __clean_logItem(self, logItem):
163         logItem.comment = re.sub(r'\s', r' ', logItem.comment)
164         logItem.params = re.sub(r'\s', r' ', logItem.params)
165
166         # add userid and username, but only if it's not deleted
167         if logItem.contrib_deleted:
168             logItem.userid = ""
169             logItem.username = ""
170         else:
171             logItem.username = re.sub(r'\s', r' ', logItem.username)
172
173         logItem.timestamp = self.__clean_timestamp(logItem.timestamp)
174         
175         return logItem
176         
177     def printDelete(self, logItem):
178         logItem = self.__clean_logItem(logItem)
179
180         output = [logItem.id, '"' + logItem.logtitle + '"',
181                   logItem.action, logItem.timestamp]
182
183         print("\t".join(output), file=self.del_log)
184
185     def printMove(self, logItem):
186         logItem = self.__clean_logItem(logItem)
187
188         output = [logItem.id, logItem.timestamp,
189                   '"' + logItem.params  + '"', # old location
190                   '"' + logItem.logtitle + '"'] # new location
191         print("\t".join(output), file=self.move_log)
192
193         # add the title to the list of titles
194         self.prot_titles[logItem.logtitle] = None
195
196     def printProtect(self, logItem):
197         logItem = self.__clean_logItem(logItem)
198
199         param_string = logItem.params
200         rights = {}
201
202         for m in self.r_param_string.finditer(param_string):
203             right = m.group("right")
204             group = m.group("group")
205             raw_period = m.group("period")
206
207             if not re.search("indefinite", raw_period):
208                 m2 = self.r_expir_string.match(raw_period)
209                 period_nums = [int(x) for x in [m2.group("year"),
210                                                 self.cal_dict[m2.group("month")],
211                                                 m2.group("day"),
212                                                 m2.group("hour"),
213                                                 m2.group("min")]]
214                 period_nums = tuple(period_nums)
215                 period = "%d-%02d-%02d %02d:%02d:00" % period_nums
216             else:
217                 period = ""
218
219             rights[right] = (group, period)
220
221         output = [logItem.id, '"' + logItem.logtitle + '"',
222                   logItem.action, logItem.timestamp]
223
224         for right in rights:
225             group, expir = rights[right]
226             print("\t".join(output + [right, group, expir]),
227             file=self.prot_log)
228
229         # add the title to the list of titles
230         self.prot_titles[logItem.logtitle] = None
231
232     def printUnprotect(self, logItem):
233         logItem = self.__clean_logItem(logItem)
234
235         output = [logItem.id, '"' + logItem.logtitle + '"', 
236                   logItem.action, logItem.timestamp,
237                   '', '', '']
238         print("\t".join(output), file=self.prot_log)
239
240         # remove the current title from the list of titles
241         self.prot_titles.pop(logItem.logtitle, None)
242
243     def conditionallyPrint(self, logItem):
244         # print deletions only if we've seen a protection event
245         if logItem.type == 'delete' \
246             and logItem.logtitle in self.prot_titles:
247             self.printDelete(logItem)
248
249         elif logItem.type == "protect":
250             if logItem.action == "move_prot":
251                 self.printMove(logItem)
252
253             elif logItem.action == "protect" \
254                 or logItem.action == "modify":
255
256                 # this limits it to only things after 2008 when this
257                 # data started being stored in params
258                 if not logItem.params:
259                     return
260                 else:
261                     self.in_window = True
262                     self.printProtect(logItem)
263
264             elif logItem.action == "unprotect":
265                 if self.in_window: self.printUnprotect(logItem)
266
267             else:
268                 # this is some kind of error so we'll print the article and
269                 # return
270                 print(logItem, file=sys.stderr)
271
272
273 def parseWithCallback(incoming_data, callback):
274     parser = make_parser()
275     parser.setFeature(handler.feature_namespaces, 0)
276
277     # apply the text_normalize_filter
278     wdh = WikiDumpHandler(logItemCallBack=callback)
279     filter_handler = text_normalize_filter(parser, wdh)
280
281     filter_handler.parse(incoming_data)
282
283 if __name__ == "__main__":
284     """
285     When called as script, argv[1] is assumed to be a filename and we
286     simply print pages found. If it's missing, we just use sys.stdin
287     instead.
288     """
289
290     if len(sys.argv) > 1:
291         input_file = open(sys.argv[1], "r")
292         output_base = re.sub(r'\.\w+$', '', os.path.basename(sys.argv[1]))
293     else:
294         input_file = sys.stdin
295         output_base = "output"
296
297     with logExporter(input_file, output_base) as exporter:
298         parseWithCallback(input_file, exporter.conditionallyPrint)
299

Benjamin Mako Hill || Want to submit a patch?