7 from xml.sax import handler, make_parser
8 from xml.sax.saxutils import XMLFilterBase
9 from collections import defaultdict
12 class WikiLogItem(object):
14 Holds data related to one <logitem> element parsed from the dump
29 for attr in self.__slots__:
30 setattr(self, attr, '')
33 return repr({k: getattr(self, k) for k in self.__slots__})
35 class text_normalize_filter(XMLFilterBase):
37 SAX filter to ensure that contiguous texts nodes are merged into one
38 That hopefully speeds up the parsing process a lot, specially when
39 reading revisions with long text
41 Recipe by Uche Ogbuji, James Kew and Peter Cogolo Retrieved from: Python
42 Cookbook, 2nd ed., by Alex Martelli, Anna Martelli Ravenscroft, and
43 David Ascher (O'Reillly Media, 2005) 0-596-00797-3
45 def __init__(self, upstream, downstream):
46 XMLFilterBase.__init__(self, upstream)
47 self._downstream=downstream
49 def _complete_text_node(self):
51 self._downstream.characters(''.join(self._accumulator))
53 def characters(self, text):
54 self._accumulator.append(text)
55 def ignorableWhiteSpace(self, ws):
56 self._accumulator.append(text)
58 def _wrap_complete(method_name):
59 def method(self, *a, **k):
60 self._complete_text_node()
61 getattr(self._downstream, method_name)(*a, **k)
62 method.__name__= method_name
63 setattr(text_normalize_filter, method_name, method)
65 for n in '''startElement endElement'''.split():
68 class WikiDumpHandler(handler.ContentHandler):
70 A ContentHandler designed to pull out page ids, titles and text from
71 Wiki pages. These are assembled into WikiLogItem objects and sent off to
72 the supplied callback.
74 def __init__(self, logItemCallBack=None):
75 handler.ContentHandler.__init__(self)
77 self.insideContribTag = False
78 self.logItemCallBack = logItemCallBack
79 self.logItemsProcessed = 0
81 def startElement(self, name, attrs):
82 self.currentTag = name
83 if (name == 'logitem'):
85 self.currentLogItem = WikiLogItem()
86 elif (name == 'contributor'):
87 # when we're in revision, ignore ids
88 self.insideContribTag = True
89 if 'deleted' in attrs:
90 self.currentLogItem.contrib_deleted = True
92 self.currentLogItem.contrib_deleted = False
94 def endElement(self, name):
95 if (name == 'logitem'):
96 if self.logItemCallBack is not None:
97 self.logItemCallBack(self.currentLogItem)
98 self.logItemsProcessed += 1
99 elif (name == 'contributor'):
100 # we've finished the revision section
101 self.insideContribTag = False
104 def characters(self, content):
105 if (self.currentTag == 'id' and not self.insideContribTag):
106 self.currentLogItem.id = content
107 elif (self.currentTag == 'id' and self.insideContribTag):
108 self.currentLogItem.userid = content
109 elif (self.currentTag == 'username' and self.insideContribTag):
110 self.currentLogItem.username = content
111 elif (self.currentTag == 'action'):
112 self.currentLogItem.action = content
113 elif (self.currentTag == 'type'):
114 self.currentLogItem.type = content
115 elif (self.currentTag == 'logtitle'):
116 self.currentLogItem.logtitle = content
117 elif (self.currentTag == 'timestamp'):
118 self.currentLogItem.timestamp = content
119 elif (self.currentTag == 'comment'):
120 self.currentLogItem.comment = content
121 elif (self.currentTag == 'params'):
122 self.currentLogItem.params = content
125 def __init__(self, input, output_base="output"):
126 self.input_file = input
127 self.move_log = open(output_base + "-moves.tsv", "w")
128 self.prot_log = open(output_base + "-protections.tsv", "w")
129 self.del_log = open(output_base + "-deletions.tsv", "w")
131 self.prot_titles = defaultdict(None)
133 self.cal_dict = {v: k for k,v in enumerate(calendar.month_name)}
134 self.r_param_string = re.compile(r'\[(?P<right>\w+)=(?P<group>\w+)\] \((?P<period>.*?)\)+')
135 self.r_expir_string = re.compile(r'expires (?P<hour>\d{2}):(?P<min>\d{2}), (?P<day>\d+) (?P<month>\w+) (?P<year>\d{4})')
137 # this marks whether we have moved into late 2008
138 # when material is being recorded consistently
139 self.in_window = False
144 def __exit__(self, type, value, traceback):
145 self.move_log.close()
146 self.prot_log.close()
150 self.move_log.flush()
151 self.prot_log.flush()
156 def __clean_timestamp(self, timestamp):
157 timestamp = timestamp.replace("T", " ")
158 timestamp = timestamp.replace("Z", "")
162 def __clean_logItem(self, logItem):
163 logItem.comment = re.sub(r'\s', r' ', logItem.comment)
164 logItem.params = re.sub(r'\s', r' ', logItem.params)
166 # add userid and username, but only if it's not deleted
167 if logItem.contrib_deleted:
169 logItem.username = ""
171 logItem.username = re.sub(r'\s', r' ', logItem.username)
173 logItem.timestamp = self.__clean_timestamp(logItem.timestamp)
177 def printDelete(self, logItem):
178 logItem = self.__clean_logItem(logItem)
180 output = [logItem.id, '"' + logItem.logtitle + '"',
181 logItem.action, logItem.timestamp]
183 print("\t".join(output), file=self.del_log)
185 def printMove(self, logItem):
186 logItem = self.__clean_logItem(logItem)
188 output = [logItem.id, logItem.timestamp,
189 '"' + logItem.params + '"', # old location
190 '"' + logItem.logtitle + '"'] # new location
191 print("\t".join(output), file=self.move_log)
193 # add the title to the list of titles
194 self.prot_titles[logItem.logtitle] = None
196 def printProtect(self, logItem):
197 logItem = self.__clean_logItem(logItem)
199 param_string = logItem.params
202 for m in self.r_param_string.finditer(param_string):
203 right = m.group("right")
204 group = m.group("group")
205 raw_period = m.group("period")
207 if not re.search("indefinite", raw_period):
208 m2 = self.r_expir_string.match(raw_period)
209 period_nums = [int(x) for x in [m2.group("year"),
210 self.cal_dict[m2.group("month")],
214 period_nums = tuple(period_nums)
215 period = "%d-%02d-%02d %02d:%02d:00" % period_nums
219 rights[right] = (group, period)
221 output = [logItem.id, '"' + logItem.logtitle + '"',
222 logItem.action, logItem.timestamp]
225 group, expir = rights[right]
226 print("\t".join(output + [right, group, expir]),
229 # add the title to the list of titles
230 self.prot_titles[logItem.logtitle] = None
232 def printUnprotect(self, logItem):
233 logItem = self.__clean_logItem(logItem)
235 output = [logItem.id, '"' + logItem.logtitle + '"',
236 logItem.action, logItem.timestamp,
238 print("\t".join(output), file=self.prot_log)
240 # remove the current title from the list of titles
241 self.prot_titles.pop(logItem.logtitle, None)
243 def conditionallyPrint(self, logItem):
244 # print deletions only if we've seen a protection event
245 if logItem.type == 'delete' \
246 and logItem.logtitle in self.prot_titles:
247 self.printDelete(logItem)
249 elif logItem.type == "protect":
250 if logItem.action == "move_prot":
251 self.printMove(logItem)
253 elif logItem.action == "protect" \
254 or logItem.action == "modify":
256 # this limits it to only things after 2008 when this
257 # data started being stored in params
258 if not logItem.params:
261 self.in_window = True
262 self.printProtect(logItem)
264 elif logItem.action == "unprotect":
265 if self.in_window: self.printUnprotect(logItem)
268 # this is some kind of error so we'll print the article and
270 print(logItem, file=sys.stderr)
273 def parseWithCallback(incoming_data, callback):
274 parser = make_parser()
275 parser.setFeature(handler.feature_namespaces, 0)
277 # apply the text_normalize_filter
278 wdh = WikiDumpHandler(logItemCallBack=callback)
279 filter_handler = text_normalize_filter(parser, wdh)
281 filter_handler.parse(incoming_data)
283 if __name__ == "__main__":
285 When called as script, argv[1] is assumed to be a filename and we
286 simply print pages found. If it's missing, we just use sys.stdin
290 if len(sys.argv) > 1:
291 input_file = open(sys.argv[1], "r")
292 output_base = re.sub(r'\.\w+$', '', os.path.basename(sys.argv[1]))
294 input_file = sys.stdin
295 output_base = "output"
297 with logExporter(input_file, output_base) as exporter:
298 parseWithCallback(input_file, exporter.conditionallyPrint)