projects.mako.cc - protection-tools/blob - 03-parse_mw_eventlog.py

   1 #!/usr/bin/env python3
   2
   3 import re
   4 import sys
   5 import os.path
   6
   7 from xml.sax import handler, make_parser
   8 from xml.sax.saxutils import XMLFilterBase
   9 from collections import defaultdict
  10 import calendar
  11
  12 class WikiLogItem(object):
  13     """
  14     Holds data related to one <logitem> element parsed from the dump
  15     """
  16     __slots__ = (
  17         'id',
  18         'action',
  19         'type',
  20         'timestamp',
  21         'logtitle',
  22         'comment',
  23         'params',
  24         'username',
  25         'userid',
  26         'contrib_deleted'
  27     )
  28     def __init__(self):
  29         for attr in self.__slots__:
  30             setattr(self, attr, '')
  31
  32     def __str__(self):
  33         return repr({k: getattr(self, k) for k in self.__slots__})
  34
  35 class text_normalize_filter(XMLFilterBase):
  36     """
  37     SAX filter to ensure that contiguous texts nodes are merged into one
  38     That hopefully speeds up the parsing process a lot, specially when
  39     reading revisions with long text
  40
  41     Recipe by Uche Ogbuji, James Kew and Peter Cogolo Retrieved from: Python
  42     Cookbook, 2nd ed., by Alex Martelli, Anna Martelli Ravenscroft, and
  43     David Ascher (O'Reillly Media, 2005) 0-596-00797-3
  44     """
  45     def __init__(self, upstream, downstream):
  46         XMLFilterBase.__init__(self, upstream)
  47         self._downstream=downstream
  48         self._accumulator=[]
  49     def _complete_text_node(self):
  50         if self._accumulator:
  51             self._downstream.characters(''.join(self._accumulator))
  52             self._accumulator=[]
  53     def characters(self, text):
  54         self._accumulator.append(text)
  55     def ignorableWhiteSpace(self, ws):
  56         self._accumulator.append(text)
  57
  58 def _wrap_complete(method_name):
  59     def method(self, *a, **k):
  60         self._complete_text_node()
  61         getattr(self._downstream, method_name)(*a, **k)
  62     method.__name__= method_name
  63     setattr(text_normalize_filter, method_name, method)
  64
  65 for n in '''startElement endElement'''.split():
  66     _wrap_complete(n)
  67
  68 class WikiDumpHandler(handler.ContentHandler):
  69     """
  70     A ContentHandler designed to pull out page ids, titles and text from
  71     Wiki pages. These are assembled into WikiLogItem objects and sent off to
  72     the supplied callback.
  73     """
  74     def __init__(self, logItemCallBack=None):
  75         handler.ContentHandler.__init__(self)
  76         self.currentTag = ''
  77         self.insideContribTag = False
  78         self.logItemCallBack = logItemCallBack
  79         self.logItemsProcessed = 0
  80
  81     def startElement(self, name, attrs):
  82         self.currentTag = name
  83         if (name == 'logitem'):
  84             # add a log item
  85             self.currentLogItem = WikiLogItem()
  86         elif (name == 'contributor'):
  87             # when we're in revision, ignore ids
  88             self.insideContribTag = True
  89             if 'deleted' in attrs:
  90                 self.currentLogItem.contrib_deleted = True
  91             else:
  92                 self.currentLogItem.contrib_deleted = False
  93
  94     def endElement(self, name):
  95         if (name == 'logitem'):
  96             if self.logItemCallBack is not None:
  97                 self.logItemCallBack(self.currentLogItem)
  98             self.logItemsProcessed += 1
  99         elif (name == 'contributor'):
 100             # we've finished the revision section
 101             self.insideContribTag = False
 102         self.currentTag = ''
 103
 104     def characters(self, content):
 105         if (self.currentTag == 'id' and not self.insideContribTag):
 106             self.currentLogItem.id = content
 107         elif (self.currentTag == 'id' and self.insideContribTag):
 108             self.currentLogItem.userid = content
 109         elif (self.currentTag == 'username' and self.insideContribTag):
 110             self.currentLogItem.username = content
 111         elif (self.currentTag == 'action'):
 112             self.currentLogItem.action = content
 113         elif (self.currentTag == 'type'):
 114             self.currentLogItem.type = content
 115         elif (self.currentTag == 'logtitle'):
 116             self.currentLogItem.logtitle = content
 117         elif (self.currentTag == 'timestamp'):
 118             self.currentLogItem.timestamp = content
 119         elif (self.currentTag == 'comment'):
 120             self.currentLogItem.comment = content
 121         elif (self.currentTag == 'params'):
 122             self.currentLogItem.params = content
 123
 124 class logExporter:
 125     def __init__(self, input, output_base="output"):
 126         self.input_file = input
 127         self.move_log = open(output_base + "-moves.tsv", "w")
 128         self.prot_log = open(output_base + "-protections.tsv", "w")
 129         self.del_log = open(output_base + "-deletions.tsv", "w")
 130
 131         self.prot_titles = defaultdict(None)
 132
 133         self.cal_dict = {v: k for k,v in enumerate(calendar.month_name)}
 134         self.r_param_string = re.compile(r'\[(?P<right>\w+)=(?P<group>\w+)\] \((?P<period>.*?)\)+')
 135         self.r_expir_string = re.compile(r'expires (?P<hour>\d{2}):(?P<min>\d{2}), (?P<day>\d+) (?P<month>\w+) (?P<year>\d{4})')
 136
 137         # this marks whether we have moved into late 2008
 138         # when material is being recorded consistently
 139         self.in_window = False
 140
 141     def __enter__(self):
 142         return self
 143
 144     def __exit__(self, type, value, traceback):
 145         self.move_log.close()
 146         self.prot_log.close()
 147         self.del_log.close()
 148
 149     def __flush(self):
 150         self.move_log.flush()
 151         self.prot_log.flush()
 152         self.del_log.flush()
 153         sys.stdout.flush()
 154         sys.stderr.flush()
 155
 156     def __clean_timestamp(self, timestamp):
 157         timestamp = timestamp.replace("T", " ")
 158         timestamp = timestamp.replace("Z", "")
 159
 160         return timestamp
 161
 162     def __clean_logItem(self, logItem):
 163         logItem.comment = re.sub(r'\s', r' ', logItem.comment)
 164         logItem.params = re.sub(r'\s', r' ', logItem.params)
 165
 166         # add userid and username, but only if it's not deleted
 167         if logItem.contrib_deleted:
 168             logItem.userid = ""
 169             logItem.username = ""
 170         else:
 171             logItem.username = re.sub(r'\s', r' ', logItem.username)
 172
 173         logItem.timestamp = self.__clean_timestamp(logItem.timestamp)
 174
 175         return logItem
 176
 177     def printDelete(self, logItem):
 178         logItem = self.__clean_logItem(logItem)
 179
 180         output = [logItem.id, '"' + logItem.logtitle + '"',
 181                   logItem.action, logItem.timestamp]
 182
 183         print("\t".join(output), file=self.del_log)
 184
 185     def printMove(self, logItem):
 186         logItem = self.__clean_logItem(logItem)
 187
 188         output = [logItem.id, logItem.timestamp,
 189                   '"' + logItem.params  + '"', # old location
 190                   '"' + logItem.logtitle + '"'] # new location
 191         print("\t".join(output), file=self.move_log)
 192
 193         # add the title to the list of titles
 194         self.prot_titles[logItem.logtitle] = None
 195
 196     def printProtect(self, logItem):
 197         logItem = self.__clean_logItem(logItem)
 198
 199         param_string = logItem.params
 200         rights = {}
 201
 202         for m in self.r_param_string.finditer(param_string):
 203             right = m.group("right")
 204             group = m.group("group")
 205             raw_period = m.group("period")
 206
 207             if not re.search("indefinite", raw_period):
 208                 m2 = self.r_expir_string.match(raw_period)
 209                 period_nums = [int(x) for x in [m2.group("year"),
 210                                                 self.cal_dict[m2.group("month")],
 211                                                 m2.group("day"),
 212                                                 m2.group("hour"),
 213                                                 m2.group("min")]]
 214                 period_nums = tuple(period_nums)
 215                 period = "%d-%02d-%02d %02d:%02d:00" % period_nums
 216             else:
 217                 period = ""
 218
 219             rights[right] = (group, period)
 220
 221         output = [logItem.id, '"' + logItem.logtitle + '"',
 222                   logItem.action, logItem.timestamp]
 223
 224         for right in rights:
 225             group, expir = rights[right]
 226             print("\t".join(output + [right, group, expir]),
 227             file=self.prot_log)
 228
 229         # add the title to the list of titles
 230         self.prot_titles[logItem.logtitle] = None
 231
 232     def printUnprotect(self, logItem):
 233         logItem = self.__clean_logItem(logItem)
 234
 235         output = [logItem.id, '"' + logItem.logtitle + '"',
 236                   logItem.action, logItem.timestamp,
 237                   '', '', '']
 238         print("\t".join(output), file=self.prot_log)
 239
 240         # remove the current title from the list of titles
 241         self.prot_titles.pop(logItem.logtitle, None)
 242
 243     def conditionallyPrint(self, logItem):
 244         # print deletions only if we've seen a protection event
 245         if logItem.type == 'delete' \
 246             and logItem.logtitle in self.prot_titles:
 247             self.printDelete(logItem)
 248
 249         elif logItem.type == "protect":
 250             if logItem.action == "move_prot":
 251                 self.printMove(logItem)
 252
 253             elif logItem.action == "protect" \
 254                 or logItem.action == "modify":
 255
 256                 # this limits it to only things after 2008 when this
 257                 # data started being stored in params
 258                 if not logItem.params:
 259                     return
 260                 else:
 261                     self.in_window = True
 262                     self.printProtect(logItem)
 263
 264             elif logItem.action == "unprotect":
 265                 if self.in_window: self.printUnprotect(logItem)
 266
 267             else:
 268                 # this is some kind of error so we'll print the article and
 269                 # return
 270                 print(logItem, file=sys.stderr)
 271
 272
 273 def parseWithCallback(incoming_data, callback):
 274     parser = make_parser()
 275     parser.setFeature(handler.feature_namespaces, 0)
 276
 277     # apply the text_normalize_filter
 278     wdh = WikiDumpHandler(logItemCallBack=callback)
 279     filter_handler = text_normalize_filter(parser, wdh)
 280
 281     filter_handler.parse(incoming_data)
 282
 283 if __name__ == "__main__":
 284     """
 285     When called as script, argv[1] is assumed to be a filename and we
 286     simply print pages found. If it's missing, we just use sys.stdin
 287     instead.
 288     """
 289
 290     if len(sys.argv) > 1:
 291         input_file = open(sys.argv[1], "r")
 292         output_base = re.sub(r'\.\w+$', '', os.path.basename(sys.argv[1]))
 293     else:
 294         input_file = sys.stdin
 295         output_base = "output"
 296
 297     with logExporter(input_file, output_base) as exporter:
 298         parseWithCallback(input_file, exporter.conditionallyPrint)
 299