13 from collections import OrderedDict
16 from ordereddict import OrderedDict
21 import simplejson as json
33 from cStringIO import StringIO
35 from StringIO import StringIO
38 def parse_timestamp(t):
39 if t == '0000-00-00T00:00:00Z':
40 return (0, 0, 0, 0, 0, 0, 0, 0)
41 return time.strptime(t, '%Y-%m-%dT%H:%M:%SZ')
44 class WaitToken(object):
47 self.id = '%x' % random.randint(0, sys.maxint)
56 def __init__(self, host, path='/w/', ext='.php', pool=None, retry_timeout=30,
57 max_retries=25, wait_callback=lambda *x: None, clients_useragent=None,
58 max_lag=3, compress=True, force_login=True, do_init=True, httpauth=None):
59 # Setup member variables
63 self.credentials = None
64 self.compress = compress
65 self.httpauth = httpauth
66 self.retry_timeout = retry_timeout
67 self.max_retries = max_retries
68 self.wait_callback = wait_callback
69 self.max_lag = str(max_lag)
70 self.force_login = force_login
72 # The token string => token object mapping
73 self.wait_tokens = weakref.WeakKeyDictionary()
76 self.blocked = False # Whether current user is blocked
77 self.hasmsg = False # Whether current user has new messages
78 self.groups = [] # Groups current user belongs to
79 self.rights = [] # Rights current user has
80 self.tokens = {} # Edit tokens of the current user
83 self.namespaces = self.default_namespaces
88 self.connection = requests.Session()
89 self.connection.headers['User-Agent'] = 'MwClient/' + __ver__ + ' (https://github.com/mwclient/mwclient)'
91 self.connection.headers['User-Agent'] = clients_useragent + ' - ' + self.connection.headers['User-Agent']
93 self.connection = pool
96 self.pages = listing.PageList(self)
97 self.categories = listing.PageList(self, namespace=14)
98 self.images = listing.PageList(self, namespace=6)
100 # Compat page generators
101 self.Pages = self.pages
102 self.Categories = self.categories
103 self.Images = self.images
105 # Initialization status
106 self.initialized = False
111 except errors.APIError, e:
112 # Private wiki, do init after login
113 if e[0] not in (u'unknown_action', u'readapidenied'):
117 meta = self.api('query', meta='siteinfo|userinfo',
118 siprop='general|namespaces', uiprop='groups|rights')
121 self.site = meta['query']['general']
122 self.namespaces = dict(((i['id'], i.get('*', '')) for i in meta['query']['namespaces'].itervalues()))
123 self.writeapi = 'writeapi' in self.site
126 if self.site['generator'].startswith('MediaWiki '):
127 version = self.site['generator'][10:].split('.')
132 if s[i] < '0' or s[i] > '9':
136 return (int(s[:i]), s[i:], )
138 return (int(s[:i]), )
139 self.version = sum((split_num(s) for s in version), ())
141 if len(self.version) < 2:
142 raise errors.MediaWikiVersionError('Unknown MediaWiki %s' % '.'.join(version))
144 raise errors.MediaWikiVersionError('Unknown generator %s' % self.site['generator'])
146 # Require MediaWiki version >= 1.16
150 userinfo = meta['query']['userinfo']
151 self.username = userinfo['name']
152 self.groups = userinfo.get('groups', [])
153 self.rights = userinfo.get('rights', [])
154 self.initialized = True
156 default_namespaces = {0: u'', 1: u'Talk', 2: u'User', 3: u'User talk', 4: u'Project', 5: u'Project talk',
157 6: u'Image', 7: u'Image talk', 8: u'MediaWiki', 9: u'MediaWiki talk', 10: u'Template', 11: u'Template talk',
158 12: u'Help', 13: u'Help talk', 14: u'Category', 15: u'Category talk', -1: u'Special', -2: u'Media'}
161 return "<Site object '%s%s'>" % (self.host, self.path)
163 def api(self, action, *args, **kwargs):
165 Perform a generic API call and handle errors. All arguments will be passed on.
168 To get coordinates from the GeoData MediaWiki extension at English Wikipedia:
170 >>> site = Site('en.wikipedia.org')
171 >>> result = site.api('query', prop='coordinates', titles='Oslo|Copenhagen')
172 >>> for page in result['query']['pages'].values():
173 ... if 'coordinates' in page:
174 ... print page['title'], page['coordinates'][0]['lat'], page['coordinates'][0]['lon']
176 Copenhagen 55.6761 12.5683
179 The raw response from the API call, as a dictionary.
182 if action == 'query':
184 kwargs['meta'] += '|userinfo'
186 kwargs['meta'] = 'userinfo'
187 if 'uiprop' in kwargs:
188 kwargs['uiprop'] += '|blockinfo|hasmsg'
190 kwargs['uiprop'] = 'blockinfo|hasmsg'
192 token = self.wait_token()
194 info = self.raw_api(action, **kwargs)
197 res = self.handle_api_result(info, token=token)
201 def handle_api_result(self, info, kwargs=None, token=None):
203 token = self.wait_token()
206 userinfo = info['query']['userinfo']
209 if 'blockedby' in userinfo:
210 self.blocked = (userinfo['blockedby'], userinfo.get('blockreason', u''))
213 self.hasmsg = 'message' in userinfo
214 self.logged_in = 'anon' not in userinfo
216 if info['error']['code'] in (u'internal_api_error_DBConnectionError', u'internal_api_error_DBQueryError'):
219 if '*' in info['error']:
220 raise errors.APIError(info['error']['code'],
221 info['error']['info'], info['error']['*'])
222 raise errors.APIError(info['error']['code'],
223 info['error']['info'], kwargs)
227 def _query_string(*args, **kwargs):
229 qs1 = [(k, v) for k, v in kwargs.iteritems() if k not in ('wpEditToken', 'token')]
230 qs2 = [(k, v) for k, v in kwargs.iteritems() if k in ('wpEditToken', 'token')]
231 return OrderedDict(qs1 + qs2)
233 def raw_call(self, script, data, files=None):
234 url = self.path + script + self.ext
236 if self.compress and gzip:
237 headers['Accept-Encoding'] = 'gzip'
238 if self.httpauth is not None:
239 credentials = base64.encodestring('%s:%s' % self.httpauth).replace('\n', '')
240 headers['Authorization'] = 'Basic %s' % credentials
241 token = self.wait_token((script, data))
243 scheme = 'http' # Should we move to 'https' as default?
245 if type(host) is tuple:
248 fullurl = '{scheme}://{host}{url}'.format(scheme=scheme, host=host, url=url)
251 stream = self.connection.post(fullurl, data=data, files=files, headers=headers)
254 except requests.exceptions.HTTPError, e:
257 if e[0] == 503 and e[1].getheader('X-Database-Lag'):
258 self.wait(token, int(e[1].getheader('Retry-After')))
259 elif e[0] < 500 or e[0] > 599:
263 except requests.exceptions.TooManyRedirects:
265 except requests.exceptions.ConnectionError:
266 print 'connection error'
271 def raw_api(self, action, *args, **kwargs):
272 """Sends a call to the API."""
273 kwargs['action'] = action
274 kwargs['format'] = 'json'
275 data = self._query_string(*args, **kwargs)
276 res = self.raw_call('api', data)
279 return json.loads(res)
281 if res.startswith('MediaWiki API is not enabled for this site.'):
282 raise errors.APIDisabledError
285 def raw_index(self, action, *args, **kwargs):
286 """Sends a call to index.php rather than the API."""
287 kwargs['action'] = action
288 kwargs['maxlag'] = self.max_lag
289 data = self._query_string(*args, **kwargs)
290 return self.raw_call('index', data)
292 def wait_token(self, args=None):
294 self.wait_tokens[token] = (0, args)
297 def wait(self, token, min_wait=0):
298 retry, args = self.wait_tokens[token]
299 self.wait_tokens[token] = (retry + 1, args)
300 if retry > self.max_retries and self.max_retries != -1:
301 raise errors.MaximumRetriesExceeded(self, token, args)
302 self.wait_callback(self, token, retry, args)
304 timeout = self.retry_timeout * retry
305 if timeout < min_wait:
308 return self.wait_tokens[token]
310 def require(self, major, minor, revision=None, raise_error=True):
311 if self.version is None:
312 if raise_error is None:
314 raise RuntimeError('Site %s has not yet been initialized' % repr(self))
317 if self.version[:2] >= (major, minor):
320 raise errors.MediaWikiVersionError('Requires version %s.%s, current version is %s.%s'
321 % ((major, minor) + self.version[:2]))
325 raise NotImplementedError
328 def email(self, user, text, subject, cc=False):
330 Send email to a specified user on the wiki.
333 ... site.email('SomeUser', 'Some message', 'Some subject')
334 ... except mwclient.errors.NoSpecifiedEmailError, e:
335 ... print 'The user does not accept email, or has not specified an email address.'
338 user (str): User name of the recipient
339 text (str): Body of the email
340 subject (str): Subject of the email
341 cc (bool): True to send a copy of the email to yourself (default is False)
344 Dictionary of the JSON response
347 NoSpecifiedEmailError (mwclient.errors.NoSpecifiedEmailError): if recipient does not accept email
348 EmailError (mwclient.errors.EmailError): on other errors
351 token = self.get_token('email')
354 info = self.api('emailuser', target=user, subject=subject,
355 text=text, ccme=cc, token=token)
356 except errors.APIError, e:
357 if e[0] == u'noemail':
358 raise errors.NoSpecifiedEmail(user, e[1])
359 raise errors.EmailError(*e)
363 def login(self, username=None, password=None, cookies=None, domain=None):
364 """Login to the wiki."""
366 if username and password:
367 self.credentials = (username, password, domain)
369 if self.host not in self.conn.cookies:
370 self.conn.cookies[self.host] = http.CookieJar()
371 self.conn.cookies[self.host].update(cookies)
374 wait_token = self.wait_token()
376 'lgname': self.credentials[0],
377 'lgpassword': self.credentials[1]
379 if self.credentials[2]:
380 kwargs['lgdomain'] = self.credentials[2]
382 login = self.api('login', **kwargs)
383 if login['login']['result'] == 'Success':
385 elif login['login']['result'] == 'NeedToken':
386 kwargs['lgtoken'] = login['login']['token']
387 elif login['login']['result'] == 'Throttled':
388 self.wait(wait_token, login['login'].get('wait', 5))
390 raise errors.LoginError(self, login['login'])
393 info = self.api('query', meta='userinfo', uiprop='groups|rights')
394 userinfo = info['query']['userinfo']
395 self.username = userinfo['name']
396 self.groups = userinfo.get('groups', [])
397 self.rights = userinfo.get('rights', [])
402 def get_token(self, type, force=False, title=None):
404 if self.version[:2] >= (1, 24):
405 # The 'csrf' (cross-site request forgery) token introduced in 1.24 replaces
406 # the majority of older tokens, like edittoken and movetoken.
407 if type not in ['watch', 'patrol', 'rollback', 'userrights']:
410 if type not in self.tokens:
411 self.tokens[type] = '0'
413 if self.tokens.get(type, '0') == '0' or force:
415 if self.version[:2] >= (1, 24):
416 info = self.api('query', meta='tokens', type=type)
417 self.tokens[type] = info['query']['tokens']['%stoken' % type]
421 # Some dummy title was needed to get a token prior to 1.24
423 info = self.api('query', titles=title,
424 prop='info', intoken=type)
425 for i in info['query']['pages'].itervalues():
426 if i['title'] == title:
427 self.tokens[type] = i['%stoken' % type]
429 return self.tokens[type]
431 def upload(self, file=None, filename=None, description='', ignore=False, file_size=None,
432 url=None, filekey=None, comment=None):
434 Uploads a file to the site. Returns JSON result from the API.
435 Can raise `errors.InsufficientPermission` and `requests.exceptions.HTTPError`.
438 - file : File object or stream to upload.
439 - filename : Destination filename, don't include namespace
441 - description : Wikitext for the file description page.
442 - ignore : True to upload despite any warnings.
443 - file_size : Deprecated in mwclient 0.7
444 - url : URL to fetch the file from.
445 - filekey : Key that identifies a previous upload that was
447 - comment : Upload comment. Also used as the initial page text
448 for new files if `description` is not specified.
450 Note that one of `file`, `filekey` and `url` must be specified, but not more
451 than one. For normal uploads, you specify `file`.
455 >>> client.upload(open('somefile', 'rb'), filename='somefile.jpg',
456 description='Some description')
459 if file_size is not None:
460 # Note that DeprecationWarning is hidden by default since Python 2.7
462 'file_size is deprecated since mwclient 0.7',
468 raise TypeError('filename must be specified')
470 if len([x for x in [file, filekey, url] if x is not None]) != 1:
471 raise TypeError("exactly one of 'file', 'filekey' and 'url' must be specified")
473 image = self.Images[filename]
474 if not image.can('upload'):
475 raise errors.InsufficientPermission(filename)
480 predata['comment'] = description
482 predata['comment'] = comment
483 predata['text'] = description
486 predata['ignorewarnings'] = 'true'
487 predata['token'] = image.get_token('edit')
488 predata['action'] = 'upload'
489 predata['format'] = 'json'
490 predata['filename'] = filename
494 # Renamed from sessionkey to filekey
495 # https://git.wikimedia.org/commit/mediawiki%2Fcore.git/5f13517e
496 if self.version[:2] < (1, 18):
497 predata['sessionkey'] = filekey
499 predata['filekey'] = filekey
504 files = {'file': file}
506 wait_token = self.wait_token()
509 data = self.raw_call('api', postdata, files)
510 info = json.loads(data)
513 if self.handle_api_result(info, kwargs=predata):
514 return info.get('upload', {})
515 except requests.exceptions.HTTPError, e:
516 if e[0] == 503 and e[1].getheader('X-Database-Lag'):
517 self.wait(wait_token, int(e[1].getheader('Retry-After')))
518 elif e[0] < 500 or e[0] > 599:
521 self.wait(wait_token)
522 except requests.exceptions.ConnectionError:
523 self.wait(wait_token)
525 def parse(self, text=None, title=None, page=None):
528 kwargs['text'] = text
529 if title is not None:
530 kwargs['title'] = title
532 kwargs['page'] = page
533 result = self.api('parse', **kwargs)
534 return result['parse']
536 # def block(self): TODO?
542 def allpages(self, start=None, prefix=None, namespace='0', filterredir='all',
543 minsize=None, maxsize=None, prtype=None, prlevel=None,
544 limit=None, dir='ascending', filterlanglinks='all', generator=True):
545 """Retrieve all pages on the wiki as a generator."""
547 pfx = listing.List.get_prefix('ap', generator)
548 kwargs = dict(listing.List.generate_kwargs(pfx, ('from', start), prefix=prefix,
549 minsize=minsize, maxsize=maxsize, prtype=prtype, prlevel=prlevel,
550 namespace=namespace, filterredir=filterredir, dir=dir,
551 filterlanglinks=filterlanglinks))
552 return listing.List.get_list(generator)(self, 'allpages', 'ap', limit=limit, return_values='title', **kwargs)
554 def allimages(self, start=None, prefix=None, minsize=None, maxsize=None, limit=None,
555 dir='ascending', sha1=None, sha1base36=None, prop='timestamp|url',
557 """Retrieve all images on the wiki as a generator."""
559 pfx = listing.List.get_prefix('ai', generator)
560 kwargs = dict(listing.List.generate_kwargs(pfx, ('from', start), prefix=prefix,
561 minsize=minsize, maxsize=maxsize,
562 dir=dir, sha1=sha1, sha1base36=sha1base36))
563 return listing.List.get_list(generator)(self, 'allimages', 'ai', limit=limit, return_values='timestamp|url', **kwargs)
565 def alllinks(self, start=None, prefix=None, unique=False, prop='title',
566 namespace='0', limit=None, generator=True):
567 """Retrieve a list of all links on the wiki as a generator."""
569 pfx = listing.List.get_prefix('al', generator)
570 kwargs = dict(listing.List.generate_kwargs(pfx, ('from', start), prefix=prefix,
571 prop=prop, namespace=namespace))
573 kwargs[pfx + 'unique'] = '1'
574 return listing.List.get_list(generator)(self, 'alllinks', 'al', limit=limit, return_values='title', **kwargs)
576 def allcategories(self, start=None, prefix=None, dir='ascending', limit=None, generator=True):
577 """Retrieve all categories on the wiki as a generator."""
579 pfx = listing.List.get_prefix('ac', generator)
580 kwargs = dict(listing.List.generate_kwargs(pfx, ('from', start), prefix=prefix, dir=dir))
581 return listing.List.get_list(generator)(self, 'allcategories', 'ac', limit=limit, **kwargs)
583 def allusers(self, start=None, prefix=None, group=None, prop=None, limit=None,
584 witheditsonly=False, activeusers=False, rights=None):
585 """Retrieve all users on the wiki as a generator."""
587 kwargs = dict(listing.List.generate_kwargs('au', ('from', start), prefix=prefix,
588 group=group, prop=prop,
590 witheditsonly=witheditsonly,
591 activeusers=activeusers))
593 return listing.List(self, 'allusers', 'au', limit=limit, **kwargs)
595 def blocks(self, start=None, end=None, dir='older', ids=None, users=None, limit=None,
596 prop='id|user|by|timestamp|expiry|reason|flags'):
597 """Retrieve blocks as a generator.
599 Each block is a dictionary containing:
600 - user: the username or IP address of the user
601 - id: the ID of the block
602 - timestamp: when the block was added
603 - expiry: when the block runs out (infinity for indefinite blocks)
604 - reason: the reason they are blocked
605 - allowusertalk: key is present (empty string) if the user is allowed to edit their user talk page
606 - by: the administrator who blocked the user
607 - nocreate: key is present (empty string) if the user's ability to create accounts has been disabled.
611 # TODO: Fix. Fix what?
612 kwargs = dict(listing.List.generate_kwargs('bk', start=start, end=end, dir=dir,
613 users=users, prop=prop))
614 return listing.List(self, 'blocks', 'bk', limit=limit, **kwargs)
616 def deletedrevisions(self, start=None, end=None, dir='older', namespace=None,
617 limit=None, prop='user|comment'):
620 kwargs = dict(listing.List.generate_kwargs('dr', start=start, end=end, dir=dir,
621 namespace=namespace, prop=prop))
622 return listing.List(self, 'deletedrevs', 'dr', limit=limit, **kwargs)
624 def exturlusage(self, query, prop=None, protocol='http', namespace=None, limit=None):
625 """Retrieves list of pages that link to a particular domain or URL as a generator.
627 This API call mirrors the Special:LinkSearch function on-wiki.
629 Query can be a domain like 'bbc.co.uk'. Wildcards can be used, e.g. '*.bbc.co.uk'.
630 Alternatively, a query can contain a full domain name and some or all of a URL:
631 e.g. '*.wikipedia.org/wiki/*'
633 See <https://meta.wikimedia.org/wiki/Help:Linksearch> for details.
635 The generator returns dictionaries containing three keys:
636 - url: the URL linked to.
637 - ns: namespace of the wiki page
638 - pageid: the ID of the wiki page
639 - title: the page title.
643 kwargs = dict(listing.List.generate_kwargs('eu', query=query, prop=prop,
644 protocol=protocol, namespace=namespace))
645 return listing.List(self, 'exturlusage', 'eu', limit=limit, **kwargs)
647 def logevents(self, type=None, prop=None, start=None, end=None,
648 dir='older', user=None, title=None, limit=None, action=None):
650 kwargs = dict(listing.List.generate_kwargs('le', prop=prop, type=type, start=start,
651 end=end, dir=dir, user=user, title=title, action=action))
652 return listing.List(self, 'logevents', 'le', limit=limit, **kwargs)
654 # def protectedtitles requires 1.15
655 def random(self, namespace, limit=20):
656 """Retrieves a generator of random page from a particular namespace.
658 limit specifies the number of random articles retrieved.
659 namespace is a namespace identifier integer.
661 Generator contains dictionary with namespace, page ID and title.
665 kwargs = dict(listing.List.generate_kwargs('rn', namespace=namespace))
666 return listing.List(self, 'random', 'rn', limit=limit, **kwargs)
668 def recentchanges(self, start=None, end=None, dir='older', namespace=None,
669 prop=None, show=None, limit=None, type=None):
671 kwargs = dict(listing.List.generate_kwargs('rc', start=start, end=end, dir=dir,
672 namespace=namespace, prop=prop, show=show, type=type))
673 return listing.List(self, 'recentchanges', 'rc', limit=limit, **kwargs)
675 def search(self, search, namespace='0', what='title', redirects=False, limit=None):
677 kwargs = dict(listing.List.generate_kwargs('sr', search=search, namespace=namespace, what=what))
679 kwargs['srredirects'] = '1'
680 return listing.List(self, 'search', 'sr', limit=limit, **kwargs)
682 def usercontributions(self, user, start=None, end=None, dir='older', namespace=None,
683 prop=None, show=None, limit=None):
685 kwargs = dict(listing.List.generate_kwargs('uc', user=user, start=start, end=end,
686 dir=dir, namespace=namespace, prop=prop, show=show))
687 return listing.List(self, 'usercontribs', 'uc', limit=limit, **kwargs)
689 def users(self, users, prop='blockinfo|groups|editcount'):
691 return listing.List(self, 'users', 'us', ususers='|'.join(users), usprop=prop)
693 def watchlist(self, allrev=False, start=None, end=None, namespace=None, dir='older',
694 prop=None, show=None, limit=None):
696 kwargs = dict(listing.List.generate_kwargs('wl', start=start, end=end,
697 namespace=namespace, dir=dir, prop=prop, show=show))
699 kwargs['wlallrev'] = '1'
700 return listing.List(self, 'watchlist', 'wl', limit=limit, **kwargs)
702 def expandtemplates(self, text, title=None, generatexml=False):
703 """Takes wikitext (text) and expands templates."""
707 kwargs['title'] = title
709 kwargs['generatexml'] = '1'
711 result = self.api('expandtemplates', text=text, **kwargs)
714 return result['expandtemplates']['*'], result['parsetree']['*']
716 return result['expandtemplates']['*']
718 def ask(self, query, title=None):
719 """Ask a query against Semantic MediaWiki."""
722 kwargs['title'] = title
723 result = self.raw_api('ask', query=query, **kwargs)
724 return result['query']['results']