From 99337ede51f8abbcf0663e7c93672f70f9b6ddd9 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Mon, 20 Apr 2015 16:58:59 -0700 Subject: [PATCH] revamped the wikipedia project - removed mwclient (requests params argument is basically good enough) - shows examples that uses params and that do it the "hard way" to show what it's doing - added examples for getting pages in category and categories for a page --- LICENSE.md.mwclient | 22 - README.rst.mwclient | 82 --- REFERENCE.md.mwclient | 115 ----- mwclient/__init__.py | 28 -- mwclient/client.py | 724 --------------------------- mwclient/errors.py | 58 --- mwclient/ex.py | 84 ---- mwclient/listing.py | 237 --------- mwclient/page.py | 390 --------------- wikipedia-mwc1.py | 13 - wikipedia-mwc2.py | 6 - wikipedia-raw1.py => wikipedia1-1.py | 2 +- wikipedia1-2.py | 33 ++ wikipedia-raw2.py => wikipedia2-1.py | 6 +- wikipedia2-2.py | 32 ++ wikipedia3.py | 16 + wikipedia4.py | 24 + 17 files changed, 108 insertions(+), 1764 deletions(-) delete mode 100644 LICENSE.md.mwclient delete mode 100644 README.rst.mwclient delete mode 100644 REFERENCE.md.mwclient delete mode 100644 mwclient/__init__.py delete mode 100644 mwclient/client.py delete mode 100644 mwclient/errors.py delete mode 100644 mwclient/ex.py delete mode 100644 mwclient/listing.py delete mode 100644 mwclient/page.py delete mode 100644 wikipedia-mwc1.py delete mode 100644 wikipedia-mwc2.py rename wikipedia-raw1.py => wikipedia1-1.py (76%) create mode 100644 wikipedia1-2.py rename wikipedia-raw2.py => wikipedia2-1.py (76%) create mode 100644 wikipedia2-2.py create mode 100644 wikipedia3.py create mode 100644 wikipedia4.py diff --git a/LICENSE.md.mwclient b/LICENSE.md.mwclient deleted file mode 100644 index 9359f20..0000000 --- a/LICENSE.md.mwclient +++ /dev/null @@ -1,22 +0,0 @@ -Copyright (c) 2006-2013 Bryan Tong Minh - -Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation -files (the "Software"), to deal in the Software without -restriction, including without limitation the rights to use, -copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following -conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -OTHER DEALINGS IN THE SOFTWARE. diff --git a/README.rst.mwclient b/README.rst.mwclient deleted file mode 100644 index 0683c31..0000000 --- a/README.rst.mwclient +++ /dev/null @@ -1,82 +0,0 @@ -mwclient -======== - -Mwclient is a client to the `MediaWiki API `_ -which provides access to most API functionality. -It depends heavily on Bob Ippolito's `SimpleJSON `_, -requires Python 2.4 and supports MediaWiki 1.11 and above. -For functions not available in the current MediaWiki, a ``MediaWikiVersionError`` is raised. - -This framework was written by Bryan Tong Minh, who released the latest stable -`version 0.6.5 `_ on 6 May 2011. -The current `development version `_ -can be installed directly off github: - -.. code-block:: console - - $ pip install git+git://github.com/mwclient/mwclient.git - -Please see the `release notes `_ -for a list of changes. - -Implementation notes --------------------- - -Most properties and generators accept the same parameters as the API, -without their two-letter prefix. Exceptions to this rule: - -* ``Image.imageinfo`` is the imageinfo of the latest image. - Earlier versions can be fetched using ``imagehistory()`` -* ``Site.all*``: parameter ``[ap]from`` renamed to ``start`` -* ``categorymembers`` is implemented as ``Category.members`` -* ``deletedrevs`` is ``deletedrevisions`` -* ``usercontribs`` is ``usercontributions`` -* First parameters of ``search`` and ``usercontributions`` are ``search`` and ``user`` - respectively - -Properties and generators are implemented as Python generators. -Their limit parameter is only an indication of the number of items in one chunk. -It is not the total limit. -Doing ``list(generator(limit = limit))`` will return ALL items of generator, -and not be limited by the limit value. -Default chunk size is generally the maximum chunk size. - - -HTTPS ------ - -To use https, specify the host as a tuple in the form of ``('https', hostname)``. - - -Example -------- - -For more information, see the -`REFERENCE.md `_ file. - -.. code-block:: python - - # Initialize Site object - import mwclient - site = mwclient.Site('commons.wikimedia.org') - site.login(username, password) # Optional - - # Edit page - page = site.Pages['Commons:Sandbox'] - text = page.edit() - print 'Text in sandbox:', text.encode('utf-8') - page.save(text + u'\nExtra data', summary = 'Test edit') - - # Printing imageusage - image = site.Images['Example.jpg'] - print 'Image', image.name.encode('utf-8'), 'usage:' - for page in image.imageusage(): - print 'Used:', page.name.encode('utf-8'), '; namespace', page.namespace - print 'Image info:', image.imageinfo - - # Uploading a file - site.upload(open('file.jpg'), 'destination.jpg', 'Image description') - - # Listing all categories (don't do this in reality) - for category in site.allcategories(): - print category diff --git a/REFERENCE.md.mwclient b/REFERENCE.md.mwclient deleted file mode 100644 index cdd1732..0000000 --- a/REFERENCE.md.mwclient +++ /dev/null @@ -1,115 +0,0 @@ -This file is intended to be a reference to mwclient. -The current version is mwclient 0.6.5. - -The mwclient framework provides access to the MediaWiki API. -It provides the functions of the MediaWiki API in a Pythonic manner. - -## Sites ## -The `Site` object is the most important class. -It represents a MediaWiki site. -Its constructor accepts various arguments, -of which the first two, `host` and `path`, are the most important. -They represent respectively -the hostname without protocol -and the root directory where `api.php` is located. -The path parameter should end with a slash, /. -Protocols other than HTTP and HTTPS are currently not supported. - -```python -#http -site = mwclient.Site(host, path = '/w/', ...) - -#https -site = mwclient.Site(('https', host), path = '/w/', ...) -``` - -### Pages ### -Sites provide access to pages via various generators and the Pages object. -The base Page object is called Page -and from that derive Category and Image. -When the page is retrieved via `Site.Pages` or a generator, -it will check automatically which of those three specific types -should be returned. -To get a page by its name, call `Site.Pages` as a scriptable object: - -```python -page = site.Pages['Template:Stub'] -image = site.Pages['Image:Wiki.png'] # This will return an Image object -image2 = site.Images['Wiki.png'] # The same image -``` - -Alternatively, `Site.Images` and `Site.Categories` are provided, -which do exactly the same as `Site.Pages`, -except that they require the page name without its namespace prefixed. - -#### PageProperties #### -The `Page` object provides many generators available in the API. -In addition to the page properties listed in the API documentation, -also the lists backlinks and embedded in are members of the Page object. For more information about using generators -see the section on generators below. - -`Category` objects provide an extra function, `members` -to list all members of a category. -The Category object can also be used itself -as an iterator yielding all its members. - -```python -#list pages in Category:Help by name -category = site.Pages['Category:Help'] -for page in category: - print page.name -``` - -`Image` objects have additional functions `imagehistory` and `imageusage` -which represent the old versions of the image and its usage, respectively. -`Image.download` returns a file object to the full size image. - -```python -fr = image.download() -fw = open('Wiki.png', 'rb') -while True: - s = fr.read(4096) - if not s: break - fw.write(s) -fr.close() # Always close those file objects !!! -fw.close() -``` - -#### Editing pages #### -Call `Page.text()` to retrieve the page content. -Use `Page.save(text, summary = u'', ...)` to save the page. -If available, `Page.save` uses the API to edit, -but falls back to the old way if the write API is not available. - -## Generators ## - -## Exceptions ## - -## Implementation notes ## -Most properties and generators accept the same parameters as the API, -without their two-letter prefix. -Exceptions: -* `Image.imageinfo` is the `imageinfo` of the *latest* image. -Earlier versions can be fetched using `imagehistory()` -* `Site.all*`: parameter `(ap)from` renamed to `start` -* `categorymembers` is implemented as `Category.members` -* `deletedrevs` is `deletedrevisions` -* `usercontribs` is `usercontributions` -* First parameters of `search` and `usercontributions` - are `search` and `user`, respectively - -Properties and generators are implemented as Python generators. -Their limit parameter is only an indication -of the number of items in one chunk. -It is not the total limit. -Doing `list(generator(limit = limit))` will return -ALL items of generator, and not be limited by the limit value. -Use `list(generator(max_items = max_items))` -to limit the amount of items returned. -Default chunk size is generally the maximum chunk size. - -## Links ## -* Project page at GitHub: https://github.com/mwclient/mwclient -* More in-depth documentation on the GitHub wiki: -https://github.com/mwclient/mwclient/wiki -* MediaWiki API documentation: https://mediawiki.org/wiki/API diff --git a/mwclient/__init__.py b/mwclient/__init__.py deleted file mode 100644 index bb32dde..0000000 --- a/mwclient/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -""" - Copyright (c) 2006-2011 Bryan Tong Minh - - Permission is hereby granted, free of charge, to any person - obtaining a copy of this software and associated documentation - files (the "Software"), to deal in the Software without - restriction, including without limitation the rights to use, - copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the - Software is furnished to do so, subject to the following - conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - OTHER DEALINGS IN THE SOFTWARE. -""" - -from errors import * -from client import Site, __ver__ -import ex diff --git a/mwclient/client.py b/mwclient/client.py deleted file mode 100644 index be4457b..0000000 --- a/mwclient/client.py +++ /dev/null @@ -1,724 +0,0 @@ -__ver__ = '0.7.1dev' - -import warnings -import urllib -import time -import random -import sys -import weakref -import base64 - -try: - # Python 2.7+ - from collections import OrderedDict -except ImportError: - # Python 2.6 - from ordereddict import OrderedDict - -try: - import json -except ImportError: - import simplejson as json -import requests - -import errors -import listing -import page - -try: - import gzip -except ImportError: - gzip = None -try: - from cStringIO import StringIO -except ImportError: - from StringIO import StringIO - - -def parse_timestamp(t): - if t == '0000-00-00T00:00:00Z': - return (0, 0, 0, 0, 0, 0, 0, 0) - return time.strptime(t, '%Y-%m-%dT%H:%M:%SZ') - - -class WaitToken(object): - - def __init__(self): - self.id = '%x' % random.randint(0, sys.maxint) - - def __hash__(self): - return hash(self.id) - - -class Site(object): - api_limit = 500 - - def __init__(self, host, path='/w/', ext='.php', pool=None, retry_timeout=30, - max_retries=25, wait_callback=lambda *x: None, clients_useragent=None, - max_lag=3, compress=True, force_login=True, do_init=True, httpauth=None): - # Setup member variables - self.host = host - self.path = path - self.ext = ext - self.credentials = None - self.compress = compress - self.httpauth = httpauth - self.retry_timeout = retry_timeout - self.max_retries = max_retries - self.wait_callback = wait_callback - self.max_lag = str(max_lag) - self.force_login = force_login - - # The token string => token object mapping - self.wait_tokens = weakref.WeakKeyDictionary() - - # Site properties - self.blocked = False # Whether current user is blocked - self.hasmsg = False # Whether current user has new messages - self.groups = [] # Groups current user belongs to - self.rights = [] # Rights current user has - self.tokens = {} # Edit tokens of the current user - self.version = None - - self.namespaces = self.default_namespaces - self.writeapi = False - - # Setup connection - if pool is None: - self.connection = requests.Session() - self.connection.headers['User-Agent'] = 'MwClient/' + __ver__ + ' (https://github.com/mwclient/mwclient)' - if clients_useragent: - self.connection.headers['User-Agent'] = clients_useragent + ' - ' + self.connection.headers['User-Agent'] - else: - self.connection = pool - - # Page generators - self.pages = listing.PageList(self) - self.categories = listing.PageList(self, namespace=14) - self.images = listing.PageList(self, namespace=6) - - # Compat page generators - self.Pages = self.pages - self.Categories = self.categories - self.Images = self.images - - # Initialization status - self.initialized = False - - if do_init: - try: - self.site_init() - except errors.APIError, e: - # Private wiki, do init after login - if e[0] not in (u'unknown_action', u'readapidenied'): - raise - - def site_init(self): - meta = self.api('query', meta='siteinfo|userinfo', - siprop='general|namespaces', uiprop='groups|rights') - - # Extract site info - self.site = meta['query']['general'] - self.namespaces = dict(((i['id'], i.get('*', '')) for i in meta['query']['namespaces'].itervalues())) - self.writeapi = 'writeapi' in self.site - - # Determine version - if self.site['generator'].startswith('MediaWiki '): - version = self.site['generator'][10:].split('.') - - def split_num(s): - i = 0 - while i < len(s): - if s[i] < '0' or s[i] > '9': - break - i += 1 - if s[i:]: - return (int(s[:i]), s[i:], ) - else: - return (int(s[:i]), ) - self.version = sum((split_num(s) for s in version), ()) - - if len(self.version) < 2: - raise errors.MediaWikiVersionError('Unknown MediaWiki %s' % '.'.join(version)) - else: - raise errors.MediaWikiVersionError('Unknown generator %s' % self.site['generator']) - - # Require MediaWiki version >= 1.16 - self.require(1, 16) - - # User info - userinfo = meta['query']['userinfo'] - self.username = userinfo['name'] - self.groups = userinfo.get('groups', []) - self.rights = userinfo.get('rights', []) - self.initialized = True - - default_namespaces = {0: u'', 1: u'Talk', 2: u'User', 3: u'User talk', 4: u'Project', 5: u'Project talk', - 6: u'Image', 7: u'Image talk', 8: u'MediaWiki', 9: u'MediaWiki talk', 10: u'Template', 11: u'Template talk', - 12: u'Help', 13: u'Help talk', 14: u'Category', 15: u'Category talk', -1: u'Special', -2: u'Media'} - - def __repr__(self): - return "" % (self.host, self.path) - - def api(self, action, *args, **kwargs): - """ - Perform a generic API call and handle errors. All arguments will be passed on. - - Example: - To get coordinates from the GeoData MediaWiki extension at English Wikipedia: - - >>> site = Site('en.wikipedia.org') - >>> result = site.api('query', prop='coordinates', titles='Oslo|Copenhagen') - >>> for page in result['query']['pages'].values(): - ... if 'coordinates' in page: - ... print page['title'], page['coordinates'][0]['lat'], page['coordinates'][0]['lon'] - Oslo 59.95 10.75 - Copenhagen 55.6761 12.5683 - - Returns: - The raw response from the API call, as a dictionary. - """ - kwargs.update(args) - if action == 'query': - if 'meta' in kwargs: - kwargs['meta'] += '|userinfo' - else: - kwargs['meta'] = 'userinfo' - if 'uiprop' in kwargs: - kwargs['uiprop'] += '|blockinfo|hasmsg' - else: - kwargs['uiprop'] = 'blockinfo|hasmsg' - - token = self.wait_token() - while True: - info = self.raw_api(action, **kwargs) - if not info: - info = {} - res = self.handle_api_result(info, token=token) - if res: - return info - - def handle_api_result(self, info, kwargs=None, token=None): - if token is None: - token = self.wait_token() - - try: - userinfo = info['query']['userinfo'] - except KeyError: - userinfo = () - if 'blockedby' in userinfo: - self.blocked = (userinfo['blockedby'], userinfo.get('blockreason', u'')) - else: - self.blocked = False - self.hasmsg = 'message' in userinfo - self.logged_in = 'anon' not in userinfo - if 'error' in info: - if info['error']['code'] in (u'internal_api_error_DBConnectionError', u'internal_api_error_DBQueryError'): - self.wait(token) - return False - if '*' in info['error']: - raise errors.APIError(info['error']['code'], - info['error']['info'], info['error']['*']) - raise errors.APIError(info['error']['code'], - info['error']['info'], kwargs) - return True - - @staticmethod - def _query_string(*args, **kwargs): - kwargs.update(args) - qs1 = [(k, v) for k, v in kwargs.iteritems() if k not in ('wpEditToken', 'token')] - qs2 = [(k, v) for k, v in kwargs.iteritems() if k in ('wpEditToken', 'token')] - return OrderedDict(qs1 + qs2) - - def raw_call(self, script, data, files=None): - url = self.path + script + self.ext - headers = {} - if self.compress and gzip: - headers['Accept-Encoding'] = 'gzip' - if self.httpauth is not None: - credentials = base64.encodestring('%s:%s' % self.httpauth).replace('\n', '') - headers['Authorization'] = 'Basic %s' % credentials - token = self.wait_token((script, data)) - while True: - scheme = 'http' # Should we move to 'https' as default? - host = self.host - if type(host) is tuple: - scheme, host = host - - fullurl = '{scheme}://{host}{url}'.format(scheme=scheme, host=host, url=url) - - try: - stream = self.connection.post(fullurl, data=data, files=files, headers=headers) - return stream.text - - except requests.exceptions.HTTPError, e: - print 'http error' - print e - if e[0] == 503 and e[1].getheader('X-Database-Lag'): - self.wait(token, int(e[1].getheader('Retry-After'))) - elif e[0] < 500 or e[0] > 599: - raise - else: - self.wait(token) - except requests.exceptions.TooManyRedirects: - raise - except requests.exceptions.ConnectionError: - print 'connection error' - self.wait(token) - except ValueError: - self.wait(token) - - def raw_api(self, action, *args, **kwargs): - """Sends a call to the API.""" - kwargs['action'] = action - kwargs['format'] = 'json' - data = self._query_string(*args, **kwargs) - res = self.raw_call('api', data) - - try: - return json.loads(res) - except ValueError: - if res.startswith('MediaWiki API is not enabled for this site.'): - raise errors.APIDisabledError - raise - - def raw_index(self, action, *args, **kwargs): - """Sends a call to index.php rather than the API.""" - kwargs['action'] = action - kwargs['maxlag'] = self.max_lag - data = self._query_string(*args, **kwargs) - return self.raw_call('index', data) - - def wait_token(self, args=None): - token = WaitToken() - self.wait_tokens[token] = (0, args) - return token - - def wait(self, token, min_wait=0): - retry, args = self.wait_tokens[token] - self.wait_tokens[token] = (retry + 1, args) - if retry > self.max_retries and self.max_retries != -1: - raise errors.MaximumRetriesExceeded(self, token, args) - self.wait_callback(self, token, retry, args) - - timeout = self.retry_timeout * retry - if timeout < min_wait: - timeout = min_wait - time.sleep(timeout) - return self.wait_tokens[token] - - def require(self, major, minor, revision=None, raise_error=True): - if self.version is None: - if raise_error is None: - return - raise RuntimeError('Site %s has not yet been initialized' % repr(self)) - - if revision is None: - if self.version[:2] >= (major, minor): - return True - elif raise_error: - raise errors.MediaWikiVersionError('Requires version %s.%s, current version is %s.%s' - % ((major, minor) + self.version[:2])) - else: - return False - else: - raise NotImplementedError - - # Actions - def email(self, user, text, subject, cc=False): - """ - Send email to a specified user on the wiki. - - >>> try: - ... site.email('SomeUser', 'Some message', 'Some subject') - ... except mwclient.errors.NoSpecifiedEmailError, e: - ... print 'The user does not accept email, or has not specified an email address.' - - Args: - user (str): User name of the recipient - text (str): Body of the email - subject (str): Subject of the email - cc (bool): True to send a copy of the email to yourself (default is False) - - Returns: - Dictionary of the JSON response - - Raises: - NoSpecifiedEmailError (mwclient.errors.NoSpecifiedEmailError): if recipient does not accept email - EmailError (mwclient.errors.EmailError): on other errors - """ - - token = self.get_token('email') - - try: - info = self.api('emailuser', target=user, subject=subject, - text=text, ccme=cc, token=token) - except errors.APIError, e: - if e[0] == u'noemail': - raise errors.NoSpecifiedEmail(user, e[1]) - raise errors.EmailError(*e) - - return info - - def login(self, username=None, password=None, cookies=None, domain=None): - """Login to the wiki.""" - - if username and password: - self.credentials = (username, password, domain) - if cookies: - if self.host not in self.conn.cookies: - self.conn.cookies[self.host] = http.CookieJar() - self.conn.cookies[self.host].update(cookies) - - if self.credentials: - wait_token = self.wait_token() - kwargs = { - 'lgname': self.credentials[0], - 'lgpassword': self.credentials[1] - } - if self.credentials[2]: - kwargs['lgdomain'] = self.credentials[2] - while True: - login = self.api('login', **kwargs) - if login['login']['result'] == 'Success': - break - elif login['login']['result'] == 'NeedToken': - kwargs['lgtoken'] = login['login']['token'] - elif login['login']['result'] == 'Throttled': - self.wait(wait_token, login['login'].get('wait', 5)) - else: - raise errors.LoginError(self, login['login']) - - if self.initialized: - info = self.api('query', meta='userinfo', uiprop='groups|rights') - userinfo = info['query']['userinfo'] - self.username = userinfo['name'] - self.groups = userinfo.get('groups', []) - self.rights = userinfo.get('rights', []) - self.tokens = {} - else: - self.site_init() - - def get_token(self, type, force=False, title=None): - - if self.version[:2] >= (1, 24): - # The 'csrf' (cross-site request forgery) token introduced in 1.24 replaces - # the majority of older tokens, like edittoken and movetoken. - if type not in ['watch', 'patrol', 'rollback', 'userrights']: - type = 'csrf' - - if type not in self.tokens: - self.tokens[type] = '0' - - if self.tokens.get(type, '0') == '0' or force: - - if self.version[:2] >= (1, 24): - info = self.api('query', meta='tokens', type=type) - self.tokens[type] = info['query']['tokens']['%stoken' % type] - - else: - if title is None: - # Some dummy title was needed to get a token prior to 1.24 - title = 'Test' - info = self.api('query', titles=title, - prop='info', intoken=type) - for i in info['query']['pages'].itervalues(): - if i['title'] == title: - self.tokens[type] = i['%stoken' % type] - - return self.tokens[type] - - def upload(self, file=None, filename=None, description='', ignore=False, file_size=None, - url=None, filekey=None, comment=None): - """ - Uploads a file to the site. Returns JSON result from the API. - Can raise `errors.InsufficientPermission` and `requests.exceptions.HTTPError`. - - : Parameters : - - file : File object or stream to upload. - - filename : Destination filename, don't include namespace - prefix like 'File:' - - description : Wikitext for the file description page. - - ignore : True to upload despite any warnings. - - file_size : Deprecated in mwclient 0.7 - - url : URL to fetch the file from. - - filekey : Key that identifies a previous upload that was - stashed temporarily. - - comment : Upload comment. Also used as the initial page text - for new files if `description` is not specified. - - Note that one of `file`, `filekey` and `url` must be specified, but not more - than one. For normal uploads, you specify `file`. - - Example: - - >>> client.upload(open('somefile', 'rb'), filename='somefile.jpg', - description='Some description') - """ - - if file_size is not None: - # Note that DeprecationWarning is hidden by default since Python 2.7 - warnings.warn( - 'file_size is deprecated since mwclient 0.7', - DeprecationWarning - ) - file_size = None - - if filename is None: - raise TypeError('filename must be specified') - - if len([x for x in [file, filekey, url] if x is not None]) != 1: - raise TypeError("exactly one of 'file', 'filekey' and 'url' must be specified") - - image = self.Images[filename] - if not image.can('upload'): - raise errors.InsufficientPermission(filename) - - predata = {} - - if comment is None: - predata['comment'] = description - else: - predata['comment'] = comment - predata['text'] = description - - if ignore: - predata['ignorewarnings'] = 'true' - predata['token'] = image.get_token('edit') - predata['action'] = 'upload' - predata['format'] = 'json' - predata['filename'] = filename - if url: - predata['url'] = url - - # Renamed from sessionkey to filekey - # https://git.wikimedia.org/commit/mediawiki%2Fcore.git/5f13517e - if self.version[:2] < (1, 18): - predata['sessionkey'] = filekey - else: - predata['filekey'] = filekey - - postdata = predata - files = None - if file is not None: - files = {'file': file} - - wait_token = self.wait_token() - while True: - try: - data = self.raw_call('api', postdata, files) - info = json.loads(data) - if not info: - info = {} - if self.handle_api_result(info, kwargs=predata): - return info.get('upload', {}) - except requests.exceptions.HTTPError, e: - if e[0] == 503 and e[1].getheader('X-Database-Lag'): - self.wait(wait_token, int(e[1].getheader('Retry-After'))) - elif e[0] < 500 or e[0] > 599: - raise - else: - self.wait(wait_token) - except requests.exceptions.ConnectionError: - self.wait(wait_token) - - def parse(self, text=None, title=None, page=None): - kwargs = {} - if text is not None: - kwargs['text'] = text - if title is not None: - kwargs['title'] = title - if page is not None: - kwargs['page'] = page - result = self.api('parse', **kwargs) - return result['parse'] - - # def block(self): TODO? - # def unblock: TODO? - # def patrol: TODO? - # def import: TODO? - - # Lists - def allpages(self, start=None, prefix=None, namespace='0', filterredir='all', - minsize=None, maxsize=None, prtype=None, prlevel=None, - limit=None, dir='ascending', filterlanglinks='all', generator=True): - """Retrieve all pages on the wiki as a generator.""" - - pfx = listing.List.get_prefix('ap', generator) - kwargs = dict(listing.List.generate_kwargs(pfx, ('from', start), prefix=prefix, - minsize=minsize, maxsize=maxsize, prtype=prtype, prlevel=prlevel, - namespace=namespace, filterredir=filterredir, dir=dir, - filterlanglinks=filterlanglinks)) - return listing.List.get_list(generator)(self, 'allpages', 'ap', limit=limit, return_values='title', **kwargs) - - def allimages(self, start=None, prefix=None, minsize=None, maxsize=None, limit=None, - dir='ascending', sha1=None, sha1base36=None, prop='timestamp|url', - generator=True): - """Retrieve all images on the wiki as a generator.""" - - pfx = listing.List.get_prefix('ai', generator) - kwargs = dict(listing.List.generate_kwargs(pfx, ('from', start), prefix=prefix, - minsize=minsize, maxsize=maxsize, - dir=dir, sha1=sha1, sha1base36=sha1base36)) - return listing.List.get_list(generator)(self, 'allimages', 'ai', limit=limit, return_values='timestamp|url', **kwargs) - - def alllinks(self, start=None, prefix=None, unique=False, prop='title', - namespace='0', limit=None, generator=True): - """Retrieve a list of all links on the wiki as a generator.""" - - pfx = listing.List.get_prefix('al', generator) - kwargs = dict(listing.List.generate_kwargs(pfx, ('from', start), prefix=prefix, - prop=prop, namespace=namespace)) - if unique: - kwargs[pfx + 'unique'] = '1' - return listing.List.get_list(generator)(self, 'alllinks', 'al', limit=limit, return_values='title', **kwargs) - - def allcategories(self, start=None, prefix=None, dir='ascending', limit=None, generator=True): - """Retrieve all categories on the wiki as a generator.""" - - pfx = listing.List.get_prefix('ac', generator) - kwargs = dict(listing.List.generate_kwargs(pfx, ('from', start), prefix=prefix, dir=dir)) - return listing.List.get_list(generator)(self, 'allcategories', 'ac', limit=limit, **kwargs) - - def allusers(self, start=None, prefix=None, group=None, prop=None, limit=None, - witheditsonly=False, activeusers=False, rights=None): - """Retrieve all users on the wiki as a generator.""" - - kwargs = dict(listing.List.generate_kwargs('au', ('from', start), prefix=prefix, - group=group, prop=prop, - rights=rights, - witheditsonly=witheditsonly, - activeusers=activeusers)) - print kwargs - return listing.List(self, 'allusers', 'au', limit=limit, **kwargs) - - def blocks(self, start=None, end=None, dir='older', ids=None, users=None, limit=None, - prop='id|user|by|timestamp|expiry|reason|flags'): - """Retrieve blocks as a generator. - - Each block is a dictionary containing: - - user: the username or IP address of the user - - id: the ID of the block - - timestamp: when the block was added - - expiry: when the block runs out (infinity for indefinite blocks) - - reason: the reason they are blocked - - allowusertalk: key is present (empty string) if the user is allowed to edit their user talk page - - by: the administrator who blocked the user - - nocreate: key is present (empty string) if the user's ability to create accounts has been disabled. - - """ - - # TODO: Fix. Fix what? - kwargs = dict(listing.List.generate_kwargs('bk', start=start, end=end, dir=dir, - users=users, prop=prop)) - return listing.List(self, 'blocks', 'bk', limit=limit, **kwargs) - - def deletedrevisions(self, start=None, end=None, dir='older', namespace=None, - limit=None, prop='user|comment'): - # TODO: Fix - - kwargs = dict(listing.List.generate_kwargs('dr', start=start, end=end, dir=dir, - namespace=namespace, prop=prop)) - return listing.List(self, 'deletedrevs', 'dr', limit=limit, **kwargs) - - def exturlusage(self, query, prop=None, protocol='http', namespace=None, limit=None): - """Retrieves list of pages that link to a particular domain or URL as a generator. - - This API call mirrors the Special:LinkSearch function on-wiki. - - Query can be a domain like 'bbc.co.uk'. Wildcards can be used, e.g. '*.bbc.co.uk'. - Alternatively, a query can contain a full domain name and some or all of a URL: - e.g. '*.wikipedia.org/wiki/*' - - See for details. - - The generator returns dictionaries containing three keys: - - url: the URL linked to. - - ns: namespace of the wiki page - - pageid: the ID of the wiki page - - title: the page title. - - """ - - kwargs = dict(listing.List.generate_kwargs('eu', query=query, prop=prop, - protocol=protocol, namespace=namespace)) - return listing.List(self, 'exturlusage', 'eu', limit=limit, **kwargs) - - def logevents(self, type=None, prop=None, start=None, end=None, - dir='older', user=None, title=None, limit=None, action=None): - - kwargs = dict(listing.List.generate_kwargs('le', prop=prop, type=type, start=start, - end=end, dir=dir, user=user, title=title, action=action)) - return listing.List(self, 'logevents', 'le', limit=limit, **kwargs) - - # def protectedtitles requires 1.15 - def random(self, namespace, limit=20): - """Retrieves a generator of random page from a particular namespace. - - limit specifies the number of random articles retrieved. - namespace is a namespace identifier integer. - - Generator contains dictionary with namespace, page ID and title. - - """ - - kwargs = dict(listing.List.generate_kwargs('rn', namespace=namespace)) - return listing.List(self, 'random', 'rn', limit=limit, **kwargs) - - def recentchanges(self, start=None, end=None, dir='older', namespace=None, - prop=None, show=None, limit=None, type=None): - - kwargs = dict(listing.List.generate_kwargs('rc', start=start, end=end, dir=dir, - namespace=namespace, prop=prop, show=show, type=type)) - return listing.List(self, 'recentchanges', 'rc', limit=limit, **kwargs) - - def search(self, search, namespace='0', what='title', redirects=False, limit=None): - - kwargs = dict(listing.List.generate_kwargs('sr', search=search, namespace=namespace, what=what)) - if redirects: - kwargs['srredirects'] = '1' - return listing.List(self, 'search', 'sr', limit=limit, **kwargs) - - def usercontributions(self, user, start=None, end=None, dir='older', namespace=None, - prop=None, show=None, limit=None): - - kwargs = dict(listing.List.generate_kwargs('uc', user=user, start=start, end=end, - dir=dir, namespace=namespace, prop=prop, show=show)) - return listing.List(self, 'usercontribs', 'uc', limit=limit, **kwargs) - - def users(self, users, prop='blockinfo|groups|editcount'): - - return listing.List(self, 'users', 'us', ususers='|'.join(users), usprop=prop) - - def watchlist(self, allrev=False, start=None, end=None, namespace=None, dir='older', - prop=None, show=None, limit=None): - - kwargs = dict(listing.List.generate_kwargs('wl', start=start, end=end, - namespace=namespace, dir=dir, prop=prop, show=show)) - if allrev: - kwargs['wlallrev'] = '1' - return listing.List(self, 'watchlist', 'wl', limit=limit, **kwargs) - - def expandtemplates(self, text, title=None, generatexml=False): - """Takes wikitext (text) and expands templates.""" - - kwargs = {} - if title is None: - kwargs['title'] = title - if generatexml: - kwargs['generatexml'] = '1' - - result = self.api('expandtemplates', text=text, **kwargs) - - if generatexml: - return result['expandtemplates']['*'], result['parsetree']['*'] - else: - return result['expandtemplates']['*'] - - def ask(self, query, title=None): - """Ask a query against Semantic MediaWiki.""" - kwargs = {} - if title is None: - kwargs['title'] = title - result = self.raw_api('ask', query=query, **kwargs) - return result['query']['results'] diff --git a/mwclient/errors.py b/mwclient/errors.py deleted file mode 100644 index 6540c2d..0000000 --- a/mwclient/errors.py +++ /dev/null @@ -1,58 +0,0 @@ -class MwClientError(RuntimeError): - pass - - -class MediaWikiVersionError(MwClientError): - pass - - -class APIDisabledError(MwClientError): - pass - - -class MaximumRetriesExceeded(MwClientError): - pass - - -class APIError(MwClientError): - - def __init__(self, code, info, kwargs): - self.code = code - self.info = info - MwClientError.__init__(self, code, info, kwargs) - - -class InsufficientPermission(MwClientError): - pass - - -class UserBlocked(InsufficientPermission): - pass - - -class EditError(MwClientError): - pass - - -class ProtectedPageError(EditError, InsufficientPermission): - pass - - -class FileExists(EditError): - pass - - -class LoginError(MwClientError): - pass - - -class EmailError(MwClientError): - pass - - -class NoSpecifiedEmail(EmailError): - pass - - -class NoWriteApi(MwClientError): - pass diff --git a/mwclient/ex.py b/mwclient/ex.py deleted file mode 100644 index db4006c..0000000 --- a/mwclient/ex.py +++ /dev/null @@ -1,84 +0,0 @@ -import client -import requests - - -def read_config(config_files, **predata): - cfg = {} - for config_file in config_files: - cfg.update(_read_config_file( - config_file, predata)) - return cfg - - -def _read_config_file(_config_file, predata): - _file = open(_config_file) - exec _file in globals(), predata - _file.close() - - for _k, _v in predata.iteritems(): - if not _k.startswith('_'): - yield _k, _v - for _k, _v in locals().iteritems(): - if not _k.startswith('_'): - yield _k, _v - - -class SiteList(object): - - def __init__(self): - self.sites = {} - - def __getitem__(self, key): - if key not in self.sites: - self.sites[key] = {} - return self.sites[key] - - def __iter__(self): - return self.sites.itervalues() - - -class ConfiguredSite(client.Site): - - def __init__(self, *config_files, **kwargs): - self.config = read_config(config_files, sites=SiteList()) - - if 'name' in kwargs: - self.config.update(self.config['sites'][kwargs['name']]) - - do_login = 'username' in self.config and 'password' in self.config - - client.Site.__init__(self, host=self.config['host'], - path=self.config['path'], ext=self.config.get('ext', '.php'), - do_init=not do_login, - retry_timeout=self.config.get('retry_timeout', 30), - max_retries=self.config.get('max_retries', -1)) - - if do_login: - self.login(self.config['username'], - self.config['password']) - - -class ConfiguredPool(list): - - def __init__(self, *config_files): - self.config = read_config(config_files, sites=SiteList()) - self.pool = requests.Session() - - config = dict([(k, v) for k, v in self.config.iteritems() - if k != 'sites']) - - for site in self.config['sites']: - cfg = config.copy() - cfg.update(site) - site.update(cfg) - - do_login = 'username' in site and 'password' in site - - self.append(client.Site(host=site['host'], - path=site['path'], ext=site.get('ext', '.php'), - pool=self.pool, do_init=not do_login, - retry_timeout=site.get('retry_timeout', 30), - max_retries=site.get('max_retries', -1))) - if do_login: - self[-1].login(site['username'], site['password']) - self[-1].config = site diff --git a/mwclient/listing.py b/mwclient/listing.py deleted file mode 100644 index d6bc1e8..0000000 --- a/mwclient/listing.py +++ /dev/null @@ -1,237 +0,0 @@ -import client -import page - - -class List(object): - - def __init__(self, site, list_name, prefix, limit=None, return_values=None, max_items=None, *args, **kwargs): - # NOTE: Fix limit - self.site = site - self.list_name = list_name - self.generator = 'list' - self.prefix = prefix - - kwargs.update(args) - self.args = kwargs - - if limit is None: - limit = site.api_limit - self.args[self.prefix + 'limit'] = str(limit) - if 'continue' not in self.args: - self.args['continue'] = '' - - self.count = 0 - self.max_items = max_items - - self._iter = iter(xrange(0)) - - self.last = False - self.result_member = list_name - self.return_values = return_values - - def __iter__(self): - return self - - def next(self, full=False): - if self.max_items is not None: - if self.count >= self.max_items: - raise StopIteration - try: - item = self._iter.next() - self.count += 1 - if 'timestamp' in item: - item['timestamp'] = client.parse_timestamp(item['timestamp']) - if full: - return item - - if type(self.return_values) is tuple: - return tuple((item[i] for i in self.return_values)) - elif self.return_values is None: - return item - else: - return item[self.return_values] - - except StopIteration: - if self.last: - raise StopIteration - self.load_chunk() - return List.next(self, full=full) - - def load_chunk(self): - data = self.site.api('query', (self.generator, self.list_name), *[(str(k), v) for k, v in self.args.iteritems()]) - if not data: - # Non existent page - raise StopIteration - self.set_iter(data) - - if data.get('continue'): - # New style continuation, added in MediaWiki 1.21 - self.args.update(data['continue']) - - elif self.list_name in data.get('query-continue', ()): - # Old style continuation - self.args.update(data['query-continue'][self.list_name]) - - else: - self.last = True - - def set_iter(self, data): - if self.result_member not in data['query']: - self._iter = iter(xrange(0)) - elif type(data['query'][self.result_member]) is list: - self._iter = iter(data['query'][self.result_member]) - else: - self._iter = data['query'][self.result_member].itervalues() - - def __repr__(self): - return "" % (self.list_name, self.site) - - @staticmethod - def generate_kwargs(_prefix, *args, **kwargs): - kwargs.update(args) - for key, value in kwargs.iteritems(): - if value is not None and value is not False: - yield _prefix + key, value - - @staticmethod - def get_prefix(prefix, generator=False): - if generator: - return 'g' + prefix - else: - return prefix - - @staticmethod - def get_list(generator=False): - if generator: - return GeneratorList - else: - return List - - -class GeneratorList(List): - - def __init__(self, site, list_name, prefix, *args, **kwargs): - List.__init__(self, site, list_name, prefix, *args, **kwargs) - - self.args['g' + self.prefix + 'limit'] = self.args[self.prefix + 'limit'] - del self.args[self.prefix + 'limit'] - self.generator = 'generator' - - self.args['prop'] = 'info|imageinfo' - self.args['inprop'] = 'protection' - - self.result_member = 'pages' - - self.page_class = page.Page - - def next(self): - info = List.next(self, full=True) - if info['ns'] == 14: - return Category(self.site, u'', info) - if info['ns'] == 6: - return page.Image(self.site, u'', info) - return page.Page(self.site, u'', info) - - def load_chunk(self): - # Put this here so that the constructor does not fail - # on uninitialized sites - self.args['iiprop'] = 'timestamp|user|comment|url|size|sha1|metadata|archivename' - return List.load_chunk(self) - - -class Category(page.Page, GeneratorList): - - def __init__(self, site, name, info=None, namespace=None): - page.Page.__init__(self, site, name, info) - kwargs = {} - kwargs['gcmtitle'] = self.name - if namespace: - kwargs['gcmnamespace'] = namespace - GeneratorList.__init__(self, site, 'categorymembers', 'cm', **kwargs) - - def __repr__(self): - return "" % (self.name.encode('utf-8'), self.site) - - def members(self, prop='ids|title', namespace=None, sort='sortkey', - dir='asc', start=None, end=None, generator=True): - prefix = self.get_prefix('cm', generator) - kwargs = dict(self.generate_kwargs(prefix, prop=prop, namespace=namespace, - sort=sort, dir=dir, start=start, end=end, title=self.name)) - return self.get_list(generator)(self.site, 'categorymembers', 'cm', **kwargs) - - -class PageList(GeneratorList): - - def __init__(self, site, prefix=None, start=None, namespace=0, redirects='all'): - self.namespace = namespace - - kwargs = {} - if prefix: - kwargs['apprefix'] = prefix - if start: - kwargs['apfrom'] = start - - GeneratorList.__init__(self, site, 'allpages', 'ap', - apnamespace=str(namespace), apfilterredir=redirects, **kwargs) - - def __getitem__(self, name): - return self.get(name, None) - - def get(self, name, info=()): - if self.namespace == 14: - return Category(self.site, self.site.namespaces[14] + ':' + name, info) - elif self.namespace == 6: - return page.Image(self.site, self.site.namespaces[6] + ':' + name, info) - elif self.namespace != 0: - return page.Page(self.site, self.site.namespaces[self.namespace] + ':' + name, info) - else: - # Guessing page class - if type(name) is not int: - namespace = self.guess_namespace(name) - if namespace == 14: - return Category(self.site, name, info) - elif namespace == 6: - return page.Image(self.site, name, info) - return page.Page(self.site, name, info) - - def guess_namespace(self, name): - normal_name = page.Page.normalize_title(name) - for ns in self.site.namespaces: - if ns == 0: - continue - if name.startswith(u'%s:' % self.site.namespaces[ns].replace(' ', '_')): - return ns - elif ns in self.site.default_namespaces: - if name.startswith(u'%s:' % self.site.default_namespaces[ns].replace(' ', '_')): - return ns - return 0 - - -class PageProperty(List): - - def __init__(self, page, prop, prefix, *args, **kwargs): - List.__init__(self, page.site, prop, prefix, titles=page.name, *args, **kwargs) - self.page = page - self.generator = 'prop' - - def set_iter(self, data): - for page in data['query']['pages'].itervalues(): - if page['title'] == self.page.name: - self._iter = iter(page.get(self.list_name, ())) - return - raise StopIteration - - -class PagePropertyGenerator(GeneratorList): - - def __init__(self, page, prop, prefix, *args, **kwargs): - GeneratorList.__init__(self, page.site, prop, prefix, titles=page.name, *args, **kwargs) - self.page = page - - -class RevisionsIterator(PageProperty): - - def load_chunk(self): - if 'rvstartid' in self.args and 'rvstart' in self.args: - del self.args['rvstart'] - return PageProperty.load_chunk(self) diff --git a/mwclient/page.py b/mwclient/page.py deleted file mode 100644 index 80c3379..0000000 --- a/mwclient/page.py +++ /dev/null @@ -1,390 +0,0 @@ -import client -import errors -import listing - -import urllib -import urlparse -import time -import warnings - - -class Page(object): - - def __init__(self, site, name, info=None, extra_properties={}): - if type(name) is type(self): - return self.__dict__.update(name.__dict__) - self.site = site - self.name = name - self.section = None - - if not info: - if extra_properties: - prop = 'info|' + '|'.join(extra_properties.iterkeys()) - extra_props = [] - [extra_props.extend(extra_prop) for extra_prop in extra_properties.itervalues()] - else: - prop = 'info' - extra_props = () - - if type(name) is int: - info = self.site.api('query', prop=prop, pageids=name, - inprop='protection', *extra_props) - else: - info = self.site.api('query', prop=prop, titles=name, - inprop='protection', *extra_props) - info = info['query']['pages'].itervalues().next() - self._info = info - - self.namespace = info.get('ns', 0) - self.name = info.get('title', u'') - if self.namespace: - self.page_title = self.strip_namespace(self.name) - else: - self.page_title = self.name - - self.touched = client.parse_timestamp(info.get('touched', '0000-00-00T00:00:00Z')) - self.revision = info.get('lastrevid', 0) - self.exists = 'missing' not in info - self.length = info.get('length') - self.protection = dict([(i['type'], (i['level'], i['expiry'])) for i in info.get('protection', ()) if i]) - self.redirect = 'redirect' in info - - self.last_rev_time = None - self.edit_time = None - - def redirects_to(self): - """ Returns the redirect target page, or None if the page is not a redirect page.""" - info = self.site.api('query', prop='pageprops', titles=self.name, redirects='')['query'] - if 'redirects' in info: - for page in info['redirects']: - if page['from'] == self.name: - return Page(self.site, page['to']) - return None - else: - return None - - def resolve_redirect(self): - """ Returns the redirect target page, or the current page if it's not a redirect page.""" - target_page = self.redirects_to() - if target_page is None: - return self - else: - return target_page - - def __repr__(self): - return "" % (self.name.encode('utf-8'), self.site) - - def __unicode__(self): - return self.name - - @staticmethod - def strip_namespace(title): - if title[0] == ':': - title = title[1:] - return title[title.find(':') + 1:] - - @staticmethod - def normalize_title(title): - # TODO: Make site dependent - title = title.strip() - if title[0] == ':': - title = title[1:] - title = title[0].upper() + title[1:] - title = title.replace(' ', '_') - return title - - def can(self, action): - level = self.protection.get(action, (action, ))[0] - if level == 'sysop': - level = 'editprotected' - - return level in self.site.rights - - def get_token(self, type, force=False): - return self.site.get_token(type, force, title=self.name) - - def get_expanded(self): - """Deprecated. Use page.text(expandtemplates=True) instead""" - warnings.warn("page.get_expanded() was deprecated in mwclient 0.7.0, use page.text(expandtemplates=True) instead.", - category=DeprecationWarning, stacklevel=2) - - return self.text(expandtemplates=True) - - def edit(self, *args, **kwargs): - """Deprecated. Use page.text() instead""" - warnings.warn("page.edit() was deprecated in mwclient 0.7.0, please use page.text() instead.", - category=DeprecationWarning, stacklevel=2) - return self.text(*args, **kwargs) - - def text(self, section=None, expandtemplates=False): - """ - Returns the current wikitext of the page, or of a specific section. - If the page does not exist, an empty string is returned. - - :Arguments: - - `section` : numbered section or `None` to get the whole page (default: `None`) - - `expandtemplates` : set to `True` to expand templates (default: `False`) - """ - - if not self.can('read'): - raise errors.InsufficientPermission(self) - if not self.exists: - return u'' - if section is not None: - section = str(section) - - revs = self.revisions(prop='content|timestamp', limit=1, section=section, expandtemplates=expandtemplates) - try: - rev = revs.next() - text = rev['*'] - self.section = section - self.last_rev_time = rev['timestamp'] - except StopIteration: - text = u'' - self.section = None - self.last_rev_time = None - if not expandtemplates: - self.edit_time = time.gmtime() - return text - - def save(self, text, summary=u'', minor=False, bot=True, section=None, **kwargs): - """ - Update the text of a section or the whole page by performing an edit operation. - """ - if not self.site.logged_in and self.site.force_login: - # Should we really check for this? - raise errors.LoginError(self.site, 'By default, mwclient protects you from accidentally ' + - 'editing without being logged in. If you actually want to edit without ' - 'logging in, you can set force_login on the Site object to False.') - if self.site.blocked: - raise errors.UserBlocked(self.site.blocked) - if not self.can('edit'): - raise errors.ProtectedPageError(self) - - if not section: - section = self.section - - if not self.site.writeapi: - raise errors.NoWriteApi(self) - - data = {} - if minor: - data['minor'] = '1' - if not minor: - data['notminor'] = '1' - if self.last_rev_time: - data['basetimestamp'] = time.strftime('%Y%m%d%H%M%S', self.last_rev_time) - if self.edit_time: - data['starttimestamp'] = time.strftime('%Y%m%d%H%M%S', self.edit_time) - if bot: - data['bot'] = '1' - if section: - data['section'] = section - - data.update(kwargs) - - def do_edit(): - result = self.site.api('edit', title=self.name, text=text, - summary=summary, token=self.get_token('edit'), - **data) - if result['edit'].get('result').lower() == 'failure': - raise errors.EditError(self, result['edit']) - return result - try: - result = do_edit() - except errors.APIError, e: - if e.code == 'badtoken': - # Retry, but only once to avoid an infinite loop - self.get_token('edit', force=True) - try: - result = do_edit() - except errors.APIError, e: - self.handle_edit_error(e, summary) - else: - self.handle_edit_error(e, summary) - - # 'newtimestamp' is not included if no change was made - if 'newtimestamp' in result['edit'].keys(): - self.last_rev_time = client.parse_timestamp(result['edit'].get('newtimestamp')) - return result['edit'] - - def handle_edit_error(self, e, summary): - if e.code == 'editconflict': - raise errors.EditError(self, summary, e.info) - elif e.code in ('protectedtitle', 'cantcreate', 'cantcreate-anon', 'noimageredirect-anon', - 'noimageredirect', 'noedit-anon', 'noedit'): - raise errors.ProtectedPageError(self, e.code, e.info) - else: - raise - - def move(self, new_title, reason='', move_talk=True, no_redirect=False): - """Move (rename) page to new_title. - - If user account is an administrator, specify no_direct as True to not - leave a redirect. - - If user does not have permission to move page, an InsufficientPermission - exception is raised. - - """ - if not self.can('move'): - raise errors.InsufficientPermission(self) - - if not self.site.writeapi: - raise errors.NoWriteApi(self) - - data = {} - if move_talk: - data['movetalk'] = '1' - if no_redirect: - data['noredirect'] = '1' - result = self.site.api('move', ('from', self.name), to=new_title, - token=self.get_token('move'), reason=reason, **data) - return result['move'] - - def delete(self, reason='', watch=False, unwatch=False, oldimage=False): - """Delete page. - - If user does not have permission to delete page, an InsufficientPermission - exception is raised. - - """ - if not self.can('delete'): - raise errors.InsufficientPermission(self) - - if not self.site.writeapi: - raise errors.NoWriteApi(self) - - data = {} - if watch: - data['watch'] = '1' - if unwatch: - data['unwatch'] = '1' - if oldimage: - data['oldimage'] = oldimage - result = self.site.api('delete', title=self.name, - token=self.get_token('delete'), - reason=reason, **data) - return result['delete'] - - def purge(self): - """Purge server-side cache of page. This will re-render templates and other - dynamic content. - - """ - self.site.raw_index('purge', title=self.name) - - # def watch: requires 1.14 - - # Properties - def backlinks(self, namespace=None, filterredir='all', redirect=False, limit=None, generator=True): - prefix = listing.List.get_prefix('bl', generator) - kwargs = dict(listing.List.generate_kwargs(prefix, - namespace=namespace, filterredir=filterredir)) - if redirect: - kwargs['%sredirect' % prefix] = '1' - kwargs[prefix + 'title'] = self.name - - return listing.List.get_list(generator)(self.site, 'backlinks', 'bl', limit=limit, return_values='title', **kwargs) - - def categories(self, generator=True): - if generator: - return listing.PagePropertyGenerator(self, 'categories', 'cl') - else: - # TODO: return sortkey if wanted - return listing.PageProperty(self, 'categories', 'cl', return_values='title') - - def embeddedin(self, namespace=None, filterredir='all', redirect=False, limit=None, generator=True): - prefix = listing.List.get_prefix('ei', generator) - kwargs = dict(listing.List.generate_kwargs(prefix, - namespace=namespace, filterredir=filterredir)) - if redirect: - kwargs['%sredirect' % prefix] = '1' - kwargs[prefix + 'title'] = self.name - - return listing.List.get_list(generator)(self.site, 'embeddedin', 'ei', limit=limit, return_values='title', **kwargs) - - def extlinks(self): - return listing.PageProperty(self, 'extlinks', 'el', return_values='*') - - def images(self, generator=True): - if generator: - return listing.PagePropertyGenerator(self, 'images', '') - else: - return listing.PageProperty(self, 'images', '', return_values='title') - - def iwlinks(self): - return listing.PageProperty(self, 'iwlinks', 'iw', return_values=('prefix', '*')) - - def langlinks(self, **kwargs): - return listing.PageProperty(self, 'langlinks', 'll', return_values=('lang', '*'), **kwargs) - - def links(self, namespace=None, generator=True, redirects=False): - kwargs = dict(listing.List.generate_kwargs('pl', namespace=namespace)) - if redirects: - kwargs['redirects'] = '1' - if generator: - return listing.PagePropertyGenerator(self, 'links', 'pl', **kwargs) - else: - return listing.PageProperty(self, 'links', 'pl', return_values='title', **kwargs) - - def revisions(self, startid=None, endid=None, start=None, end=None, - dir='older', user=None, excludeuser=None, limit=50, - prop='ids|timestamp|flags|comment|user', expandtemplates=False, section=None): - kwargs = dict(listing.List.generate_kwargs('rv', startid=startid, endid=endid, - start=start, end=end, user=user, excludeuser=excludeuser)) - kwargs['rvdir'] = dir - kwargs['rvprop'] = prop - if expandtemplates: - kwargs['rvexpandtemplates'] = '1' - if section is not None: - kwargs['rvsection'] = section - - return listing.RevisionsIterator(self, 'revisions', 'rv', limit=limit, **kwargs) - - def templates(self, namespace=None, generator=True): - kwargs = dict(listing.List.generate_kwargs('tl', namespace=namespace)) - if generator: - return listing.PagePropertyGenerator(self, 'templates', 'tl') - else: - return listing.PageProperty(self, 'templates', 'tl', return_values='title') - - -class Image(Page): - - def __init__(self, site, name, info=None): - Page.__init__(self, site, name, info, - extra_properties={'imageinfo': - (('iiprop', 'timestamp|user|comment|url|size|sha1|metadata|archivename'), ) - }) - self.imagerepository = self._info.get('imagerepository', '') - self.imageinfo = self._info.get('imageinfo', ({}, ))[0] - - def imagehistory(self): - return listing.PageProperty(self, 'imageinfo', 'ii', - iiprop='timestamp|user|comment|url|size|sha1|metadata|archivename') - - def imageusage(self, namespace=None, filterredir='all', redirect=False, - limit=None, generator=True): - prefix = listing.List.get_prefix('iu', generator) - kwargs = dict(listing.List.generate_kwargs(prefix, title=self.name, - namespace=namespace, filterredir=filterredir)) - if redirect: - kwargs['%sredirect' % prefix] = '1' - return listing.List.get_list(generator)(self.site, 'imageusage', 'iu', - limit=limit, return_values='title', **kwargs) - - def duplicatefiles(self, limit=None): - return listing.PageProperty(self, 'duplicatefiles', 'df', - dflimit=limit) - - def download(self): - url = self.imageinfo['url'] - if not url.startswith('http://'): - url = 'http://' + self.site.host + url - url = urlparse.urlparse(url) - # TODO: query string - return self.site.connection.get(url[1], url[2]) - - def __repr__(self): - return "" % (self.name.encode('utf-8'), self.site) diff --git a/wikipedia-mwc1.py b/wikipedia-mwc1.py deleted file mode 100644 index cac0576..0000000 --- a/wikipedia-mwc1.py +++ /dev/null @@ -1,13 +0,0 @@ -import time -import json -import mwclient - -def format_time(t): - return(time.strftime('%Y-%m-%d %H:%M:%S', t)) - -site = mwclient.Site('en.wikipedia.org') - -page = site.Pages["Data science"] - -for revision in page.revisions(): - print revision["user"] + "\t" + format_time(revision['timestamp']) diff --git a/wikipedia-mwc2.py b/wikipedia-mwc2.py deleted file mode 100644 index 0fee364..0000000 --- a/wikipedia-mwc2.py +++ /dev/null @@ -1,6 +0,0 @@ -import mwclient -site = mwclient.Site('en.wikipedia.org') - -category = site.Pages['Category:University of Washington'] -for page in category: - print page.name diff --git a/wikipedia-raw1.py b/wikipedia1-1.py similarity index 76% rename from wikipedia-raw1.py rename to wikipedia1-1.py index 05222a2..ca021d1 100644 --- a/wikipedia-raw1.py +++ b/wikipedia1-1.py @@ -1,6 +1,6 @@ import requests -wp_call = requests.get('https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=Main_Page&rvlimit=100&rvprop=timestamp|user&format=json') +wp_call = requests.get('https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=Python_(programming_language)&rvlimit=100&rvprop=timestamp|user&continue=&format=json') response = wp_call.json() diff --git a/wikipedia1-2.py b/wikipedia1-2.py new file mode 100644 index 0000000..98fd129 --- /dev/null +++ b/wikipedia1-2.py @@ -0,0 +1,33 @@ +import requests + +# raw string: +# ?action=query&prop=revisions&titles=Python_(programming_language)&rvlimit=100&rvprop=timestamp|user&format=json') + +# parameter version which makes a little more sense +parameters = {'action' : 'query', + 'prop' : 'revisions', + 'titles' : 'Python (programming language)', + 'rvlimit' : 100, + 'rvprop' : "timestamp|user", + 'format' : 'json', + 'continue' : ''} + +# run a white true loop +while True: + wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + print(parameters) + print(response) + + for page_id in response["query"]["pages"].keys(): + page_title = response["query"]["pages"][page_id]["title"] + revisions = response["query"]["pages"][page_id]["revisions"] + + for rev in revisions: + print(page_title + "\t" + rev["user"] + "\t" + rev["timestamp"]) + + if 'continue' in response: + parameters.update(response['continue']) + else: + break + diff --git a/wikipedia-raw2.py b/wikipedia2-1.py similarity index 76% rename from wikipedia-raw2.py rename to wikipedia2-1.py index a8efd34..3cb23d9 100644 --- a/wikipedia-raw2.py +++ b/wikipedia2-1.py @@ -1,9 +1,8 @@ -import time import requests -url_base = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&rvlimit=100&rvprop=timestamp|user&format=json' +url_base = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&rvlimit=100&rvprop=timestamp|user&continue=&format=json' -pages = ["Benjamin_Mako_Hill", "Python", "Data_science"] +pages = ["Benjamin Mako Hill", "Python", "Data science"] for page_title in pages: @@ -17,4 +16,3 @@ for page_title in pages: for rev in revisions: print(page_title + "\t" + rev["user"] + "\t" + rev["timestamp"]) - time.sleep(3) diff --git a/wikipedia2-2.py b/wikipedia2-2.py new file mode 100644 index 0000000..4dc2f30 --- /dev/null +++ b/wikipedia2-2.py @@ -0,0 +1,32 @@ +import requests + +# base url: +# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&rvlimit=100&rvprop=timestamp|user&format=json + +pages = ["Benjamin Mako Hill", "University of Washington", "Data science"] + +parameters = {'action' : 'query', + 'prop' : 'revisions', + 'rvlimit' : 100, + 'rvprop' : 'timestamp|user', + 'format' : 'json', + 'continue' : ''} + +for page_title in pages: + parameters['titles'] = page_title + + while True: + wp_call = requests.get('https://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + + for page_id in response["query"]["pages"].keys(): + page_title = response["query"]["pages"][page_id]["title"] + revisions = response["query"]["pages"][page_id]["revisions"] + + for rev in revisions: + print(page_title + "\t" + rev["user"] + "\t" + rev["timestamp"]) + + if 'continue' in response: + parameters.update(response['continue']) + else: + break diff --git a/wikipedia3.py b/wikipedia3.py new file mode 100644 index 0000000..717b487 --- /dev/null +++ b/wikipedia3.py @@ -0,0 +1,16 @@ +import requests + +# I found documentation here: +# https://www.mediawiki.org/wiki/API:Categorymembers + +parameters = {'action' : 'query', + 'list' : 'categorymembers', + 'format' : 'json', + 'cmtitle' : 'Category:Programming languages created in 1991'} + +wp_call = requests.get('http://en.wikipedia.org/w/api.php', params=parameters) + +response = wp_call.json() + +for page in response['query']['categorymembers']: + print(page['title']) diff --git a/wikipedia4.py b/wikipedia4.py new file mode 100644 index 0000000..1798d1a --- /dev/null +++ b/wikipedia4.py @@ -0,0 +1,24 @@ +import requests + +# ?action=query&titles=Albert%20Einstein&prop=categories +# Get the list of categories for the Albert Einstein article. + +parameters = {'action' : 'query', + 'titles' : 'Albert Einstein', + 'prop' : 'categories', + 'format' : 'json', + 'continue' : ''} + +while True: + wp_call = requests.get('http://en.wikipedia.org/w/api.php', params=parameters) + response = wp_call.json() + + for page_id in response["query"]["pages"].keys(): + for category in response["query"]["pages"][page_id]['categories']: + print(category['title']) + + if 'continue' in response: + parameters.update(response['continue']) + else: + break + -- 2.39.5