--- /dev/null
+# load up libraries and data
+#######################################################################
+library(ggplot2)
+library(reshape2)
+library(parallel)
+
+d <- read.delim("gitdata_1week.tsv", stringsAsFactors=FALSE)
+
+d$url <- NULL
+d$date <- gsub("T", " ", d$date)
+d$date <- as.POSIXct(d$date)
+
+# crop the event off the end
+d$type <- gsub("Event$", "", d$type)
+
+rev(sort(table(d$type)))
+head(rev(sort(table(d$name))))
+head(rev(sort(table(d$actor))))
+
+d$min <- cut(d$date, breaks="min")
+d$hour <- cut(d$date, breaks="hour")
+
+# generate graphs
+######################################################################
+
+pdf("github_graphs.pdf", width=10, height=7)
+
+# graph all together
+grid.tmp <- as.data.frame(table(d$hour))
+colnames(grid.tmp) <- c("date", "freq")
+grid.tmp$date <- as.POSIXct(grid.tmp$date)
+
+qplot(date, freq, data=grid.tmp, geom="line")
+
+# just types of events
+grid.tmp <- melt(lapply(tapply(d$type, d$hour, table), as.list), L2~L1)
+colnames(grid.tmp) <- c("value", "event", "date")
+grid.tmp$date <- as.POSIXct(grid.tmp$date)
+
+ggplot(data=grid.tmp) +
+ aes(x=date, y=value, group=event, color=event) + geom_line()
+
+ggplot(data=grid.tmp) +
+ aes(x=date, y=value, group=event, color=event) + geom_bar(stat="identity")
+
+ggplot(data=grid.tmp) +
+ aes(x=date, y=value, group=event, color=event, size=value) + geom_line() +
+ facet_grid(event~., scale="free_y")
+
+# create first differences
+build.phase.diagram.dataset <- function (d) {
+ grid.tmp <- as.data.frame(table(d$hour))
+ colnames(grid.tmp) <- c("date", "freq")
+ grid.tmp$date <- as.POSIXct(grid.tmp$date)
+ grid.tmp$freq2 <- c(NA, grid.tmp$freq[1:(length(grid.tmp$freq)-1)])
+ grid.tmp$diff <- grid.tmp$freq2 - grid.tmp$freq
+ grid.tmp <- grid.tmp[2:(dim(grid.tmp)[1]-1),]
+ grid.tmp$hour.of.week <- as.numeric(as.factor(grid.tmp$date))
+ grid.tmp$hour <- as.numeric(as.factor(grid.tmp$date)) %% 24
+ return(grid.tmp)
+}
+
+phase.diagram <- function (subset) {
+ grid.tmp <- build.phase.diagram.dataset(d[d$type == subset,])
+ ggplot(grid.tmp) + aes(x=diff, y=freq2, size=hour, colour=hour) +
+ geom_path() + ggtitle(paste("Phase Diagram:", subset))
+
+}
+
+grid.tmp <- build.phase.diagram.dataset(d)
+ggplot(grid.tmp) + aes(x=diff, y=freq2, size=hour, colour=hour) +
+ geom_path() + ggtitle("Phase Diagram: All Activity")
+
+phase.diagram("Push")
+phase.diagram("Watch")
+phase.diagram("Fork")
+phase.diagram("PullRequest")
+dev.off()
--- /dev/null
+ijson-1.1/ijson
\ No newline at end of file
--- /dev/null
+Copyright (c) 2010, Ivan Sagalaev
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name "ijson" nor the names of its contributors
+ may be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- /dev/null
+Metadata-Version: 1.1
+Name: ijson
+Version: 1.1
+Summary: Iterative JSON parser with a standard Python iterator interface
+Home-page: https://github.com/isagalaev/ijson
+Author: Ivan Sagalaev
+Author-email: maniac@softwaremaniacs.org
+License: LICENSE.txt
+Description: =====
+ ijson
+ =====
+
+ Ijson is an iterative JSON parser with a standard Python iterator interface.
+
+
+ Usage
+ =====
+
+ All usage example will be using a JSON document describing geographical
+ objects::
+
+ {
+ "earth": {
+ "europe": [
+ {"name": "Paris", "type": "city", "info": { ... }},
+ {"name": "Thames", "type": "river", "info": { ... }},
+ // ...
+ ],
+ "america": [
+ {"name": "Texas", "type": "state", "info": { ... }},
+ // ...
+ ]
+ }
+ }
+
+ Most common usage is having ijson yield native Python objects out of a JSON
+ stream located under a prefix. Here's how to process all European cities::
+
+ import ijson
+
+ f = urlopen('http://.../')
+ objects = ijson.items(f, 'earth.europe.item')
+ cities = (o for o in objects if o['type'] == 'city')
+ for city in cities:
+ do_something_with(city)
+
+ Sometimes when dealing with a particularly large JSON payload it may worth to
+ not even construct individual Python objects and react on individual events
+ immediately producing some result::
+
+ import ijson
+
+ parser = ijson.parse(urlopen('http://.../'))
+ stream.write('<geo>')
+ for prefix, event, value in parser:
+ if (prefix, event) == ('earth', 'map_key'):
+ stream.write('<%s>' % value)
+ continent = value
+ elif prefix.endswith('.name'):
+ stream.write('<object name="%s"/>' % value)
+ elif (prefix, event) == ('earth.%s' % continent, 'end_map'):
+ stream.write('</%s>' % continent)
+ stream.write('</geo>')
+
+
+ Backends
+ ========
+
+ Ijson provides several implementations of the actual parsing in the form of
+ backends located in ijson/backends:
+
+ - ``yajl2``: wrapper around `YAJL <http://lloyd.github.com/yajl/>`_ version 2.x
+ - ``yajl``: wrapper around `YAJL <http://lloyd.github.com/yajl/>`_ version 1.x
+ - ``python``: pure Python parser (good to use under PyPy)
+
+ You can import a specific backend and use it in the same way as the top level
+ library::
+
+ import ijson.backends.python as ijson
+
+ for item in ijson.items(...):
+ # ...
+
+ Importing the top level library as ``import ijson`` tries to import all backends
+ in order, so it either finds an appropriate version of YAJL or falls back to the
+ Python backend if none is found.
+
+
+ Acknowledgements
+ ================
+
+ Python parser in ijson is relatively simple thanks to `Douglas Crockford
+ <http://www.crockford.com/>`_ who invented a strict, easy to parse syntax.
+
+ The `YAJL <http://lloyd.github.com/yajl/>`_ library by `Lloyd Hilaiel
+ <http://lloyd.io/>`_ is the most popular and efficient way to parse JSON in an
+ iterative fashion.
+
+ Ijson was inspired by `yajl-py <http://pykler.github.com/yajl-py/>`_ wrapper by
+ `Hatem Nassrat <http://www.nassrat.ca/>`_. Though ijson borrows almost nothing
+ from the actual yajl-py code it was used as an example of integration with yajl
+ using ctypes.
+
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 3
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
--- /dev/null
+=====
+ijson
+=====
+
+Ijson is an iterative JSON parser with a standard Python iterator interface.
+
+
+Usage
+=====
+
+All usage example will be using a JSON document describing geographical
+objects::
+
+ {
+ "earth": {
+ "europe": [
+ {"name": "Paris", "type": "city", "info": { ... }},
+ {"name": "Thames", "type": "river", "info": { ... }},
+ // ...
+ ],
+ "america": [
+ {"name": "Texas", "type": "state", "info": { ... }},
+ // ...
+ ]
+ }
+ }
+
+Most common usage is having ijson yield native Python objects out of a JSON
+stream located under a prefix. Here's how to process all European cities::
+
+ import ijson
+
+ f = urlopen('http://.../')
+ objects = ijson.items(f, 'earth.europe.item')
+ cities = (o for o in objects if o['type'] == 'city')
+ for city in cities:
+ do_something_with(city)
+
+Sometimes when dealing with a particularly large JSON payload it may worth to
+not even construct individual Python objects and react on individual events
+immediately producing some result::
+
+ import ijson
+
+ parser = ijson.parse(urlopen('http://.../'))
+ stream.write('<geo>')
+ for prefix, event, value in parser:
+ if (prefix, event) == ('earth', 'map_key'):
+ stream.write('<%s>' % value)
+ continent = value
+ elif prefix.endswith('.name'):
+ stream.write('<object name="%s"/>' % value)
+ elif (prefix, event) == ('earth.%s' % continent, 'end_map'):
+ stream.write('</%s>' % continent)
+ stream.write('</geo>')
+
+
+Backends
+========
+
+Ijson provides several implementations of the actual parsing in the form of
+backends located in ijson/backends:
+
+- ``yajl2``: wrapper around `YAJL <http://lloyd.github.com/yajl/>`_ version 2.x
+- ``yajl``: wrapper around `YAJL <http://lloyd.github.com/yajl/>`_ version 1.x
+- ``python``: pure Python parser (good to use under PyPy)
+
+You can import a specific backend and use it in the same way as the top level
+library::
+
+ import ijson.backends.python as ijson
+
+ for item in ijson.items(...):
+ # ...
+
+Importing the top level library as ``import ijson`` tries to import all backends
+in order, so it either finds an appropriate version of YAJL or falls back to the
+Python backend if none is found.
+
+
+Acknowledgements
+================
+
+Python parser in ijson is relatively simple thanks to `Douglas Crockford
+<http://www.crockford.com/>`_ who invented a strict, easy to parse syntax.
+
+The `YAJL <http://lloyd.github.com/yajl/>`_ library by `Lloyd Hilaiel
+<http://lloyd.io/>`_ is the most popular and efficient way to parse JSON in an
+iterative fashion.
+
+Ijson was inspired by `yajl-py <http://pykler.github.com/yajl-py/>`_ wrapper by
+`Hatem Nassrat <http://www.nassrat.ca/>`_. Though ijson borrows almost nothing
+from the actual yajl-py code it was used as an example of integration with yajl
+using ctypes.
--- /dev/null
+'''
+Iterative JSON parser.
+
+Main API:
+
+- ``ijson.parse``: iterator returning parsing events with the object tree context,
+ see ``ijson.common.parse`` for docs.
+
+- ``ijson.items``: iterator returning Python objects found under a specified prefix,
+ see ``ijson.common.items`` for docs.
+
+Top-level ``ijson`` module tries to automatically find and import a suitable
+parsing backend. You can also explicitly import a required backend from
+``ijson.backends``.
+'''
+
+from ijson.common import JSONError, IncompleteJSONError, ObjectBuilder
+from ijson.backends import YAJLImportError
+
+try:
+ import ijson.backends.yajl2 as backend
+except YAJLImportError:
+ try:
+ import ijson.backends.yajl as backend
+ except YAJLImportError:
+ import ijson.backends.python as backend
+
+
+basic_parse = backend.basic_parse
+parse = backend.parse
+items = backend.items
--- /dev/null
+from ctypes import util, cdll
+
+class YAJLImportError(ImportError):
+ pass
+
+def find_yajl(required):
+ so_name = util.find_library('yajl')
+ if so_name is None:
+ raise YAJLImportError('YAJL shared object not found.')
+ yajl = cdll.LoadLibrary(so_name)
+ major, rest = divmod(yajl.yajl_version(), 10000)
+ minor, micro = divmod(rest, 100)
+ if major != required:
+ raise YAJLImportError('YAJL version %s.x required, found %s.%s.%s' % (required, major, minor, micro))
+ return yajl
--- /dev/null
+'''
+Pure-python parsing backend.
+'''
+from __future__ import unicode_literals
+from decimal import Decimal
+import re
+from codecs import unicode_escape_decode
+
+from ijson import common
+from ijson.compat import chr
+
+
+BUFSIZE = 16 * 1024
+NONWS = re.compile(r'\S')
+LEXTERM = re.compile(r'[^a-z0-9\.+-]')
+
+
+class UnexpectedSymbol(common.JSONError):
+ def __init__(self, symbol, reader):
+ super(UnexpectedSymbol, self).__init__('Unexpected symbol "%s" at %d' % (symbol[0], reader.pos - len(symbol)))
+
+class Lexer(object):
+ '''
+ JSON lexer. Supports iterator interface.
+ '''
+ def __init__(self, f):
+ self.f = f
+
+ def __iter__(self):
+ self.buffer = ''
+ self.pos = 0
+ return self
+
+ def __next__(self):
+ while True:
+ match = NONWS.search(self.buffer, self.pos)
+ if match:
+ self.pos = match.start()
+ char = self.buffer[self.pos]
+ if 'a' <= char <= 'z' or '0' <= char <= '9' or char == '-':
+ return self.lexem()
+ elif char == '"':
+ return self.stringlexem()
+ else:
+ self.pos += 1
+ return char
+ self.buffer = self.f.read(BUFSIZE).decode('utf-8')
+ self.pos = 0
+ if not len(self.buffer):
+ raise StopIteration
+ next = __next__
+
+ def lexem(self):
+ current = self.pos
+ while True:
+ match = LEXTERM.search(self.buffer, current)
+ if match:
+ current = match.start()
+ break
+ else:
+ current = len(self.buffer)
+ self.buffer += self.f.read(BUFSIZE).decode('utf-8')
+ if len(self.buffer) == current:
+ break
+ result = self.buffer[self.pos:current]
+ self.pos = current
+ if self.pos > BUFSIZE:
+ self.buffer = self.buffer[self.pos:]
+ self.pos = 0
+ return result
+
+ def stringlexem(self):
+ start = self.pos + 1
+ while True:
+ try:
+ end = self.buffer.index('"', start)
+ escpos = end - 1
+ while self.buffer[escpos] == '\\':
+ escpos -= 1
+ if (end - escpos) % 2 == 0:
+ start = end + 1
+ else:
+ result = self.buffer[self.pos:end + 1]
+ self.pos = end + 1
+ return result
+ except ValueError:
+ old_len = len(self.buffer)
+ self.buffer += self.f.read(BUFSIZE).decode('utf-8')
+ if len(self.buffer) == old_len:
+ raise common.IncompleteJSONError()
+
+def unescape(s):
+ start = 0
+ while start < len(s):
+ pos = s.find('\\', start)
+ if pos == -1:
+ yield s[start:]
+ break
+ yield s[start:pos]
+ pos += 1
+ esc = s[pos]
+ if esc == 'b':
+ yield '\b'
+ elif esc == 'f':
+ yield '\f'
+ elif esc == 'n':
+ yield '\n'
+ elif esc == 'r':
+ yield '\r'
+ elif esc == 't':
+ yield '\t'
+ elif esc == 'u':
+ yield chr(int(s[pos + 1:pos + 5], 16))
+ pos += 4
+ else:
+ yield esc
+ start = pos + 1
+
+def parse_value(lexer, symbol=None):
+ try:
+ if symbol is None:
+ symbol = next(lexer)
+ if symbol == 'null':
+ yield ('null', None)
+ elif symbol == 'true':
+ yield ('boolean', True)
+ elif symbol == 'false':
+ yield ('boolean', False)
+ elif symbol == '[':
+ for event in parse_array(lexer):
+ yield event
+ elif symbol == '{':
+ for event in parse_object(lexer):
+ yield event
+ elif symbol[0] == '"':
+ yield ('string', ''.join(unescape(symbol[1:-1])))
+ else:
+ try:
+ number = Decimal(symbol) if '.' in symbol else int(symbol)
+ yield ('number', number)
+ except ValueError:
+ raise UnexpectedSymbol(symbol, lexer)
+ except StopIteration:
+ raise common.IncompleteJSONError()
+
+def parse_array(lexer):
+ yield ('start_array', None)
+ symbol = next(lexer)
+ if symbol != ']':
+ while True:
+ for event in parse_value(lexer, symbol):
+ yield event
+ symbol = next(lexer)
+ if symbol == ']':
+ break
+ if symbol != ',':
+ raise UnexpectedSymbol(symbol, lexer)
+ symbol = next(lexer)
+ yield ('end_array', None)
+
+def parse_object(lexer):
+ yield ('start_map', None)
+ symbol = next(lexer)
+ if symbol != '}':
+ while True:
+ if symbol[0] != '"':
+ raise UnexpectedSymbol(symbol, lexer)
+ yield ('map_key', symbol[1:-1])
+ symbol = next(lexer)
+ if symbol != ':':
+ raise UnexpectedSymbol(symbol, lexer)
+ for event in parse_value(lexer):
+ yield event
+ symbol = next(lexer)
+ if symbol == '}':
+ break
+ if symbol != ',':
+ raise UnexpectedSymbol(symbol, lexer)
+ symbol = next(lexer)
+ yield ('end_map', None)
+
+def basic_parse(file):
+ '''
+ Iterator yielding unprefixed events.
+
+ Parameters:
+
+ - file: a readable file-like object with JSON input
+ '''
+ lexer = iter(Lexer(file))
+ for value in parse_value(lexer):
+ yield value
+ try:
+ next(lexer)
+ except StopIteration:
+ pass
+ else:
+ raise common.JSONError('Additional data')
+
+def parse(file):
+ '''
+ Backend-specific wrapper for ijson.common.parse.
+ '''
+ return common.parse(basic_parse(file))
+
+def items(file, prefix):
+ '''
+ Backend-specific wrapper for ijson.common.items.
+ '''
+ return common.items(parse(file), prefix)
--- /dev/null
+'''
+Wrapper for YAJL C library version 1.x.
+'''
+
+from ctypes import Structure, c_uint, c_ubyte, c_int, c_long, c_double, \
+ c_void_p, c_char_p, CFUNCTYPE, POINTER, byref, string_at, cast , \
+ cdll, util, c_char
+from decimal import Decimal
+
+from ijson import common, backends
+from ijson.compat import b2s
+
+
+yajl = backends.find_yajl(1)
+
+yajl.yajl_alloc.restype = POINTER(c_char)
+yajl.yajl_get_error.restype = POINTER(c_char)
+
+C_EMPTY = CFUNCTYPE(c_int, c_void_p)
+C_INT = CFUNCTYPE(c_int, c_void_p, c_int)
+C_LONG = CFUNCTYPE(c_int, c_void_p, c_long)
+C_DOUBLE = CFUNCTYPE(c_int, c_void_p, c_double)
+C_STR = CFUNCTYPE(c_int, c_void_p, POINTER(c_ubyte), c_uint)
+
+
+def number(value):
+ '''
+ Helper function casting a string that represents any Javascript number
+ into appropriate Python value: either int or Decimal.
+ '''
+ try:
+ return int(value)
+ except ValueError:
+ return Decimal(value)
+
+_callback_data = [
+ # Mapping of JSON parser events to callback C types and value converters.
+ # Used to define the Callbacks structure and actual callback functions
+ # inside the parse function.
+ ('null', C_EMPTY, lambda: None),
+ ('boolean', C_INT, lambda v: bool(v)),
+ # "integer" and "double" aren't actually yielded by yajl since "number"
+ # takes precedence if defined
+ ('integer', C_LONG, lambda v, l: int(string_at(v, l))),
+ ('double', C_DOUBLE, lambda v, l: float(string_at(v, l))),
+ ('number', C_STR, lambda v, l: number(b2s(string_at(v, l)))),
+ ('string', C_STR, lambda v, l: string_at(v, l).decode('utf-8')),
+ ('start_map', C_EMPTY, lambda: None),
+ ('map_key', C_STR, lambda v, l: b2s(string_at(v, l))),
+ ('end_map', C_EMPTY, lambda: None),
+ ('start_array', C_EMPTY, lambda: None),
+ ('end_array', C_EMPTY, lambda: None),
+]
+
+class Callbacks(Structure):
+ _fields_ = [(name, type) for name, type, func in _callback_data]
+
+class Config(Structure):
+ _fields_ = [
+ ("allowComments", c_uint),
+ ("checkUTF8", c_uint)
+ ]
+
+YAJL_OK = 0
+YAJL_CANCELLED = 1
+YAJL_INSUFFICIENT_DATA = 2
+YAJL_ERROR = 3
+
+
+def basic_parse(f, allow_comments=False, check_utf8=False, buf_size=64 * 1024):
+ '''
+ Iterator yielding unprefixed events.
+
+ Parameters:
+
+ - f: a readable file-like object with JSON input
+ - allow_comments: tells parser to allow comments in JSON input
+ - check_utf8: if True, parser will cause an error if input is invalid utf-8
+ - buf_size: a size of an input buffer
+ '''
+ events = []
+
+ def callback(event, func_type, func):
+ def c_callback(context, *args):
+ events.append((event, func(*args)))
+ return 1
+ return func_type(c_callback)
+
+ callbacks = Callbacks(*[callback(*data) for data in _callback_data])
+ config = Config(allow_comments, check_utf8)
+ handle = yajl.yajl_alloc(byref(callbacks), byref(config), None, None)
+ try:
+ while True:
+ buffer = f.read(buf_size)
+ if buffer:
+ result = yajl.yajl_parse(handle, buffer, len(buffer))
+ else:
+ result = yajl.yajl_parse_complete(handle)
+ if result == YAJL_ERROR:
+ perror = yajl.yajl_get_error(handle, 1, buffer, len(buffer))
+ error = cast(perror, c_char_p).value
+ yajl.yajl_free_error(handle, perror)
+ raise common.JSONError(error)
+ if not buffer and not events:
+ if result == YAJL_INSUFFICIENT_DATA:
+ raise common.IncompleteJSONError()
+ break
+
+ for event in events:
+ yield event
+ events = []
+ finally:
+ yajl.yajl_free(handle)
+
+def parse(file, **kwargs):
+ '''
+ Backend-specific wrapper for ijson.common.parse.
+ '''
+ return common.parse(basic_parse(file, **kwargs))
+
+def items(file, prefix):
+ '''
+ Backend-specific wrapper for ijson.common.items.
+ '''
+ return common.items(parse(file), prefix)
--- /dev/null
+'''
+Wrapper for YAJL C library version 2.x.
+'''
+
+from ctypes import Structure, c_uint, c_ubyte, c_int, c_long, c_double, \
+ c_void_p, c_char_p, CFUNCTYPE, POINTER, byref, string_at, cast , \
+ cdll, util, c_char
+from decimal import Decimal
+
+from ijson import common, backends
+from ijson.compat import b2s
+
+
+yajl = backends.find_yajl(2)
+
+yajl.yajl_alloc.restype = POINTER(c_char)
+yajl.yajl_get_error.restype = POINTER(c_char)
+
+C_EMPTY = CFUNCTYPE(c_int, c_void_p)
+C_INT = CFUNCTYPE(c_int, c_void_p, c_int)
+C_LONG = CFUNCTYPE(c_int, c_void_p, c_long)
+C_DOUBLE = CFUNCTYPE(c_int, c_void_p, c_double)
+C_STR = CFUNCTYPE(c_int, c_void_p, POINTER(c_ubyte), c_uint)
+
+
+def number(value):
+ '''
+ Helper function casting a string that represents any Javascript number
+ into appropriate Python value: either int or Decimal.
+ '''
+ try:
+ return int(value)
+ except ValueError:
+ return Decimal(value)
+
+_callback_data = [
+ # Mapping of JSON parser events to callback C types and value converters.
+ # Used to define the Callbacks structure and actual callback functions
+ # inside the parse function.
+ ('null', C_EMPTY, lambda: None),
+ ('boolean', C_INT, lambda v: bool(v)),
+ # "integer" and "double" aren't actually yielded by yajl since "number"
+ # takes precedence if defined
+ ('integer', C_LONG, lambda v, l: int(string_at(v, l))),
+ ('double', C_DOUBLE, lambda v, l: float(string_at(v, l))),
+ ('number', C_STR, lambda v, l: number(b2s(string_at(v, l)))),
+ ('string', C_STR, lambda v, l: string_at(v, l).decode('utf-8')),
+ ('start_map', C_EMPTY, lambda: None),
+ ('map_key', C_STR, lambda v, l: b2s(string_at(v, l))),
+ ('end_map', C_EMPTY, lambda: None),
+ ('start_array', C_EMPTY, lambda: None),
+ ('end_array', C_EMPTY, lambda: None),
+]
+
+class Callbacks(Structure):
+ _fields_ = [(name, type) for name, type, func in _callback_data]
+
+YAJL_OK = 0
+YAJL_CANCELLED = 1
+YAJL_INSUFFICIENT_DATA = 2
+YAJL_ERROR = 3
+
+# constants defined in yajl_parse.h
+YAJL_ALLOW_COMMENTS = 1
+YAJL_MULTIPLE_VALUES = 8
+
+
+def basic_parse(f, allow_comments=False, buf_size=64 * 1024,
+ multiple_values=False):
+ '''
+ Iterator yielding unprefixed events.
+
+ Parameters:
+
+ - f: a readable file-like object with JSON input
+ - allow_comments: tells parser to allow comments in JSON input
+ - buf_size: a size of an input buffer
+ - multiple_values: allows the parser to parse multiple JSON objects
+ '''
+ events = []
+
+ def callback(event, func_type, func):
+ def c_callback(context, *args):
+ events.append((event, func(*args)))
+ return 1
+ return func_type(c_callback)
+
+ callbacks = Callbacks(*[callback(*data) for data in _callback_data])
+ handle = yajl.yajl_alloc(byref(callbacks), None, None)
+ if allow_comments:
+ yajl.yajl_config(handle, YAJL_ALLOW_COMMENTS, 1)
+ if multiple_values:
+ yajl.yajl_config(handle, YAJL_MULTIPLE_VALUES, 1)
+ try:
+ while True:
+ buffer = f.read(buf_size)
+ if buffer:
+ result = yajl.yajl_parse(handle, buffer, len(buffer))
+ else:
+ result = yajl.yajl_complete_parse(handle)
+ if result == YAJL_ERROR:
+ perror = yajl.yajl_get_error(handle, 1, buffer, len(buffer))
+ error = cast(perror, c_char_p).value
+ yajl.yajl_free_error(handle, perror)
+ raise common.JSONError(error)
+ if not buffer and not events:
+ if result == YAJL_INSUFFICIENT_DATA:
+ raise common.IncompleteJSONError()
+ break
+
+ for event in events:
+ yield event
+ events = []
+ finally:
+ yajl.yajl_free(handle)
+
+def parse(file, **kwargs):
+ '''
+ Backend-specific wrapper for ijson.common.parse.
+ '''
+ return common.parse(basic_parse(file, **kwargs))
+
+def items(file, prefix):
+ '''
+ Backend-specific wrapper for ijson.common.items.
+ '''
+ return common.items(parse(file), prefix)
--- /dev/null
+'''
+Backend independent higher level interfaces, common exceptions.
+'''
+
+class JSONError(Exception):
+ '''
+ Base exception for all parsing errors.
+ '''
+ pass
+
+class IncompleteJSONError(JSONError):
+ '''
+ Raised when the parser expects data and it's not available. May be
+ caused by malformed syntax or a broken source stream.
+ '''
+ def __init__(self):
+ super(IncompleteJSONError, self).__init__('Incomplete or empty JSON data')
+
+def parse(basic_events):
+ '''
+ An iterator returning parsing events with the information about their location
+ with the JSON object tree. Events are tuples ``(prefix, type, value)``.
+
+ Available types and values are:
+
+ ('null', None)
+ ('boolean', <True or False>)
+ ('number', <int or Decimal>)
+ ('string', <unicode>)
+ ('map_key', <str>)
+ ('start_map', None)
+ ('end_map', None)
+ ('start_array', None)
+ ('end_array', None)
+
+ Prefixes represent the path to the nested elements from the root of the JSON
+ document. For example, given this document::
+
+ {
+ "array": [1, 2],
+ "map": {
+ "key": "value"
+ }
+ }
+
+ the parser would yield events:
+
+ ('', 'start_map', None)
+ ('', 'map_key', 'array')
+ ('array', 'start_array', None)
+ ('array.item', 'number', 1)
+ ('array.item', 'number', 2)
+ ('array', 'end_array', None)
+ ('', 'map_key', 'map')
+ ('map', 'start_map', None)
+ ('map', 'map_key', 'key')
+ ('map.key', 'string', u'value')
+ ('map', 'end_map', None)
+ ('', 'end_map', None)
+
+ '''
+ path = []
+ for event, value in basic_events:
+ if event == 'map_key':
+ prefix = '.'.join(path[:-1])
+ path[-1] = value
+ elif event == 'start_map':
+ prefix = '.'.join(path)
+ path.append(None)
+ elif event == 'end_map':
+ path.pop()
+ prefix = '.'.join(path)
+ elif event == 'start_array':
+ prefix = '.'.join(path)
+ path.append('item')
+ elif event == 'end_array':
+ path.pop()
+ prefix = '.'.join(path)
+ else: # any scalar value
+ prefix = '.'.join(path)
+
+ yield prefix, event, value
+
+
+class ObjectBuilder(object):
+ '''
+ Incrementally builds an object from JSON parser events. Events are passed
+ into the `event` function that accepts two parameters: event type and
+ value. The object being built is available at any time from the `value`
+ attribute.
+
+ Example::
+
+ from StringIO import StringIO
+ from ijson.parse import basic_parse
+ from ijson.utils import ObjectBuilder
+
+ builder = ObjectBuilder()
+ f = StringIO('{"key": "value"})
+ for event, value in basic_parse(f):
+ builder.event(event, value)
+ print builder.value
+
+ '''
+ def __init__(self):
+ def initial_set(value):
+ self.value = value
+ self.containers = [initial_set]
+
+ def event(self, event, value):
+ if event == 'map_key':
+ self.key = value
+ elif event == 'start_map':
+ map = {}
+ self.containers[-1](map)
+ def setter(value):
+ map[self.key] = value
+ self.containers.append(setter)
+ elif event == 'start_array':
+ array = []
+ self.containers[-1](array)
+ self.containers.append(array.append)
+ elif event == 'end_array' or event == 'end_map':
+ self.containers.pop()
+ else:
+ self.containers[-1](value)
+
+def items(prefixed_events, prefix):
+ '''
+ An iterator returning native Python objects constructed from the events
+ under a given prefix.
+ '''
+ prefixed_events = iter(prefixed_events)
+ try:
+ while True:
+ current, event, value = next(prefixed_events)
+ if current == prefix:
+ if event in ('start_map', 'start_array'):
+ builder = ObjectBuilder()
+ end_event = event.replace('start', 'end')
+ while (current, event) != (prefix, end_event):
+ builder.event(event, value)
+ current, event, value = next(prefixed_events)
+ yield builder.value
+ else:
+ yield value
+ except StopIteration:
+ pass
--- /dev/null
+'''
+Python2/Python3 compatibility utilities.
+'''
+
+import sys
+
+
+IS_PY2 = sys.version_info[0] < 3
+
+
+if IS_PY2:
+ b2s = lambda s: s
+ chr = unichr
+else:
+ def b2s(b):
+ return b.decode('utf-8')
+ chr = chr
--- /dev/null
+# -*- coding:utf-8 -*-
+from functools import wraps
+
+
+def coroutine(func):
+ '''
+ Wraps a generator which intended to be used as a pure coroutine by
+ .send()ing it values. The only thing that the wrapper does is calling
+ .next() for the first time which is required by Python generator protocol.
+ '''
+ @wraps(func)
+ def wrapper(*args, **kwargs):
+ g = func(*args, **kwargs)
+ next(g)
+ return g
+ return wrapper
+
+@coroutine
+def foreach(coroutine_func):
+ '''
+ Dispatches each JSON array item to a handler coroutine. A coroutine is
+ created anew for each item by calling `coroutine_func` callable. The
+ resulting coroutine should accept value in the form of tuple of values
+ generated by rich JSON parser: (prefix, event, value).
+
+ First event received by foreach should be a "start_array" event.
+ '''
+ g = None
+ base, event, value = yield
+ if event != 'start_array':
+ raise Exception('foreach requires "start_array" as the first event, got %s' % repr((base, event, value)))
+ START_EVENTS = set(['start_map', 'start_array', 'null', 'boolean', 'number', 'string'])
+ itemprefix = base + '.item' if base else 'item'
+ while True:
+ prefix, event, value = yield
+ if prefix == itemprefix and event in START_EVENTS:
+ g = coroutine_func()
+ if (prefix, event) != (base, 'end_array'):
+ g.send((prefix, event, value))
+
+@coroutine
+def dispatcher(targets):
+ '''
+ Dispatches JSON parser events into several handlers depending on event
+ prefixes.
+
+ Accepts a list of tuples (base_prefix, coroutine). A coroutine then
+ receives all the events with prefixes starting with its base_prefix.
+ '''
+ while True:
+ prefix, event, value = yield
+ for base, target in targets:
+ if prefix.startswith(base):
+ target.send((prefix, event, value))
+ break
--- /dev/null
+# -*- coding:utf-8 -*-
+from distutils.core import setup
+
+setup(
+ name = 'ijson',
+ version = '1.1',
+ author = 'Ivan Sagalaev',
+ author_email = 'maniac@softwaremaniacs.org',
+ packages = ['ijson', 'ijson.backends'],
+ url = 'https://github.com/isagalaev/ijson',
+ license = 'LICENSE.txt',
+ description = 'Iterative JSON parser with a standard Python iterator interface',
+ long_description = open('README.rst').read(),
+ classifiers = [
+ 'Development Status :: 5 - Production/Stable',
+ 'License :: OSI Approved :: BSD License',
+ 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 3',
+ 'Topic :: Software Development :: Libraries :: Python Modules',
+ ]
+)
--- /dev/null
+#!/usr/bin/env python
+
+import os
+import json
+import re
+
+#shameless copy paste from json/decoder.py
+FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
+WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
+
+class ConcatJSONDecoder(json.JSONDecoder):
+ def decode(self, s, _w=WHITESPACE.match):
+ s_len = len(s)
+
+ objs = []
+ end = 0
+ while end != s_len:
+ obj, end = self.raw_decode(s, idx=_w(s, end).end())
+ end = _w(s, end).end()
+ objs.append(obj)
+ return objs
+
+def print_json_file(filename):
+ events = json.loads(os.popen("zcat "+ filename).read(), cls=ConcatJSONDecoder)
+
+ for event in events:
+ # remove events which are not done to repositories
+ if event['type'] in ["GistEvent"]:
+ continue
+
+ ev = {}
+ ev['actor'] = event['actor']
+ ev['type'] = event['type']
+ ev['date'] = event['created_at']
+ ev['url'] = event['url']
+
+
+ if event.has_key('repository'):
+ ev['repo.name'] = event['repository']['name']
+ ev['repo.owner'] = event['repository']['owner']
+ ev['repo.watchers'] = str(event['repository']['watchers'])
+ ev['repo.forks'] = str(event['repository']['forks'])
+ else:
+ ev['repo.name'] = ""
+ ev['repo.owner'] = ""
+ ev['repo.watchers'] = ""
+ ev['repo.forks'] = ""
+
+ # print event # debug code
+ print u"\t".join([ev['actor'],
+ ev['type'],
+ ev['date'],
+ ev['url'],
+ ev['repo.name'],
+ ev['repo.owner'],
+ ev['repo.watchers'],
+ ev['repo.forks']]).encode("utf8")
+
+print "\t".join(["actor", "type", "date", "url", "name", "repo.owner",
+ "repo.watchers", "repo.forks"])
+
+for filename in os.listdir("data/"):
+ print_json_file("data/" + filename)