From: Benjamin Mako Hill Date: Sat, 16 Aug 2014 02:35:55 +0000 (-0700) Subject: import of github code used for the hackathon X-Git-Url: https://projects.mako.cc/source/github-barcamp-201407/commitdiff_plain/680ee1f7103d948c203ee01401d1f9674a6f00f4 import of github code used for the hackathon Held w/ the SciencePo MediaLab in Paris on June 26-27, 2014 --- 680ee1f7103d948c203ee01401d1f9674a6f00f4 diff --git a/analysis.R b/analysis.R new file mode 100644 index 0000000..7527828 --- /dev/null +++ b/analysis.R @@ -0,0 +1,78 @@ +# load up libraries and data +####################################################################### +library(ggplot2) +library(reshape2) +library(parallel) + +d <- read.delim("gitdata_1week.tsv", stringsAsFactors=FALSE) + +d$url <- NULL +d$date <- gsub("T", " ", d$date) +d$date <- as.POSIXct(d$date) + +# crop the event off the end +d$type <- gsub("Event$", "", d$type) + +rev(sort(table(d$type))) +head(rev(sort(table(d$name)))) +head(rev(sort(table(d$actor)))) + +d$min <- cut(d$date, breaks="min") +d$hour <- cut(d$date, breaks="hour") + +# generate graphs +###################################################################### + +pdf("github_graphs.pdf", width=10, height=7) + +# graph all together +grid.tmp <- as.data.frame(table(d$hour)) +colnames(grid.tmp) <- c("date", "freq") +grid.tmp$date <- as.POSIXct(grid.tmp$date) + +qplot(date, freq, data=grid.tmp, geom="line") + +# just types of events +grid.tmp <- melt(lapply(tapply(d$type, d$hour, table), as.list), L2~L1) +colnames(grid.tmp) <- c("value", "event", "date") +grid.tmp$date <- as.POSIXct(grid.tmp$date) + +ggplot(data=grid.tmp) + + aes(x=date, y=value, group=event, color=event) + geom_line() + +ggplot(data=grid.tmp) + + aes(x=date, y=value, group=event, color=event) + geom_bar(stat="identity") + +ggplot(data=grid.tmp) + + aes(x=date, y=value, group=event, color=event, size=value) + geom_line() + + facet_grid(event~., scale="free_y") + +# create first differences +build.phase.diagram.dataset <- function (d) { + grid.tmp <- as.data.frame(table(d$hour)) + colnames(grid.tmp) <- c("date", "freq") + grid.tmp$date <- as.POSIXct(grid.tmp$date) + grid.tmp$freq2 <- c(NA, grid.tmp$freq[1:(length(grid.tmp$freq)-1)]) + grid.tmp$diff <- grid.tmp$freq2 - grid.tmp$freq + grid.tmp <- grid.tmp[2:(dim(grid.tmp)[1]-1),] + grid.tmp$hour.of.week <- as.numeric(as.factor(grid.tmp$date)) + grid.tmp$hour <- as.numeric(as.factor(grid.tmp$date)) %% 24 + return(grid.tmp) +} + +phase.diagram <- function (subset) { + grid.tmp <- build.phase.diagram.dataset(d[d$type == subset,]) + ggplot(grid.tmp) + aes(x=diff, y=freq2, size=hour, colour=hour) + + geom_path() + ggtitle(paste("Phase Diagram:", subset)) + +} + +grid.tmp <- build.phase.diagram.dataset(d) +ggplot(grid.tmp) + aes(x=diff, y=freq2, size=hour, colour=hour) + + geom_path() + ggtitle("Phase Diagram: All Activity") + +phase.diagram("Push") +phase.diagram("Watch") +phase.diagram("Fork") +phase.diagram("PullRequest") +dev.off() diff --git a/ijson b/ijson new file mode 120000 index 0000000..f8035de --- /dev/null +++ b/ijson @@ -0,0 +1 @@ +ijson-1.1/ijson \ No newline at end of file diff --git a/ijson-1.1/LICENSE.txt b/ijson-1.1/LICENSE.txt new file mode 100644 index 0000000..89d119c --- /dev/null +++ b/ijson-1.1/LICENSE.txt @@ -0,0 +1,24 @@ +Copyright (c) 2010, Ivan Sagalaev +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name "ijson" nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/ijson-1.1/PKG-INFO b/ijson-1.1/PKG-INFO new file mode 100644 index 0000000..7349cd6 --- /dev/null +++ b/ijson-1.1/PKG-INFO @@ -0,0 +1,109 @@ +Metadata-Version: 1.1 +Name: ijson +Version: 1.1 +Summary: Iterative JSON parser with a standard Python iterator interface +Home-page: https://github.com/isagalaev/ijson +Author: Ivan Sagalaev +Author-email: maniac@softwaremaniacs.org +License: LICENSE.txt +Description: ===== + ijson + ===== + + Ijson is an iterative JSON parser with a standard Python iterator interface. + + + Usage + ===== + + All usage example will be using a JSON document describing geographical + objects:: + + { + "earth": { + "europe": [ + {"name": "Paris", "type": "city", "info": { ... }}, + {"name": "Thames", "type": "river", "info": { ... }}, + // ... + ], + "america": [ + {"name": "Texas", "type": "state", "info": { ... }}, + // ... + ] + } + } + + Most common usage is having ijson yield native Python objects out of a JSON + stream located under a prefix. Here's how to process all European cities:: + + import ijson + + f = urlopen('http://.../') + objects = ijson.items(f, 'earth.europe.item') + cities = (o for o in objects if o['type'] == 'city') + for city in cities: + do_something_with(city) + + Sometimes when dealing with a particularly large JSON payload it may worth to + not even construct individual Python objects and react on individual events + immediately producing some result:: + + import ijson + + parser = ijson.parse(urlopen('http://.../')) + stream.write('') + for prefix, event, value in parser: + if (prefix, event) == ('earth', 'map_key'): + stream.write('<%s>' % value) + continent = value + elif prefix.endswith('.name'): + stream.write('' % value) + elif (prefix, event) == ('earth.%s' % continent, 'end_map'): + stream.write('' % continent) + stream.write('') + + + Backends + ======== + + Ijson provides several implementations of the actual parsing in the form of + backends located in ijson/backends: + + - ``yajl2``: wrapper around `YAJL `_ version 2.x + - ``yajl``: wrapper around `YAJL `_ version 1.x + - ``python``: pure Python parser (good to use under PyPy) + + You can import a specific backend and use it in the same way as the top level + library:: + + import ijson.backends.python as ijson + + for item in ijson.items(...): + # ... + + Importing the top level library as ``import ijson`` tries to import all backends + in order, so it either finds an appropriate version of YAJL or falls back to the + Python backend if none is found. + + + Acknowledgements + ================ + + Python parser in ijson is relatively simple thanks to `Douglas Crockford + `_ who invented a strict, easy to parse syntax. + + The `YAJL `_ library by `Lloyd Hilaiel + `_ is the most popular and efficient way to parse JSON in an + iterative fashion. + + Ijson was inspired by `yajl-py `_ wrapper by + `Hatem Nassrat `_. Though ijson borrows almost nothing + from the actual yajl-py code it was used as an example of integration with yajl + using ctypes. + +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: License :: OSI Approved :: BSD License +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Software Development :: Libraries :: Python Modules diff --git a/ijson-1.1/README.rst b/ijson-1.1/README.rst new file mode 100644 index 0000000..434dd5f --- /dev/null +++ b/ijson-1.1/README.rst @@ -0,0 +1,94 @@ +===== +ijson +===== + +Ijson is an iterative JSON parser with a standard Python iterator interface. + + +Usage +===== + +All usage example will be using a JSON document describing geographical +objects:: + + { + "earth": { + "europe": [ + {"name": "Paris", "type": "city", "info": { ... }}, + {"name": "Thames", "type": "river", "info": { ... }}, + // ... + ], + "america": [ + {"name": "Texas", "type": "state", "info": { ... }}, + // ... + ] + } + } + +Most common usage is having ijson yield native Python objects out of a JSON +stream located under a prefix. Here's how to process all European cities:: + + import ijson + + f = urlopen('http://.../') + objects = ijson.items(f, 'earth.europe.item') + cities = (o for o in objects if o['type'] == 'city') + for city in cities: + do_something_with(city) + +Sometimes when dealing with a particularly large JSON payload it may worth to +not even construct individual Python objects and react on individual events +immediately producing some result:: + + import ijson + + parser = ijson.parse(urlopen('http://.../')) + stream.write('') + for prefix, event, value in parser: + if (prefix, event) == ('earth', 'map_key'): + stream.write('<%s>' % value) + continent = value + elif prefix.endswith('.name'): + stream.write('' % value) + elif (prefix, event) == ('earth.%s' % continent, 'end_map'): + stream.write('' % continent) + stream.write('') + + +Backends +======== + +Ijson provides several implementations of the actual parsing in the form of +backends located in ijson/backends: + +- ``yajl2``: wrapper around `YAJL `_ version 2.x +- ``yajl``: wrapper around `YAJL `_ version 1.x +- ``python``: pure Python parser (good to use under PyPy) + +You can import a specific backend and use it in the same way as the top level +library:: + + import ijson.backends.python as ijson + + for item in ijson.items(...): + # ... + +Importing the top level library as ``import ijson`` tries to import all backends +in order, so it either finds an appropriate version of YAJL or falls back to the +Python backend if none is found. + + +Acknowledgements +================ + +Python parser in ijson is relatively simple thanks to `Douglas Crockford +`_ who invented a strict, easy to parse syntax. + +The `YAJL `_ library by `Lloyd Hilaiel +`_ is the most popular and efficient way to parse JSON in an +iterative fashion. + +Ijson was inspired by `yajl-py `_ wrapper by +`Hatem Nassrat `_. Though ijson borrows almost nothing +from the actual yajl-py code it was used as an example of integration with yajl +using ctypes. diff --git a/ijson-1.1/ijson/__init__.py b/ijson-1.1/ijson/__init__.py new file mode 100644 index 0000000..c76eb3f --- /dev/null +++ b/ijson-1.1/ijson/__init__.py @@ -0,0 +1,31 @@ +''' +Iterative JSON parser. + +Main API: + +- ``ijson.parse``: iterator returning parsing events with the object tree context, + see ``ijson.common.parse`` for docs. + +- ``ijson.items``: iterator returning Python objects found under a specified prefix, + see ``ijson.common.items`` for docs. + +Top-level ``ijson`` module tries to automatically find and import a suitable +parsing backend. You can also explicitly import a required backend from +``ijson.backends``. +''' + +from ijson.common import JSONError, IncompleteJSONError, ObjectBuilder +from ijson.backends import YAJLImportError + +try: + import ijson.backends.yajl2 as backend +except YAJLImportError: + try: + import ijson.backends.yajl as backend + except YAJLImportError: + import ijson.backends.python as backend + + +basic_parse = backend.basic_parse +parse = backend.parse +items = backend.items diff --git a/ijson-1.1/ijson/__init__.pyc b/ijson-1.1/ijson/__init__.pyc new file mode 100644 index 0000000..a5ef9d3 Binary files /dev/null and b/ijson-1.1/ijson/__init__.pyc differ diff --git a/ijson-1.1/ijson/backends/__init__.py b/ijson-1.1/ijson/backends/__init__.py new file mode 100644 index 0000000..c03d567 --- /dev/null +++ b/ijson-1.1/ijson/backends/__init__.py @@ -0,0 +1,15 @@ +from ctypes import util, cdll + +class YAJLImportError(ImportError): + pass + +def find_yajl(required): + so_name = util.find_library('yajl') + if so_name is None: + raise YAJLImportError('YAJL shared object not found.') + yajl = cdll.LoadLibrary(so_name) + major, rest = divmod(yajl.yajl_version(), 10000) + minor, micro = divmod(rest, 100) + if major != required: + raise YAJLImportError('YAJL version %s.x required, found %s.%s.%s' % (required, major, minor, micro)) + return yajl diff --git a/ijson-1.1/ijson/backends/__init__.pyc b/ijson-1.1/ijson/backends/__init__.pyc new file mode 100644 index 0000000..2ab7682 Binary files /dev/null and b/ijson-1.1/ijson/backends/__init__.pyc differ diff --git a/ijson-1.1/ijson/backends/python.py b/ijson-1.1/ijson/backends/python.py new file mode 100644 index 0000000..ba10870 --- /dev/null +++ b/ijson-1.1/ijson/backends/python.py @@ -0,0 +1,210 @@ +''' +Pure-python parsing backend. +''' +from __future__ import unicode_literals +from decimal import Decimal +import re +from codecs import unicode_escape_decode + +from ijson import common +from ijson.compat import chr + + +BUFSIZE = 16 * 1024 +NONWS = re.compile(r'\S') +LEXTERM = re.compile(r'[^a-z0-9\.+-]') + + +class UnexpectedSymbol(common.JSONError): + def __init__(self, symbol, reader): + super(UnexpectedSymbol, self).__init__('Unexpected symbol "%s" at %d' % (symbol[0], reader.pos - len(symbol))) + +class Lexer(object): + ''' + JSON lexer. Supports iterator interface. + ''' + def __init__(self, f): + self.f = f + + def __iter__(self): + self.buffer = '' + self.pos = 0 + return self + + def __next__(self): + while True: + match = NONWS.search(self.buffer, self.pos) + if match: + self.pos = match.start() + char = self.buffer[self.pos] + if 'a' <= char <= 'z' or '0' <= char <= '9' or char == '-': + return self.lexem() + elif char == '"': + return self.stringlexem() + else: + self.pos += 1 + return char + self.buffer = self.f.read(BUFSIZE).decode('utf-8') + self.pos = 0 + if not len(self.buffer): + raise StopIteration + next = __next__ + + def lexem(self): + current = self.pos + while True: + match = LEXTERM.search(self.buffer, current) + if match: + current = match.start() + break + else: + current = len(self.buffer) + self.buffer += self.f.read(BUFSIZE).decode('utf-8') + if len(self.buffer) == current: + break + result = self.buffer[self.pos:current] + self.pos = current + if self.pos > BUFSIZE: + self.buffer = self.buffer[self.pos:] + self.pos = 0 + return result + + def stringlexem(self): + start = self.pos + 1 + while True: + try: + end = self.buffer.index('"', start) + escpos = end - 1 + while self.buffer[escpos] == '\\': + escpos -= 1 + if (end - escpos) % 2 == 0: + start = end + 1 + else: + result = self.buffer[self.pos:end + 1] + self.pos = end + 1 + return result + except ValueError: + old_len = len(self.buffer) + self.buffer += self.f.read(BUFSIZE).decode('utf-8') + if len(self.buffer) == old_len: + raise common.IncompleteJSONError() + +def unescape(s): + start = 0 + while start < len(s): + pos = s.find('\\', start) + if pos == -1: + yield s[start:] + break + yield s[start:pos] + pos += 1 + esc = s[pos] + if esc == 'b': + yield '\b' + elif esc == 'f': + yield '\f' + elif esc == 'n': + yield '\n' + elif esc == 'r': + yield '\r' + elif esc == 't': + yield '\t' + elif esc == 'u': + yield chr(int(s[pos + 1:pos + 5], 16)) + pos += 4 + else: + yield esc + start = pos + 1 + +def parse_value(lexer, symbol=None): + try: + if symbol is None: + symbol = next(lexer) + if symbol == 'null': + yield ('null', None) + elif symbol == 'true': + yield ('boolean', True) + elif symbol == 'false': + yield ('boolean', False) + elif symbol == '[': + for event in parse_array(lexer): + yield event + elif symbol == '{': + for event in parse_object(lexer): + yield event + elif symbol[0] == '"': + yield ('string', ''.join(unescape(symbol[1:-1]))) + else: + try: + number = Decimal(symbol) if '.' in symbol else int(symbol) + yield ('number', number) + except ValueError: + raise UnexpectedSymbol(symbol, lexer) + except StopIteration: + raise common.IncompleteJSONError() + +def parse_array(lexer): + yield ('start_array', None) + symbol = next(lexer) + if symbol != ']': + while True: + for event in parse_value(lexer, symbol): + yield event + symbol = next(lexer) + if symbol == ']': + break + if symbol != ',': + raise UnexpectedSymbol(symbol, lexer) + symbol = next(lexer) + yield ('end_array', None) + +def parse_object(lexer): + yield ('start_map', None) + symbol = next(lexer) + if symbol != '}': + while True: + if symbol[0] != '"': + raise UnexpectedSymbol(symbol, lexer) + yield ('map_key', symbol[1:-1]) + symbol = next(lexer) + if symbol != ':': + raise UnexpectedSymbol(symbol, lexer) + for event in parse_value(lexer): + yield event + symbol = next(lexer) + if symbol == '}': + break + if symbol != ',': + raise UnexpectedSymbol(symbol, lexer) + symbol = next(lexer) + yield ('end_map', None) + +def basic_parse(file): + ''' + Iterator yielding unprefixed events. + + Parameters: + + - file: a readable file-like object with JSON input + ''' + lexer = iter(Lexer(file)) + for value in parse_value(lexer): + yield value + try: + next(lexer) + except StopIteration: + pass + else: + raise common.JSONError('Additional data') + +def parse(file): + ''' + Backend-specific wrapper for ijson.common.parse. + ''' + return common.parse(basic_parse(file)) + +def items(file, prefix): + ''' + Backend-specific wrapper for ijson.common.items. + ''' + return common.items(parse(file), prefix) diff --git a/ijson-1.1/ijson/backends/yajl.py b/ijson-1.1/ijson/backends/yajl.py new file mode 100644 index 0000000..fe61a0c --- /dev/null +++ b/ijson-1.1/ijson/backends/yajl.py @@ -0,0 +1,125 @@ +''' +Wrapper for YAJL C library version 1.x. +''' + +from ctypes import Structure, c_uint, c_ubyte, c_int, c_long, c_double, \ + c_void_p, c_char_p, CFUNCTYPE, POINTER, byref, string_at, cast , \ + cdll, util, c_char +from decimal import Decimal + +from ijson import common, backends +from ijson.compat import b2s + + +yajl = backends.find_yajl(1) + +yajl.yajl_alloc.restype = POINTER(c_char) +yajl.yajl_get_error.restype = POINTER(c_char) + +C_EMPTY = CFUNCTYPE(c_int, c_void_p) +C_INT = CFUNCTYPE(c_int, c_void_p, c_int) +C_LONG = CFUNCTYPE(c_int, c_void_p, c_long) +C_DOUBLE = CFUNCTYPE(c_int, c_void_p, c_double) +C_STR = CFUNCTYPE(c_int, c_void_p, POINTER(c_ubyte), c_uint) + + +def number(value): + ''' + Helper function casting a string that represents any Javascript number + into appropriate Python value: either int or Decimal. + ''' + try: + return int(value) + except ValueError: + return Decimal(value) + +_callback_data = [ + # Mapping of JSON parser events to callback C types and value converters. + # Used to define the Callbacks structure and actual callback functions + # inside the parse function. + ('null', C_EMPTY, lambda: None), + ('boolean', C_INT, lambda v: bool(v)), + # "integer" and "double" aren't actually yielded by yajl since "number" + # takes precedence if defined + ('integer', C_LONG, lambda v, l: int(string_at(v, l))), + ('double', C_DOUBLE, lambda v, l: float(string_at(v, l))), + ('number', C_STR, lambda v, l: number(b2s(string_at(v, l)))), + ('string', C_STR, lambda v, l: string_at(v, l).decode('utf-8')), + ('start_map', C_EMPTY, lambda: None), + ('map_key', C_STR, lambda v, l: b2s(string_at(v, l))), + ('end_map', C_EMPTY, lambda: None), + ('start_array', C_EMPTY, lambda: None), + ('end_array', C_EMPTY, lambda: None), +] + +class Callbacks(Structure): + _fields_ = [(name, type) for name, type, func in _callback_data] + +class Config(Structure): + _fields_ = [ + ("allowComments", c_uint), + ("checkUTF8", c_uint) + ] + +YAJL_OK = 0 +YAJL_CANCELLED = 1 +YAJL_INSUFFICIENT_DATA = 2 +YAJL_ERROR = 3 + + +def basic_parse(f, allow_comments=False, check_utf8=False, buf_size=64 * 1024): + ''' + Iterator yielding unprefixed events. + + Parameters: + + - f: a readable file-like object with JSON input + - allow_comments: tells parser to allow comments in JSON input + - check_utf8: if True, parser will cause an error if input is invalid utf-8 + - buf_size: a size of an input buffer + ''' + events = [] + + def callback(event, func_type, func): + def c_callback(context, *args): + events.append((event, func(*args))) + return 1 + return func_type(c_callback) + + callbacks = Callbacks(*[callback(*data) for data in _callback_data]) + config = Config(allow_comments, check_utf8) + handle = yajl.yajl_alloc(byref(callbacks), byref(config), None, None) + try: + while True: + buffer = f.read(buf_size) + if buffer: + result = yajl.yajl_parse(handle, buffer, len(buffer)) + else: + result = yajl.yajl_parse_complete(handle) + if result == YAJL_ERROR: + perror = yajl.yajl_get_error(handle, 1, buffer, len(buffer)) + error = cast(perror, c_char_p).value + yajl.yajl_free_error(handle, perror) + raise common.JSONError(error) + if not buffer and not events: + if result == YAJL_INSUFFICIENT_DATA: + raise common.IncompleteJSONError() + break + + for event in events: + yield event + events = [] + finally: + yajl.yajl_free(handle) + +def parse(file, **kwargs): + ''' + Backend-specific wrapper for ijson.common.parse. + ''' + return common.parse(basic_parse(file, **kwargs)) + +def items(file, prefix): + ''' + Backend-specific wrapper for ijson.common.items. + ''' + return common.items(parse(file), prefix) diff --git a/ijson-1.1/ijson/backends/yajl2.py b/ijson-1.1/ijson/backends/yajl2.py new file mode 100644 index 0000000..3bfc7ba --- /dev/null +++ b/ijson-1.1/ijson/backends/yajl2.py @@ -0,0 +1,127 @@ +''' +Wrapper for YAJL C library version 2.x. +''' + +from ctypes import Structure, c_uint, c_ubyte, c_int, c_long, c_double, \ + c_void_p, c_char_p, CFUNCTYPE, POINTER, byref, string_at, cast , \ + cdll, util, c_char +from decimal import Decimal + +from ijson import common, backends +from ijson.compat import b2s + + +yajl = backends.find_yajl(2) + +yajl.yajl_alloc.restype = POINTER(c_char) +yajl.yajl_get_error.restype = POINTER(c_char) + +C_EMPTY = CFUNCTYPE(c_int, c_void_p) +C_INT = CFUNCTYPE(c_int, c_void_p, c_int) +C_LONG = CFUNCTYPE(c_int, c_void_p, c_long) +C_DOUBLE = CFUNCTYPE(c_int, c_void_p, c_double) +C_STR = CFUNCTYPE(c_int, c_void_p, POINTER(c_ubyte), c_uint) + + +def number(value): + ''' + Helper function casting a string that represents any Javascript number + into appropriate Python value: either int or Decimal. + ''' + try: + return int(value) + except ValueError: + return Decimal(value) + +_callback_data = [ + # Mapping of JSON parser events to callback C types and value converters. + # Used to define the Callbacks structure and actual callback functions + # inside the parse function. + ('null', C_EMPTY, lambda: None), + ('boolean', C_INT, lambda v: bool(v)), + # "integer" and "double" aren't actually yielded by yajl since "number" + # takes precedence if defined + ('integer', C_LONG, lambda v, l: int(string_at(v, l))), + ('double', C_DOUBLE, lambda v, l: float(string_at(v, l))), + ('number', C_STR, lambda v, l: number(b2s(string_at(v, l)))), + ('string', C_STR, lambda v, l: string_at(v, l).decode('utf-8')), + ('start_map', C_EMPTY, lambda: None), + ('map_key', C_STR, lambda v, l: b2s(string_at(v, l))), + ('end_map', C_EMPTY, lambda: None), + ('start_array', C_EMPTY, lambda: None), + ('end_array', C_EMPTY, lambda: None), +] + +class Callbacks(Structure): + _fields_ = [(name, type) for name, type, func in _callback_data] + +YAJL_OK = 0 +YAJL_CANCELLED = 1 +YAJL_INSUFFICIENT_DATA = 2 +YAJL_ERROR = 3 + +# constants defined in yajl_parse.h +YAJL_ALLOW_COMMENTS = 1 +YAJL_MULTIPLE_VALUES = 8 + + +def basic_parse(f, allow_comments=False, buf_size=64 * 1024, + multiple_values=False): + ''' + Iterator yielding unprefixed events. + + Parameters: + + - f: a readable file-like object with JSON input + - allow_comments: tells parser to allow comments in JSON input + - buf_size: a size of an input buffer + - multiple_values: allows the parser to parse multiple JSON objects + ''' + events = [] + + def callback(event, func_type, func): + def c_callback(context, *args): + events.append((event, func(*args))) + return 1 + return func_type(c_callback) + + callbacks = Callbacks(*[callback(*data) for data in _callback_data]) + handle = yajl.yajl_alloc(byref(callbacks), None, None) + if allow_comments: + yajl.yajl_config(handle, YAJL_ALLOW_COMMENTS, 1) + if multiple_values: + yajl.yajl_config(handle, YAJL_MULTIPLE_VALUES, 1) + try: + while True: + buffer = f.read(buf_size) + if buffer: + result = yajl.yajl_parse(handle, buffer, len(buffer)) + else: + result = yajl.yajl_complete_parse(handle) + if result == YAJL_ERROR: + perror = yajl.yajl_get_error(handle, 1, buffer, len(buffer)) + error = cast(perror, c_char_p).value + yajl.yajl_free_error(handle, perror) + raise common.JSONError(error) + if not buffer and not events: + if result == YAJL_INSUFFICIENT_DATA: + raise common.IncompleteJSONError() + break + + for event in events: + yield event + events = [] + finally: + yajl.yajl_free(handle) + +def parse(file, **kwargs): + ''' + Backend-specific wrapper for ijson.common.parse. + ''' + return common.parse(basic_parse(file, **kwargs)) + +def items(file, prefix): + ''' + Backend-specific wrapper for ijson.common.items. + ''' + return common.items(parse(file), prefix) diff --git a/ijson-1.1/ijson/backends/yajl2.pyc b/ijson-1.1/ijson/backends/yajl2.pyc new file mode 100644 index 0000000..e5aa9ff Binary files /dev/null and b/ijson-1.1/ijson/backends/yajl2.pyc differ diff --git a/ijson-1.1/ijson/common.py b/ijson-1.1/ijson/common.py new file mode 100644 index 0000000..cbea028 --- /dev/null +++ b/ijson-1.1/ijson/common.py @@ -0,0 +1,148 @@ +''' +Backend independent higher level interfaces, common exceptions. +''' + +class JSONError(Exception): + ''' + Base exception for all parsing errors. + ''' + pass + +class IncompleteJSONError(JSONError): + ''' + Raised when the parser expects data and it's not available. May be + caused by malformed syntax or a broken source stream. + ''' + def __init__(self): + super(IncompleteJSONError, self).__init__('Incomplete or empty JSON data') + +def parse(basic_events): + ''' + An iterator returning parsing events with the information about their location + with the JSON object tree. Events are tuples ``(prefix, type, value)``. + + Available types and values are: + + ('null', None) + ('boolean', ) + ('number', ) + ('string', ) + ('map_key', ) + ('start_map', None) + ('end_map', None) + ('start_array', None) + ('end_array', None) + + Prefixes represent the path to the nested elements from the root of the JSON + document. For example, given this document:: + + { + "array": [1, 2], + "map": { + "key": "value" + } + } + + the parser would yield events: + + ('', 'start_map', None) + ('', 'map_key', 'array') + ('array', 'start_array', None) + ('array.item', 'number', 1) + ('array.item', 'number', 2) + ('array', 'end_array', None) + ('', 'map_key', 'map') + ('map', 'start_map', None) + ('map', 'map_key', 'key') + ('map.key', 'string', u'value') + ('map', 'end_map', None) + ('', 'end_map', None) + + ''' + path = [] + for event, value in basic_events: + if event == 'map_key': + prefix = '.'.join(path[:-1]) + path[-1] = value + elif event == 'start_map': + prefix = '.'.join(path) + path.append(None) + elif event == 'end_map': + path.pop() + prefix = '.'.join(path) + elif event == 'start_array': + prefix = '.'.join(path) + path.append('item') + elif event == 'end_array': + path.pop() + prefix = '.'.join(path) + else: # any scalar value + prefix = '.'.join(path) + + yield prefix, event, value + + +class ObjectBuilder(object): + ''' + Incrementally builds an object from JSON parser events. Events are passed + into the `event` function that accepts two parameters: event type and + value. The object being built is available at any time from the `value` + attribute. + + Example:: + + from StringIO import StringIO + from ijson.parse import basic_parse + from ijson.utils import ObjectBuilder + + builder = ObjectBuilder() + f = StringIO('{"key": "value"}) + for event, value in basic_parse(f): + builder.event(event, value) + print builder.value + + ''' + def __init__(self): + def initial_set(value): + self.value = value + self.containers = [initial_set] + + def event(self, event, value): + if event == 'map_key': + self.key = value + elif event == 'start_map': + map = {} + self.containers[-1](map) + def setter(value): + map[self.key] = value + self.containers.append(setter) + elif event == 'start_array': + array = [] + self.containers[-1](array) + self.containers.append(array.append) + elif event == 'end_array' or event == 'end_map': + self.containers.pop() + else: + self.containers[-1](value) + +def items(prefixed_events, prefix): + ''' + An iterator returning native Python objects constructed from the events + under a given prefix. + ''' + prefixed_events = iter(prefixed_events) + try: + while True: + current, event, value = next(prefixed_events) + if current == prefix: + if event in ('start_map', 'start_array'): + builder = ObjectBuilder() + end_event = event.replace('start', 'end') + while (current, event) != (prefix, end_event): + builder.event(event, value) + current, event, value = next(prefixed_events) + yield builder.value + else: + yield value + except StopIteration: + pass diff --git a/ijson-1.1/ijson/common.pyc b/ijson-1.1/ijson/common.pyc new file mode 100644 index 0000000..cdf515f Binary files /dev/null and b/ijson-1.1/ijson/common.pyc differ diff --git a/ijson-1.1/ijson/compat.py b/ijson-1.1/ijson/compat.py new file mode 100644 index 0000000..b73c746 --- /dev/null +++ b/ijson-1.1/ijson/compat.py @@ -0,0 +1,17 @@ +''' +Python2/Python3 compatibility utilities. +''' + +import sys + + +IS_PY2 = sys.version_info[0] < 3 + + +if IS_PY2: + b2s = lambda s: s + chr = unichr +else: + def b2s(b): + return b.decode('utf-8') + chr = chr diff --git a/ijson-1.1/ijson/compat.pyc b/ijson-1.1/ijson/compat.pyc new file mode 100644 index 0000000..b820633 Binary files /dev/null and b/ijson-1.1/ijson/compat.pyc differ diff --git a/ijson-1.1/ijson/utils.py b/ijson-1.1/ijson/utils.py new file mode 100644 index 0000000..75092b5 --- /dev/null +++ b/ijson-1.1/ijson/utils.py @@ -0,0 +1,55 @@ +# -*- coding:utf-8 -*- +from functools import wraps + + +def coroutine(func): + ''' + Wraps a generator which intended to be used as a pure coroutine by + .send()ing it values. The only thing that the wrapper does is calling + .next() for the first time which is required by Python generator protocol. + ''' + @wraps(func) + def wrapper(*args, **kwargs): + g = func(*args, **kwargs) + next(g) + return g + return wrapper + +@coroutine +def foreach(coroutine_func): + ''' + Dispatches each JSON array item to a handler coroutine. A coroutine is + created anew for each item by calling `coroutine_func` callable. The + resulting coroutine should accept value in the form of tuple of values + generated by rich JSON parser: (prefix, event, value). + + First event received by foreach should be a "start_array" event. + ''' + g = None + base, event, value = yield + if event != 'start_array': + raise Exception('foreach requires "start_array" as the first event, got %s' % repr((base, event, value))) + START_EVENTS = set(['start_map', 'start_array', 'null', 'boolean', 'number', 'string']) + itemprefix = base + '.item' if base else 'item' + while True: + prefix, event, value = yield + if prefix == itemprefix and event in START_EVENTS: + g = coroutine_func() + if (prefix, event) != (base, 'end_array'): + g.send((prefix, event, value)) + +@coroutine +def dispatcher(targets): + ''' + Dispatches JSON parser events into several handlers depending on event + prefixes. + + Accepts a list of tuples (base_prefix, coroutine). A coroutine then + receives all the events with prefixes starting with its base_prefix. + ''' + while True: + prefix, event, value = yield + for base, target in targets: + if prefix.startswith(base): + target.send((prefix, event, value)) + break diff --git a/ijson-1.1/setup.py b/ijson-1.1/setup.py new file mode 100644 index 0000000..639f9bc --- /dev/null +++ b/ijson-1.1/setup.py @@ -0,0 +1,21 @@ +# -*- coding:utf-8 -*- +from distutils.core import setup + +setup( + name = 'ijson', + version = '1.1', + author = 'Ivan Sagalaev', + author_email = 'maniac@softwaremaniacs.org', + packages = ['ijson', 'ijson.backends'], + url = 'https://github.com/isagalaev/ijson', + license = 'LICENSE.txt', + description = 'Iterative JSON parser with a standard Python iterator interface', + long_description = open('README.rst').read(), + classifiers = [ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Topic :: Software Development :: Libraries :: Python Modules', + ] +) diff --git a/parse_json.py b/parse_json.py new file mode 100644 index 0000000..3f305a7 --- /dev/null +++ b/parse_json.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +import os +import json +import re + +#shameless copy paste from json/decoder.py +FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL +WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) + +class ConcatJSONDecoder(json.JSONDecoder): + def decode(self, s, _w=WHITESPACE.match): + s_len = len(s) + + objs = [] + end = 0 + while end != s_len: + obj, end = self.raw_decode(s, idx=_w(s, end).end()) + end = _w(s, end).end() + objs.append(obj) + return objs + +def print_json_file(filename): + events = json.loads(os.popen("zcat "+ filename).read(), cls=ConcatJSONDecoder) + + for event in events: + # remove events which are not done to repositories + if event['type'] in ["GistEvent"]: + continue + + ev = {} + ev['actor'] = event['actor'] + ev['type'] = event['type'] + ev['date'] = event['created_at'] + ev['url'] = event['url'] + + + if event.has_key('repository'): + ev['repo.name'] = event['repository']['name'] + ev['repo.owner'] = event['repository']['owner'] + ev['repo.watchers'] = str(event['repository']['watchers']) + ev['repo.forks'] = str(event['repository']['forks']) + else: + ev['repo.name'] = "" + ev['repo.owner'] = "" + ev['repo.watchers'] = "" + ev['repo.forks'] = "" + + # print event # debug code + print u"\t".join([ev['actor'], + ev['type'], + ev['date'], + ev['url'], + ev['repo.name'], + ev['repo.owner'], + ev['repo.watchers'], + ev['repo.forks']]).encode("utf8") + +print "\t".join(["actor", "type", "date", "url", "name", "repo.owner", + "repo.watchers", "repo.forks"]) + +for filename in os.listdir("data/"): + print_json_file("data/" + filename)