From 680ee1f7103d948c203ee01401d1f9674a6f00f4 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Fri, 15 Aug 2014 19:35:55 -0700 Subject: [PATCH 1/1] import of github code used for the hackathon Held w/ the SciencePo MediaLab in Paris on June 26-27, 2014 --- analysis.R | 78 ++++++++++ ijson | 1 + ijson-1.1/LICENSE.txt | 24 +++ ijson-1.1/PKG-INFO | 109 +++++++++++++ ijson-1.1/README.rst | 94 ++++++++++++ ijson-1.1/ijson/__init__.py | 31 ++++ ijson-1.1/ijson/__init__.pyc | Bin 0 -> 1090 bytes ijson-1.1/ijson/backends/__init__.py | 15 ++ ijson-1.1/ijson/backends/__init__.pyc | Bin 0 -> 921 bytes ijson-1.1/ijson/backends/python.py | 210 ++++++++++++++++++++++++++ ijson-1.1/ijson/backends/yajl.py | 125 +++++++++++++++ ijson-1.1/ijson/backends/yajl2.py | 127 ++++++++++++++++ ijson-1.1/ijson/backends/yajl2.pyc | Bin 0 -> 5611 bytes ijson-1.1/ijson/common.py | 148 ++++++++++++++++++ ijson-1.1/ijson/common.pyc | Bin 0 -> 4901 bytes ijson-1.1/ijson/compat.py | 17 +++ ijson-1.1/ijson/compat.pyc | Bin 0 -> 553 bytes ijson-1.1/ijson/utils.py | 55 +++++++ ijson-1.1/setup.py | 21 +++ parse_json.py | 63 ++++++++ 20 files changed, 1118 insertions(+) create mode 100644 analysis.R create mode 120000 ijson create mode 100644 ijson-1.1/LICENSE.txt create mode 100644 ijson-1.1/PKG-INFO create mode 100644 ijson-1.1/README.rst create mode 100644 ijson-1.1/ijson/__init__.py create mode 100644 ijson-1.1/ijson/__init__.pyc create mode 100644 ijson-1.1/ijson/backends/__init__.py create mode 100644 ijson-1.1/ijson/backends/__init__.pyc create mode 100644 ijson-1.1/ijson/backends/python.py create mode 100644 ijson-1.1/ijson/backends/yajl.py create mode 100644 ijson-1.1/ijson/backends/yajl2.py create mode 100644 ijson-1.1/ijson/backends/yajl2.pyc create mode 100644 ijson-1.1/ijson/common.py create mode 100644 ijson-1.1/ijson/common.pyc create mode 100644 ijson-1.1/ijson/compat.py create mode 100644 ijson-1.1/ijson/compat.pyc create mode 100644 ijson-1.1/ijson/utils.py create mode 100644 ijson-1.1/setup.py create mode 100644 parse_json.py diff --git a/analysis.R b/analysis.R new file mode 100644 index 0000000..7527828 --- /dev/null +++ b/analysis.R @@ -0,0 +1,78 @@ +# load up libraries and data +####################################################################### +library(ggplot2) +library(reshape2) +library(parallel) + +d <- read.delim("gitdata_1week.tsv", stringsAsFactors=FALSE) + +d$url <- NULL +d$date <- gsub("T", " ", d$date) +d$date <- as.POSIXct(d$date) + +# crop the event off the end +d$type <- gsub("Event$", "", d$type) + +rev(sort(table(d$type))) +head(rev(sort(table(d$name)))) +head(rev(sort(table(d$actor)))) + +d$min <- cut(d$date, breaks="min") +d$hour <- cut(d$date, breaks="hour") + +# generate graphs +###################################################################### + +pdf("github_graphs.pdf", width=10, height=7) + +# graph all together +grid.tmp <- as.data.frame(table(d$hour)) +colnames(grid.tmp) <- c("date", "freq") +grid.tmp$date <- as.POSIXct(grid.tmp$date) + +qplot(date, freq, data=grid.tmp, geom="line") + +# just types of events +grid.tmp <- melt(lapply(tapply(d$type, d$hour, table), as.list), L2~L1) +colnames(grid.tmp) <- c("value", "event", "date") +grid.tmp$date <- as.POSIXct(grid.tmp$date) + +ggplot(data=grid.tmp) + + aes(x=date, y=value, group=event, color=event) + geom_line() + +ggplot(data=grid.tmp) + + aes(x=date, y=value, group=event, color=event) + geom_bar(stat="identity") + +ggplot(data=grid.tmp) + + aes(x=date, y=value, group=event, color=event, size=value) + geom_line() + + facet_grid(event~., scale="free_y") + +# create first differences +build.phase.diagram.dataset <- function (d) { + grid.tmp <- as.data.frame(table(d$hour)) + colnames(grid.tmp) <- c("date", "freq") + grid.tmp$date <- as.POSIXct(grid.tmp$date) + grid.tmp$freq2 <- c(NA, grid.tmp$freq[1:(length(grid.tmp$freq)-1)]) + grid.tmp$diff <- grid.tmp$freq2 - grid.tmp$freq + grid.tmp <- grid.tmp[2:(dim(grid.tmp)[1]-1),] + grid.tmp$hour.of.week <- as.numeric(as.factor(grid.tmp$date)) + grid.tmp$hour <- as.numeric(as.factor(grid.tmp$date)) %% 24 + return(grid.tmp) +} + +phase.diagram <- function (subset) { + grid.tmp <- build.phase.diagram.dataset(d[d$type == subset,]) + ggplot(grid.tmp) + aes(x=diff, y=freq2, size=hour, colour=hour) + + geom_path() + ggtitle(paste("Phase Diagram:", subset)) + +} + +grid.tmp <- build.phase.diagram.dataset(d) +ggplot(grid.tmp) + aes(x=diff, y=freq2, size=hour, colour=hour) + + geom_path() + ggtitle("Phase Diagram: All Activity") + +phase.diagram("Push") +phase.diagram("Watch") +phase.diagram("Fork") +phase.diagram("PullRequest") +dev.off() diff --git a/ijson b/ijson new file mode 120000 index 0000000..f8035de --- /dev/null +++ b/ijson @@ -0,0 +1 @@ +ijson-1.1/ijson \ No newline at end of file diff --git a/ijson-1.1/LICENSE.txt b/ijson-1.1/LICENSE.txt new file mode 100644 index 0000000..89d119c --- /dev/null +++ b/ijson-1.1/LICENSE.txt @@ -0,0 +1,24 @@ +Copyright (c) 2010, Ivan Sagalaev +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name "ijson" nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/ijson-1.1/PKG-INFO b/ijson-1.1/PKG-INFO new file mode 100644 index 0000000..7349cd6 --- /dev/null +++ b/ijson-1.1/PKG-INFO @@ -0,0 +1,109 @@ +Metadata-Version: 1.1 +Name: ijson +Version: 1.1 +Summary: Iterative JSON parser with a standard Python iterator interface +Home-page: https://github.com/isagalaev/ijson +Author: Ivan Sagalaev +Author-email: maniac@softwaremaniacs.org +License: LICENSE.txt +Description: ===== + ijson + ===== + + Ijson is an iterative JSON parser with a standard Python iterator interface. + + + Usage + ===== + + All usage example will be using a JSON document describing geographical + objects:: + + { + "earth": { + "europe": [ + {"name": "Paris", "type": "city", "info": { ... }}, + {"name": "Thames", "type": "river", "info": { ... }}, + // ... + ], + "america": [ + {"name": "Texas", "type": "state", "info": { ... }}, + // ... + ] + } + } + + Most common usage is having ijson yield native Python objects out of a JSON + stream located under a prefix. Here's how to process all European cities:: + + import ijson + + f = urlopen('http://.../') + objects = ijson.items(f, 'earth.europe.item') + cities = (o for o in objects if o['type'] == 'city') + for city in cities: + do_something_with(city) + + Sometimes when dealing with a particularly large JSON payload it may worth to + not even construct individual Python objects and react on individual events + immediately producing some result:: + + import ijson + + parser = ijson.parse(urlopen('http://.../')) + stream.write('') + for prefix, event, value in parser: + if (prefix, event) == ('earth', 'map_key'): + stream.write('<%s>' % value) + continent = value + elif prefix.endswith('.name'): + stream.write('' % value) + elif (prefix, event) == ('earth.%s' % continent, 'end_map'): + stream.write('' % continent) + stream.write('') + + + Backends + ======== + + Ijson provides several implementations of the actual parsing in the form of + backends located in ijson/backends: + + - ``yajl2``: wrapper around `YAJL `_ version 2.x + - ``yajl``: wrapper around `YAJL `_ version 1.x + - ``python``: pure Python parser (good to use under PyPy) + + You can import a specific backend and use it in the same way as the top level + library:: + + import ijson.backends.python as ijson + + for item in ijson.items(...): + # ... + + Importing the top level library as ``import ijson`` tries to import all backends + in order, so it either finds an appropriate version of YAJL or falls back to the + Python backend if none is found. + + + Acknowledgements + ================ + + Python parser in ijson is relatively simple thanks to `Douglas Crockford + `_ who invented a strict, easy to parse syntax. + + The `YAJL `_ library by `Lloyd Hilaiel + `_ is the most popular and efficient way to parse JSON in an + iterative fashion. + + Ijson was inspired by `yajl-py `_ wrapper by + `Hatem Nassrat `_. Though ijson borrows almost nothing + from the actual yajl-py code it was used as an example of integration with yajl + using ctypes. + +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: License :: OSI Approved :: BSD License +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 3 +Classifier: Topic :: Software Development :: Libraries :: Python Modules diff --git a/ijson-1.1/README.rst b/ijson-1.1/README.rst new file mode 100644 index 0000000..434dd5f --- /dev/null +++ b/ijson-1.1/README.rst @@ -0,0 +1,94 @@ +===== +ijson +===== + +Ijson is an iterative JSON parser with a standard Python iterator interface. + + +Usage +===== + +All usage example will be using a JSON document describing geographical +objects:: + + { + "earth": { + "europe": [ + {"name": "Paris", "type": "city", "info": { ... }}, + {"name": "Thames", "type": "river", "info": { ... }}, + // ... + ], + "america": [ + {"name": "Texas", "type": "state", "info": { ... }}, + // ... + ] + } + } + +Most common usage is having ijson yield native Python objects out of a JSON +stream located under a prefix. Here's how to process all European cities:: + + import ijson + + f = urlopen('http://.../') + objects = ijson.items(f, 'earth.europe.item') + cities = (o for o in objects if o['type'] == 'city') + for city in cities: + do_something_with(city) + +Sometimes when dealing with a particularly large JSON payload it may worth to +not even construct individual Python objects and react on individual events +immediately producing some result:: + + import ijson + + parser = ijson.parse(urlopen('http://.../')) + stream.write('') + for prefix, event, value in parser: + if (prefix, event) == ('earth', 'map_key'): + stream.write('<%s>' % value) + continent = value + elif prefix.endswith('.name'): + stream.write('' % value) + elif (prefix, event) == ('earth.%s' % continent, 'end_map'): + stream.write('' % continent) + stream.write('') + + +Backends +======== + +Ijson provides several implementations of the actual parsing in the form of +backends located in ijson/backends: + +- ``yajl2``: wrapper around `YAJL `_ version 2.x +- ``yajl``: wrapper around `YAJL `_ version 1.x +- ``python``: pure Python parser (good to use under PyPy) + +You can import a specific backend and use it in the same way as the top level +library:: + + import ijson.backends.python as ijson + + for item in ijson.items(...): + # ... + +Importing the top level library as ``import ijson`` tries to import all backends +in order, so it either finds an appropriate version of YAJL or falls back to the +Python backend if none is found. + + +Acknowledgements +================ + +Python parser in ijson is relatively simple thanks to `Douglas Crockford +`_ who invented a strict, easy to parse syntax. + +The `YAJL `_ library by `Lloyd Hilaiel +`_ is the most popular and efficient way to parse JSON in an +iterative fashion. + +Ijson was inspired by `yajl-py `_ wrapper by +`Hatem Nassrat `_. Though ijson borrows almost nothing +from the actual yajl-py code it was used as an example of integration with yajl +using ctypes. diff --git a/ijson-1.1/ijson/__init__.py b/ijson-1.1/ijson/__init__.py new file mode 100644 index 0000000..c76eb3f --- /dev/null +++ b/ijson-1.1/ijson/__init__.py @@ -0,0 +1,31 @@ +''' +Iterative JSON parser. + +Main API: + +- ``ijson.parse``: iterator returning parsing events with the object tree context, + see ``ijson.common.parse`` for docs. + +- ``ijson.items``: iterator returning Python objects found under a specified prefix, + see ``ijson.common.items`` for docs. + +Top-level ``ijson`` module tries to automatically find and import a suitable +parsing backend. You can also explicitly import a required backend from +``ijson.backends``. +''' + +from ijson.common import JSONError, IncompleteJSONError, ObjectBuilder +from ijson.backends import YAJLImportError + +try: + import ijson.backends.yajl2 as backend +except YAJLImportError: + try: + import ijson.backends.yajl as backend + except YAJLImportError: + import ijson.backends.python as backend + + +basic_parse = backend.basic_parse +parse = backend.parse +items = backend.items diff --git a/ijson-1.1/ijson/__init__.pyc b/ijson-1.1/ijson/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5ef9d30afe86a07d7c77e2ce92702d4351d48a1 GIT binary patch literal 1090 zcmZ`%&2AGh5FTgKwAnVa<)=c3!yI#A&KFm+J`1@!9@(_8!&bYwHi=s z0d@KP2KCjWJU0<_0_2{D5aL8JIN>o&)*)F3uWUfF5tL0xHhsi#*FVDkWB_sBag#Z5 z#c}WW@BquqSZ;c8yq^h$yGQ+Ztr79MwDPv*pNPRSQ=UUa7 zEX7x9gmk#U3~kACF&2D;rQu_7eMj<||90{Zy4;rZ962v^IK)DaD#__Jf?ISNv&!j$ z`j=-?PH`+KaYjcJr8drIuY}9aB#(XTe0$Ea{DP}d4^MT4d8ROvRwKVIrO1UNj+;=! zFDqfF?m7g=Mi=qCXSI=9(u>7!`nnO#)U`?YLB2DF3V1-bzM}S)k~_YzcgTJi=JK`@ zlKRx8RRFh6;&gZKeLqwmC=R;!y=R$3n$l3x)UA;|TZh@)26TskCm1wHxO*gK{&|gq zP};-)?7e9=kuTf|Ip$F3H*;q8Icsxx$lshH)-`Lidqm1VfHG`_Ymui`R|BIJc^Vcu zYx15PjZF(o%{-k;nmr*b5@6PJV>y7Ys*ABK*HBxdR zoH%pfPjcc1z>GI-xOL@uJ06e6GjAsTbMN5y>5or2wqH#D*Cg(gK++PZfLdT8P)C?V z5Hdm$Vbd^zUixMIgjN96qS9;+npNZc<150q4_z(!y#up%9}*oQrak3npZ z{hNReb(U>*V9yA?gH?obk0~OqHW5BYSjSkm!cpLCYiwgp&M{ULIfnfiB2y>?BPkus zTIk)pL~$FEM}g91R#xr_<;s*M?n`D1nV4x=pSm<0ZiV5YQ8D>b#5U54jJMi+t0F%#(T5)4fyhg%nB zQ>shqQMk^@rrUS>%(mz7*Kd2aI1}1CZu2Qaos~2uJVv%~zA-6Hrf{foYj&N#rRn|- zMF}NvBAlChKmgGdha_F`SUe={sThSRrX~WpU(O2`7}%Bg-{>~QACc!SjiB9@3s?TK P8QxdSo#PUFQ7`@n!#BG~ literal 0 HcmV?d00001 diff --git a/ijson-1.1/ijson/backends/python.py b/ijson-1.1/ijson/backends/python.py new file mode 100644 index 0000000..ba10870 --- /dev/null +++ b/ijson-1.1/ijson/backends/python.py @@ -0,0 +1,210 @@ +''' +Pure-python parsing backend. +''' +from __future__ import unicode_literals +from decimal import Decimal +import re +from codecs import unicode_escape_decode + +from ijson import common +from ijson.compat import chr + + +BUFSIZE = 16 * 1024 +NONWS = re.compile(r'\S') +LEXTERM = re.compile(r'[^a-z0-9\.+-]') + + +class UnexpectedSymbol(common.JSONError): + def __init__(self, symbol, reader): + super(UnexpectedSymbol, self).__init__('Unexpected symbol "%s" at %d' % (symbol[0], reader.pos - len(symbol))) + +class Lexer(object): + ''' + JSON lexer. Supports iterator interface. + ''' + def __init__(self, f): + self.f = f + + def __iter__(self): + self.buffer = '' + self.pos = 0 + return self + + def __next__(self): + while True: + match = NONWS.search(self.buffer, self.pos) + if match: + self.pos = match.start() + char = self.buffer[self.pos] + if 'a' <= char <= 'z' or '0' <= char <= '9' or char == '-': + return self.lexem() + elif char == '"': + return self.stringlexem() + else: + self.pos += 1 + return char + self.buffer = self.f.read(BUFSIZE).decode('utf-8') + self.pos = 0 + if not len(self.buffer): + raise StopIteration + next = __next__ + + def lexem(self): + current = self.pos + while True: + match = LEXTERM.search(self.buffer, current) + if match: + current = match.start() + break + else: + current = len(self.buffer) + self.buffer += self.f.read(BUFSIZE).decode('utf-8') + if len(self.buffer) == current: + break + result = self.buffer[self.pos:current] + self.pos = current + if self.pos > BUFSIZE: + self.buffer = self.buffer[self.pos:] + self.pos = 0 + return result + + def stringlexem(self): + start = self.pos + 1 + while True: + try: + end = self.buffer.index('"', start) + escpos = end - 1 + while self.buffer[escpos] == '\\': + escpos -= 1 + if (end - escpos) % 2 == 0: + start = end + 1 + else: + result = self.buffer[self.pos:end + 1] + self.pos = end + 1 + return result + except ValueError: + old_len = len(self.buffer) + self.buffer += self.f.read(BUFSIZE).decode('utf-8') + if len(self.buffer) == old_len: + raise common.IncompleteJSONError() + +def unescape(s): + start = 0 + while start < len(s): + pos = s.find('\\', start) + if pos == -1: + yield s[start:] + break + yield s[start:pos] + pos += 1 + esc = s[pos] + if esc == 'b': + yield '\b' + elif esc == 'f': + yield '\f' + elif esc == 'n': + yield '\n' + elif esc == 'r': + yield '\r' + elif esc == 't': + yield '\t' + elif esc == 'u': + yield chr(int(s[pos + 1:pos + 5], 16)) + pos += 4 + else: + yield esc + start = pos + 1 + +def parse_value(lexer, symbol=None): + try: + if symbol is None: + symbol = next(lexer) + if symbol == 'null': + yield ('null', None) + elif symbol == 'true': + yield ('boolean', True) + elif symbol == 'false': + yield ('boolean', False) + elif symbol == '[': + for event in parse_array(lexer): + yield event + elif symbol == '{': + for event in parse_object(lexer): + yield event + elif symbol[0] == '"': + yield ('string', ''.join(unescape(symbol[1:-1]))) + else: + try: + number = Decimal(symbol) if '.' in symbol else int(symbol) + yield ('number', number) + except ValueError: + raise UnexpectedSymbol(symbol, lexer) + except StopIteration: + raise common.IncompleteJSONError() + +def parse_array(lexer): + yield ('start_array', None) + symbol = next(lexer) + if symbol != ']': + while True: + for event in parse_value(lexer, symbol): + yield event + symbol = next(lexer) + if symbol == ']': + break + if symbol != ',': + raise UnexpectedSymbol(symbol, lexer) + symbol = next(lexer) + yield ('end_array', None) + +def parse_object(lexer): + yield ('start_map', None) + symbol = next(lexer) + if symbol != '}': + while True: + if symbol[0] != '"': + raise UnexpectedSymbol(symbol, lexer) + yield ('map_key', symbol[1:-1]) + symbol = next(lexer) + if symbol != ':': + raise UnexpectedSymbol(symbol, lexer) + for event in parse_value(lexer): + yield event + symbol = next(lexer) + if symbol == '}': + break + if symbol != ',': + raise UnexpectedSymbol(symbol, lexer) + symbol = next(lexer) + yield ('end_map', None) + +def basic_parse(file): + ''' + Iterator yielding unprefixed events. + + Parameters: + + - file: a readable file-like object with JSON input + ''' + lexer = iter(Lexer(file)) + for value in parse_value(lexer): + yield value + try: + next(lexer) + except StopIteration: + pass + else: + raise common.JSONError('Additional data') + +def parse(file): + ''' + Backend-specific wrapper for ijson.common.parse. + ''' + return common.parse(basic_parse(file)) + +def items(file, prefix): + ''' + Backend-specific wrapper for ijson.common.items. + ''' + return common.items(parse(file), prefix) diff --git a/ijson-1.1/ijson/backends/yajl.py b/ijson-1.1/ijson/backends/yajl.py new file mode 100644 index 0000000..fe61a0c --- /dev/null +++ b/ijson-1.1/ijson/backends/yajl.py @@ -0,0 +1,125 @@ +''' +Wrapper for YAJL C library version 1.x. +''' + +from ctypes import Structure, c_uint, c_ubyte, c_int, c_long, c_double, \ + c_void_p, c_char_p, CFUNCTYPE, POINTER, byref, string_at, cast , \ + cdll, util, c_char +from decimal import Decimal + +from ijson import common, backends +from ijson.compat import b2s + + +yajl = backends.find_yajl(1) + +yajl.yajl_alloc.restype = POINTER(c_char) +yajl.yajl_get_error.restype = POINTER(c_char) + +C_EMPTY = CFUNCTYPE(c_int, c_void_p) +C_INT = CFUNCTYPE(c_int, c_void_p, c_int) +C_LONG = CFUNCTYPE(c_int, c_void_p, c_long) +C_DOUBLE = CFUNCTYPE(c_int, c_void_p, c_double) +C_STR = CFUNCTYPE(c_int, c_void_p, POINTER(c_ubyte), c_uint) + + +def number(value): + ''' + Helper function casting a string that represents any Javascript number + into appropriate Python value: either int or Decimal. + ''' + try: + return int(value) + except ValueError: + return Decimal(value) + +_callback_data = [ + # Mapping of JSON parser events to callback C types and value converters. + # Used to define the Callbacks structure and actual callback functions + # inside the parse function. + ('null', C_EMPTY, lambda: None), + ('boolean', C_INT, lambda v: bool(v)), + # "integer" and "double" aren't actually yielded by yajl since "number" + # takes precedence if defined + ('integer', C_LONG, lambda v, l: int(string_at(v, l))), + ('double', C_DOUBLE, lambda v, l: float(string_at(v, l))), + ('number', C_STR, lambda v, l: number(b2s(string_at(v, l)))), + ('string', C_STR, lambda v, l: string_at(v, l).decode('utf-8')), + ('start_map', C_EMPTY, lambda: None), + ('map_key', C_STR, lambda v, l: b2s(string_at(v, l))), + ('end_map', C_EMPTY, lambda: None), + ('start_array', C_EMPTY, lambda: None), + ('end_array', C_EMPTY, lambda: None), +] + +class Callbacks(Structure): + _fields_ = [(name, type) for name, type, func in _callback_data] + +class Config(Structure): + _fields_ = [ + ("allowComments", c_uint), + ("checkUTF8", c_uint) + ] + +YAJL_OK = 0 +YAJL_CANCELLED = 1 +YAJL_INSUFFICIENT_DATA = 2 +YAJL_ERROR = 3 + + +def basic_parse(f, allow_comments=False, check_utf8=False, buf_size=64 * 1024): + ''' + Iterator yielding unprefixed events. + + Parameters: + + - f: a readable file-like object with JSON input + - allow_comments: tells parser to allow comments in JSON input + - check_utf8: if True, parser will cause an error if input is invalid utf-8 + - buf_size: a size of an input buffer + ''' + events = [] + + def callback(event, func_type, func): + def c_callback(context, *args): + events.append((event, func(*args))) + return 1 + return func_type(c_callback) + + callbacks = Callbacks(*[callback(*data) for data in _callback_data]) + config = Config(allow_comments, check_utf8) + handle = yajl.yajl_alloc(byref(callbacks), byref(config), None, None) + try: + while True: + buffer = f.read(buf_size) + if buffer: + result = yajl.yajl_parse(handle, buffer, len(buffer)) + else: + result = yajl.yajl_parse_complete(handle) + if result == YAJL_ERROR: + perror = yajl.yajl_get_error(handle, 1, buffer, len(buffer)) + error = cast(perror, c_char_p).value + yajl.yajl_free_error(handle, perror) + raise common.JSONError(error) + if not buffer and not events: + if result == YAJL_INSUFFICIENT_DATA: + raise common.IncompleteJSONError() + break + + for event in events: + yield event + events = [] + finally: + yajl.yajl_free(handle) + +def parse(file, **kwargs): + ''' + Backend-specific wrapper for ijson.common.parse. + ''' + return common.parse(basic_parse(file, **kwargs)) + +def items(file, prefix): + ''' + Backend-specific wrapper for ijson.common.items. + ''' + return common.items(parse(file), prefix) diff --git a/ijson-1.1/ijson/backends/yajl2.py b/ijson-1.1/ijson/backends/yajl2.py new file mode 100644 index 0000000..3bfc7ba --- /dev/null +++ b/ijson-1.1/ijson/backends/yajl2.py @@ -0,0 +1,127 @@ +''' +Wrapper for YAJL C library version 2.x. +''' + +from ctypes import Structure, c_uint, c_ubyte, c_int, c_long, c_double, \ + c_void_p, c_char_p, CFUNCTYPE, POINTER, byref, string_at, cast , \ + cdll, util, c_char +from decimal import Decimal + +from ijson import common, backends +from ijson.compat import b2s + + +yajl = backends.find_yajl(2) + +yajl.yajl_alloc.restype = POINTER(c_char) +yajl.yajl_get_error.restype = POINTER(c_char) + +C_EMPTY = CFUNCTYPE(c_int, c_void_p) +C_INT = CFUNCTYPE(c_int, c_void_p, c_int) +C_LONG = CFUNCTYPE(c_int, c_void_p, c_long) +C_DOUBLE = CFUNCTYPE(c_int, c_void_p, c_double) +C_STR = CFUNCTYPE(c_int, c_void_p, POINTER(c_ubyte), c_uint) + + +def number(value): + ''' + Helper function casting a string that represents any Javascript number + into appropriate Python value: either int or Decimal. + ''' + try: + return int(value) + except ValueError: + return Decimal(value) + +_callback_data = [ + # Mapping of JSON parser events to callback C types and value converters. + # Used to define the Callbacks structure and actual callback functions + # inside the parse function. + ('null', C_EMPTY, lambda: None), + ('boolean', C_INT, lambda v: bool(v)), + # "integer" and "double" aren't actually yielded by yajl since "number" + # takes precedence if defined + ('integer', C_LONG, lambda v, l: int(string_at(v, l))), + ('double', C_DOUBLE, lambda v, l: float(string_at(v, l))), + ('number', C_STR, lambda v, l: number(b2s(string_at(v, l)))), + ('string', C_STR, lambda v, l: string_at(v, l).decode('utf-8')), + ('start_map', C_EMPTY, lambda: None), + ('map_key', C_STR, lambda v, l: b2s(string_at(v, l))), + ('end_map', C_EMPTY, lambda: None), + ('start_array', C_EMPTY, lambda: None), + ('end_array', C_EMPTY, lambda: None), +] + +class Callbacks(Structure): + _fields_ = [(name, type) for name, type, func in _callback_data] + +YAJL_OK = 0 +YAJL_CANCELLED = 1 +YAJL_INSUFFICIENT_DATA = 2 +YAJL_ERROR = 3 + +# constants defined in yajl_parse.h +YAJL_ALLOW_COMMENTS = 1 +YAJL_MULTIPLE_VALUES = 8 + + +def basic_parse(f, allow_comments=False, buf_size=64 * 1024, + multiple_values=False): + ''' + Iterator yielding unprefixed events. + + Parameters: + + - f: a readable file-like object with JSON input + - allow_comments: tells parser to allow comments in JSON input + - buf_size: a size of an input buffer + - multiple_values: allows the parser to parse multiple JSON objects + ''' + events = [] + + def callback(event, func_type, func): + def c_callback(context, *args): + events.append((event, func(*args))) + return 1 + return func_type(c_callback) + + callbacks = Callbacks(*[callback(*data) for data in _callback_data]) + handle = yajl.yajl_alloc(byref(callbacks), None, None) + if allow_comments: + yajl.yajl_config(handle, YAJL_ALLOW_COMMENTS, 1) + if multiple_values: + yajl.yajl_config(handle, YAJL_MULTIPLE_VALUES, 1) + try: + while True: + buffer = f.read(buf_size) + if buffer: + result = yajl.yajl_parse(handle, buffer, len(buffer)) + else: + result = yajl.yajl_complete_parse(handle) + if result == YAJL_ERROR: + perror = yajl.yajl_get_error(handle, 1, buffer, len(buffer)) + error = cast(perror, c_char_p).value + yajl.yajl_free_error(handle, perror) + raise common.JSONError(error) + if not buffer and not events: + if result == YAJL_INSUFFICIENT_DATA: + raise common.IncompleteJSONError() + break + + for event in events: + yield event + events = [] + finally: + yajl.yajl_free(handle) + +def parse(file, **kwargs): + ''' + Backend-specific wrapper for ijson.common.parse. + ''' + return common.parse(basic_parse(file, **kwargs)) + +def items(file, prefix): + ''' + Backend-specific wrapper for ijson.common.items. + ''' + return common.items(parse(file), prefix) diff --git a/ijson-1.1/ijson/backends/yajl2.pyc b/ijson-1.1/ijson/backends/yajl2.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5aa9ff7f9aabc32e12b74a62b69de2444ede8fd GIT binary patch literal 5611 zcmcIo-E-T<5nm7#B~c%gB}@LQZ8&Zc+KJ@U{YpG;CNd?ba%4&cB`e0n#el$31_h8b z0NQ3My~JsqbegBW_D|}6lefOK``aTa$(@nMX-(>mcYAxgd;4{JGJl*Ze)Co14?guj zQ~diGw{#;%6c9Pw8x-UyGAPcGlcU%q$D}wdLq{k&L-AR1&XS=PsCkZp0=+QEIZtYW)G3VBd1_uD_h-i;7X?xzH9~5P zUht7Fks3@H_=1mj`3Qu`GQ>Ip85PKtBM|187mg;qV7fq^?j<8cb(MF{x97>8woW2Gi3*d44cmkkvDT=~-DlH<+?o zSd0sUDeHu%7yHv)w$RThU_Ioh!6vv2_zR+k&bt&?{E(ik&hA?~rp%AlE6l zO3nwQKBVBf^zYHPtgJ-}-UkV1iGs2WZcuQIYUK~$Sl75K?j&BPqY|spPOOK^_tve7 z6@~T0OM2E*m84<2W!+qQzElkV&7blNZ^_2Ft7S>o&$@}q3Ow`OZrIAQQ9Z2pGL?kqw^nVTTJI&Q zAvkH4gsolI%d$KOdue|VM3F$cSs3-%gL2tmEFrg|{4n;SGDj9^gmxUaTOvi>^Y>IM zNHw;}81D^gM>!!WYkVChSashV(1 z|H|8n7Jq)PBGG-frZS*>nd zSBa2fO}8yJWzy~>p_eIZvzI+)l25&;t8Q5;%pS9q8Ovfr_s_p1JF8_JGD8P*75Vo8 zHmxQ}JF#JLtqPo4sFis@(xqp36gJa#>&8HBH+o((y1CTpiC(pU=NT)7?P ztkdd7kq;m~P-}%-3QGp;U&6DhIRjR0x6~oy*K-HE+mRR71MgE{r`)vh>+N==yw=c~ zFs-o0$q!BWssPxg#eu9(4`n$?2{w+7I6Gv|I##T1~MXC zsxJpn?bGNcxD!9JFJcFziQYh(4;f!egTU0W_)~4Wn!G2oVUXpmdx2Nu34q3v!IN@7n z_^V?KD_#^KnWw`O!jbO)2ace)nlNaW2RT}Kbb-F&GK2rP*y1wIMETXo(I0d4gbU09 z*){EKa2&Z^UAN`M%5~+MaoxBbbR$fs8Nv1YtlhxNJbgznvJA3bM-LjfE60G{2vroM z?wfcsGVyP@rO2iQBjlb!#7k(R1KIF1bjZp+JR4Ow^5j)+=|32iPyRj8OD-ZM-v2^$ zkVhV1IH%&3N3(P=Li>5zAECdIDB>~(2LbAI&YJ|^KV5z4l zsFsR?x#=ZvG_H4VX~3e@xMg`~Qo*QO4YaddfAUI>J(z4{jB`}fNn zy)@*hWDvaEgqN~29^P<1B4<#-;OySk_uG;p7EoocOc2(vo zc`!^ehCPZ-Dx}hYP$N;Qk8+A(qG<$>JTo{K^y#iuYuk74u2t6fAnwZY*0PAQ*5aTM zIS#jsAEjJIqLx~?AwnIGME$VqM^*-rUI`xiV{C_0$khYV1H+}_Z;!oJz-oZI>L8RM zKg&1>WytS1elAH-tn~mF(cXnH5#Kq{Szz0`mxjL9^a@W?xE@Uwa;3t%#s#BbEJ!nr zaiheeOSzIUXGHB2Z<;-sX`GYiJfZR8+P1Orn}W zua8~(0b_N!$(2p{VcCgvRRR3fb8)KSw8M9hemVV=bA_3T`D(xm{grdcipZq%zz$2G z%`f+gwl~90%t!V^M)(tukV`0E4e|ahqq>s(=ej}L=b8x4<|7ZKNFHR1A4&Sb;ce*D zhO%uS>X1XmvEjWsX~3s!2}p#8jUvm2_X`_-hv1(e^1>It72App?O7mU) + ('number', ) + ('string', ) + ('map_key', ) + ('start_map', None) + ('end_map', None) + ('start_array', None) + ('end_array', None) + + Prefixes represent the path to the nested elements from the root of the JSON + document. For example, given this document:: + + { + "array": [1, 2], + "map": { + "key": "value" + } + } + + the parser would yield events: + + ('', 'start_map', None) + ('', 'map_key', 'array') + ('array', 'start_array', None) + ('array.item', 'number', 1) + ('array.item', 'number', 2) + ('array', 'end_array', None) + ('', 'map_key', 'map') + ('map', 'start_map', None) + ('map', 'map_key', 'key') + ('map.key', 'string', u'value') + ('map', 'end_map', None) + ('', 'end_map', None) + + ''' + path = [] + for event, value in basic_events: + if event == 'map_key': + prefix = '.'.join(path[:-1]) + path[-1] = value + elif event == 'start_map': + prefix = '.'.join(path) + path.append(None) + elif event == 'end_map': + path.pop() + prefix = '.'.join(path) + elif event == 'start_array': + prefix = '.'.join(path) + path.append('item') + elif event == 'end_array': + path.pop() + prefix = '.'.join(path) + else: # any scalar value + prefix = '.'.join(path) + + yield prefix, event, value + + +class ObjectBuilder(object): + ''' + Incrementally builds an object from JSON parser events. Events are passed + into the `event` function that accepts two parameters: event type and + value. The object being built is available at any time from the `value` + attribute. + + Example:: + + from StringIO import StringIO + from ijson.parse import basic_parse + from ijson.utils import ObjectBuilder + + builder = ObjectBuilder() + f = StringIO('{"key": "value"}) + for event, value in basic_parse(f): + builder.event(event, value) + print builder.value + + ''' + def __init__(self): + def initial_set(value): + self.value = value + self.containers = [initial_set] + + def event(self, event, value): + if event == 'map_key': + self.key = value + elif event == 'start_map': + map = {} + self.containers[-1](map) + def setter(value): + map[self.key] = value + self.containers.append(setter) + elif event == 'start_array': + array = [] + self.containers[-1](array) + self.containers.append(array.append) + elif event == 'end_array' or event == 'end_map': + self.containers.pop() + else: + self.containers[-1](value) + +def items(prefixed_events, prefix): + ''' + An iterator returning native Python objects constructed from the events + under a given prefix. + ''' + prefixed_events = iter(prefixed_events) + try: + while True: + current, event, value = next(prefixed_events) + if current == prefix: + if event in ('start_map', 'start_array'): + builder = ObjectBuilder() + end_event = event.replace('start', 'end') + while (current, event) != (prefix, end_event): + builder.event(event, value) + current, event, value = next(prefixed_events) + yield builder.value + else: + yield value + except StopIteration: + pass diff --git a/ijson-1.1/ijson/common.pyc b/ijson-1.1/ijson/common.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cdf515f093877676956b9f99ab946c0744b914eb GIT binary patch literal 4901 zcma)AO>Z1Y8Lplgk3TlSj-xdR%SR~(JW;HH-3!7BD`Mi60>X*f3xTl9xTmLVx82h{ z+g0s&w6=s4w7b9^!I={$;DC_!hWG(o5d4Te?vVpL&s#G+WA8X%yUW#8PrX(3e$`W5 z|MdFOzkdCzC$Sp+)ban@cTasuf^^TT-lE!VEpeU)D3-RO_}FAUon$>TK*eNEA0`KfrqHQ5 zF)2EnDfXk-*bN=${eGTla~hk0OY+RNm)s(xe0TTp&UR7cMYWj#ty`Srb0`efg*~9G zsc^T&KKpePOBz30ku?*EdOt68l%{$R6*kEZw4qqcLuiXCatkOroh<5`PRDU0olZaR zl__86<+Yc`olXl?H7V>eUXvr6XJ7NXXb*-VA#-?a14Xmm4BSmr9%VobQsc}^_VsxH zr}qU@QT=sputt#|ppaFEQDRL`KR+~?c85lG2JM>Dfr*{fy~ss6q6OT#)miRzbP^?L z)J;uWe?J=Pt`RYDR8mZLsQXb0E%otYhnb5`HC3j&MGix1o0mmwv~`7v`d)1@70!q1 z`wkxa1r!c{fjWa%)zlH3Do`&16~IV-{X6zc08LcENlbs>hMG!|ReN)Y{>p(VLMm}& zog_;fOu$JUo2^Or|A#G_fUOWp?Wv%}VIc*CYzVpbkeG=Wsm-)2$G(Ec+qKaQL3#Lf&GOo4W&GXbm8CvhZ zU6h8t{Z^D(^X3?kmHny%eO}8Lie49Ti1HL6`9t+htErL{U&$VW2b1Ebem+t&>YVGu9zznAG&ea{EQz zmyRMw(#iLwPP>ReyqW&$!n;u;|~N1A4aeqYA2ru7|J=ABLb zlh-%&8}DvRcR>&6m_`E+t)cObnDEXN`n;;2dz(&W(dT)Y_Vh3@C~;iDBdgW4)}gv} zugS}L$5|r16JxD=rFsWo_gKjz3FFs^Ehq9Jv#UqzwrHG`YMH9Xf*E(a?pmx9&l zdTqJpt^@g4GT>I3q&+i(JZV786nUyhhUbGIdte*fqol6+;&h-?X(3+Z^u_w4ayyidInM=9GMpE7 z(+icX%%m)mM5}H6HVDvYDP67-&knu4>Tc1hppDoO2$fIX2FIk;)JbY^rA zZ2Rmqorq-Iu6XmK$2#c`^1|uyy9i;55|og_wy0wa^llO`R7S;Nt$Ba-CEySwm1>Fh?~=-eur7WuWjz`zNW- z%VPdIp2O44?8dY>Kek8Kzksdnge*hesw7KSHs$8UrZ-UWHsuv4gO~EMZUhqf|rJHl^wK{xA?29 zaq{rXC?rSA$tp8@&*1guYFD6u43FhJ3aOjF&EQ6`T4QQwZ7sMJtknFZvRUqPa!&~< z!p}z$SME8yjou&Nu{^<=H=DuYTuf1PUG6BxWB-8yDuL!<>vN@se^jTx=LN3#SUI?X zw7*jy-CuY|L8EulRk{{F3LH08@dp~>1%h~vjYs%cfG)TcZaqzP`e1x1kg;b?I_hy< z6~9wwbLvQj(En;Gm^tNpv3g2ilUItr&^7UYRpIaHQ+Qnihy3x9D*mp5AjkssBYd&g zZE=_O7=`4p*;kGX7l;$1pA6k0E*Jjzu=p5}fv%J>No{pZNV>BktHj3)UW`3u^=G-i z5Q~3{SBYl`3lRqedD!u{cYf8V*onQwNF-Fow4{&TcL|0s(AP!Xv%9KEVqMKYAc{GJF-=AYMEzy{N+bJoyd(pyagc=xZW_b i`S2bZ{}1Pv&jt7W=bJZ~V=Plov$ob;TUgv$yZ2w49!_ll literal 0 HcmV?d00001 diff --git a/ijson-1.1/ijson/compat.py b/ijson-1.1/ijson/compat.py new file mode 100644 index 0000000..b73c746 --- /dev/null +++ b/ijson-1.1/ijson/compat.py @@ -0,0 +1,17 @@ +''' +Python2/Python3 compatibility utilities. +''' + +import sys + + +IS_PY2 = sys.version_info[0] < 3 + + +if IS_PY2: + b2s = lambda s: s + chr = unichr +else: + def b2s(b): + return b.decode('utf-8') + chr = chr diff --git a/ijson-1.1/ijson/compat.pyc b/ijson-1.1/ijson/compat.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b820633ac73bcd22fcb5f35d79f5f5c25bd01528 GIT binary patch literal 553 zcmah_Jx{|h5Ix6jpjAr;Cbn)=NVHN|7@$I8Vu=Es$YM3It2(4{RXa$L*x*0%L-+x3 z=TLP*Sh{y#_Pz7tU&H?Axp>Us_XOzg&}@Pr0a8Q;GJujn%0MwF0+1ZeWCvcb%|UV4 z-Gc1G2c{iRT}VTyMj*p|C8P?DSWJ#+@&!UaY24aW(X91X!~*%59rd(T)&wVNEq_*{0jZ9b@gLTy8SC zoDb*{+JS@A@Ek)7kfZQwyfY-DR&2Y7wOd_Yw~QgCQsqXf!v@(u8_=h+$S-GUC@?RR zBdvsxCKtld9otw(8@{TY)us}z#jt{=n{|zeU&>S(j?= TsZMZK7!3F+4_S}(f?n_gRZ3%g literal 0 HcmV?d00001 diff --git a/ijson-1.1/ijson/utils.py b/ijson-1.1/ijson/utils.py new file mode 100644 index 0000000..75092b5 --- /dev/null +++ b/ijson-1.1/ijson/utils.py @@ -0,0 +1,55 @@ +# -*- coding:utf-8 -*- +from functools import wraps + + +def coroutine(func): + ''' + Wraps a generator which intended to be used as a pure coroutine by + .send()ing it values. The only thing that the wrapper does is calling + .next() for the first time which is required by Python generator protocol. + ''' + @wraps(func) + def wrapper(*args, **kwargs): + g = func(*args, **kwargs) + next(g) + return g + return wrapper + +@coroutine +def foreach(coroutine_func): + ''' + Dispatches each JSON array item to a handler coroutine. A coroutine is + created anew for each item by calling `coroutine_func` callable. The + resulting coroutine should accept value in the form of tuple of values + generated by rich JSON parser: (prefix, event, value). + + First event received by foreach should be a "start_array" event. + ''' + g = None + base, event, value = yield + if event != 'start_array': + raise Exception('foreach requires "start_array" as the first event, got %s' % repr((base, event, value))) + START_EVENTS = set(['start_map', 'start_array', 'null', 'boolean', 'number', 'string']) + itemprefix = base + '.item' if base else 'item' + while True: + prefix, event, value = yield + if prefix == itemprefix and event in START_EVENTS: + g = coroutine_func() + if (prefix, event) != (base, 'end_array'): + g.send((prefix, event, value)) + +@coroutine +def dispatcher(targets): + ''' + Dispatches JSON parser events into several handlers depending on event + prefixes. + + Accepts a list of tuples (base_prefix, coroutine). A coroutine then + receives all the events with prefixes starting with its base_prefix. + ''' + while True: + prefix, event, value = yield + for base, target in targets: + if prefix.startswith(base): + target.send((prefix, event, value)) + break diff --git a/ijson-1.1/setup.py b/ijson-1.1/setup.py new file mode 100644 index 0000000..639f9bc --- /dev/null +++ b/ijson-1.1/setup.py @@ -0,0 +1,21 @@ +# -*- coding:utf-8 -*- +from distutils.core import setup + +setup( + name = 'ijson', + version = '1.1', + author = 'Ivan Sagalaev', + author_email = 'maniac@softwaremaniacs.org', + packages = ['ijson', 'ijson.backends'], + url = 'https://github.com/isagalaev/ijson', + license = 'LICENSE.txt', + description = 'Iterative JSON parser with a standard Python iterator interface', + long_description = open('README.rst').read(), + classifiers = [ + 'Development Status :: 5 - Production/Stable', + 'License :: OSI Approved :: BSD License', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Topic :: Software Development :: Libraries :: Python Modules', + ] +) diff --git a/parse_json.py b/parse_json.py new file mode 100644 index 0000000..3f305a7 --- /dev/null +++ b/parse_json.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python + +import os +import json +import re + +#shameless copy paste from json/decoder.py +FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL +WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) + +class ConcatJSONDecoder(json.JSONDecoder): + def decode(self, s, _w=WHITESPACE.match): + s_len = len(s) + + objs = [] + end = 0 + while end != s_len: + obj, end = self.raw_decode(s, idx=_w(s, end).end()) + end = _w(s, end).end() + objs.append(obj) + return objs + +def print_json_file(filename): + events = json.loads(os.popen("zcat "+ filename).read(), cls=ConcatJSONDecoder) + + for event in events: + # remove events which are not done to repositories + if event['type'] in ["GistEvent"]: + continue + + ev = {} + ev['actor'] = event['actor'] + ev['type'] = event['type'] + ev['date'] = event['created_at'] + ev['url'] = event['url'] + + + if event.has_key('repository'): + ev['repo.name'] = event['repository']['name'] + ev['repo.owner'] = event['repository']['owner'] + ev['repo.watchers'] = str(event['repository']['watchers']) + ev['repo.forks'] = str(event['repository']['forks']) + else: + ev['repo.name'] = "" + ev['repo.owner'] = "" + ev['repo.watchers'] = "" + ev['repo.forks'] = "" + + # print event # debug code + print u"\t".join([ev['actor'], + ev['type'], + ev['date'], + ev['url'], + ev['repo.name'], + ev['repo.owner'], + ev['repo.watchers'], + ev['repo.forks']]).encode("utf8") + +print "\t".join(["actor", "type", "date", "url", "name", "repo.owner", + "repo.watchers", "repo.forks"]) + +for filename in os.listdir("data/"): + print_json_file("data/" + filename) -- 2.39.5