From: Zach Varberg Date: Fri, 24 Jul 2015 17:18:41 +0000 (-0500) Subject: Handle content-type header charset value for streaming API X-Git-Url: https://projects.mako.cc/source/twitter-api-cdsw/commitdiff_plain/4b0f484cf92d3dd4a04eff17a3c52e2dd1dde9b1?ds=inline Handle content-type header charset value for streaming API This also fixes https://github.com/tweepy/tweepy/issues/615 by properly decoding the python3 bytes object to a string. --- diff --git a/tweepy/streaming.py b/tweepy/streaming.py index c637264..ad7944c 100644 --- a/tweepy/streaming.py +++ b/tweepy/streaming.py @@ -7,6 +7,7 @@ from __future__ import absolute_import, print_function import logging +import re import requests from requests.exceptions import Timeout from threading import Thread @@ -148,10 +149,11 @@ class ReadBuffer(object): use small chunks so it can read the length and the tweet in 2 read calls. """ - def __init__(self, stream, chunk_size): + def __init__(self, stream, chunk_size, encoding='utf-8'): self._stream = stream - self._buffer = '' + self._buffer = six.b('') self._chunk_size = chunk_size + self._encoding = encoding def read_len(self, length): while not self._stream.closed: @@ -160,7 +162,13 @@ class ReadBuffer(object): read_len = max(self._chunk_size, length - len(self._buffer)) self._buffer += self._stream.read(read_len) - def read_line(self, sep='\n'): + def read_line(self, sep=six.b('\n')): + """Read the data stream until a given separator is found (default \n) + + :param sep: Separator to read until. Must by of the bytes type (str in python 2, + bytes in python 3) + :return: The str of the data read until sep + """ start = 0 while not self._stream.closed: loc = self._buffer.find(sep, start) @@ -173,7 +181,7 @@ class ReadBuffer(object): def _pop(self, length): r = self._buffer[:length] self._buffer = self._buffer[length:] - return r + return r.decode(self._encoding) class Stream(object): @@ -290,7 +298,14 @@ class Stream(object): self.running = False def _read_loop(self, resp): - buf = ReadBuffer(resp.raw, self.chunk_size) + charset = resp.headers.get('content-type', default='') + enc_search = re.search('charset=(?P\S*)', charset) + if enc_search is not None: + encoding = enc_search.group('enc') + else: + encoding = 'utf-8' + + buf = ReadBuffer(resp.raw, self.chunk_size, encoding=encoding) while self.running and not resp.raw.closed: length = 0