From: Benjamin Mako Hill Date: Fri, 23 Oct 2015 04:25:12 +0000 (-0700) Subject: updated wikipedia project to fix windows encoding issue X-Git-Url: https://projects.mako.cc/source/wikipedia-api-cdsw/commitdiff_plain/HEAD?hp=de6957e1346cfd24526dfbdfbe13fce1aab08dc8 updated wikipedia project to fix windows encoding issue --- diff --git a/encoding_fix.py b/encoding_fix.py new file mode 100644 index 0000000..d927bd1 --- /dev/null +++ b/encoding_fix.py @@ -0,0 +1,4 @@ +import os +if os.name == "nt": + import win_unicode_console + win_unicode_console.enable() diff --git a/wikipedia1-1.py b/wikipedia1-1.py index ca021d1..43182a2 100644 --- a/wikipedia1-1.py +++ b/wikipedia1-1.py @@ -1,3 +1,4 @@ +import encoding_fix import requests wp_call = requests.get('https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=Python_(programming_language)&rvlimit=100&rvprop=timestamp|user&continue=&format=json') diff --git a/wikipedia1-2.py b/wikipedia1-2.py index 14d6026..a33b819 100644 --- a/wikipedia1-2.py +++ b/wikipedia1-2.py @@ -1,3 +1,4 @@ +import encoding_fix import requests # raw string: diff --git a/wikipedia2-1.py b/wikipedia2-1.py index 3cb23d9..332779c 100644 --- a/wikipedia2-1.py +++ b/wikipedia2-1.py @@ -1,3 +1,4 @@ +import encoding_fix import requests url_base = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=%s&rvlimit=100&rvprop=timestamp|user&continue=&format=json' diff --git a/wikipedia2-2.py b/wikipedia2-2.py index 4dc2f30..02b566b 100644 --- a/wikipedia2-2.py +++ b/wikipedia2-2.py @@ -1,3 +1,4 @@ +import encoding_fix import requests # base url: diff --git a/wikipedia3.py b/wikipedia3.py index 717b487..027c8d1 100644 --- a/wikipedia3.py +++ b/wikipedia3.py @@ -1,3 +1,4 @@ +import encoding_fix import requests # I found documentation here: diff --git a/wikipedia4-1.py b/wikipedia4-1.py index 1798d1a..27f025f 100644 --- a/wikipedia4-1.py +++ b/wikipedia4-1.py @@ -1,3 +1,4 @@ +import encoding_fix import requests # ?action=query&titles=Albert%20Einstein&prop=categories diff --git a/wikipedia4-2.py b/wikipedia4-2.py index 11eb967..c28e092 100644 --- a/wikipedia4-2.py +++ b/wikipedia4-2.py @@ -1,3 +1,4 @@ +import encoding_fix import requests # ?action=query&titles=Albert%20Einstein&prop=categories diff --git a/win_unicode_console/__init__.py b/win_unicode_console/__init__.py new file mode 100644 index 0000000..f9d1416 --- /dev/null +++ b/win_unicode_console/__init__.py @@ -0,0 +1,39 @@ + +from win_unicode_console import streams, console, readline_hook + +streams_ = streams + + +def enable(*, + streams=["stdin", "stdout", "stderr"], + transcode=None, + use_readline_hook=True, + use_pyreadline=True, + use_repl=False): + + if transcode is None: + if use_readline_hook and use_pyreadline and readline_hook.pyreadline: + transcode = True + # pyreadline assumes that encoding of all sys.stdio objects is the same + + elif use_repl: + transcode = False + + else: + transcode = True + # actually Python REPL assumes that sys.stdin.encoding == sys.stdout.encoding and cannot handle UTF-16 on both input and output + + streams_.enable(streams, transcode=transcode) + + if use_readline_hook: + readline_hook.enable(use_pyreadline=use_pyreadline) + + if use_repl: + console.enable() + +def disable(): + if console.running_console is not None: + console.disable() + + readline_hook.disable() + streams.disable() diff --git a/win_unicode_console/buffer.py b/win_unicode_console/buffer.py new file mode 100644 index 0000000..cd5853c --- /dev/null +++ b/win_unicode_console/buffer.py @@ -0,0 +1,51 @@ + +from ctypes import (byref, POINTER, Structure, pythonapi, + c_int, c_char, c_char_p, c_void_p, py_object, c_ssize_t) +import sys + +c_ssize_p = POINTER(c_ssize_t) + +PyObject_GetBuffer = pythonapi.PyObject_GetBuffer +PyBuffer_Release = pythonapi.PyBuffer_Release + + +PyBUF_SIMPLE = 0 +PyBUF_WRITABLE = 1 + + +class Py_buffer(Structure): + _fields_ = [ + ("buf", c_void_p), + ("obj", py_object), + ("len", c_ssize_t), + ("itemsize", c_ssize_t), + ("readonly", c_int), + ("ndim", c_int), + ("format", c_char_p), + ("shape", c_ssize_p), + ("strides", c_ssize_p), + ("suboffsets", c_ssize_p), + ("internal", c_void_p) + ] + + if sys.version_info[0] < 3: + _fields_.insert(-1, ("smalltable", c_ssize_t * 2)) + + @classmethod + def get_from(cls, obj, flags=PyBUF_SIMPLE): + buf = cls() + PyObject_GetBuffer(py_object(obj), byref(buf), flags) + return buf + + def release(self): + PyBuffer_Release(byref(self)) + + +def get_buffer(obj, writable=False): + buf = Py_buffer.get_from(obj, PyBUF_WRITABLE if writable else PyBUF_SIMPLE) + try: + buffer_type = c_char * buf.len + return buffer_type.from_address(buf.buf) + finally: + buf.release() + diff --git a/win_unicode_console/console.py b/win_unicode_console/console.py new file mode 100644 index 0000000..5aab6de --- /dev/null +++ b/win_unicode_console/console.py @@ -0,0 +1,96 @@ + +import code +import sys +import __main__ + + +def print_banner(): + print("Python {} on {}".format(sys.version, sys.platform)) + print('Type "help", "copyright", "credits" or "license" for more information.') + +class InteractiveConsole(code.InteractiveConsole): + # code.InteractiveConsole without banner + # exits on EOF + # also more robust treating of sys.ps1, sys.ps2 + # prints prompt into stderr rather than stdout + # flushes sys.stderr and sys.stdout + + def __init__(self, locals=None, filename=""): + self.done = False + super().__init__(locals, filename) + + def raw_input(self, prompt=""): + sys.stderr.write(prompt) + return input() + + def runcode(self, code): + super().runcode(code) + sys.stderr.flush() + sys.stdout.flush() + + def interact(self): + #sys.ps1 = "~>> " + #sys.ps2 = "~.. " + + try: + sys.ps1 + except AttributeError: + sys.ps1 = ">>> " + + try: + sys.ps2 + except AttributeError: + sys.ps2 = "... " + + more = 0 + while not self.done: + try: + if more: + try: + prompt = sys.ps2 + except AttributeError: + prompt = "" + else: + try: + prompt = sys.ps1 + except AttributeError: + prompt = "" + + try: + line = self.raw_input(prompt) + except EOFError: + self.on_EOF() + else: + more = self.push(line) + + except KeyboardInterrupt: + self.write("\nKeyboardInterrupt\n") + self.resetbuffer() + more = 0 + + def on_EOF(self): + self.write("\n") + # sys.exit() + raise SystemExit from None + + +running_console = None + +def enable(): + global running_console + + if running_console is not None: + raise RuntimeError("interactive console already running") + else: + running_console = InteractiveConsole(__main__.__dict__) + running_console.interact() + +def disable(): + global running_console + + if running_console is None: + raise RuntimeError("interactive console is not running") + else: + running_console.done = True + running_console = None + diff --git a/win_unicode_console/readline_hook.py b/win_unicode_console/readline_hook.py new file mode 100644 index 0000000..7946784 --- /dev/null +++ b/win_unicode_console/readline_hook.py @@ -0,0 +1,106 @@ + +import sys, traceback +from ctypes import pythonapi, cdll, c_size_t, c_char_p, c_void_p, cast, CFUNCTYPE, POINTER, addressof + +PyMem_Malloc = pythonapi.PyMem_Malloc +PyMem_Malloc.restype = c_size_t +PyMem_Malloc.argtypes = [c_size_t] + +strncpy = cdll.msvcrt.strncpy +strncpy.restype = c_char_p +strncpy.argtypes = [c_char_p, c_char_p, c_size_t] + +HOOKFUNC = CFUNCTYPE(c_char_p, c_void_p, c_void_p, c_char_p) + +PyOS_ReadlineFunctionPointer = c_void_p.in_dll(pythonapi, "PyOS_ReadlineFunctionPointer") + + +def new_zero_terminated_string(b): + p = PyMem_Malloc(len(b) + 1) + strncpy(cast(p, c_char_p), b, len(b) + 1) + return p + + +class ReadlineHookManager: + def __init__(self): + self.readline_wrapper_ref = HOOKFUNC(self.readline_wrapper) + self.address = c_void_p.from_address(addressof(self.readline_wrapper_ref)).value + self.original_address = PyOS_ReadlineFunctionPointer.value + self.readline_hook = None + + def readline_wrapper(self, stdin, stdout, prompt): + try: + try: + if sys.stdin.encoding != sys.stdout.encoding: + raise ValueError("sys.stdin.encoding != sys.stdout.encoding, readline hook doesn't know, which one to use to decode prompt") + + except ValueError: + traceback.print_exc(file=sys.stderr) + try: + prompt = prompt.decode("utf-8") + except UnicodeDecodeError: + prompt = "" + + else: + prompt = prompt.decode(sys.stdout.encoding) + + try: + line = self.readline_hook(prompt) + except KeyboardInterrupt: + return 0 + else: + return new_zero_terminated_string(line.encode(sys.stdin.encoding)) + + except: + print("Intenal win_unicode_console error", file=sys.stderr) + traceback.print_exc(file=sys.stderr) + return new_zero_terminated_string(b"\n") + + def install_hook(self, hook): + self.readline_hook = hook + PyOS_ReadlineFunctionPointer.value = self.address + + def restore_original(self): + self.readline_hook = None + PyOS_ReadlineFunctionPointer.value = self.original_address + + +def readline(prompt): + sys.stdout.write(prompt) + sys.stdout.flush() + return sys.stdin.readline() + + +class PyReadlineManager: + def __init__(self): + self.original_codepage = pyreadline.unicode_helper.pyreadline_codepage + + def set_codepage(self, codepage): + pyreadline.unicode_helper.pyreadline_codepage = codepage + + def restore_original(self): + self.set_codepage(self.original_codepage) + +try: + import pyreadline.unicode_helper +except ImportError: + pyreadline = None +else: + pyreadline_manager = PyReadlineManager() + +manager = ReadlineHookManager() + + +def enable(*, use_pyreadline=True): + if use_pyreadline and pyreadline: + pyreadline_manager.set_codepage(sys.stdin.encoding) + # pyreadline assumes that encoding of all sys.stdio objects is the same + else: + manager.install_hook(readline) + +def disable(): + if pyreadline: + pyreadline_manager.restore_original() + + manager.restore_original() + diff --git a/win_unicode_console/runner.py b/win_unicode_console/runner.py new file mode 100644 index 0000000..4b22904 --- /dev/null +++ b/win_unicode_console/runner.py @@ -0,0 +1,96 @@ + +from types import CodeType as Code +import sys +import traceback +import __main__ +from ctypes import pythonapi, POINTER, c_long, cast +import tokenize + + +inspect_flag = cast(pythonapi.Py_InspectFlag, POINTER(c_long)).contents + +def set_inspect_flag(value): + inspect_flag.value = int(value) + + +def update_code(codeobj, **kwargs): + fields = ["argcount", "kwonlyargcount", "nlocals", "stacksize", "flags", + "code", "consts", "names", "varnames", "filename", "name", + "firstlineno", "lnotab", "freevars", "cellvars"] + + def field_values(): + for field in fields: + value = kwargs.get(field, None) + if value is None: + yield getattr(codeobj, "co_{}".format(field)) + else: + yield value + + return Code(*field_values()) + +def update_code_recursively(codeobj, **kwargs): + updated = {} + + def update(codeobj, **kwargs): + result = updated.get(codeobj, None) + if result is not None: + return result + + if any(isinstance(c, Code) for c in codeobj.co_consts): + consts = tuple(update(c, **kwargs) if isinstance(c, Code) else c + for c in codeobj.co_consts) + else: + consts = codeobj.co_consts + + result = update_code(codeobj, consts=consts, **kwargs) + updated[codeobj] = result + return result + + return update(codeobj, **kwargs) + + +def get_code(path): + with tokenize.open(path) as f: # opens with detected source encoding + source = f.read() + + try: + code = compile(source, path, "exec") + except UnicodeEncodeError: + code = compile(source, "", "exec") + code = update_code_recursively(code, filename=path) + # so code constains correct filename (even if it contains Unicode) + # and tracebacks show contents of code lines + + return code + +class MainLoader: + # to reload __main__ properly + + def __init__(self, path): + self.path = path + + def load_module(self, name): + code = get_code(self.path) + exec(code, __main__.__dict__) + return __main__ + +def run_script(): + sys.argv.pop(0) # so sys.argv looks correct from script being run + path = sys.argv[0] + __main__.__file__ = path + __main__.__loader__ = MainLoader(path) + + + try: + code = get_code(path) + except Exception as e: + traceback.print_exception(e.__class__, e, e.__traceback__.tb_next.tb_next, chain=False) + else: + try: + exec(code, __main__.__dict__) + except BaseException as e: + if not sys.flags.inspect and isinstance(e, SystemExit): + raise + else: + traceback.print_exception(e.__class__, e, e.__traceback__.tb_next) + diff --git a/win_unicode_console/streams.py b/win_unicode_console/streams.py new file mode 100644 index 0000000..be6927d --- /dev/null +++ b/win_unicode_console/streams.py @@ -0,0 +1,260 @@ + +from ctypes import byref, windll, c_ulong + +from win_unicode_console.buffer import get_buffer + +import io +import sys +import time + + +kernel32 = windll.kernel32 +GetStdHandle = kernel32.GetStdHandle +ReadConsoleW = kernel32.ReadConsoleW +WriteConsoleW = kernel32.WriteConsoleW +GetLastError = kernel32.GetLastError + + +ERROR_SUCCESS = 0 +ERROR_NOT_ENOUGH_MEMORY = 8 +ERROR_OPERATION_ABORTED = 995 + +STDIN_HANDLE = GetStdHandle(-10) +STDOUT_HANDLE = GetStdHandle(-11) +STDERR_HANDLE = GetStdHandle(-12) + +STDIN_FILENO = 0 +STDOUT_FILENO = 1 +STDERR_FILENO = 2 + +EOF = b"\x1a" + +MAX_BYTES_WRITTEN = 32767 # arbitrary because WriteConsoleW ability to write big buffers depends on heap usage + + +class ReprMixin: + def __repr__(self): + modname = self.__class__.__module__ + clsname = self.__class__.__qualname__ + attributes = [] + for name in ["name", "encoding"]: + try: + value = getattr(self, name) + except AttributeError: + pass + else: + attributes.append("{}={}".format(name, repr(value))) + + return "<{}.{} {}>".format(modname, clsname, " ".join(attributes)) + + +class WindowsConsoleRawIOBase(ReprMixin, io.RawIOBase): + def __init__(self, name, handle, fileno): + self.name = name + self.handle = handle + self.file_no = fileno + + def fileno(self): + return self.file_no + + def isatty(self): + super().isatty() # for close check in default implementation + return True + +class WindowsConsoleRawReader(WindowsConsoleRawIOBase): + def readable(self): + return True + + def readinto(self, b): + bytes_to_be_read = len(b) + if not bytes_to_be_read: + return 0 + elif bytes_to_be_read % 2: + raise ValueError("cannot read odd number of bytes from UTF-16-LE encoded console") + + buffer = get_buffer(b, writable=True) + code_units_to_be_read = bytes_to_be_read // 2 + code_units_read = c_ulong() + + retval = ReadConsoleW(self.handle, buffer, code_units_to_be_read, byref(code_units_read), None) + if GetLastError() == ERROR_OPERATION_ABORTED: + time.sleep(0.1) # wait for KeyboardInterrupt + if not retval: + raise OSError("Windows error {}".format(GetLastError())) + + if buffer[0] == EOF: + return 0 + else: + return 2 * code_units_read.value + +class WindowsConsoleRawWriter(WindowsConsoleRawIOBase): + def writable(self): + return True + + @staticmethod + def _error_message(errno): + if errno == ERROR_SUCCESS: + return "Windows error {} (ERROR_SUCCESS); zero bytes written on nonzero input, probably just one byte given".format(errno) + elif errno == ERROR_NOT_ENOUGH_MEMORY: + return "Windows error {} (ERROR_NOT_ENOUGH_MEMORY); try to lower `win_unicode_console.streams.MAX_BYTES_WRITTEN`".format(errno) + else: + return "Windows error {}".format(errno) + + def write(self, b): + bytes_to_be_written = len(b) + buffer = get_buffer(b) + code_units_to_be_written = min(bytes_to_be_written, MAX_BYTES_WRITTEN) // 2 + code_units_written = c_ulong() + + retval = WriteConsoleW(self.handle, buffer, code_units_to_be_written, byref(code_units_written), None) + bytes_written = 2 * code_units_written.value + + # fixes both infinite loop of io.BufferedWriter.flush() on when the buffer has odd length + # and situation when WriteConsoleW refuses to write lesser that MAX_BYTES_WRITTEN bytes + if bytes_written == 0 != bytes_to_be_written: + raise OSError(self._error_message(GetLastError())) + else: + return bytes_written + +class TextTranscodingWrapper(ReprMixin, io.TextIOBase): + encoding = None + + def __init__(self, base, encoding): + self.base = base + self.encoding = encoding + + @property + def errors(self): + return self.base.errors + + @property + def line_buffering(self): + return self.base.line_buffering + + def seekable(self): + return self.base.seekable() + + def readable(self): + return self.base.readable() + + def writable(self): + return self.base.writable() + + def flush(self): + self.base.flush() + + def close(self): + self.base.close() + + @property + def closed(self): + return self.base.closed + + @property + def name(self): + return self.base.name + + def fileno(self): + return self.base.fileno() + + def isatty(self): + return self.base.isatty() + + def write(self, s): + return self.base.write(s) + + def tell(self): + return self.base.tell() + + def truncate(self, pos=None): + return self.base.truncate(pos) + + def seek(self, cookie, whence=0): + return self.base.seek(cookie, whence) + + def read(self, size=None): + return self.base.read(size) + + def __next__(self): + return next(self.base) + + def readline(self, size=-1): + return self.base.readline(size) + + @property + def newlines(self): + return self.base.newlines + + +stdin_raw = WindowsConsoleRawReader("", STDIN_HANDLE, STDIN_FILENO) +stdout_raw = WindowsConsoleRawWriter("", STDOUT_HANDLE, STDOUT_FILENO) +stderr_raw = WindowsConsoleRawWriter("", STDERR_HANDLE, STDERR_FILENO) + +stdin_text = io.TextIOWrapper(io.BufferedReader(stdin_raw), encoding="utf-16-le", line_buffering=True) +stdout_text = io.TextIOWrapper(io.BufferedWriter(stdout_raw), encoding="utf-16-le", line_buffering=True) +stderr_text = io.TextIOWrapper(io.BufferedWriter(stderr_raw), encoding="utf-16-le", line_buffering=True) + +stdin_text_transcoded = TextTranscodingWrapper(stdin_text, encoding="utf-8") +stdout_text_transcoded = TextTranscodingWrapper(stdout_text, encoding="utf-8") +stderr_text_transcoded = TextTranscodingWrapper(stderr_text, encoding="utf-8") + + +def disable(): + sys.stdin.flush() + sys.stdout.flush() + sys.stderr.flush() + sys.stdin = sys.__stdin__ + sys.stdout = sys.__stdout__ + sys.stderr = sys.__stderr__ + +def check_stream(stream, fileno): + if stream is None: # e.g. with IDLE + return True + + try: + _fileno = stream.fileno() + except io.UnsupportedOperation: + return False + else: + if _fileno == fileno and stream.isatty(): + stream.flush() + return True + else: + return False + +def enable_reader(*, transcode=True): + # transcoding because Python tokenizer cannot handle UTF-16 + if check_stream(sys.stdin, STDIN_FILENO): + if transcode: + sys.stdin = stdin_text_transcoded + else: + sys.stdin = stdin_text + +def enable_writer(*, transcode=True): + if check_stream(sys.stdout, STDOUT_FILENO): + if transcode: + sys.stdout = stdout_text_transcoded + else: + sys.stdout = stdout_text + +def enable_error_writer(*, transcode=True): + if check_stream(sys.stderr, STDERR_FILENO): + if transcode: + sys.stderr = stderr_text_transcoded + else: + sys.stderr = stderr_text + +enablers = {"stdin": enable_reader, "stdout": enable_writer, "stderr": enable_error_writer} + +def enable(streams=("stdin", "stdout", "stderr"), *, transcode=frozenset(enablers.keys())): + if transcode is True: + transcode = enablers.keys() + elif transcode is False: + transcode = set() + + if not set(streams) | set(transcode) <= enablers.keys(): + raise ValueError("invalid stream names") + + for stream in streams: + enablers[stream](transcode=(stream in transcode)) +