]> projects.mako.cc - harrypotter-wikipedia-cdsw/commitdiff
updated code to handle printing on windows
authorBenjamin Mako Hill <mako@atdot.cc>
Mon, 2 Nov 2015 00:42:42 +0000 (16:42 -0800)
committerBenjamin Mako Hill <mako@atdot.cc>
Mon, 2 Nov 2015 00:42:42 +0000 (16:42 -0800)
.gitignore
build_hpwp_dataset.py
encoding_fix.py [new file with mode: 0644]
hpwp-minor.py
hpwp-trend.py
win_unicode_console/__init__.py [new file with mode: 0644]
win_unicode_console/buffer.py [new file with mode: 0644]
win_unicode_console/console.py [new file with mode: 0644]
win_unicode_console/readline_hook.py [new file with mode: 0644]
win_unicode_console/runner.py [new file with mode: 0644]
win_unicode_console/streams.py [new file with mode: 0644]

index 3c8ddf31d18b93464f10ab2cbd42dd65995dd1c5..3b7f1dff0a96de737f2c8a9490fb72bd874fd591 100644 (file)
@@ -1 +1,2 @@
 /*.tsv
 /*.tsv
+__pycache__/
index 62f7e4fd447bafbff39b4c8d78f9fe149c0efd87..9d381ae1e1a66b4e5904940c00f3d56791d4a53a 100644 (file)
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # coding=utf-8
 
 #!/usr/bin/env python
 # coding=utf-8
 
+import encoding_fix
 import requests
 
 # get_article_revisions is a function that takes an article title in
 import requests
 
 # get_article_revisions is a function that takes an article title in
@@ -107,6 +108,7 @@ for article in articles:
 
     # first grab the article's title
     title = article["a"]["title"]
 
     # first grab the article's title
     title = article["a"]["title"]
+    print(title)
 
     # get the list of revisions from our function and then iterate through it,
     # printing it to our output file
 
     # get the list of revisions from our function and then iterate through it,
     # printing it to our output file
diff --git a/encoding_fix.py b/encoding_fix.py
new file mode 100644 (file)
index 0000000..d927bd1
--- /dev/null
@@ -0,0 +1,4 @@
+import os
+if os.name == "nt":
+    import win_unicode_console
+    win_unicode_console.enable()
index b3bfd2f66a86ddb300bd0d8eef73e38b1bacfdbf..79135a1c28049cd95e021fb02526f3f60fc370ef 100644 (file)
@@ -1,3 +1,5 @@
+import encoding_fix
+
 from csv import DictReader
 
 input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
 from csv import DictReader
 
 input_file = open("hp_wiki.tsv", 'r', encoding="utf-8")
index aaa2b70a0ef3a088fcad40d174571ccf806ff154..b9d1c7b8fb6e69c6d4c6b56b834515c8b4ae4e87 100644 (file)
@@ -1,3 +1,5 @@
+import encoding_fix
+
 from csv import DictReader
 
 # read in the input file and count by day
 from csv import DictReader
 
 # read in the input file and count by day
diff --git a/win_unicode_console/__init__.py b/win_unicode_console/__init__.py
new file mode 100644 (file)
index 0000000..f9d1416
--- /dev/null
@@ -0,0 +1,39 @@
+
+from win_unicode_console import streams, console, readline_hook
+
+streams_ = streams
+
+
+def enable(*, 
+       streams=["stdin", "stdout", "stderr"], 
+       transcode=None, 
+       use_readline_hook=True, 
+       use_pyreadline=True,
+       use_repl=False):
+       
+       if transcode is None:
+               if use_readline_hook and use_pyreadline and readline_hook.pyreadline:
+                       transcode = True
+                               # pyreadline assumes that encoding of all sys.stdio objects is the same
+                       
+               elif use_repl:
+                       transcode = False
+                       
+               else:
+                       transcode = True
+                               # actually Python REPL assumes that sys.stdin.encoding == sys.stdout.encoding and cannot handle UTF-16 on both input and output
+       
+       streams_.enable(streams, transcode=transcode)
+       
+       if use_readline_hook:
+               readline_hook.enable(use_pyreadline=use_pyreadline)
+       
+       if use_repl:
+               console.enable()
+
+def disable():
+       if console.running_console is not None:
+               console.disable()
+       
+       readline_hook.disable()
+       streams.disable()
diff --git a/win_unicode_console/buffer.py b/win_unicode_console/buffer.py
new file mode 100644 (file)
index 0000000..cd5853c
--- /dev/null
@@ -0,0 +1,51 @@
+
+from ctypes import (byref, POINTER, Structure, pythonapi,
+       c_int, c_char, c_char_p, c_void_p, py_object, c_ssize_t)
+import sys
+
+c_ssize_p = POINTER(c_ssize_t)
+
+PyObject_GetBuffer = pythonapi.PyObject_GetBuffer
+PyBuffer_Release = pythonapi.PyBuffer_Release
+
+
+PyBUF_SIMPLE = 0
+PyBUF_WRITABLE = 1
+
+
+class Py_buffer(Structure):
+       _fields_ = [
+               ("buf", c_void_p),
+               ("obj", py_object),
+               ("len", c_ssize_t),
+               ("itemsize", c_ssize_t),
+               ("readonly", c_int),
+               ("ndim", c_int),
+               ("format", c_char_p),
+               ("shape", c_ssize_p),
+               ("strides", c_ssize_p),
+               ("suboffsets", c_ssize_p),
+               ("internal", c_void_p)
+       ]
+       
+       if sys.version_info[0] < 3:
+               _fields_.insert(-1, ("smalltable", c_ssize_t * 2))
+       
+       @classmethod
+       def get_from(cls, obj, flags=PyBUF_SIMPLE):
+               buf = cls()
+               PyObject_GetBuffer(py_object(obj), byref(buf), flags)
+               return buf
+       
+       def release(self):
+               PyBuffer_Release(byref(self))
+
+
+def get_buffer(obj, writable=False):
+       buf = Py_buffer.get_from(obj, PyBUF_WRITABLE if writable else PyBUF_SIMPLE)
+       try:
+               buffer_type = c_char * buf.len
+               return buffer_type.from_address(buf.buf)
+       finally:
+               buf.release()
+
diff --git a/win_unicode_console/console.py b/win_unicode_console/console.py
new file mode 100644 (file)
index 0000000..5aab6de
--- /dev/null
@@ -0,0 +1,96 @@
+
+import code
+import sys
+import __main__
+
+
+def print_banner():
+       print("Python {} on {}".format(sys.version, sys.platform))
+       print('Type "help", "copyright", "credits" or "license" for more information.')
+
+class InteractiveConsole(code.InteractiveConsole):
+       # code.InteractiveConsole without banner
+       # exits on EOF
+       # also more robust treating of sys.ps1, sys.ps2
+       # prints prompt into stderr rather than stdout
+       # flushes sys.stderr and sys.stdout
+       
+       def __init__(self, locals=None, filename="<stdin>"):
+               self.done = False
+               super().__init__(locals, filename)
+       
+       def raw_input(self, prompt=""):
+               sys.stderr.write(prompt)
+               return input()
+       
+       def runcode(self, code):
+               super().runcode(code)
+               sys.stderr.flush()
+               sys.stdout.flush()
+       
+       def interact(self):
+               #sys.ps1 = "~>> "
+               #sys.ps2 = "~.. "
+               
+               try:
+                       sys.ps1
+               except AttributeError:
+                       sys.ps1 = ">>> "
+               
+               try:
+                       sys.ps2
+               except AttributeError:
+                       sys.ps2 = "... "
+               
+               more = 0
+               while not self.done:
+                       try:
+                               if more:
+                                       try:
+                                               prompt = sys.ps2
+                                       except AttributeError:
+                                               prompt = ""
+                               else:
+                                       try:
+                                               prompt = sys.ps1
+                                       except AttributeError:
+                                               prompt = ""
+                               
+                               try:
+                                       line = self.raw_input(prompt)
+                               except EOFError:
+                                       self.on_EOF()
+                               else:
+                                       more = self.push(line)
+                               
+                       except KeyboardInterrupt:
+                               self.write("\nKeyboardInterrupt\n")
+                               self.resetbuffer()
+                               more = 0
+       
+       def on_EOF(self):
+               self.write("\n")
+               # sys.exit()
+               raise SystemExit from None
+
+
+running_console = None
+
+def enable():
+       global running_console
+       
+       if running_console is not None:
+               raise RuntimeError("interactive console already running")
+       else:
+               running_console = InteractiveConsole(__main__.__dict__) 
+               running_console.interact() 
+
+def disable():
+       global running_console
+       
+       if running_console is None:
+               raise RuntimeError("interactive console is not running")
+       else:
+               running_console.done = True
+               running_console = None
+
diff --git a/win_unicode_console/readline_hook.py b/win_unicode_console/readline_hook.py
new file mode 100644 (file)
index 0000000..7946784
--- /dev/null
@@ -0,0 +1,106 @@
+
+import sys, traceback
+from ctypes import pythonapi, cdll, c_size_t, c_char_p, c_void_p, cast, CFUNCTYPE, POINTER, addressof
+
+PyMem_Malloc = pythonapi.PyMem_Malloc
+PyMem_Malloc.restype = c_size_t
+PyMem_Malloc.argtypes = [c_size_t]
+
+strncpy = cdll.msvcrt.strncpy
+strncpy.restype = c_char_p
+strncpy.argtypes = [c_char_p, c_char_p, c_size_t]
+
+HOOKFUNC = CFUNCTYPE(c_char_p, c_void_p, c_void_p, c_char_p)
+
+PyOS_ReadlineFunctionPointer = c_void_p.in_dll(pythonapi, "PyOS_ReadlineFunctionPointer")
+
+
+def new_zero_terminated_string(b):
+       p = PyMem_Malloc(len(b) + 1)
+       strncpy(cast(p, c_char_p), b, len(b) + 1)
+       return p
+
+
+class ReadlineHookManager:
+       def __init__(self):
+               self.readline_wrapper_ref = HOOKFUNC(self.readline_wrapper)
+               self.address = c_void_p.from_address(addressof(self.readline_wrapper_ref)).value
+               self.original_address = PyOS_ReadlineFunctionPointer.value
+               self.readline_hook = None
+       
+       def readline_wrapper(self, stdin, stdout, prompt):
+               try:
+                       try:
+                               if sys.stdin.encoding != sys.stdout.encoding:
+                                       raise ValueError("sys.stdin.encoding != sys.stdout.encoding, readline hook doesn't know, which one to use to decode prompt")
+                               
+                       except ValueError:
+                               traceback.print_exc(file=sys.stderr)
+                               try:
+                                       prompt = prompt.decode("utf-8")
+                               except UnicodeDecodeError:
+                                       prompt = ""
+                               
+                       else:
+                               prompt = prompt.decode(sys.stdout.encoding)
+                       
+                       try:
+                               line = self.readline_hook(prompt)
+                       except KeyboardInterrupt:
+                               return 0
+                       else:
+                               return new_zero_terminated_string(line.encode(sys.stdin.encoding))
+                       
+               except:
+                       print("Intenal win_unicode_console error", file=sys.stderr)
+                       traceback.print_exc(file=sys.stderr)
+                       return new_zero_terminated_string(b"\n")
+       
+       def install_hook(self, hook):
+               self.readline_hook = hook
+               PyOS_ReadlineFunctionPointer.value = self.address
+       
+       def restore_original(self):
+               self.readline_hook = None
+               PyOS_ReadlineFunctionPointer.value = self.original_address
+
+
+def readline(prompt):
+       sys.stdout.write(prompt)
+       sys.stdout.flush()
+       return sys.stdin.readline()
+
+
+class PyReadlineManager:
+       def __init__(self):
+               self.original_codepage = pyreadline.unicode_helper.pyreadline_codepage
+       
+       def set_codepage(self, codepage):
+               pyreadline.unicode_helper.pyreadline_codepage = codepage
+       
+       def restore_original(self):
+               self.set_codepage(self.original_codepage)
+
+try:
+       import pyreadline.unicode_helper
+except ImportError:
+       pyreadline = None
+else:
+       pyreadline_manager = PyReadlineManager()
+
+manager = ReadlineHookManager()
+
+
+def enable(*, use_pyreadline=True):
+       if use_pyreadline and pyreadline:
+               pyreadline_manager.set_codepage(sys.stdin.encoding)
+                       # pyreadline assumes that encoding of all sys.stdio objects is the same
+       else:
+               manager.install_hook(readline)
+
+def disable():
+       if pyreadline:
+               pyreadline_manager.restore_original()
+       
+       manager.restore_original()
+
diff --git a/win_unicode_console/runner.py b/win_unicode_console/runner.py
new file mode 100644 (file)
index 0000000..4b22904
--- /dev/null
@@ -0,0 +1,96 @@
+
+from types import CodeType as Code
+import sys
+import traceback
+import __main__
+from ctypes import pythonapi, POINTER, c_long, cast
+import tokenize
+
+
+inspect_flag = cast(pythonapi.Py_InspectFlag, POINTER(c_long)).contents
+
+def set_inspect_flag(value):
+       inspect_flag.value = int(value)
+
+
+def update_code(codeobj, **kwargs):
+       fields = ["argcount", "kwonlyargcount", "nlocals", "stacksize", "flags",
+               "code", "consts", "names", "varnames", "filename", "name",
+               "firstlineno", "lnotab", "freevars", "cellvars"]
+       
+       def field_values():
+               for field in fields:
+                       value = kwargs.get(field, None)
+                       if value is None:
+                               yield getattr(codeobj, "co_{}".format(field))
+                       else:
+                               yield value
+       
+       return Code(*field_values())
+
+def update_code_recursively(codeobj, **kwargs):
+       updated = {}
+       
+       def update(codeobj, **kwargs):
+               result = updated.get(codeobj, None)
+               if result is not None:
+                       return result
+               
+               if any(isinstance(c, Code) for c in codeobj.co_consts):
+                       consts = tuple(update(c, **kwargs) if isinstance(c, Code) else c
+                               for c in codeobj.co_consts)
+               else:
+                       consts = codeobj.co_consts
+               
+               result = update_code(codeobj, consts=consts, **kwargs)
+               updated[codeobj] = result
+               return result
+       
+       return update(codeobj, **kwargs)
+
+
+def get_code(path):
+       with tokenize.open(path) as f:  # opens with detected source encoding
+               source = f.read()
+       
+       try:
+               code = compile(source, path, "exec")
+       except UnicodeEncodeError:
+               code = compile(source, "<encoding error>", "exec")
+               code = update_code_recursively(code, filename=path)
+                       # so code constains correct filename (even if it contains Unicode)
+                       # and tracebacks show contents of code lines
+       
+       return code
+
+class MainLoader:
+       # to reload __main__ properly
+       
+       def __init__(self, path):
+               self.path = path
+       
+       def load_module(self, name):
+               code = get_code(self.path)
+               exec(code, __main__.__dict__)
+               return __main__
+               
+def run_script():
+       sys.argv.pop(0) # so sys.argv looks correct from script being run
+       path = sys.argv[0]
+       __main__.__file__ = path
+       __main__.__loader__ = MainLoader(path)
+       
+       
+       try:
+               code = get_code(path)
+       except Exception as e:
+               traceback.print_exception(e.__class__, e, e.__traceback__.tb_next.tb_next, chain=False)
+       else:
+               try:
+                       exec(code, __main__.__dict__)
+               except BaseException as e:
+                       if not sys.flags.inspect and isinstance(e, SystemExit):
+                               raise
+                       else:
+                               traceback.print_exception(e.__class__, e, e.__traceback__.tb_next)
+
diff --git a/win_unicode_console/streams.py b/win_unicode_console/streams.py
new file mode 100644 (file)
index 0000000..be6927d
--- /dev/null
@@ -0,0 +1,260 @@
+
+from ctypes import byref, windll, c_ulong
+
+from win_unicode_console.buffer import get_buffer
+
+import io
+import sys
+import time
+
+
+kernel32 = windll.kernel32
+GetStdHandle = kernel32.GetStdHandle
+ReadConsoleW = kernel32.ReadConsoleW
+WriteConsoleW = kernel32.WriteConsoleW
+GetLastError = kernel32.GetLastError
+
+
+ERROR_SUCCESS = 0
+ERROR_NOT_ENOUGH_MEMORY = 8
+ERROR_OPERATION_ABORTED = 995
+
+STDIN_HANDLE = GetStdHandle(-10)
+STDOUT_HANDLE = GetStdHandle(-11)
+STDERR_HANDLE = GetStdHandle(-12)
+
+STDIN_FILENO = 0
+STDOUT_FILENO = 1
+STDERR_FILENO = 2
+
+EOF = b"\x1a"
+
+MAX_BYTES_WRITTEN = 32767      # arbitrary because WriteConsoleW ability to write big buffers depends on heap usage
+
+
+class ReprMixin:
+       def __repr__(self):
+               modname = self.__class__.__module__
+               clsname = self.__class__.__qualname__
+               attributes = []
+               for name in ["name", "encoding"]:
+                       try:
+                               value = getattr(self, name)
+                       except AttributeError:
+                               pass
+                       else:
+                               attributes.append("{}={}".format(name, repr(value)))
+               
+               return "<{}.{} {}>".format(modname, clsname, " ".join(attributes))
+
+
+class WindowsConsoleRawIOBase(ReprMixin, io.RawIOBase):
+       def __init__(self, name, handle, fileno):
+               self.name = name
+               self.handle = handle
+               self.file_no = fileno
+       
+       def fileno(self):
+               return self.file_no
+       
+       def isatty(self):
+               super().isatty()        # for close check in default implementation
+               return True
+
+class WindowsConsoleRawReader(WindowsConsoleRawIOBase):
+       def readable(self):
+               return True
+       
+       def readinto(self, b):
+               bytes_to_be_read = len(b)
+               if not bytes_to_be_read:
+                       return 0
+               elif bytes_to_be_read % 2:
+                       raise ValueError("cannot read odd number of bytes from UTF-16-LE encoded console")
+               
+               buffer = get_buffer(b, writable=True)
+               code_units_to_be_read = bytes_to_be_read // 2
+               code_units_read = c_ulong()
+               
+               retval = ReadConsoleW(self.handle, buffer, code_units_to_be_read, byref(code_units_read), None)
+               if GetLastError() == ERROR_OPERATION_ABORTED:
+                       time.sleep(0.1) # wait for KeyboardInterrupt
+               if not retval:
+                       raise OSError("Windows error {}".format(GetLastError()))
+               
+               if buffer[0] == EOF:
+                       return 0
+               else:
+                       return 2 * code_units_read.value
+
+class WindowsConsoleRawWriter(WindowsConsoleRawIOBase):
+       def writable(self):
+               return True
+       
+       @staticmethod
+       def _error_message(errno):
+               if errno == ERROR_SUCCESS:
+                       return "Windows error {} (ERROR_SUCCESS); zero bytes written on nonzero input, probably just one byte given".format(errno)
+               elif errno == ERROR_NOT_ENOUGH_MEMORY:
+                       return "Windows error {} (ERROR_NOT_ENOUGH_MEMORY); try to lower `win_unicode_console.streams.MAX_BYTES_WRITTEN`".format(errno)
+               else:
+                       return "Windows error {}".format(errno)
+       
+       def write(self, b):
+               bytes_to_be_written = len(b)
+               buffer = get_buffer(b)
+               code_units_to_be_written = min(bytes_to_be_written, MAX_BYTES_WRITTEN) // 2
+               code_units_written = c_ulong()
+               
+               retval = WriteConsoleW(self.handle, buffer, code_units_to_be_written, byref(code_units_written), None)
+               bytes_written = 2 * code_units_written.value
+               
+               # fixes both infinite loop of io.BufferedWriter.flush() on when the buffer has odd length
+               #       and situation when WriteConsoleW refuses to write lesser that MAX_BYTES_WRITTEN bytes
+               if bytes_written == 0 != bytes_to_be_written:
+                       raise OSError(self._error_message(GetLastError()))
+               else:
+                       return bytes_written
+
+class TextTranscodingWrapper(ReprMixin, io.TextIOBase):
+       encoding = None
+       
+       def __init__(self, base, encoding):
+               self.base = base
+               self.encoding = encoding
+       
+       @property
+       def errors(self):
+               return self.base.errors
+       
+       @property
+       def line_buffering(self):
+               return self.base.line_buffering
+       
+       def seekable(self):
+               return self.base.seekable()
+       
+       def readable(self):
+               return self.base.readable()
+       
+       def writable(self):
+               return self.base.writable()
+       
+       def flush(self):
+               self.base.flush()
+       
+       def close(self):
+               self.base.close()
+       
+       @property
+       def closed(self):
+               return self.base.closed
+       
+       @property
+       def name(self):
+               return self.base.name
+       
+       def fileno(self):
+               return self.base.fileno()
+       
+       def isatty(self):
+               return self.base.isatty()
+       
+       def write(self, s):
+               return self.base.write(s)
+       
+       def tell(self):
+               return self.base.tell()
+       
+       def truncate(self, pos=None):
+               return self.base.truncate(pos)
+       
+       def seek(self, cookie, whence=0):
+               return self.base.seek(cookie, whence)
+       
+       def read(self, size=None):
+               return self.base.read(size)
+       
+       def __next__(self):
+               return next(self.base)
+       
+       def readline(self, size=-1):
+               return self.base.readline(size)
+       
+       @property
+       def newlines(self):
+               return self.base.newlines
+
+
+stdin_raw = WindowsConsoleRawReader("<stdin>", STDIN_HANDLE, STDIN_FILENO)
+stdout_raw = WindowsConsoleRawWriter("<stdout>", STDOUT_HANDLE, STDOUT_FILENO)
+stderr_raw = WindowsConsoleRawWriter("<stderr>", STDERR_HANDLE, STDERR_FILENO)
+
+stdin_text = io.TextIOWrapper(io.BufferedReader(stdin_raw), encoding="utf-16-le", line_buffering=True)
+stdout_text = io.TextIOWrapper(io.BufferedWriter(stdout_raw), encoding="utf-16-le", line_buffering=True)
+stderr_text = io.TextIOWrapper(io.BufferedWriter(stderr_raw), encoding="utf-16-le", line_buffering=True)
+
+stdin_text_transcoded = TextTranscodingWrapper(stdin_text, encoding="utf-8")
+stdout_text_transcoded = TextTranscodingWrapper(stdout_text, encoding="utf-8")
+stderr_text_transcoded = TextTranscodingWrapper(stderr_text, encoding="utf-8")
+
+
+def disable():
+       sys.stdin.flush()
+       sys.stdout.flush()
+       sys.stderr.flush()
+       sys.stdin = sys.__stdin__
+       sys.stdout = sys.__stdout__
+       sys.stderr = sys.__stderr__
+
+def check_stream(stream, fileno):
+       if stream is None:      # e.g. with IDLE
+               return True
+       
+       try:
+               _fileno = stream.fileno()
+       except io.UnsupportedOperation:
+               return False
+       else:
+               if _fileno == fileno and stream.isatty():
+                       stream.flush()
+                       return True
+               else:
+                       return False
+       
+def enable_reader(*, transcode=True):
+               # transcoding because Python tokenizer cannot handle UTF-16
+       if check_stream(sys.stdin, STDIN_FILENO):
+               if transcode:
+                       sys.stdin = stdin_text_transcoded
+               else:
+                       sys.stdin = stdin_text
+
+def enable_writer(*, transcode=True):
+       if check_stream(sys.stdout, STDOUT_FILENO):
+               if transcode:
+                       sys.stdout = stdout_text_transcoded
+               else:
+                       sys.stdout = stdout_text
+
+def enable_error_writer(*, transcode=True):
+       if check_stream(sys.stderr, STDERR_FILENO):
+               if transcode:
+                       sys.stderr = stderr_text_transcoded
+               else:
+                       sys.stderr = stderr_text
+
+enablers = {"stdin": enable_reader, "stdout": enable_writer, "stderr": enable_error_writer}
+
+def enable(streams=("stdin", "stdout", "stderr"), *, transcode=frozenset(enablers.keys())):
+       if transcode is True:
+               transcode = enablers.keys()
+       elif transcode is False:
+               transcode = set()
+       
+       if not set(streams) | set(transcode) <= enablers.keys():
+               raise ValueError("invalid stream names")
+       
+       for stream in streams:
+               enablers[stream](transcode=(stream in transcode))
+

Benjamin Mako Hill || Want to submit a patch?