From 5de5b8f8fb9e044d3149b3c4e91369e43586cd5b Mon Sep 17 00:00:00 2001
From: Benjamin Mako Hill <mako@atdot.cc>
Date: Fri, 16 Feb 2018 18:32:08 -0800
Subject: [PATCH 1/1] initial import into git

---
 README          | 54 +++++++++++++++++++++++++++++++
 diary_parser.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 README
 create mode 100755 diary_parser.py
diff --git a/README b/README
new file mode 100644
index 0000000..7a6f18e
--- /dev/null
+++ b/README
@@ -0,0 +1,54 @@
+This program will parse raw HTML pages of Kuro5hin diaries and post
+them to a Wordpress blog using in the Wordpress XMLRPC API.
+
+Requirements
+=============
+
+To use the program you will need the following software:
+
+* Python 3
+* The python-wordpress-xmlrpc package:
+  https://pypi.python.org/pypi/python-wordpress-xmlrpc
+
+Of course, you will also need the Kuro5hin diary entries you want to
+import. I grabbed mine from "What's Left of K5, AKA Mumble's Archive"
+described here:
+
+https://kr5ddit.com/post/754
+
+Using the Program
+===================
+
+This is how I used the data:
+
+1.
+
+I downloaded and unzipped this file:
+
+http://k5.semantic-db.org/diary-slurp/161942--archive-diaries--html-diaries--nested-format.zip
+
+2.
+
+My username is "makohill" so searched through and copied diary entries from the location of the unzipped entries with a command like this one:
+
+grep -l -r 'HREF="/user/makohill">makohill</A>' LOCATION_OF_ENTRIES|xargs -i cp {} .
+
+3.
+
+Once I did that, I modified and imported the data with a command like:
+
+./diary_parser.py 2002-12-26-9150-8083.html
+
+By default, the entries are posted with "pending" status so I could check then first. If you have many entries, you might want to tweak this. Details on the Wordpress XMLRPC API and the Python module I use is here:
+
+https://codex.wordpress.org/XML-RPC_WordPress_API/Posts
+https://python-wordpress-xmlrpc.readthedocs.io/en/latest/index.html
+
+Copyright and License
+======================
+Â© Benjamin Mako Hill, 2018
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
diff --git a/diary_parser.py b/diary_parser.py
new file mode 100755
index 0000000..c41cb95
--- /dev/null
+++ b/diary_parser.py
@@ -0,0 +1,84 @@
+#!/usr/bin/python3
+# -*- coding: utf-8  -*-
+""" Script to parse raw HTML pages that reflect Kuro5hin diaries and repost them to a Wordpress blog using in the Wordpress XML RPC API. """
+
+# Â© Benjamin Mako Hill, 2018
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+from subprocess import check_output
+import sys
+from lxml import html
+import re
+from dateutil import parser
+
+from wordpress_xmlrpc import Client, WordPressPost
+from wordpress_xmlrpc.methods.posts import GetPosts, NewPost
+from wordpress_xmlrpc.methods.users import GetUserInfo
+
+## EDIT THIS SECTION BEFORE RUNNING!
+#########################################################
+
+wp_user = "mako"
+wp_password = check_output(["gpg", "--quiet", "--batch", "--decrypt", "/home/mako/.copyrighteous_password.gpg.asc"]).decode("utf-8").strip()
+wp_xmlrpc_endpoint = 'https://mako.cc/copyrighteous/xmlrpc.php'
+wp_terms = {'post_tag': ['reflections', 'kuro5hin'],
+            'category': ['Blog Posts']}
+archive_base_url = "https://mako.cc/copyrighteous/extra/kuro5hin_archives/"
+footer_html = """<hr />\n<p><i>Originally posted as a diary entry on <a href="https://en.wikipedia.org/wiki/Kuro5hin">Kuro5hin</a>. Although Kuro5hin is now defunct, <a href="%(base)s%(filename)s">an archived copy of the post</a> includes a series of comments from the Kuro5hin community.</i></p>"""
+##########################################################
+
+wp = Client(wp_xmlrpc_endpoint, wp_user, wp_password)
+
+def process_file(input_text, filename):
+    sub = html.fromstring(input_text)
+    
+    # grab the title of the post
+    title = sub.xpath("//title")[0].text
+    title = re.sub(" \|\| kuro5hin\.org$", "", title)
+
+    # timestamp
+    sub_metadata = sub.xpath('//font[text()="%s"]/../../../../font[2]' % title)[0]
+    date_string = re.sub(r"^.*((Mon|Tue|Wed|Thu|Fri|Sat|Sun).*?EST).*$", r"\1", sub_metadata.text_content())
+    post_date = parser.parse(date_string)
+    
+    # post_html
+    sub_text = sub_metadata.xpath("../../../tr[2]/td[2]/p/font")[0]
+
+    # step1: add any material not in a sub_tag
+    post_html = sub_text.text
+    if not re.match(r'^\s*$', post_html):
+        post_html = "<p>%s</p>" % post_html.strip()
+
+    # add all the subtags
+    post_html += "\n".join([html.tostring(x).strip().decode("utf-8") for x in sub_text.getchildren()])
+    post_html = re.sub(r'<[Bb][Rr]>', '', post_html)
+    post_html = re.sub(r'[\t ]*(<[Pp]>)\s*', r'\1', post_html)
+    post_html = re.sub(r'\s*(</[Pp]>)[\t ]*', r'\1', post_html)
+    post_html = post_html.strip()
+
+    post_html = post_html + "\n\n" + (footer_html % {'base' : archive_base_url, 'filename' : filename })
+    
+    # DEBUG CODE: you might want to uncomment this while testing
+    # list of comments
+    # print("***********OUTPUT: %s" % date_string)
+    # print(post_html)
+    # return
+
+    post = WordPressPost()
+    post.title = title
+    post.content = post_html
+    post.date = post_date
+    post.terms_names = wp_terms 
+    post.post_status = "pending"
+
+    wp.call(NewPost(post))
+    
+    
+for filename in sys.argv[1:]:
+    with open(filename, "r", encoding="latin1") as f:
+        process_file(f.read(), filename)
+
-- 
2.39.5