From 5de5b8f8fb9e044d3149b3c4e91369e43586cd5b Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Fri, 16 Feb 2018 18:32:08 -0800 Subject: [PATCH 1/1] initial import into git --- README | 54 +++++++++++++++++++++++++++++++ diary_parser.py | 84 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 README create mode 100755 diary_parser.py diff --git a/README b/README new file mode 100644 index 0000000..7a6f18e --- /dev/null +++ b/README @@ -0,0 +1,54 @@ +This program will parse raw HTML pages of Kuro5hin diaries and post +them to a Wordpress blog using in the Wordpress XMLRPC API. + +Requirements +============= + +To use the program you will need the following software: + +* Python 3 +* The python-wordpress-xmlrpc package: + https://pypi.python.org/pypi/python-wordpress-xmlrpc + +Of course, you will also need the Kuro5hin diary entries you want to +import. I grabbed mine from "What's Left of K5, AKA Mumble's Archive" +described here: + +https://kr5ddit.com/post/754 + +Using the Program +=================== + +This is how I used the data: + +1. + +I downloaded and unzipped this file: + +http://k5.semantic-db.org/diary-slurp/161942--archive-diaries--html-diaries--nested-format.zip + +2. + +My username is "makohill" so searched through and copied diary entries from the location of the unzipped entries with a command like this one: + +grep -l -r 'HREF="/user/makohill">makohill' LOCATION_OF_ENTRIES|xargs -i cp {} . + +3. + +Once I did that, I modified and imported the data with a command like: + +./diary_parser.py 2002-12-26-9150-8083.html + +By default, the entries are posted with "pending" status so I could check then first. If you have many entries, you might want to tweak this. Details on the Wordpress XMLRPC API and the Python module I use is here: + +https://codex.wordpress.org/XML-RPC_WordPress_API/Posts +https://python-wordpress-xmlrpc.readthedocs.io/en/latest/index.html + +Copyright and License +====================== +© Benjamin Mako Hill, 2018 + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or (at +your option) any later version. diff --git a/diary_parser.py b/diary_parser.py new file mode 100755 index 0000000..c41cb95 --- /dev/null +++ b/diary_parser.py @@ -0,0 +1,84 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +""" Script to parse raw HTML pages that reflect Kuro5hin diaries and repost them to a Wordpress blog using in the Wordpress XML RPC API. """ + +# © Benjamin Mako Hill, 2018 +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +from subprocess import check_output +import sys +from lxml import html +import re +from dateutil import parser + +from wordpress_xmlrpc import Client, WordPressPost +from wordpress_xmlrpc.methods.posts import GetPosts, NewPost +from wordpress_xmlrpc.methods.users import GetUserInfo + +## EDIT THIS SECTION BEFORE RUNNING! +######################################################### + +wp_user = "mako" +wp_password = check_output(["gpg", "--quiet", "--batch", "--decrypt", "/home/mako/.copyrighteous_password.gpg.asc"]).decode("utf-8").strip() +wp_xmlrpc_endpoint = 'https://mako.cc/copyrighteous/xmlrpc.php' +wp_terms = {'post_tag': ['reflections', 'kuro5hin'], + 'category': ['Blog Posts']} +archive_base_url = "https://mako.cc/copyrighteous/extra/kuro5hin_archives/" +footer_html = """
\n

Originally posted as a diary entry on Kuro5hin. Although Kuro5hin is now defunct, an archived copy of the post includes a series of comments from the Kuro5hin community.

""" +########################################################## + +wp = Client(wp_xmlrpc_endpoint, wp_user, wp_password) + +def process_file(input_text, filename): + sub = html.fromstring(input_text) + + # grab the title of the post + title = sub.xpath("//title")[0].text + title = re.sub(" \|\| kuro5hin\.org$", "", title) + + # timestamp + sub_metadata = sub.xpath('//font[text()="%s"]/../../../../font[2]' % title)[0] + date_string = re.sub(r"^.*((Mon|Tue|Wed|Thu|Fri|Sat|Sun).*?EST).*$", r"\1", sub_metadata.text_content()) + post_date = parser.parse(date_string) + + # post_html + sub_text = sub_metadata.xpath("../../../tr[2]/td[2]/p/font")[0] + + # step1: add any material not in a sub_tag + post_html = sub_text.text + if not re.match(r'^\s*$', post_html): + post_html = "

%s

" % post_html.strip() + + # add all the subtags + post_html += "\n".join([html.tostring(x).strip().decode("utf-8") for x in sub_text.getchildren()]) + post_html = re.sub(r'<[Bb][Rr]>', '', post_html) + post_html = re.sub(r'[\t ]*(<[Pp]>)\s*', r'\1', post_html) + post_html = re.sub(r'\s*()[\t ]*', r'\1', post_html) + post_html = post_html.strip() + + post_html = post_html + "\n\n" + (footer_html % {'base' : archive_base_url, 'filename' : filename }) + + # DEBUG CODE: you might want to uncomment this while testing + # list of comments + # print("***********OUTPUT: %s" % date_string) + # print(post_html) + # return + + post = WordPressPost() + post.title = title + post.content = post_html + post.date = post_date + post.terms_names = wp_terms + post.post_status = "pending" + + wp.call(NewPost(post)) + + +for filename in sys.argv[1:]: + with open(filename, "r", encoding="latin1") as f: + process_file(f.read(), filename) + -- 2.39.5