X-Git-Url: https://projects.mako.cc/source/kuro5hin_to_wordpress/blobdiff_plain/5de5b8f8fb9e044d3149b3c4e91369e43586cd5b..67dc510ff5b8776de8d688f8ed5486d3d1359fe2:/diary_parser.py diff --git a/diary_parser.py b/diary_parser.py deleted file mode 100755 index c41cb95..0000000 --- a/diary_parser.py +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- -""" Script to parse raw HTML pages that reflect Kuro5hin diaries and repost them to a Wordpress blog using in the Wordpress XML RPC API. """ - -# © Benjamin Mako Hill, 2018 -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -from subprocess import check_output -import sys -from lxml import html -import re -from dateutil import parser - -from wordpress_xmlrpc import Client, WordPressPost -from wordpress_xmlrpc.methods.posts import GetPosts, NewPost -from wordpress_xmlrpc.methods.users import GetUserInfo - -## EDIT THIS SECTION BEFORE RUNNING! -######################################################### - -wp_user = "mako" -wp_password = check_output(["gpg", "--quiet", "--batch", "--decrypt", "/home/mako/.copyrighteous_password.gpg.asc"]).decode("utf-8").strip() -wp_xmlrpc_endpoint = 'https://mako.cc/copyrighteous/xmlrpc.php' -wp_terms = {'post_tag': ['reflections', 'kuro5hin'], - 'category': ['Blog Posts']} -archive_base_url = "https://mako.cc/copyrighteous/extra/kuro5hin_archives/" -footer_html = """
\n

Originally posted as a diary entry on Kuro5hin. Although Kuro5hin is now defunct, an archived copy of the post includes a series of comments from the Kuro5hin community.

""" -########################################################## - -wp = Client(wp_xmlrpc_endpoint, wp_user, wp_password) - -def process_file(input_text, filename): - sub = html.fromstring(input_text) - - # grab the title of the post - title = sub.xpath("//title")[0].text - title = re.sub(" \|\| kuro5hin\.org$", "", title) - - # timestamp - sub_metadata = sub.xpath('//font[text()="%s"]/../../../../font[2]' % title)[0] - date_string = re.sub(r"^.*((Mon|Tue|Wed|Thu|Fri|Sat|Sun).*?EST).*$", r"\1", sub_metadata.text_content()) - post_date = parser.parse(date_string) - - # post_html - sub_text = sub_metadata.xpath("../../../tr[2]/td[2]/p/font")[0] - - # step1: add any material not in a sub_tag - post_html = sub_text.text - if not re.match(r'^\s*$', post_html): - post_html = "

%s

" % post_html.strip() - - # add all the subtags - post_html += "\n".join([html.tostring(x).strip().decode("utf-8") for x in sub_text.getchildren()]) - post_html = re.sub(r'<[Bb][Rr]>', '', post_html) - post_html = re.sub(r'[\t ]*(<[Pp]>)\s*', r'\1', post_html) - post_html = re.sub(r'\s*()[\t ]*', r'\1', post_html) - post_html = post_html.strip() - - post_html = post_html + "\n\n" + (footer_html % {'base' : archive_base_url, 'filename' : filename }) - - # DEBUG CODE: you might want to uncomment this while testing - # list of comments - # print("***********OUTPUT: %s" % date_string) - # print(post_html) - # return - - post = WordPressPost() - post.title = title - post.content = post_html - post.date = post_date - post.terms_names = wp_terms - post.post_status = "pending" - - wp.call(NewPost(post)) - - -for filename in sys.argv[1:]: - with open(filename, "r", encoding="latin1") as f: - process_file(f.read(), filename) -