projects.mako.cc - kuro5hin_to_wordpress/blob - diary_parser.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8  -*-
   3 """ Script to parse raw HTML pages that reflect Kuro5hin diaries and repost them to a Wordpress blog using in the Wordpress XML RPC API. """
   4
   5 # © Benjamin Mako Hill, 2018
   6 #
   7 # This program is free software: you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation, either version 3 of the License, or
  10 # (at your option) any later version.
  11
  12 from subprocess import check_output
  13 import sys
  14 from lxml import html
  15 import re
  16 from dateutil import parser
  17
  18 from wordpress_xmlrpc import Client, WordPressPost
  19 from wordpress_xmlrpc.methods.posts import GetPosts, NewPost
  20 from wordpress_xmlrpc.methods.users import GetUserInfo
  21
  22 ## EDIT THIS SECTION BEFORE RUNNING!
  23 #########################################################
  24
  25 wp_user = "mako"
  26 wp_password = check_output(["gpg", "--quiet", "--batch", "--decrypt", "/home/mako/.copyrighteous_password.gpg.asc"]).decode("utf-8").strip()
  27 wp_xmlrpc_endpoint = 'https://mako.cc/copyrighteous/xmlrpc.php'
  28 wp_terms = {'post_tag': ['reflections', 'kuro5hin'],
  29             'category': ['Blog Posts']}
  30 archive_base_url = "https://mako.cc/copyrighteous/extra/kuro5hin_archives/"
  31 footer_html = """<hr />\n<p><i>Originally posted as a diary entry on <a href="https://en.wikipedia.org/wiki/Kuro5hin">Kuro5hin</a>. Although Kuro5hin is now defunct, <a href="%(base)s%(filename)s">an archived copy of the post</a> includes a series of comments from the Kuro5hin community.</i></p>"""
  32 ##########################################################
  33
  34 wp = Client(wp_xmlrpc_endpoint, wp_user, wp_password)
  35
  36 def process_file(input_text, filename):
  37     sub = html.fromstring(input_text)
  38
  39     # grab the title of the post
  40     title = sub.xpath("//title")[0].text
  41     title = re.sub(" \|\| kuro5hin\.org$", "", title)
  42
  43     # timestamp
  44     sub_metadata = sub.xpath('//font[text()="%s"]/../../../../font[2]' % title)[0]
  45     date_string = re.sub(r"^.*((Mon|Tue|Wed|Thu|Fri|Sat|Sun).*?EST).*$", r"\1", sub_metadata.text_content())
  46     post_date = parser.parse(date_string)
  47
  48     # post_html
  49     sub_text = sub_metadata.xpath("../../../tr[2]/td[2]/p/font")[0]
  50
  51     # step1: add any material not in a sub_tag
  52     post_html = sub_text.text
  53     if not re.match(r'^\s*$', post_html):
  54         post_html = "<p>%s</p>" % post_html.strip()
  55
  56     # add all the subtags
  57     post_html += "\n".join([html.tostring(x).strip().decode("utf-8") for x in sub_text.getchildren()])
  58     post_html = re.sub(r'<[Bb][Rr]>', '', post_html)
  59     post_html = re.sub(r'[\t ]*(<[Pp]>)\s*', r'\1', post_html)
  60     post_html = re.sub(r'\s*(</[Pp]>)[\t ]*', r'\1', post_html)
  61     post_html = post_html.strip()
  62
  63     post_html = post_html + "\n\n" + (footer_html % {'base' : archive_base_url, 'filename' : filename })
  64
  65     # DEBUG CODE: you might want to uncomment this while testing
  66     # list of comments
  67     # print("***********OUTPUT: %s" % date_string)
  68     # print(post_html)
  69     # return
  70
  71     post = WordPressPost()
  72     post.title = title
  73     post.content = post_html
  74     post.date = post_date
  75     post.terms_names = wp_terms
  76     post.post_status = "pending"
  77
  78     wp.call(NewPost(post))
  79
  80
  81 for filename in sys.argv[1:]:
  82     with open(filename, "r", encoding="latin1") as f:
  83         process_file(f.read(), filename)
  84