add two modified version and documentation master
authorBenjamin Mako Hill <mako@atdot.cc>
Fri, 11 Jan 2013 18:23:46 +0000 (13:23 -0500)
committerBenjamin Mako Hill <mako@atdot.cc>
Fri, 11 Jan 2013 18:23:46 +0000 (13:23 -0500)
README [new file with mode: 0644]
convert_rst_frag.py [new file with mode: 0755]
fix_comment.pl [new file with mode: 0755]
pyblosxom2wxr-copyrighteous.sh [new file with mode: 0755]
pyblosxom2wxr-revealingerrors.sh [moved from pyblosxom2wxr.sh with 97% similarity]

diff --git a/README b/README
new file mode 100644 (file)
index 0000000..e7da2b1
--- /dev/null
+++ b/README
@@ -0,0 +1,53 @@
+This repository has three branches with three different version of the
+script.
+
+I've run this twice and I've found that every PyBlosxom instance is a
+little different and requires a different (sometimes very different)
+version of this script. If you want to use it, you will *need*
+to modify it to fit your needs. I've included three version of this
+script in three different branches. Please use git to look at the three
+to pick and choose what you want.
+
+1. pyblosxom2wxr-snarfed.sh: This is the version available on
+   http://snarfed.org/pyblosxom2wxr
+
+   It assumes posts are .txt files which are HTML extension
+   and assumes both posts and pages as distinct types.
+
+2. pyblosxom2wxr-revealingerrors.sh: coverted http://revealingerrors.com
+   
+   This assumes only posts, in .txt files, but which are all in fact
+   markdown files. It also takes into account 'tags' using the tags.py
+   plugin and assumes that they are the second line of every file
+   following a "#tags" line.
+
+   REQUIRES: markdown_py
+   
+3. pyblosxom2wxr-copyrighteous.sh: converted http://mako.cc/copyrighteous
+
+   This deals with two types of files: .txt files which are standard
+   html pyblosxom files as well as .rst files which are restructured
+   text. It also looks for comma separated tags in the "#tags " line of
+   each file.
+
+   REQUIRES: python-docutils
+
+   HUGE WARNING: This version of the script changes the GUID or the
+   permalinks. It does *not* keep the links that worked with your old
+   version of PyBlosxom. I done because I usd date based URLS and I
+   wanted to get rid of them. The script *also* writes a new file,
+   called "url_mapping_list" in the local directory. I use that to set
+   up a series of 301 permanant redirects with Apache so that people
+   going to the old links still arrive at the page. 
+
+Known Bugs / Workaround
+-------------------------
+
+One bug I have not worked around is that comments are created with the
+'comment_author' field in the database including either the IP address
+or the IP address and time timestamp pre-pended to the name. I think
+this might be a bug in the Wordpress import code. Rather than debug it,
+I wrote a simple perl script (fix_comment.pl) which will fix the
+comments in the database. It's an ugly kluge but it worked for me.
+Please read it carefully before running it.
+
diff --git a/convert_rst_frag.py b/convert_rst_frag.py
new file mode 100755 (executable)
index 0000000..d52a948
--- /dev/null
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+
+from docutils.core import publish_parts
+import sys
+
+settings = { 'initial_header_level': 2,
+             'doctitle_xform': 1 }
+
+raw_rst = ''.join(sys.stdin.readlines())
+html = publish_parts(raw_rst, writer_name='html',
+                     settings_overrides=settings)['body']
+html = html.replace('\n', ' ')
+
+sys.stdout.write(html.encode('utf-8'))
+
+
+
diff --git a/fix_comment.pl b/fix_comment.pl
new file mode 100755 (executable)
index 0000000..9baec03
--- /dev/null
@@ -0,0 +1,55 @@
+#!/usr/bin /perl -w
+
+use DBI;
+my $dbh = DBI->connect('DBI:mysql:copyrighteous:localhost', 'mako')
+    or die "Cannot connect: " . $DBI::errstr;
+
+
+$sql = qq`SELECT comment_ID, comment_author from wp_comments`;
+$sth = $dbh->prepare($sql) or die "Cannot prepare: " . $dbh->errstr();
+$sth->execute() or die "Cannot execute: " . $sth->errstr();
+
+my @row;
+my @fields;
+while(@row = $sth->fetchrow_array()) {
+    my @record = @row;
+    push(@fields, \@record);
+}
+$sth->finish();
+
+# now process the fields
+
+my $new_username;
+my $comment_id;
+if (@fields != 0) {
+    foreach $line (@fields) {
+        # datestamp method
+        if (@$line[1] =~ /^.*\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} (.*)$/) {
+            $new_username = $1;
+        }
+        # just the ip address
+        elsif (@$line[1] =~ /^\d+\.\d+\.\d+\.\d+ (.*)$/) {
+            $new_username = $1;
+        } else {
+            next;
+        }
+        
+        $comment_id = @$line[0];
+        #print "Old: ", @$line[1], "\n";
+        print "Updating: ", $comment_id, " ", $new_username, "\n";
+
+        $sql = qq`UPDATE wp_comments SET comment_author = "$new_username" WHERE comment_ID = $comment_id`;
+        $sth = $dbh->prepare($sql) or die "Cannot prepare: " .
+        $dbh->errstr();
+        $sth->execute() or die "Cannot execute: " . $sth->errstr();
+        $sth->finish();
+        #print "row $i - id is @$line[0], name is @$line[1]\n";
+    }
+}
+
+
+
+
+
+
+
diff --git a/pyblosxom2wxr-copyrighteous.sh b/pyblosxom2wxr-copyrighteous.sh
new file mode 100755 (executable)
index 0000000..623e337
--- /dev/null
@@ -0,0 +1,213 @@
+#!/bin/bash
+#
+# pyblosxom2wxr.sh
+#
+# http://projects.mako.cc/source/pyblosxom2wxr
+# Benjamin Mako Hill <mako@atdot.cc>
+#
+# http://snarfed.org/pyblosxom2wxr
+# Ryan Barrett <pyblosxom2wxr@ryanb.org>
+#
+# Version 0.2.1-cpy. This script is public domain.
+#
+# This script converts PyBlosxom posts and comments into a WXR (WordPress
+# eXtensible RSS) XML file that can be imported into a WordPress blog.
+#
+# Example usage:
+#
+# $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml
+#
+# pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It
+# should work with other versions too, but your mileage may vary.
+#
+# TODO: comment ordering
+
+# exit on error
+set -e
+
+# check args
+if [[ $# = "0" || $1 = "--help" ]]; then
+  echo 'Usage: pyblosxom2wxr.sh FILES...'
+  exit 1
+fi
+
+# category
+all_categories=""
+all_tags=""
+all_slugs=""
+
+# create a variable to include all the urls we've seen
+url_map_file="./url_mapping_list"
+rm $url_map_file; touch $url_map_file
+
+# comment id sequence number
+commentid=1
+
+# output header
+now=`date --rfc-3339=seconds`
+cat << EOF
+<?xml version="1.0" encoding="UTF-8"?> 
+
+<!-- generator="pyblosxom2wxr/1.0" created="${now}" -->
+<rss version="2.0"
+  xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
+  xmlns:content="http://purl.org/rss/1.0/modules/content/"
+  xmlns:wfw="http://wellformedweb.org/CommentAPI/"
+  xmlns:dc="http://purl.org/dc/elements/1.1/"
+  xmlns:wp="http://wordpress.org/export/1.0/">
+
+<channel>
+  <title></title>
+  <link></link>
+  <description></description>
+  <pubDate></pubDate>
+  <generator>http://snarfed.org/pyblosxom2wxr?v=1.0</generator>
+  <language>en</language>
+  <wp:wxr_version>1.0</wp:wxr_version>
+  <wp:base_site_url></wp:base_site_url>
+  <wp:base_blog_url></wp:base_blog_url>
+EOF
+
+# convert blog posts
+for file in "$@"; do
+
+  # extract the filetype and then the payload
+  if [[ ${file##*.} = 'rst' ]]; then
+    payload=$(tail -n +4 "$file" | python ./convert_rst_frag.py)
+    fullname=$(basename "$file" ".rst")
+  elif [[ ${file##*.} = 'txt' ]]; then
+    filetype=".txt"
+    payload=$(tail -n +4 "$file"|perl -pe 's/\n/ /g')
+    fullname=$(basename "$file" ".txt")
+  fi
+
+  creator="mako"
+  dir=$(dirname "$file")
+  commentdir="$(dirname "$file")/comments"
+  title=$(head -n 1 "$file")
+  type="post"
+
+  # create a new slug and then save it to the url map file to create a
+  # set of redirects afteward
+  slug=$(echo $title | perl -p -e \
+    's/\s*(.*?)\s*$/\1/; s/<\/?\w+>//g; $_ = lc; tr/ /-/; s/[^A-Za-z0-9-]//g; s/-+/-/g')
+
+  # count to see if the slug has been before, and then increement it
+  slug_count=$(echo "$all_slugs" | perl -ne "print if /$slug(\-\d)?/" | wc -l)
+  if [[ $slug_count -gt 0 ]]; then
+    slug="$slug-$(expr $slug_count + 1)"
+  fi
+
+  # save the new slug to the list of slugs, and write it to the
+  # url_map_file
+  all_slugs=$(echo "$all_slugs"; echo "$slug" )
+  echo "$fullname $slug" >> $url_map_file
+
+  name=${fullname}
+
+  pubDate=$(date -uR -r $file)
+  date=$(date -r "$file" +'%F %T')
+  dateGmt=$(date -u -r "$file" +'%F %T')
+
+  if grep -q ']]>' "$file"; then
+    echo "WARNING: $file contains the string ]]>, which makes its CDATA " \
+         "section invalid. WordPress handles this ok, but still, heads up." 1>&2
+  fi
+
+  cat << EOF
+<item>
+  <title>${title}</title>
+  <pubDate>${pubDate}</pubDate>
+  <guid isPermaLink="true">/${slug}</guid>
+  <description></description>
+  <content:encoded><![CDATA[${payload}]]></content:encoded>
+  <wp:post_date>${date}</wp:post_date>
+  <wp:post_date_gmt>${dateGmt}</wp:post_date_gmt>
+  <wp:comment_status>open</wp:comment_status>
+  <wp:ping_status>open</wp:ping_status>
+  <wp:post_name>${slug}</wp:post_name>
+  <wp:status>publish</wp:status>
+  <wp:post_parent>0</wp:post_parent>
+  <wp:menu_order>0</wp:menu_order>
+  <wp:post_type>${type}</wp:post_type>
+  <wp:post_password></wp:post_password>
+  <wp:is_sticky>0</wp:is_sticky>
+  <dc:creator>${creator}</dc:creator>
+EOF
+  # split the tags
+  raw_tags=$(grep '#tags' "$file" |perl -pe 's/.tags // '|tr ',' "\n")
+  for tag in $raw_tags; do
+    tag=$(echo "$tag"|perl -p -e 's/^\s*(.*)\s*$/\1/')
+    echo "<category domain=\"post_tag\" nicename=\"${tag}\">${tag}</category>"
+    #echo "<category domain=\"category\" nicename=\"$category\">$category</category>"
+    all_tags=$(echo "$tag"; echo "$all_tags")
+  done;
+
+  # category="uncategorized"
+  # <category domain="category" nicename="$category">$category</category>
+
+  # other possible elements:
+#  <link>/${fullname}</link>
+#  <wp:post_id></wp:post_id>
+#  <excerpt:encoded></excerpt:encoded>
+
+  for cmtfile in ${commentdir}/"$fullname"-{all,[0-9]*}.cmt; do
+    if [[ -e "$cmtfile" ]]; then
+      set +e  # because the perl script below uses a non-zero exit code
+      tail -q -n +2 "$cmtfile" | \
+        sed -r '
+          s/^<item>$/<wp:comment>\n<wp:comment_id>X<\/wp:comment_id>/;
+          s/^<\/item>$/<wp:comment_approved>1<\/wp:comment_approved>\n<\/wp:comment>/;
+          s/<(\/)?author>/<\1wp:comment_author>/g;
+          s/<(\/)?link>/<\1wp:comment_author_url>/g;
+          s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g;
+          s/<(\/)?description>/<\1wp:comment_content>/g;
+          s/<(\/)?email>/<\1wp:comment_email>/g;
+          s/^<(ajax|cmt_date|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//;
+          s/^<\/?items>$//;
+          /^$/d' | \
+        perl -pe 'use HTML::Entities; decode_entities($_)' | \
+        perl -pe 'use POSIX qw(strftime);
+                  s/^<pubDate>(.+)<\/pubDate>$/"<wp:comment_date>" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \
+        perl -e '
+          my $id = '${commentid}';
+          while (<STDIN>) {
+             s/^(<wp:comment_id>)X(<\/wp:comment_id>)$/$1 . $id++ . $2/e;
+             print $_;
+          }
+          exit $id - '${commentid}';'
+      # TODO: this is a hack since exit codes are only 8 bits unsigned.
+      # this will break on posts with >255 comments.
+      let commentid+=$?
+      set -e
+    fi
+  done
+
+  cat << EOF
+</item>
+
+EOF
+done
+
+index=2
+#for category in $(echo "$all_categories"|sort|uniq); do
+#    cat << EOF
+#    <wp:category><wp:term_id>${index}</wp:term_id><wp:category_nicename>${category}</wp:category_nicename><wp:category_parent></wp:category_parent><wp:cat_name><![CDATA[${category}]]></wp:cat_name></wp:category>
+#EOF
+#    index=$(expr $index + 1)
+#done
+
+for tag in $(echo "$all_tags"|sort|uniq); do
+    cat << EOF
+    <wp:tag><wp:term_id>${index}</wp:term_id><wp:tag_slug>${tag}</wp:tag_slug><wp:tag_name>${tag}</wp:tag_name></wp:tag>
+EOF
+    index=$(expr $index + 1)
+done
+
+# output footer
+cat << EOF
+</channel>
+</rss>
+EOF
+
similarity index 97%
rename from pyblosxom2wxr.sh
rename to pyblosxom2wxr-revealingerrors.sh
index 4e91ef54aa51e07f78a12b83c27a250176e504a4..02318c4e3fac2219f3910ef854ba884ce0b800b5 100755 (executable)
@@ -1,9 +1,14 @@
 #!/bin/bash
 #
 # pyblosxom2wxr.sh
+#
+# http://projects.mako.cc/source/pyblosxom2wxr
+# Benjamin Mako Hill <mako@atdot.cc>
+#
 # http://snarfed.org/pyblosxom2wxr
 # Ryan Barrett <pyblosxom2wxr@ryanb.org>
-# Version 0.2. This script is public domain.
+#
+# Version 0.2.1-re. This script is public domain.
 #
 # This script converts PyBlosxom posts and comments into a WXR (WordPress
 # eXtensible RSS) XML file that can be imported into a WordPress blog.

Benjamin Mako Hill || Want to submit a patch?