+#!/bin/bash
+#
+# pyblosxom2wxr.sh
+#
+# http://projects.mako.cc/source/pyblosxom2wxr
+# Benjamin Mako Hill <mako@atdot.cc>
+#
+# http://snarfed.org/pyblosxom2wxr
+# Ryan Barrett <pyblosxom2wxr@ryanb.org>
+#
+# Version 0.2.1-cpy. This script is public domain.
+#
+# This script converts PyBlosxom posts and comments into a WXR (WordPress
+# eXtensible RSS) XML file that can be imported into a WordPress blog.
+#
+# Example usage:
+#
+# $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml
+#
+# pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It
+# should work with other versions too, but your mileage may vary.
+#
+# TODO: comment ordering
+
+# exit on error
+set -e
+
+# check args
+if [[ $# = "0" || $1 = "--help" ]]; then
+ echo 'Usage: pyblosxom2wxr.sh FILES...'
+ exit 1
+fi
+
+# category
+all_categories=""
+all_tags=""
+all_slugs=""
+
+# create a variable to include all the urls we've seen
+url_map_file="./url_mapping_list"
+rm $url_map_file; touch $url_map_file
+
+# comment id sequence number
+commentid=1
+
+# output header
+now=`date --rfc-3339=seconds`
+cat << EOF
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!-- generator="pyblosxom2wxr/1.0" created="${now}" -->
+<rss version="2.0"
+ xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
+ xmlns:content="http://purl.org/rss/1.0/modules/content/"
+ xmlns:wfw="http://wellformedweb.org/CommentAPI/"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:wp="http://wordpress.org/export/1.0/">
+
+<channel>
+ <title></title>
+ <link></link>
+ <description></description>
+ <pubDate></pubDate>
+ <generator>http://snarfed.org/pyblosxom2wxr?v=1.0</generator>
+ <language>en</language>
+ <wp:wxr_version>1.0</wp:wxr_version>
+ <wp:base_site_url></wp:base_site_url>
+ <wp:base_blog_url></wp:base_blog_url>
+EOF
+
+# convert blog posts
+for file in "$@"; do
+
+ # extract the filetype and then the payload
+ if [[ ${file##*.} = 'rst' ]]; then
+ payload=$(tail -n +4 "$file" | python ./convert_rst_frag.py)
+ fullname=$(basename "$file" ".rst")
+ elif [[ ${file##*.} = 'txt' ]]; then
+ filetype=".txt"
+ payload=$(tail -n +4 "$file"|perl -pe 's/\n/ /g')
+ fullname=$(basename "$file" ".txt")
+ fi
+
+ creator="mako"
+ dir=$(dirname "$file")
+ commentdir="$(dirname "$file")/comments"
+ title=$(head -n 1 "$file")
+ type="post"
+
+ # create a new slug and then save it to the url map file to create a
+ # set of redirects afteward
+ slug=$(echo $title | perl -p -e \
+ 's/\s*(.*?)\s*$/\1/; s/<\/?\w+>//g; $_ = lc; tr/ /-/; s/[^A-Za-z0-9-]//g; s/-+/-/g')
+
+ # count to see if the slug has been before, and then increement it
+ slug_count=$(echo "$all_slugs" | perl -ne "print if /$slug(\-\d)?/" | wc -l)
+ if [[ $slug_count -gt 0 ]]; then
+ slug="$slug-$(expr $slug_count + 1)"
+ fi
+
+ # save the new slug to the list of slugs, and write it to the
+ # url_map_file
+ all_slugs=$(echo "$all_slugs"; echo "$slug" )
+ echo "$fullname $slug" >> $url_map_file
+
+ name=${fullname}
+
+ pubDate=$(date -uR -r $file)
+ date=$(date -r "$file" +'%F %T')
+ dateGmt=$(date -u -r "$file" +'%F %T')
+
+ if grep -q ']]>' "$file"; then
+ echo "WARNING: $file contains the string ]]>, which makes its CDATA " \
+ "section invalid. WordPress handles this ok, but still, heads up." 1>&2
+ fi
+
+ cat << EOF
+<item>
+ <title>${title}</title>
+ <pubDate>${pubDate}</pubDate>
+ <guid isPermaLink="true">/${slug}</guid>
+ <description></description>
+ <content:encoded><![CDATA[${payload}]]></content:encoded>
+ <wp:post_date>${date}</wp:post_date>
+ <wp:post_date_gmt>${dateGmt}</wp:post_date_gmt>
+ <wp:comment_status>open</wp:comment_status>
+ <wp:ping_status>open</wp:ping_status>
+ <wp:post_name>${slug}</wp:post_name>
+ <wp:status>publish</wp:status>
+ <wp:post_parent>0</wp:post_parent>
+ <wp:menu_order>0</wp:menu_order>
+ <wp:post_type>${type}</wp:post_type>
+ <wp:post_password></wp:post_password>
+ <wp:is_sticky>0</wp:is_sticky>
+ <dc:creator>${creator}</dc:creator>
+EOF
+
+ # split the tags
+ raw_tags=$(grep '#tags' "$file" |perl -pe 's/.tags // '|tr ',' "\n")
+ for tag in $raw_tags; do
+ tag=$(echo "$tag"|perl -p -e 's/^\s*(.*)\s*$/\1/')
+ echo "<category domain=\"post_tag\" nicename=\"${tag}\">${tag}</category>"
+ #echo "<category domain=\"category\" nicename=\"$category\">$category</category>"
+ all_tags=$(echo "$tag"; echo "$all_tags")
+ done;
+
+ # category="uncategorized"
+ # <category domain="category" nicename="$category">$category</category>
+
+ # other possible elements:
+# <link>/${fullname}</link>
+# <wp:post_id></wp:post_id>
+# <excerpt:encoded></excerpt:encoded>
+
+ for cmtfile in ${commentdir}/"$fullname"-{all,[0-9]*}.cmt; do
+ if [[ -e "$cmtfile" ]]; then
+ set +e # because the perl script below uses a non-zero exit code
+ tail -q -n +2 "$cmtfile" | \
+ sed -r '
+ s/^<item>$/<wp:comment>\n<wp:comment_id>X<\/wp:comment_id>/;
+ s/^<\/item>$/<wp:comment_approved>1<\/wp:comment_approved>\n<\/wp:comment>/;
+ s/<(\/)?author>/<\1wp:comment_author>/g;
+ s/<(\/)?link>/<\1wp:comment_author_url>/g;
+ s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g;
+ s/<(\/)?description>/<\1wp:comment_content>/g;
+ s/<(\/)?email>/<\1wp:comment_email>/g;
+ s/^<(ajax|cmt_date|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//;
+ s/^<\/?items>$//;
+ /^$/d' | \
+ perl -pe 'use HTML::Entities; decode_entities($_)' | \
+ perl -pe 'use POSIX qw(strftime);
+ s/^<pubDate>(.+)<\/pubDate>$/"<wp:comment_date>" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \
+ perl -e '
+ my $id = '${commentid}';
+ while (<STDIN>) {
+ s/^(<wp:comment_id>)X(<\/wp:comment_id>)$/$1 . $id++ . $2/e;
+ print $_;
+ }
+ exit $id - '${commentid}';'
+ # TODO: this is a hack since exit codes are only 8 bits unsigned.
+ # this will break on posts with >255 comments.
+ let commentid+=$?
+ set -e
+ fi
+ done
+
+ cat << EOF
+</item>
+
+EOF
+done
+
+index=2
+#for category in $(echo "$all_categories"|sort|uniq); do
+# cat << EOF
+# <wp:category><wp:term_id>${index}</wp:term_id><wp:category_nicename>${category}</wp:category_nicename><wp:category_parent></wp:category_parent><wp:cat_name><![CDATA[${category}]]></wp:cat_name></wp:category>
+#EOF
+# index=$(expr $index + 1)
+#done
+
+for tag in $(echo "$all_tags"|sort|uniq); do
+ cat << EOF
+ <wp:tag><wp:term_id>${index}</wp:term_id><wp:tag_slug>${tag}</wp:tag_slug><wp:tag_name>${tag}</wp:tag_name></wp:tag>
+EOF
+ index=$(expr $index + 1)
+done
+
+# output footer
+cat << EOF
+</channel>
+</rss>
+EOF
+