From b718cd1999414cb05ab40db5a1cf106dc4521935 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Thu, 10 Jan 2013 17:32:27 -0500 Subject: [PATCH] the version directly from upstream --- pyblosxom2wxr.sh | 170 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100755 pyblosxom2wxr.sh diff --git a/pyblosxom2wxr.sh b/pyblosxom2wxr.sh new file mode 100755 index 0000000..c3218d7 --- /dev/null +++ b/pyblosxom2wxr.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# +# pyblosxom2wxr.sh +# http://snarfed.org/pyblosxom2wxr +# Ryan Barrett +# Version 0.2. This script is public domain. +# +# This script converts PyBlosxom posts and comments into a WXR (WordPress +# eXtensible RSS) XML file that can be imported into a WordPress blog. +# +# Example usage: +# +# $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml +# +# pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It +# should work with other versions too, but your mileage may vary. +# +# TODO: comment ordering + +# exit on error +set -e + +# check args +if [[ $# = "0" || $1 = "--help" ]]; then + echo 'Usage: pyblosxom2wxr.sh FILES...' + exit 1 +fi + +# comment id sequence number +commentid=1 + +# output header +now=`date --rfc-3339=seconds` +cat << EOF + + + + + + + + + + + http://snarfed.org/pyblosxom2wxr?v=1.0 + en + 1.0 + + + + +EOF + +# convert comments +for file in "$@"; do + fullname=`basename "$file" .txt` + dir=`dirname "$file"` + title=`head -n 1 "$file"` + + # TODO: make this easier to customize + date_re="[0-9]{4}-[0-9]{2}-[0-9]{2}" + time_re="([0-9]{2})-([0-9]{2})" + + # my pyblosxom posts have a date prefix, e.g. 2010-03-13. my pages don't. + if [[ "$fullname" =~ ^${date_re} ]]; then + type=post + name=${fullname:11} + datestr="${fullname::10} 00:00:00 -0800" + else + type=page + name=${fullname} + + timestamp_file=${dir}/../timestamps + datestr=`grep --max-count=1 -E \ + "^${date_re}-${time_re} (.+/)?${fullname}.txt\$" ${timestamp_file} | \ + cut -f1 -d' ' | \ + sed -r "s/-${time_re}\$/ \1:\2 -0500/"` + + if [[ ${datestr} == '' ]]; then + datestr=`stat --format=%y "$file"` + fi + fi + + pubDate=`date -uR -d "$datestr"` + date=`date -d "$datestr" +'%F %T'` + dateGmt=`date -u -d "$datestr" +'%F %T'` + + # TODO: category support + category="uncategorized" + + if grep -q ']]>' "$file"; then + echo "WARNING: $file contains the string ]]>, which makes its CDATA " \ + "section invalid. WordPress handles this ok, but still, heads up." 1>&2 + fi + + cat << EOF + + ${title} + ${pubDate} + $category + /${fullname} + + + ${date} + ${dateGmt} + open + open + ${name} + publish + 0 + 0 + ${type} + + 0 +EOF + + # other possible elements: +# /${fullname} +# +# +# ${creator} + + + for cmtfile in ${dir}/"$fullname"-{all,[0-9]*}.cmt; do + if [[ -e "$cmtfile" ]]; then + set +e # because the perl script below uses a non-zero exit code + tail -q -n +2 "$cmtfile" | \ + sed -r ' + s/^$/\nX<\/wp:comment_id>/; + s/^<\/item>$/1<\/wp:comment_approved>\n<\/wp:comment>/; + s/<(\/)?author>/<\1wp:comment_author>/g; + s/<(\/)?link>/<\1wp:comment_author_url>/g; + s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g; + s/<(\/)?description>/<\1wp:comment_content>/g; + s/^<(ajax|cmt_date|email|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//; + s/^<\/?items>$//; + /^$/d' | \ + perl -pe 'use HTML::Entities; decode_entities($_)' | \ + perl -pe 'use POSIX qw(strftime); + s/^(.+)<\/pubDate>$/"" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \ + perl -e ' + my $id = '${commentid}'; + while () { + s/^()X(<\/wp:comment_id>)$/$1 . $id++ . $2/e; + print $_; + } + exit $id - '${commentid}';' + # TODO: this is a hack since exit codes are only 8 bits unsigned. + # this will break on posts with >255 comments. + let commentid+=$? + set -e + fi + done + + cat << EOF + + +EOF +done + +# output footer +cat << EOF + + +EOF -- 2.39.5