From 63b7950374b42e5bb351e534dfa3c150c3d5a285 Mon Sep 17 00:00:00 2001 From: Benjamin Mako Hill Date: Fri, 11 Jan 2013 13:14:51 -0500 Subject: [PATCH] added upstream version and docs as a separate file --- README.snarfed | 58 +++++++++++++ pyblosxom2wxr-snarfed.sh | 170 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 228 insertions(+) create mode 100644 README.snarfed create mode 100755 pyblosxom2wxr-snarfed.sh diff --git a/README.snarfed b/README.snarfed new file mode 100644 index 0000000..c655079 --- /dev/null +++ b/README.snarfed @@ -0,0 +1,58 @@ +NOTE: The latest version of this file is online here: + http://snarfed.org/pyblosxom2wxr + +pyblosxom2wxr.sh is a shell script that migrates content from PyBlosxom +to WordPress. It converts PyBlosxom posts and comments into a WXR +(WordPress eXtensible RSS) file that can be imported into WordPress. + +Notes: + +- The post file extension is hard-coded to .txt, since that’s what + PyBlosxom expects. + +- Pages are supported as well as posts. pyblosxom2wxr assumes that post + filenames start with the date, in YYYY-MM-DD format, e.g. + 2010-07-28_my_post.txt. Files without a prefix in that format are + assumed to be pages. (This is hard coded but would be easy to change. + Search for the date_re variable.) + +- The filename is used as the WordPress post/page GUID, and the first + line of the file is extracted and used as the title. The second line + is assumed to be blank. If your files don’t follow that format, you’ll + want to preprocess them or tweak the script. + +- Categories are not (yet) supported. All posts and pages are assigned + to the “uncategorized” category in WordPress. + +- WordPress limits import files to 2MB, but pyblosxom2wxr can generate + output files larger than that. If that happens, you can split it + manually or with a tool like ChoppedPress. + +- By default, the last modified time of post and page files is used as + their timestamp. However, if you have a timestamps file from the + hardcodedates PyBlosxom plugin, it will be used instead. The default + path is ../timestamp; you can customize this by editing the + timestamp_file variable in the script. + +- If you use Markdown or another markup language where line breaks and + whitespace are meaningful, you’ll want to apply this patch to the + WordPress importer. + +- pyblosxom2wxr doesn’t assign post ids. It omits elements + in the output file. This makes WordPress allocate post ids itself. + +- However, WordPress won’t allocate comment ids itself, so pyblosxom2wxr + has to do that and populate them in elements. This + means that importing a WXR file generated by pyblosxom2wxr may + overwrite any existing comments! + +- If you use PyBlosxom’s compact_comments.sh, comments imported from + -all.cmt files may not be ordered by date. See my page on extracting + compacted PyBlosxom comments for a workaround. + +Known bugs: + +- Posts with more than 256 comments are not supported well. Only the + last 256 comments will be imported, and will likely be ordered wrong. + See the TODO near the end of the script. + diff --git a/pyblosxom2wxr-snarfed.sh b/pyblosxom2wxr-snarfed.sh new file mode 100755 index 0000000..c3218d7 --- /dev/null +++ b/pyblosxom2wxr-snarfed.sh @@ -0,0 +1,170 @@ +#!/bin/bash +# +# pyblosxom2wxr.sh +# http://snarfed.org/pyblosxom2wxr +# Ryan Barrett +# Version 0.2. This script is public domain. +# +# This script converts PyBlosxom posts and comments into a WXR (WordPress +# eXtensible RSS) XML file that can be imported into a WordPress blog. +# +# Example usage: +# +# $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml +# +# pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It +# should work with other versions too, but your mileage may vary. +# +# TODO: comment ordering + +# exit on error +set -e + +# check args +if [[ $# = "0" || $1 = "--help" ]]; then + echo 'Usage: pyblosxom2wxr.sh FILES...' + exit 1 +fi + +# comment id sequence number +commentid=1 + +# output header +now=`date --rfc-3339=seconds` +cat << EOF + + + + + + + + + + + http://snarfed.org/pyblosxom2wxr?v=1.0 + en + 1.0 + + + + +EOF + +# convert comments +for file in "$@"; do + fullname=`basename "$file" .txt` + dir=`dirname "$file"` + title=`head -n 1 "$file"` + + # TODO: make this easier to customize + date_re="[0-9]{4}-[0-9]{2}-[0-9]{2}" + time_re="([0-9]{2})-([0-9]{2})" + + # my pyblosxom posts have a date prefix, e.g. 2010-03-13. my pages don't. + if [[ "$fullname" =~ ^${date_re} ]]; then + type=post + name=${fullname:11} + datestr="${fullname::10} 00:00:00 -0800" + else + type=page + name=${fullname} + + timestamp_file=${dir}/../timestamps + datestr=`grep --max-count=1 -E \ + "^${date_re}-${time_re} (.+/)?${fullname}.txt\$" ${timestamp_file} | \ + cut -f1 -d' ' | \ + sed -r "s/-${time_re}\$/ \1:\2 -0500/"` + + if [[ ${datestr} == '' ]]; then + datestr=`stat --format=%y "$file"` + fi + fi + + pubDate=`date -uR -d "$datestr"` + date=`date -d "$datestr" +'%F %T'` + dateGmt=`date -u -d "$datestr" +'%F %T'` + + # TODO: category support + category="uncategorized" + + if grep -q ']]>' "$file"; then + echo "WARNING: $file contains the string ]]>, which makes its CDATA " \ + "section invalid. WordPress handles this ok, but still, heads up." 1>&2 + fi + + cat << EOF + + ${title} + ${pubDate} + $category + /${fullname} + + + ${date} + ${dateGmt} + open + open + ${name} + publish + 0 + 0 + ${type} + + 0 +EOF + + # other possible elements: +# /${fullname} +# +# +# ${creator} + + + for cmtfile in ${dir}/"$fullname"-{all,[0-9]*}.cmt; do + if [[ -e "$cmtfile" ]]; then + set +e # because the perl script below uses a non-zero exit code + tail -q -n +2 "$cmtfile" | \ + sed -r ' + s/^$/\nX<\/wp:comment_id>/; + s/^<\/item>$/1<\/wp:comment_approved>\n<\/wp:comment>/; + s/<(\/)?author>/<\1wp:comment_author>/g; + s/<(\/)?link>/<\1wp:comment_author_url>/g; + s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g; + s/<(\/)?description>/<\1wp:comment_content>/g; + s/^<(ajax|cmt_date|email|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//; + s/^<\/?items>$//; + /^$/d' | \ + perl -pe 'use HTML::Entities; decode_entities($_)' | \ + perl -pe 'use POSIX qw(strftime); + s/^(.+)<\/pubDate>$/"" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \ + perl -e ' + my $id = '${commentid}'; + while () { + s/^()X(<\/wp:comment_id>)$/$1 . $id++ . $2/e; + print $_; + } + exit $id - '${commentid}';' + # TODO: this is a hack since exit codes are only 8 bits unsigned. + # this will break on posts with >255 comments. + let commentid+=$? + set -e + fi + done + + cat << EOF + + +EOF +done + +# output footer +cat << EOF + + +EOF -- 2.39.5