From: Benjamin Mako Hill Date: Fri, 11 Jan 2013 18:23:46 +0000 (-0500) Subject: add two modified version and documentation X-Git-Url: https://projects.mako.cc/source/pyblosxom2wxr/commitdiff_plain/f34e97609dfb44f29bfb8aceac0e1170707ba6e0?ds=sidebyside add two modified version and documentation --- diff --git a/README b/README new file mode 100644 index 0000000..e7da2b1 --- /dev/null +++ b/README @@ -0,0 +1,53 @@ +This repository has three branches with three different version of the +script. + +I've run this twice and I've found that every PyBlosxom instance is a +little different and requires a different (sometimes very different) +version of this script. If you want to use it, you will *need* +to modify it to fit your needs. I've included three version of this +script in three different branches. Please use git to look at the three +to pick and choose what you want. + +1. pyblosxom2wxr-snarfed.sh: This is the version available on + http://snarfed.org/pyblosxom2wxr + + It assumes posts are .txt files which are HTML extension + and assumes both posts and pages as distinct types. + +2. pyblosxom2wxr-revealingerrors.sh: coverted http://revealingerrors.com + + This assumes only posts, in .txt files, but which are all in fact + markdown files. It also takes into account 'tags' using the tags.py + plugin and assumes that they are the second line of every file + following a "#tags" line. + + REQUIRES: markdown_py + +3. pyblosxom2wxr-copyrighteous.sh: converted http://mako.cc/copyrighteous + + This deals with two types of files: .txt files which are standard + html pyblosxom files as well as .rst files which are restructured + text. It also looks for comma separated tags in the "#tags " line of + each file. + + REQUIRES: python-docutils + + HUGE WARNING: This version of the script changes the GUID or the + permalinks. It does *not* keep the links that worked with your old + version of PyBlosxom. I done because I usd date based URLS and I + wanted to get rid of them. The script *also* writes a new file, + called "url_mapping_list" in the local directory. I use that to set + up a series of 301 permanant redirects with Apache so that people + going to the old links still arrive at the page. + +Known Bugs / Workaround +------------------------- + +One bug I have not worked around is that comments are created with the +'comment_author' field in the database including either the IP address +or the IP address and time timestamp pre-pended to the name. I think +this might be a bug in the Wordpress import code. Rather than debug it, +I wrote a simple perl script (fix_comment.pl) which will fix the +comments in the database. It's an ugly kluge but it worked for me. +Please read it carefully before running it. + diff --git a/convert_rst_frag.py b/convert_rst_frag.py new file mode 100755 index 0000000..d52a948 --- /dev/null +++ b/convert_rst_frag.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python + +from docutils.core import publish_parts +import sys + +settings = { 'initial_header_level': 2, + 'doctitle_xform': 1 } + +raw_rst = ''.join(sys.stdin.readlines()) +html = publish_parts(raw_rst, writer_name='html', + settings_overrides=settings)['body'] +html = html.replace('\n', ' ') + +sys.stdout.write(html.encode('utf-8')) + + + diff --git a/fix_comment.pl b/fix_comment.pl new file mode 100755 index 0000000..9baec03 --- /dev/null +++ b/fix_comment.pl @@ -0,0 +1,55 @@ +#!/usr/bin /perl -w + +use DBI; +my $dbh = DBI->connect('DBI:mysql:copyrighteous:localhost', 'mako') + or die "Cannot connect: " . $DBI::errstr; + + +$sql = qq`SELECT comment_ID, comment_author from wp_comments`; +$sth = $dbh->prepare($sql) or die "Cannot prepare: " . $dbh->errstr(); +$sth->execute() or die "Cannot execute: " . $sth->errstr(); + +my @row; +my @fields; +while(@row = $sth->fetchrow_array()) { + my @record = @row; + push(@fields, \@record); +} +$sth->finish(); + +# now process the fields + +my $new_username; +my $comment_id; +if (@fields != 0) { + foreach $line (@fields) { + # datestamp method + if (@$line[1] =~ /^.*\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} (.*)$/) { + $new_username = $1; + } + # just the ip address + elsif (@$line[1] =~ /^\d+\.\d+\.\d+\.\d+ (.*)$/) { + $new_username = $1; + } else { + next; + } + + $comment_id = @$line[0]; + #print "Old: ", @$line[1], "\n"; + print "Updating: ", $comment_id, " ", $new_username, "\n"; + + $sql = qq`UPDATE wp_comments SET comment_author = "$new_username" WHERE comment_ID = $comment_id`; + $sth = $dbh->prepare($sql) or die "Cannot prepare: " . + $dbh->errstr(); + $sth->execute() or die "Cannot execute: " . $sth->errstr(); + $sth->finish(); + #print "row $i - id is @$line[0], name is @$line[1]\n"; + } +} + + + + + + + diff --git a/pyblosxom2wxr-copyrighteous.sh b/pyblosxom2wxr-copyrighteous.sh new file mode 100755 index 0000000..623e337 --- /dev/null +++ b/pyblosxom2wxr-copyrighteous.sh @@ -0,0 +1,213 @@ +#!/bin/bash +# +# pyblosxom2wxr.sh +# +# http://projects.mako.cc/source/pyblosxom2wxr +# Benjamin Mako Hill +# +# http://snarfed.org/pyblosxom2wxr +# Ryan Barrett +# +# Version 0.2.1-cpy. This script is public domain. +# +# This script converts PyBlosxom posts and comments into a WXR (WordPress +# eXtensible RSS) XML file that can be imported into a WordPress blog. +# +# Example usage: +# +# $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml +# +# pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It +# should work with other versions too, but your mileage may vary. +# +# TODO: comment ordering + +# exit on error +set -e + +# check args +if [[ $# = "0" || $1 = "--help" ]]; then + echo 'Usage: pyblosxom2wxr.sh FILES...' + exit 1 +fi + +# category +all_categories="" +all_tags="" +all_slugs="" + +# create a variable to include all the urls we've seen +url_map_file="./url_mapping_list" +rm $url_map_file; touch $url_map_file + +# comment id sequence number +commentid=1 + +# output header +now=`date --rfc-3339=seconds` +cat << EOF + + + + + + + + + + + http://snarfed.org/pyblosxom2wxr?v=1.0 + en + 1.0 + + +EOF + +# convert blog posts +for file in "$@"; do + + # extract the filetype and then the payload + if [[ ${file##*.} = 'rst' ]]; then + payload=$(tail -n +4 "$file" | python ./convert_rst_frag.py) + fullname=$(basename "$file" ".rst") + elif [[ ${file##*.} = 'txt' ]]; then + filetype=".txt" + payload=$(tail -n +4 "$file"|perl -pe 's/\n/ /g') + fullname=$(basename "$file" ".txt") + fi + + creator="mako" + dir=$(dirname "$file") + commentdir="$(dirname "$file")/comments" + title=$(head -n 1 "$file") + type="post" + + # create a new slug and then save it to the url map file to create a + # set of redirects afteward + slug=$(echo $title | perl -p -e \ + 's/\s*(.*?)\s*$/\1/; s/<\/?\w+>//g; $_ = lc; tr/ /-/; s/[^A-Za-z0-9-]//g; s/-+/-/g') + + # count to see if the slug has been before, and then increement it + slug_count=$(echo "$all_slugs" | perl -ne "print if /$slug(\-\d)?/" | wc -l) + if [[ $slug_count -gt 0 ]]; then + slug="$slug-$(expr $slug_count + 1)" + fi + + # save the new slug to the list of slugs, and write it to the + # url_map_file + all_slugs=$(echo "$all_slugs"; echo "$slug" ) + echo "$fullname $slug" >> $url_map_file + + name=${fullname} + + pubDate=$(date -uR -r $file) + date=$(date -r "$file" +'%F %T') + dateGmt=$(date -u -r "$file" +'%F %T') + + if grep -q ']]>' "$file"; then + echo "WARNING: $file contains the string ]]>, which makes its CDATA " \ + "section invalid. WordPress handles this ok, but still, heads up." 1>&2 + fi + + cat << EOF + + ${title} + ${pubDate} + /${slug} + + + ${date} + ${dateGmt} + open + open + ${slug} + publish + 0 + 0 + ${type} + + 0 + ${creator} +EOF + + # split the tags + raw_tags=$(grep '#tags' "$file" |perl -pe 's/.tags // '|tr ',' "\n") + for tag in $raw_tags; do + tag=$(echo "$tag"|perl -p -e 's/^\s*(.*)\s*$/\1/') + echo "${tag}" + #echo "$category" + all_tags=$(echo "$tag"; echo "$all_tags") + done; + + # category="uncategorized" + # $category + + # other possible elements: +# /${fullname} +# +# + + for cmtfile in ${commentdir}/"$fullname"-{all,[0-9]*}.cmt; do + if [[ -e "$cmtfile" ]]; then + set +e # because the perl script below uses a non-zero exit code + tail -q -n +2 "$cmtfile" | \ + sed -r ' + s/^$/\nX<\/wp:comment_id>/; + s/^<\/item>$/1<\/wp:comment_approved>\n<\/wp:comment>/; + s/<(\/)?author>/<\1wp:comment_author>/g; + s/<(\/)?link>/<\1wp:comment_author_url>/g; + s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g; + s/<(\/)?description>/<\1wp:comment_content>/g; + s/<(\/)?email>/<\1wp:comment_email>/g; + s/^<(ajax|cmt_date|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//; + s/^<\/?items>$//; + /^$/d' | \ + perl -pe 'use HTML::Entities; decode_entities($_)' | \ + perl -pe 'use POSIX qw(strftime); + s/^(.+)<\/pubDate>$/"" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \ + perl -e ' + my $id = '${commentid}'; + while () { + s/^()X(<\/wp:comment_id>)$/$1 . $id++ . $2/e; + print $_; + } + exit $id - '${commentid}';' + # TODO: this is a hack since exit codes are only 8 bits unsigned. + # this will break on posts with >255 comments. + let commentid+=$? + set -e + fi + done + + cat << EOF + + +EOF +done + +index=2 +#for category in $(echo "$all_categories"|sort|uniq); do +# cat << EOF +# ${index}${category} +#EOF +# index=$(expr $index + 1) +#done + +for tag in $(echo "$all_tags"|sort|uniq); do + cat << EOF + ${index}${tag}${tag} +EOF + index=$(expr $index + 1) +done + +# output footer +cat << EOF + + +EOF + diff --git a/pyblosxom2wxr.sh b/pyblosxom2wxr-revealingerrors.sh similarity index 97% rename from pyblosxom2wxr.sh rename to pyblosxom2wxr-revealingerrors.sh index 4e91ef5..02318c4 100755 --- a/pyblosxom2wxr.sh +++ b/pyblosxom2wxr-revealingerrors.sh @@ -1,9 +1,14 @@ #!/bin/bash # # pyblosxom2wxr.sh +# +# http://projects.mako.cc/source/pyblosxom2wxr +# Benjamin Mako Hill +# # http://snarfed.org/pyblosxom2wxr # Ryan Barrett -# Version 0.2. This script is public domain. +# +# Version 0.2.1-re. This script is public domain. # # This script converts PyBlosxom posts and comments into a WXR (WordPress # eXtensible RSS) XML file that can be imported into a WordPress blog.