projects.mako.cc - pyblosxom2wxr/blob - pyblosxom2wxr.sh

   1 #!/bin/bash
   2 #
   3 # pyblosxom2wxr.sh
   4 # http://snarfed.org/pyblosxom2wxr
   5 # Ryan Barrett <pyblosxom2wxr@ryanb.org>
   6 # Version 0.2. This script is public domain.
   7 #
   8 # This script converts PyBlosxom posts and comments into a WXR (WordPress
   9 # eXtensible RSS) XML file that can be imported into a WordPress blog.
  10 #
  11 # Example usage:
  12 #
  13 # $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml
  14 #
  15 # pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It
  16 # should work with other versions too, but your mileage may vary.
  17 #
  18 # TODO: comment ordering
  19
  20 # exit on error
  21 set -e
  22
  23 # check args
  24 if [[ $# = "0" || $1 = "--help" ]]; then
  25   echo 'Usage: pyblosxom2wxr.sh FILES...'
  26   exit 1
  27 fi
  28
  29 # comment id sequence number
  30 commentid=1
  31
  32 # output header
  33 now=`date --rfc-3339=seconds`
  34 cat << EOF
  35 <?xml version="1.0" encoding="UTF-8"?>
  36
  37 <!-- generator="pyblosxom2wxr/1.0" created="${now}" -->
  38 <rss version="2.0"
  39   xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
  40   xmlns:content="http://purl.org/rss/1.0/modules/content/"
  41   xmlns:wfw="http://wellformedweb.org/CommentAPI/"
  42   xmlns:dc="http://purl.org/dc/elements/1.1/"
  43   xmlns:wp="http://wordpress.org/export/1.0/">
  44
  45 <channel>
  46   <title></title>
  47   <link></link>
  48   <description></description>
  49   <pubDate></pubDate>
  50   <generator>http://snarfed.org/pyblosxom2wxr?v=1.0</generator>
  51   <language>en</language>
  52   <wp:wxr_version>1.0</wp:wxr_version>
  53   <wp:base_site_url></wp:base_site_url>
  54   <wp:base_blog_url></wp:base_blog_url>
  55   <wp:category></wp:category>
  56
  57 EOF
  58
  59 # convert comments
  60 for file in "$@"; do
  61   fullname=`basename "$file" .txt`
  62   dir=`dirname "$file"`
  63   title=`head -n 1 "$file"`
  64
  65   # TODO: make this easier to customize
  66   date_re="[0-9]{4}-[0-9]{2}-[0-9]{2}"
  67   time_re="([0-9]{2})-([0-9]{2})"
  68
  69   # my pyblosxom posts have a date prefix, e.g. 2010-03-13. my pages don't.
  70   if [[ "$fullname" =~ ^${date_re} ]]; then
  71     type=post
  72     name=${fullname:11}
  73     datestr="${fullname::10} 00:00:00 -0800"
  74   else
  75     type=page
  76     name=${fullname}
  77
  78     timestamp_file=${dir}/../timestamps
  79     datestr=`grep --max-count=1 -E \
  80                "^${date_re}-${time_re} (.+/)?${fullname}.txt\$" ${timestamp_file} | \
  81         cut -f1 -d' ' | \
  82         sed -r "s/-${time_re}\$/ \1:\2 -0500/"`
  83
  84     if [[ ${datestr} == '' ]]; then
  85       datestr=`stat --format=%y "$file"`
  86     fi
  87   fi
  88
  89   pubDate=`date -uR -d "$datestr"`
  90   date=`date -d "$datestr" +'%F %T'`
  91   dateGmt=`date -u -d "$datestr" +'%F %T'`
  92
  93   # TODO: category support
  94   category="uncategorized"
  95
  96   if grep -q ']]>' "$file"; then
  97     echo "WARNING: $file contains the string ]]>, which makes its CDATA " \
  98          "section invalid. WordPress handles this ok, but still, heads up." 1>&2
  99   fi
 100
 101   cat << EOF
 102 <item>
 103   <title>${title}</title>
 104   <pubDate>${pubDate}</pubDate>
 105   <category domain="category" nicename="$category">$category</category>
 106   <guid isPermaLink="true">/${fullname}</guid>
 107   <description></description>
 108   <content:encoded><![CDATA[`tail -n +3 "$file"`]]></content:encoded>
 109   <wp:post_date>${date}</wp:post_date>
 110   <wp:post_date_gmt>${dateGmt}</wp:post_date_gmt>
 111   <wp:comment_status>open</wp:comment_status>
 112   <wp:ping_status>open</wp:ping_status>
 113   <wp:post_name>${name}</wp:post_name>
 114   <wp:status>publish</wp:status>
 115   <wp:post_parent>0</wp:post_parent>
 116   <wp:menu_order>0</wp:menu_order>
 117   <wp:post_type>${type}</wp:post_type>
 118   <wp:post_password></wp:post_password>
 119   <wp:is_sticky>0</wp:is_sticky>
 120 EOF
 121
 122   # other possible elements:
 123 #  <link>/${fullname}</link>
 124 #  <wp:post_id></wp:post_id>
 125 #  <excerpt:encoded></excerpt:encoded>
 126 #  <dc:creator>${creator}</dc:creator>
 127
 128
 129   for cmtfile in ${dir}/"$fullname"-{all,[0-9]*}.cmt; do
 130     if [[ -e "$cmtfile" ]]; then
 131       set +e  # because the perl script below uses a non-zero exit code
 132       tail -q -n +2 "$cmtfile" | \
 133         sed -r '
 134           s/^<item>$/<wp:comment>\n<wp:comment_id>X<\/wp:comment_id>/;
 135           s/^<\/item>$/<wp:comment_approved>1<\/wp:comment_approved>\n<\/wp:comment>/;
 136           s/<(\/)?author>/<\1wp:comment_author>/g;
 137           s/<(\/)?link>/<\1wp:comment_author_url>/g;
 138           s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g;
 139           s/<(\/)?description>/<\1wp:comment_content>/g;
 140           s/^<(ajax|cmt_date|email|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//;
 141           s/^<\/?items>$//;
 142           /^$/d' | \
 143         perl -pe 'use HTML::Entities; decode_entities($_)' | \
 144         perl -pe 'use POSIX qw(strftime);
 145                   s/^<pubDate>(.+)<\/pubDate>$/"<wp:comment_date>" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \
 146         perl -e '
 147           my $id = '${commentid}';
 148           while (<STDIN>) {
 149              s/^(<wp:comment_id>)X(<\/wp:comment_id>)$/$1 . $id++ . $2/e;
 150              print $_;
 151           }
 152           exit $id - '${commentid}';'
 153       # TODO: this is a hack since exit codes are only 8 bits unsigned.
 154       # this will break on posts with >255 comments.
 155       let commentid+=$?
 156       set -e
 157     fi
 158   done
 159
 160   cat << EOF
 161 </item>
 162
 163 EOF
 164 done
 165
 166 # output footer
 167 cat << EOF
 168 </channel>
 169 </rss>
 170 EOF