projects.mako.cc - pyblosxom2wxr/blob - pyblosxom2wxr.sh

   1 #!/bin/bash
   2 #
   3 # pyblosxom2wxr.sh
   4 # http://snarfed.org/pyblosxom2wxr
   5 # Ryan Barrett <pyblosxom2wxr@ryanb.org>
   6 # Version 0.2. This script is public domain.
   7 #
   8 # This script converts PyBlosxom posts and comments into a WXR (WordPress
   9 # eXtensible RSS) XML file that can be imported into a WordPress blog.
  10 #
  11 # Example usage:
  12 #
  13 # $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml
  14 #
  15 # pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It
  16 # should work with other versions too, but your mileage may vary.
  17 #
  18 # TODO: comment ordering
  19
  20 # exit on error
  21 set -e
  22
  23 # check args
  24 if [[ $# = "0" || $1 = "--help" ]]; then
  25   echo 'Usage: pyblosxom2wxr.sh FILES...'
  26   exit 1
  27 fi
  28
  29 # category
  30 all_categories=""
  31 all_tags=""
  32
  33 # comment id sequence number
  34 commentid=1
  35
  36 # output header
  37 now=`date --rfc-3339=seconds`
  38 cat << EOF
  39 <?xml version="1.0" encoding="UTF-8"?>
  40
  41 <!-- generator="pyblosxom2wxr/1.0" created="${now}" -->
  42 <rss version="2.0"
  43   xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
  44   xmlns:content="http://purl.org/rss/1.0/modules/content/"
  45   xmlns:wfw="http://wellformedweb.org/CommentAPI/"
  46   xmlns:dc="http://purl.org/dc/elements/1.1/"
  47   xmlns:wp="http://wordpress.org/export/1.0/">
  48
  49 <channel>
  50   <title></title>
  51   <link></link>
  52   <description></description>
  53   <pubDate></pubDate>
  54   <generator>http://snarfed.org/pyblosxom2wxr?v=1.0</generator>
  55   <language>en</language>
  56   <wp:wxr_version>1.0</wp:wxr_version>
  57   <wp:base_site_url></wp:base_site_url>
  58   <wp:base_blog_url></wp:base_blog_url>
  59 EOF
  60
  61
  62 # convert blog posts
  63 for file in "$@"; do
  64   creator="mako"
  65   fullname=`basename "$file" .txt`
  66   dir=`dirname "$file"`
  67   commentdir="$(dirname "$file")/comments"
  68   title=`head -n 1 "$file"`
  69   type="post"
  70
  71   name=${fullname}
  72
  73   pubDate=$(date -uR -r $file)
  74   date=$(date -r "$file" +'%F %T')
  75   dateGmt=$(date -u -r "$file" +'%F %T')
  76
  77   if grep -q ']]>' "$file"; then
  78     echo "WARNING: $file contains the string ]]>, which makes its CDATA " \
  79          "section invalid. WordPress handles this ok, but still, heads up." 1>&2
  80   fi
  81
  82   cat << EOF
  83 <item>
  84   <title>${title}</title>
  85   <pubDate>${pubDate}</pubDate>
  86   <guid isPermaLink="true">/${fullname}</guid>
  87   <description></description>
  88   <content:encoded><![CDATA[$(tail -n +4 "$file" | markdown_py)]]></content:encoded>
  89   <wp:post_date>${date}</wp:post_date>
  90   <wp:post_date_gmt>${dateGmt}</wp:post_date_gmt>
  91   <wp:comment_status>open</wp:comment_status>
  92   <wp:ping_status>open</wp:ping_status>
  93   <wp:post_name>${name}</wp:post_name>
  94   <wp:status>publish</wp:status>
  95   <wp:post_parent>0</wp:post_parent>
  96   <wp:menu_order>0</wp:menu_order>
  97   <wp:post_type>${type}</wp:post_type>
  98   <wp:post_password></wp:post_password>
  99   <wp:is_sticky>0</wp:is_sticky>
 100   <dc:creator>${creator}</dc:creator>
 101 EOF
 102
 103   # split the tags
 104   raw_tags=$(grep '#tags' "$file" |perl -pe 's/.tags // '|tr ',' "\n")
 105   for tag in $raw_tags; do
 106     echo "<category domain=\"post_tag\" nicename=\"${tag}\">${tag}</category>"
 107     #echo "<category domain=\"category\" nicename=\"$category\">$category</category>"
 108   done;
 109   all_tags="$all_tags$raw_tags"
 110
 111   # category="uncategorized"
 112   # <category domain="category" nicename="$category">$category</category>
 113
 114   # other possible elements:
 115 #  <link>/${fullname}</link>
 116 #  <wp:post_id></wp:post_id>
 117 #  <excerpt:encoded></excerpt:encoded>
 118
 119
 120   for cmtfile in ${commentdir}/"$fullname"-{all,[0-9]*}.cmt; do
 121     if [[ -e "$cmtfile" ]]; then
 122       set +e  # because the perl script below uses a non-zero exit code
 123       tail -q -n +2 "$cmtfile" | \
 124         sed -r '
 125           s/^<item>$/<wp:comment>\n<wp:comment_id>X<\/wp:comment_id>/;
 126           s/^<\/item>$/<wp:comment_approved>1<\/wp:comment_approved>\n<\/wp:comment>/;
 127           s/<(\/)?author>/<\1wp:comment_author>/g;
 128           s/<(\/)?link>/<\1wp:comment_author_url>/g;
 129           s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g;
 130           s/<(\/)?description>/<\1wp:comment_content>/g;
 131           s/^<(ajax|cmt_date|email|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//;
 132           s/^<\/?items>$//;
 133           /^$/d' | \
 134         perl -pe 'use HTML::Entities; decode_entities($_)' | \
 135         perl -pe 'use POSIX qw(strftime);
 136                   s/^<pubDate>(.+)<\/pubDate>$/"<wp:comment_date>" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \
 137         perl -e '
 138           my $id = '${commentid}';
 139           while (<STDIN>) {
 140              s/^(<wp:comment_id>)X(<\/wp:comment_id>)$/$1 . $id++ . $2/e;
 141              print $_;
 142           }
 143           exit $id - '${commentid}';'
 144       # TODO: this is a hack since exit codes are only 8 bits unsigned.
 145       # this will break on posts with >255 comments.
 146       let commentid+=$?
 147       set -e
 148     fi
 149   done
 150
 151   cat << EOF
 152 </item>
 153
 154 EOF
 155 done
 156
 157 index=2
 158 #for category in $(echo "$all_categories"|sort|uniq); do
 159 #    cat << EOF
 160 #    <wp:category><wp:term_id>${index}</wp:term_id><wp:category_nicename>${category}</wp:category_nicename><wp:category_parent></wp:category_parent><wp:cat_name><![CDATA[${category}]]></wp:cat_name></wp:category>
 161 #EOF
 162 #    index=$(expr $index + 1)
 163 #done
 164
 165 for tag in $(echo "$all_tags"|sort|uniq); do
 166     cat << EOF
 167     <wp:tag><wp:term_id>${index}</wp:term_id><wp:tag_slug>${tag}</wp:tag_slug><wp:tag_name>${tag}</wp:tag_name></wp:tag>
 168 EOF
 169     index=$(expr $index + 1)
 170 done
 171
 172
 173 # output footer
 174 cat << EOF
 175 </channel>
 176 </rss>
 177 EOF