projects.mako.cc - pyblosxom2wxr/blob - pyblosxom2wxr-copyrighteous.sh

   1 #!/bin/bash
   2 #
   3 # pyblosxom2wxr.sh
   4 #
   5 # http://projects.mako.cc/source/pyblosxom2wxr
   6 # Benjamin Mako Hill <mako@atdot.cc>
   7 #
   8 # http://snarfed.org/pyblosxom2wxr
   9 # Ryan Barrett <pyblosxom2wxr@ryanb.org>
  10 #
  11 # Version 0.2.1-cpy. This script is public domain.
  12 #
  13 # This script converts PyBlosxom posts and comments into a WXR (WordPress
  14 # eXtensible RSS) XML file that can be imported into a WordPress blog.
  15 #
  16 # Example usage:
  17 #
  18 # $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml
  19 #
  20 # pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It
  21 # should work with other versions too, but your mileage may vary.
  22 #
  23 # TODO: comment ordering
  24
  25 # exit on error
  26 set -e
  27
  28 # check args
  29 if [[ $# = "0" || $1 = "--help" ]]; then
  30   echo 'Usage: pyblosxom2wxr.sh FILES...'
  31   exit 1
  32 fi
  33
  34 # category
  35 all_categories=""
  36 all_tags=""
  37 all_slugs=""
  38
  39 # create a variable to include all the urls we've seen
  40 url_map_file="./url_mapping_list"
  41 rm $url_map_file; touch $url_map_file
  42
  43 # comment id sequence number
  44 commentid=1
  45
  46 # output header
  47 now=`date --rfc-3339=seconds`
  48 cat << EOF
  49 <?xml version="1.0" encoding="UTF-8"?>
  50
  51 <!-- generator="pyblosxom2wxr/1.0" created="${now}" -->
  52 <rss version="2.0"
  53   xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
  54   xmlns:content="http://purl.org/rss/1.0/modules/content/"
  55   xmlns:wfw="http://wellformedweb.org/CommentAPI/"
  56   xmlns:dc="http://purl.org/dc/elements/1.1/"
  57   xmlns:wp="http://wordpress.org/export/1.0/">
  58
  59 <channel>
  60   <title></title>
  61   <link></link>
  62   <description></description>
  63   <pubDate></pubDate>
  64   <generator>http://snarfed.org/pyblosxom2wxr?v=1.0</generator>
  65   <language>en</language>
  66   <wp:wxr_version>1.0</wp:wxr_version>
  67   <wp:base_site_url></wp:base_site_url>
  68   <wp:base_blog_url></wp:base_blog_url>
  69 EOF
  70
  71 # convert blog posts
  72 for file in "$@"; do
  73
  74   # extract the filetype and then the payload
  75   if [[ ${file##*.} = 'rst' ]]; then
  76     payload=$(tail -n +4 "$file" | python ./convert_rst_frag.py)
  77     fullname=$(basename "$file" ".rst")
  78   elif [[ ${file##*.} = 'txt' ]]; then
  79     filetype=".txt"
  80     payload=$(tail -n +4 "$file"|perl -pe 's/\n/ /g')
  81     fullname=$(basename "$file" ".txt")
  82   fi
  83
  84   creator="mako"
  85   dir=$(dirname "$file")
  86   commentdir="$(dirname "$file")/comments"
  87   title=$(head -n 1 "$file")
  88   type="post"
  89
  90   # create a new slug and then save it to the url map file to create a
  91   # set of redirects afteward
  92   slug=$(echo $title | perl -p -e \
  93     's/\s*(.*?)\s*$/\1/; s/<\/?\w+>//g; $_ = lc; tr/ /-/; s/[^A-Za-z0-9-]//g; s/-+/-/g')
  94
  95   # count to see if the slug has been before, and then increement it
  96   slug_count=$(echo "$all_slugs" | perl -ne "print if /$slug(\-\d)?/" | wc -l)
  97   if [[ $slug_count -gt 0 ]]; then
  98     slug="$slug-$(expr $slug_count + 1)"
  99   fi
 100
 101   # save the new slug to the list of slugs, and write it to the
 102   # url_map_file
 103   all_slugs=$(echo "$all_slugs"; echo "$slug" )
 104   echo "$fullname $slug" >> $url_map_file
 105
 106   name=${fullname}
 107
 108   pubDate=$(date -uR -r $file)
 109   date=$(date -r "$file" +'%F %T')
 110   dateGmt=$(date -u -r "$file" +'%F %T')
 111
 112   if grep -q ']]>' "$file"; then
 113     echo "WARNING: $file contains the string ]]>, which makes its CDATA " \
 114          "section invalid. WordPress handles this ok, but still, heads up." 1>&2
 115   fi
 116
 117   cat << EOF
 118 <item>
 119   <title>${title}</title>
 120   <pubDate>${pubDate}</pubDate>
 121   <guid isPermaLink="true">/${slug}</guid>
 122   <description></description>
 123   <content:encoded><![CDATA[${payload}]]></content:encoded>
 124   <wp:post_date>${date}</wp:post_date>
 125   <wp:post_date_gmt>${dateGmt}</wp:post_date_gmt>
 126   <wp:comment_status>open</wp:comment_status>
 127   <wp:ping_status>open</wp:ping_status>
 128   <wp:post_name>${slug}</wp:post_name>
 129   <wp:status>publish</wp:status>
 130   <wp:post_parent>0</wp:post_parent>
 131   <wp:menu_order>0</wp:menu_order>
 132   <wp:post_type>${type}</wp:post_type>
 133   <wp:post_password></wp:post_password>
 134   <wp:is_sticky>0</wp:is_sticky>
 135   <dc:creator>${creator}</dc:creator>
 136 EOF
 137
 138   # split the tags
 139   raw_tags=$(grep '#tags' "$file" |perl -pe 's/.tags // '|tr ',' "\n")
 140   for tag in $raw_tags; do
 141     tag=$(echo "$tag"|perl -p -e 's/^\s*(.*)\s*$/\1/')
 142     echo "<category domain=\"post_tag\" nicename=\"${tag}\">${tag}</category>"
 143     #echo "<category domain=\"category\" nicename=\"$category\">$category</category>"
 144     all_tags=$(echo "$tag"; echo "$all_tags")
 145   done;
 146
 147   # category="uncategorized"
 148   # <category domain="category" nicename="$category">$category</category>
 149
 150   # other possible elements:
 151 #  <link>/${fullname}</link>
 152 #  <wp:post_id></wp:post_id>
 153 #  <excerpt:encoded></excerpt:encoded>
 154
 155   for cmtfile in ${commentdir}/"$fullname"-{all,[0-9]*}.cmt; do
 156     if [[ -e "$cmtfile" ]]; then
 157       set +e  # because the perl script below uses a non-zero exit code
 158       tail -q -n +2 "$cmtfile" | \
 159         sed -r '
 160           s/^<item>$/<wp:comment>\n<wp:comment_id>X<\/wp:comment_id>/;
 161           s/^<\/item>$/<wp:comment_approved>1<\/wp:comment_approved>\n<\/wp:comment>/;
 162           s/<(\/)?author>/<\1wp:comment_author>/g;
 163           s/<(\/)?link>/<\1wp:comment_author_url>/g;
 164           s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g;
 165           s/<(\/)?description>/<\1wp:comment_content>/g;
 166           s/<(\/)?email>/<\1wp:comment_email>/g;
 167           s/^<(ajax|cmt_date|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//;
 168           s/^<\/?items>$//;
 169           /^$/d' | \
 170         perl -pe 'use HTML::Entities; decode_entities($_)' | \
 171         perl -pe 'use POSIX qw(strftime);
 172                   s/^<pubDate>(.+)<\/pubDate>$/"<wp:comment_date>" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \
 173         perl -e '
 174           my $id = '${commentid}';
 175           while (<STDIN>) {
 176              s/^(<wp:comment_id>)X(<\/wp:comment_id>)$/$1 . $id++ . $2/e;
 177              print $_;
 178           }
 179           exit $id - '${commentid}';'
 180       # TODO: this is a hack since exit codes are only 8 bits unsigned.
 181       # this will break on posts with >255 comments.
 182       let commentid+=$?
 183       set -e
 184     fi
 185   done
 186
 187   cat << EOF
 188 </item>
 189
 190 EOF
 191 done
 192
 193 index=2
 194 #for category in $(echo "$all_categories"|sort|uniq); do
 195 #    cat << EOF
 196 #    <wp:category><wp:term_id>${index}</wp:term_id><wp:category_nicename>${category}</wp:category_nicename><wp:category_parent></wp:category_parent><wp:cat_name><![CDATA[${category}]]></wp:cat_name></wp:category>
 197 #EOF
 198 #    index=$(expr $index + 1)
 199 #done
 200
 201 for tag in $(echo "$all_tags"|sort|uniq); do
 202     cat << EOF
 203     <wp:tag><wp:term_id>${index}</wp:term_id><wp:tag_slug>${tag}</wp:tag_slug><wp:tag_name>${tag}</wp:tag_name></wp:tag>
 204 EOF
 205     index=$(expr $index + 1)
 206 done
 207
 208 # output footer
 209 cat << EOF
 210 </channel>
 211 </rss>
 212 EOF
 213