4 # http://snarfed.org/pyblosxom2wxr
5 # Ryan Barrett <pyblosxom2wxr@ryanb.org>
6 # Version 0.2. This script is public domain.
8 # This script converts PyBlosxom posts and comments into a WXR (WordPress
9 # eXtensible RSS) XML file that can be imported into a WordPress blog.
13 # $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml
15 # pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It
16 # should work with other versions too, but your mileage may vary.
18 # TODO: comment ordering
24 if [[ $# = "0" || $1 = "--help" ]]; then
25 echo 'Usage: pyblosxom2wxr.sh FILES...'
29 # comment id sequence number
33 now=`date --rfc-3339=seconds`
35 <?xml version="1.0" encoding="UTF-8"?>
37 <!-- generator="pyblosxom2wxr/1.0" created="${now}" -->
39 xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
40 xmlns:content="http://purl.org/rss/1.0/modules/content/"
41 xmlns:wfw="http://wellformedweb.org/CommentAPI/"
42 xmlns:dc="http://purl.org/dc/elements/1.1/"
43 xmlns:wp="http://wordpress.org/export/1.0/">
48 <description></description>
50 <generator>http://snarfed.org/pyblosxom2wxr?v=1.0</generator>
51 <language>en</language>
52 <wp:wxr_version>1.0</wp:wxr_version>
53 <wp:base_site_url></wp:base_site_url>
54 <wp:base_blog_url></wp:base_blog_url>
55 <wp:category></wp:category>
61 fullname=`basename "$file" .txt`
63 title=`head -n 1 "$file"`
65 # TODO: make this easier to customize
66 date_re="[0-9]{4}-[0-9]{2}-[0-9]{2}"
67 time_re="([0-9]{2})-([0-9]{2})"
69 # my pyblosxom posts have a date prefix, e.g. 2010-03-13. my pages don't.
70 if [[ "$fullname" =~ ^${date_re} ]]; then
73 datestr="${fullname::10} 00:00:00 -0800"
78 timestamp_file=${dir}/../timestamps
79 datestr=`grep --max-count=1 -E \
80 "^${date_re}-${time_re} (.+/)?${fullname}.txt\$" ${timestamp_file} | \
82 sed -r "s/-${time_re}\$/ \1:\2 -0500/"`
84 if [[ ${datestr} == '' ]]; then
85 datestr=`stat --format=%y "$file"`
89 pubDate=`date -uR -d "$datestr"`
90 date=`date -d "$datestr" +'%F %T'`
91 dateGmt=`date -u -d "$datestr" +'%F %T'`
93 # TODO: category support
94 category="uncategorized"
96 if grep -q ']]>' "$file"; then
97 echo "WARNING: $file contains the string ]]>, which makes its CDATA " \
98 "section invalid. WordPress handles this ok, but still, heads up." 1>&2
103 <title>${title}</title>
104 <pubDate>${pubDate}</pubDate>
105 <category domain="category" nicename="$category">$category</category>
106 <guid isPermaLink="true">/${fullname}</guid>
107 <description></description>
108 <content:encoded><![CDATA[`tail -n +3 "$file"`]]></content:encoded>
109 <wp:post_date>${date}</wp:post_date>
110 <wp:post_date_gmt>${dateGmt}</wp:post_date_gmt>
111 <wp:comment_status>open</wp:comment_status>
112 <wp:ping_status>open</wp:ping_status>
113 <wp:post_name>${name}</wp:post_name>
114 <wp:status>publish</wp:status>
115 <wp:post_parent>0</wp:post_parent>
116 <wp:menu_order>0</wp:menu_order>
117 <wp:post_type>${type}</wp:post_type>
118 <wp:post_password></wp:post_password>
119 <wp:is_sticky>0</wp:is_sticky>
122 # other possible elements:
123 # <link>/${fullname}</link>
124 # <wp:post_id></wp:post_id>
125 # <excerpt:encoded></excerpt:encoded>
126 # <dc:creator>${creator}</dc:creator>
129 for cmtfile in ${dir}/"$fullname"-{all,[0-9]*}.cmt; do
130 if [[ -e "$cmtfile" ]]; then
131 set +e # because the perl script below uses a non-zero exit code
132 tail -q -n +2 "$cmtfile" | \
134 s/^<item>$/<wp:comment>\n<wp:comment_id>X<\/wp:comment_id>/;
135 s/^<\/item>$/<wp:comment_approved>1<\/wp:comment_approved>\n<\/wp:comment>/;
136 s/<(\/)?author>/<\1wp:comment_author>/g;
137 s/<(\/)?link>/<\1wp:comment_author_url>/g;
138 s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g;
139 s/<(\/)?description>/<\1wp:comment_content>/g;
140 s/^<(ajax|cmt_date|email|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//;
143 perl -pe 'use HTML::Entities; decode_entities($_)' | \
144 perl -pe 'use POSIX qw(strftime);
145 s/^<pubDate>(.+)<\/pubDate>$/"<wp:comment_date>" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \
147 my $id = '${commentid}';
149 s/^(<wp:comment_id>)X(<\/wp:comment_id>)$/$1 . $id++ . $2/e;
152 exit $id - '${commentid}';'
153 # TODO: this is a hack since exit codes are only 8 bits unsigned.
154 # this will break on posts with >255 comments.