c3218d7d7d77c9b955f3368895f02cab74b092fd
[pyblosxom2wxr] / pyblosxom2wxr.sh
1 #!/bin/bash
2 #
3 # pyblosxom2wxr.sh
4 # http://snarfed.org/pyblosxom2wxr
5 # Ryan Barrett <pyblosxom2wxr@ryanb.org>
6 # Version 0.2. This script is public domain.
7 #
8 # This script converts PyBlosxom posts and comments into a WXR (WordPress
9 # eXtensible RSS) XML file that can be imported into a WordPress blog.
10 #
11 # Example usage:
12 #
13 # $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml
14 #
15 # pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It
16 # should work with other versions too, but your mileage may vary.
17 #
18 # TODO: comment ordering
19
20 # exit on error
21 set -e
22
23 # check args
24 if [[ $# = "0" || $1 = "--help" ]]; then
25   echo 'Usage: pyblosxom2wxr.sh FILES...'
26   exit 1
27 fi
28
29 # comment id sequence number
30 commentid=1
31
32 # output header
33 now=`date --rfc-3339=seconds`
34 cat << EOF
35 <?xml version="1.0" encoding="UTF-8"?> 
36
37 <!-- generator="pyblosxom2wxr/1.0" created="${now}" -->
38 <rss version="2.0"
39   xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
40   xmlns:content="http://purl.org/rss/1.0/modules/content/"
41   xmlns:wfw="http://wellformedweb.org/CommentAPI/"
42   xmlns:dc="http://purl.org/dc/elements/1.1/"
43   xmlns:wp="http://wordpress.org/export/1.0/">
44
45 <channel>
46   <title></title>
47   <link></link>
48   <description></description>
49   <pubDate></pubDate>
50   <generator>http://snarfed.org/pyblosxom2wxr?v=1.0</generator>
51   <language>en</language>
52   <wp:wxr_version>1.0</wp:wxr_version>
53   <wp:base_site_url></wp:base_site_url>
54   <wp:base_blog_url></wp:base_blog_url>
55   <wp:category></wp:category>
56
57 EOF
58
59 # convert comments
60 for file in "$@"; do
61   fullname=`basename "$file" .txt`
62   dir=`dirname "$file"`
63   title=`head -n 1 "$file"`
64
65   # TODO: make this easier to customize
66   date_re="[0-9]{4}-[0-9]{2}-[0-9]{2}"
67   time_re="([0-9]{2})-([0-9]{2})"
68
69   # my pyblosxom posts have a date prefix, e.g. 2010-03-13. my pages don't.
70   if [[ "$fullname" =~ ^${date_re} ]]; then
71     type=post
72     name=${fullname:11}
73     datestr="${fullname::10} 00:00:00 -0800"
74   else
75     type=page
76     name=${fullname}
77
78     timestamp_file=${dir}/../timestamps
79     datestr=`grep --max-count=1 -E \
80                "^${date_re}-${time_re} (.+/)?${fullname}.txt\$" ${timestamp_file} | \
81         cut -f1 -d' ' | \
82         sed -r "s/-${time_re}\$/ \1:\2 -0500/"`
83   
84     if [[ ${datestr} == '' ]]; then
85       datestr=`stat --format=%y "$file"`
86     fi
87   fi
88
89   pubDate=`date -uR -d "$datestr"`
90   date=`date -d "$datestr" +'%F %T'`
91   dateGmt=`date -u -d "$datestr" +'%F %T'`
92
93   # TODO: category support
94   category="uncategorized"
95
96   if grep -q ']]>' "$file"; then
97     echo "WARNING: $file contains the string ]]>, which makes its CDATA " \
98          "section invalid. WordPress handles this ok, but still, heads up." 1>&2
99   fi
100
101   cat << EOF
102 <item>
103   <title>${title}</title>
104   <pubDate>${pubDate}</pubDate>
105   <category domain="category" nicename="$category">$category</category>
106   <guid isPermaLink="true">/${fullname}</guid>
107   <description></description>
108   <content:encoded><![CDATA[`tail -n +3 "$file"`]]></content:encoded>
109   <wp:post_date>${date}</wp:post_date>
110   <wp:post_date_gmt>${dateGmt}</wp:post_date_gmt>
111   <wp:comment_status>open</wp:comment_status>
112   <wp:ping_status>open</wp:ping_status>
113   <wp:post_name>${name}</wp:post_name>
114   <wp:status>publish</wp:status>
115   <wp:post_parent>0</wp:post_parent>
116   <wp:menu_order>0</wp:menu_order>
117   <wp:post_type>${type}</wp:post_type>
118   <wp:post_password></wp:post_password>
119   <wp:is_sticky>0</wp:is_sticky>
120 EOF
121
122   # other possible elements:
123 #  <link>/${fullname}</link>
124 #  <wp:post_id></wp:post_id>
125 #  <excerpt:encoded></excerpt:encoded>
126 #  <dc:creator>${creator}</dc:creator>
127
128
129   for cmtfile in ${dir}/"$fullname"-{all,[0-9]*}.cmt; do
130     if [[ -e "$cmtfile" ]]; then
131       set +e  # because the perl script below uses a non-zero exit code
132       tail -q -n +2 "$cmtfile" | \
133         sed -r '
134           s/^<item>$/<wp:comment>\n<wp:comment_id>X<\/wp:comment_id>/;
135           s/^<\/item>$/<wp:comment_approved>1<\/wp:comment_approved>\n<\/wp:comment>/;
136           s/<(\/)?author>/<\1wp:comment_author>/g;
137           s/<(\/)?link>/<\1wp:comment_author_url>/g;
138           s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g;
139           s/<(\/)?description>/<\1wp:comment_content>/g;
140           s/^<(ajax|cmt_date|email|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//;
141           s/^<\/?items>$//;
142           /^$/d' | \
143         perl -pe 'use HTML::Entities; decode_entities($_)' | \
144         perl -pe 'use POSIX qw(strftime);
145                   s/^<pubDate>(.+)<\/pubDate>$/"<wp:comment_date>" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \
146         perl -e '
147           my $id = '${commentid}';
148           while (<STDIN>) {
149              s/^(<wp:comment_id>)X(<\/wp:comment_id>)$/$1 . $id++ . $2/e;
150              print $_;
151           }
152           exit $id - '${commentid}';'
153       # TODO: this is a hack since exit codes are only 8 bits unsigned.
154       # this will break on posts with >255 comments.
155       let commentid+=$?
156       set -e
157     fi
158   done
159
160   cat << EOF
161 </item>
162
163 EOF
164 done
165
166 # output footer
167 cat << EOF
168 </channel>
169 </rss>
170 EOF

Benjamin Mako Hill || Want to submit a patch?