add two modified version and documentation
[pyblosxom2wxr] / pyblosxom2wxr-revealingerrors.sh
1 #!/bin/bash
2 #
3 # pyblosxom2wxr.sh
4 #
5 # http://projects.mako.cc/source/pyblosxom2wxr
6 # Benjamin Mako Hill <mako@atdot.cc>
7 #
8 # http://snarfed.org/pyblosxom2wxr
9 # Ryan Barrett <pyblosxom2wxr@ryanb.org>
10 #
11 # Version 0.2.1-re. This script is public domain.
12 #
13 # This script converts PyBlosxom posts and comments into a WXR (WordPress
14 # eXtensible RSS) XML file that can be imported into a WordPress blog.
15 #
16 # Example usage:
17 #
18 # $ ./pyblosxom2wxr.sh post1.txt post2.txt > posts.xml
19 #
20 # pyblosxom2wxr has been tested with PyBlosxom 1.4 and WordPress 2.9 and 3.0. It
21 # should work with other versions too, but your mileage may vary.
22 #
23 # TODO: comment ordering
24
25 # exit on error
26 set -e
27
28 # check args
29 if [[ $# = "0" || $1 = "--help" ]]; then
30   echo 'Usage: pyblosxom2wxr.sh FILES...'
31   exit 1
32 fi
33
34 # category
35 all_categories=""
36 all_tags=""
37
38 # comment id sequence number
39 commentid=1
40
41 # output header
42 now=`date --rfc-3339=seconds`
43 cat << EOF
44 <?xml version="1.0" encoding="UTF-8"?> 
45
46 <!-- generator="pyblosxom2wxr/1.0" created="${now}" -->
47 <rss version="2.0"
48   xmlns:excerpt="http://wordpress.org/export/1.0/excerpt/"
49   xmlns:content="http://purl.org/rss/1.0/modules/content/"
50   xmlns:wfw="http://wellformedweb.org/CommentAPI/"
51   xmlns:dc="http://purl.org/dc/elements/1.1/"
52   xmlns:wp="http://wordpress.org/export/1.0/">
53
54 <channel>
55   <title></title>
56   <link></link>
57   <description></description>
58   <pubDate></pubDate>
59   <generator>http://snarfed.org/pyblosxom2wxr?v=1.0</generator>
60   <language>en</language>
61   <wp:wxr_version>1.0</wp:wxr_version>
62   <wp:base_site_url></wp:base_site_url>
63   <wp:base_blog_url></wp:base_blog_url>
64 EOF
65
66
67 # convert blog posts
68 for file in "$@"; do
69   creator="mako"
70   fullname=`basename "$file" .txt`
71   dir=`dirname "$file"`
72   commentdir="$(dirname "$file")/comments"
73   title=`head -n 1 "$file"`
74   type="post"
75
76   name=${fullname}
77
78   pubDate=$(date -uR -r $file)
79   date=$(date -r "$file" +'%F %T')
80   dateGmt=$(date -u -r "$file" +'%F %T')
81
82   if grep -q ']]>' "$file"; then
83     echo "WARNING: $file contains the string ]]>, which makes its CDATA " \
84          "section invalid. WordPress handles this ok, but still, heads up." 1>&2
85   fi
86
87   cat << EOF
88 <item>
89   <title>${title}</title>
90   <pubDate>${pubDate}</pubDate>
91   <guid isPermaLink="true">/${fullname}</guid>
92   <description></description>
93   <content:encoded><![CDATA[$(tail -n +4 "$file" | markdown_py)]]></content:encoded>
94   <wp:post_date>${date}</wp:post_date>
95   <wp:post_date_gmt>${dateGmt}</wp:post_date_gmt>
96   <wp:comment_status>open</wp:comment_status>
97   <wp:ping_status>open</wp:ping_status>
98   <wp:post_name>${name}</wp:post_name>
99   <wp:status>publish</wp:status>
100   <wp:post_parent>0</wp:post_parent>
101   <wp:menu_order>0</wp:menu_order>
102   <wp:post_type>${type}</wp:post_type>
103   <wp:post_password></wp:post_password>
104   <wp:is_sticky>0</wp:is_sticky>
105   <dc:creator>${creator}</dc:creator>
106 EOF
107  
108   # split the tags
109   raw_tags=$(grep '#tags' "$file" |perl -pe 's/.tags // '|tr ',' "\n")
110   for tag in $raw_tags; do
111     echo "<category domain=\"post_tag\" nicename=\"${tag}\">${tag}</category>"
112     #echo "<category domain=\"category\" nicename=\"$category\">$category</category>"
113   done;
114   all_tags="$all_tags$raw_tags"
115
116   # category="uncategorized"
117   # <category domain="category" nicename="$category">$category</category>
118
119   # other possible elements:
120 #  <link>/${fullname}</link>
121 #  <wp:post_id></wp:post_id>
122 #  <excerpt:encoded></excerpt:encoded>
123
124
125   for cmtfile in ${commentdir}/"$fullname"-{all,[0-9]*}.cmt; do
126     if [[ -e "$cmtfile" ]]; then
127       set +e  # because the perl script below uses a non-zero exit code
128       tail -q -n +2 "$cmtfile" | \
129         sed -r '
130           s/^<item>$/<wp:comment>\n<wp:comment_id>X<\/wp:comment_id>/;
131           s/^<\/item>$/<wp:comment_approved>1<\/wp:comment_approved>\n<\/wp:comment>/;
132           s/<(\/)?author>/<\1wp:comment_author>/g;
133           s/<(\/)?link>/<\1wp:comment_author_url>/g;
134           s/<(\/)?ipaddress>/<\1wp:comment_author_IP>/g;
135           s/<(\/)?description>/<\1wp:comment_content>/g;
136           s/^<(ajax|cmt_date|email|openid_url|parent|post|secretToken|source|title|w3cdate)>.+$//;
137           s/^<\/?items>$//;
138           /^$/d' | \
139         perl -pe 'use HTML::Entities; decode_entities($_)' | \
140         perl -pe 'use POSIX qw(strftime);
141                   s/^<pubDate>(.+)<\/pubDate>$/"<wp:comment_date>" . (strftime "%Y-%m-%d %H:%M:%S", localtime($1)) . "<\/wp:comment_date>"/e;' | \
142         perl -e '
143           my $id = '${commentid}';
144           while (<STDIN>) {
145              s/^(<wp:comment_id>)X(<\/wp:comment_id>)$/$1 . $id++ . $2/e;
146              print $_;
147           }
148           exit $id - '${commentid}';'
149       # TODO: this is a hack since exit codes are only 8 bits unsigned.
150       # this will break on posts with >255 comments.
151       let commentid+=$?
152       set -e
153     fi
154   done
155
156   cat << EOF
157 </item>
158
159 EOF
160 done
161
162 index=2
163 #for category in $(echo "$all_categories"|sort|uniq); do
164 #    cat << EOF
165 #    <wp:category><wp:term_id>${index}</wp:term_id><wp:category_nicename>${category}</wp:category_nicename><wp:category_parent></wp:category_parent><wp:cat_name><![CDATA[${category}]]></wp:cat_name></wp:category>
166 #EOF
167 #    index=$(expr $index + 1)
168 #done
169
170 for tag in $(echo "$all_tags"|sort|uniq); do
171     cat << EOF
172     <wp:tag><wp:term_id>${index}</wp:term_id><wp:tag_slug>${tag}</wp:tag_slug><wp:tag_name>${tag}</wp:tag_name></wp:tag>
173 EOF
174     index=$(expr $index + 1)
175 done
176
177
178 # output footer
179 cat << EOF
180 </channel>
181 </rss>
182 EOF

Benjamin Mako Hill || Want to submit a patch?