--- /dev/null
+from lxml import html
+import yaml
+import urllib2
+import urlparse
+
+with open('bloggers.yml') as f:
+ users = yaml.safe_load(f.read())
+
+def fetch_links(url):
+ tree = html.fromstring(urllib2.urlopen(url).read())
+ links = tree.xpath(
+ '//link[@rel="alternate"][contains(@type, "rss") or ' +
+ 'contains(@type, "atom") or contains(@type, "rdf")]')
+ candidates = [l for l in links if
+ 'atom' in l.attrib['type'] and
+ 'comments' not in l.attrib['href'].lower() and
+ 'comments' not in l.attrib.get('title','')]
+ if candidates:
+ return candidates[0].attrib['href']
+ return links[0].attrib['href']
+
+for (name, u) in users.items():
+ print "[%s]" % name
+ for e in u['links']:
+ (title, url) = e[0:2]
+ print " - %s:" % title.strip()
+ e[0] = e[0].strip()
+ if len(e) < 3:
+ e.append(None)
+ link = fetch_links(url)
+ if not link.startswith('http:'):
+ link = urlparse.urljoin(url, link)
+ print " %s" % (link,)
+ e[2] = link
+
+with open('bloggers.yml', 'w') as f:
+ yaml.safe_dump(users, f)
--- /dev/null
+#!usr/bin/python
+from lxml import html
+import yaml
+
+tree = html.fromstring(open('/tmp/iron-blogger.html').read())
+
+who = {}
+for tr in list(tree.xpath('//tr'))[1:]:
+ username = str(tr.xpath('td[1]/tt/text()')[0])
+ links = tr.xpath('td[2]/a')
+ links = [(l.text, l.attrib['href']) for l in links]
+ start = str(tr.xpath('td[3]/text()')[0]).strip()
+ who[username] = dict(links=links, start=start)
+
+print yaml.safe_dump(who)