From 04d130367c78c5be70f413013094a9f43cf07cf6 Mon Sep 17 00:00:00 2001
From: Nelson Elhage <nelhage@mit.edu>
Date: Mon, 28 Dec 2009 17:06:06 -0500
Subject: [PATCH] Initial import scripts

---
 atom.py   | 37 +++++++++++++++++++++++++++++++++++++
 import.py | 15 +++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 atom.py
 create mode 100644 import.py

diff --git a/atom.py b/atom.py
new file mode 100644
index 0000000..cbd0d3c
--- /dev/null
+++ b/atom.py
@@ -0,0 +1,37 @@
+from lxml import html
+import yaml
+import urllib2
+import urlparse
+
+with open('bloggers.yml') as f:
+    users = yaml.safe_load(f.read())
+
+def fetch_links(url):
+    tree = html.fromstring(urllib2.urlopen(url).read())
+    links = tree.xpath(
+        '//link[@rel="alternate"][contains(@type, "rss") or ' +
+        'contains(@type, "atom") or contains(@type, "rdf")]')
+    candidates = [l for l in links if
+                  'atom' in l.attrib['type'] and
+                  'comments' not in l.attrib['href'].lower() and
+                  'comments' not in l.attrib.get('title','')]
+    if candidates:
+        return candidates[0].attrib['href']
+    return links[0].attrib['href']
+
+for (name, u) in users.items():
+    print "[%s]" % name
+    for e in u['links']:
+        (title, url) = e[0:2]
+        print " - %s:" % title.strip()
+        e[0] = e[0].strip()
+        if len(e) < 3:
+            e.append(None)
+        link = fetch_links(url)
+        if not link.startswith('http:'):
+            link = urlparse.urljoin(url, link)
+        print "   %s" % (link,)
+        e[2] = link
+
+with open('bloggers.yml', 'w') as f:
+    yaml.safe_dump(users, f)
diff --git a/import.py b/import.py
new file mode 100644
index 0000000..54edbe8
--- /dev/null
+++ b/import.py
@@ -0,0 +1,15 @@
+#!usr/bin/python
+from lxml import html
+import yaml
+
+tree = html.fromstring(open('/tmp/iron-blogger.html').read())
+
+who = {}
+for tr in list(tree.xpath('//tr'))[1:]:
+    username = str(tr.xpath('td[1]/tt/text()')[0])
+    links = tr.xpath('td[2]/a')
+    links = [(l.text, l.attrib['href']) for l in links]
+    start = str(tr.xpath('td[3]/text()')[0]).strip()
+    who[username] = dict(links=links, start=start)
+
+print yaml.safe_dump(who)
-- 
2.39.5