Initial import scripts

author Nelson Elhage <nelhage@mit.edu>

Mon, 28 Dec 2009 22:06:06 +0000 (17:06 -0500)

committer Nelson Elhage <nelhage@mit.edu>

Mon, 28 Dec 2009 22:06:06 +0000 (17:06 -0500)
author Nelson Elhage <nelhage@mit.edu>
Mon, 28 Dec 2009 22:06:06 +0000 (17:06 -0500)
committer Nelson Elhage <nelhage@mit.edu>
Mon, 28 Dec 2009 22:06:06 +0000 (17:06 -0500)
diff --git a/atom.py b/atom.py

new file mode 100644 (file)

index 0000000..cbd0d3c
--- /dev/null
+++ b/atom.py
@@ -0,0 +1,37 @@
+from lxml import html
+import yaml
+import urllib2
+import urlparse
+
+with open('bloggers.yml') as f:
+    users = yaml.safe_load(f.read())
+
+def fetch_links(url):
+    tree = html.fromstring(urllib2.urlopen(url).read())
+    links = tree.xpath(
+        '//link[@rel="alternate"][contains(@type, "rss") or ' +
+        'contains(@type, "atom") or contains(@type, "rdf")]')
+    candidates = [l for l in links if
+                  'atom' in l.attrib['type'] and
+                  'comments' not in l.attrib['href'].lower() and
+                  'comments' not in l.attrib.get('title','')]
+    if candidates:
+        return candidates[0].attrib['href']
+    return links[0].attrib['href']
+
+for (name, u) in users.items():
+    print "[%s]" % name
+    for e in u['links']:
+        (title, url) = e[0:2]
+        print " - %s:" % title.strip()
+        e[0] = e[0].strip()
+        if len(e) < 3:
+            e.append(None)
+        link = fetch_links(url)
+        if not link.startswith('http:'):
+            link = urlparse.urljoin(url, link)
+        print "   %s" % (link,)
+        e[2] = link
+
+with open('bloggers.yml', 'w') as f:
+    yaml.safe_dump(users, f)
diff --git a/import.py b/import.py

new file mode 100644 (file)

index 0000000..54edbe8
--- /dev/null
+++ b/import.py
@@ -0,0 +1,15 @@
+#!usr/bin/python
+from lxml import html
+import yaml
+
+tree = html.fromstring(open('/tmp/iron-blogger.html').read())
+
+who = {}
+for tr in list(tree.xpath('//tr'))[1:]:
+    username = str(tr.xpath('td[1]/tt/text()')[0])
+    links = tr.xpath('td[2]/a')
+    links = [(l.text, l.attrib['href']) for l in links]
+    start = str(tr.xpath('td[3]/text()')[0]).strip()
+    who[username] = dict(links=links, start=start)
+
+print yaml.safe_dump(who)
author	Nelson Elhage <nelhage@mit.edu>
	Mon, 28 Dec 2009 22:06:06 +0000 (17:06 -0500)
committer	Nelson Elhage <nelhage@mit.edu>
	Mon, 28 Dec 2009 22:06:06 +0000 (17:06 -0500)
atom.py	[new file with mode: 0644]	patch \| blob
import.py	[new file with mode: 0644]	patch \| blob