From 04d130367c78c5be70f413013094a9f43cf07cf6 Mon Sep 17 00:00:00 2001 From: Nelson Elhage Date: Mon, 28 Dec 2009 17:06:06 -0500 Subject: [PATCH] Initial import scripts --- atom.py | 37 +++++++++++++++++++++++++++++++++++++ import.py | 15 +++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 atom.py create mode 100644 import.py diff --git a/atom.py b/atom.py new file mode 100644 index 0000000..cbd0d3c --- /dev/null +++ b/atom.py @@ -0,0 +1,37 @@ +from lxml import html +import yaml +import urllib2 +import urlparse + +with open('bloggers.yml') as f: + users = yaml.safe_load(f.read()) + +def fetch_links(url): + tree = html.fromstring(urllib2.urlopen(url).read()) + links = tree.xpath( + '//link[@rel="alternate"][contains(@type, "rss") or ' + + 'contains(@type, "atom") or contains(@type, "rdf")]') + candidates = [l for l in links if + 'atom' in l.attrib['type'] and + 'comments' not in l.attrib['href'].lower() and + 'comments' not in l.attrib.get('title','')] + if candidates: + return candidates[0].attrib['href'] + return links[0].attrib['href'] + +for (name, u) in users.items(): + print "[%s]" % name + for e in u['links']: + (title, url) = e[0:2] + print " - %s:" % title.strip() + e[0] = e[0].strip() + if len(e) < 3: + e.append(None) + link = fetch_links(url) + if not link.startswith('http:'): + link = urlparse.urljoin(url, link) + print " %s" % (link,) + e[2] = link + +with open('bloggers.yml', 'w') as f: + yaml.safe_dump(users, f) diff --git a/import.py b/import.py new file mode 100644 index 0000000..54edbe8 --- /dev/null +++ b/import.py @@ -0,0 +1,15 @@ +#!usr/bin/python +from lxml import html +import yaml + +tree = html.fromstring(open('/tmp/iron-blogger.html').read()) + +who = {} +for tr in list(tree.xpath('//tr'))[1:]: + username = str(tr.xpath('td[1]/tt/text()')[0]) + links = tr.xpath('td[2]/a') + links = [(l.text, l.attrib['href']) for l in links] + start = str(tr.xpath('td[3]/text()')[0]).strip() + who[username] = dict(links=links, start=start) + +print yaml.safe_dump(who) -- 2.30.2