projects.mako.cc - iron-blogger/blob - import-feeds.py

   1 from lxml import html
   2 import yaml
   3 import urllib2
   4 import urlparse
   5
   6 with open('bloggers.yml') as f:
   7     users = yaml.safe_load(f.read())
   8
   9 def fetch_links(url):
  10     tree = html.fromstring(urllib2.urlopen(url).read())
  11     links = tree.xpath(
  12         '//link[@rel="alternate"][contains(@type, "rss") or ' +
  13         'contains(@type, "atom") or contains(@type, "rdf")]')
  14     candidates = [l for l in links if
  15                   'atom' in l.attrib['type'] and
  16                   'comments' not in l.attrib['href'].lower() and
  17                   'comments' not in l.attrib.get('title','')]
  18     if candidates:
  19         return candidates[0].attrib['href']
  20     return links[0].attrib['href']
  21
  22 for (name, u) in users.items():
  23     print "[%s]" % name
  24     for e in u['links']:
  25         (title, url) = e[0:2]
  26         print " - %s:" % title.strip()
  27         e[0] = e[0].strip()
  28         if len(e) == 3:
  29             continue
  30         link = fetch_links(url)
  31         if not link.startswith('http:'):
  32             link = urlparse.urljoin(url, link)
  33         print "   %s" % (link,)
  34         e.append(link)
  35
  36 with open('bloggers.yml', 'w') as f:
  37     yaml.safe_dump(users, f)