]> projects.mako.cc - iron-blogger/blobdiff - scan-feeds.py
Skip cbayley because she was away from a computer all week
[iron-blogger] / scan-feeds.py
index 6373316005a104baa2666ed68b980d23708e7831..bf60aefbaeec5a3b4e6b02bda4bffe89ce2f99cc 100755 (executable)
@@ -2,13 +2,18 @@
 import yaml
 import feedparser
 import datetime
 import yaml
 import feedparser
 import datetime
+import sys
 from dateutil.parser import parse
 import dateutil.tz as tz
 
 with open('bloggers.yml') as f:
     users = yaml.safe_load(f.read())
 
 from dateutil.parser import parse
 import dateutil.tz as tz
 
 with open('bloggers.yml') as f:
     users = yaml.safe_load(f.read())
 
-log = {}
+try:
+    with open('out/report.yml') as f:
+        log = yaml.safe_load(f.read())
+except IOError:
+    log = {}
 
 START = datetime.datetime(2009, 12, 21, 6)
 
 
 START = datetime.datetime(2009, 12, 21, 6)
 
@@ -16,25 +21,17 @@ def parse_published(pub):
     return parse(pub).astimezone(tz.tzlocal()).replace(tzinfo=None)
 
 def get_date(post):
     return parse(pub).astimezone(tz.tzlocal()).replace(tzinfo=None)
 
 def get_date(post):
-    if 'published' in post:
-        return post.published
-    return post.updated
+    for k in ('published', 'created', 'updated'):
+        if k in post:
+            return post[k]
 
 def get_link(post):
 
 def get_link(post):
-    if 'links' in post:
-        links = dict((l.rel, l) for l in post.links if 'html' in l.type)
-        if 'self' in links:
-            return links['self'].href
-        elif 'alternate' in links:
-            return links['alternate'].href
-    if 'href' in post:
-        return post.href
-    if 'link' in post:
-        return post.link
-    return None
+    return post.link
 
 def parse_feeds(weeks, uri):
     feed = feedparser.parse(uri)
 
 def parse_feeds(weeks, uri):
     feed = feedparser.parse(uri)
+    if not feed.entries:
+        print >>sys.stderr, "WARN: no entries for ", uri
     for post in feed.entries:
         date = parse_published(get_date(post))
 
     for post in feed.entries:
         date = parse_published(get_date(post))
 
@@ -44,19 +41,17 @@ def parse_feeds(weeks, uri):
 
         while len(weeks) <= wn:
             weeks.append([])
 
         while len(weeks) <= wn:
             weeks.append([])
-        weeks[wn].append(dict(
-                date=date,
-                title=post.title,
-                url=get_link(post)))
+
+        post = dict(date=date,
+                    title=post.title,
+                    url=get_link(post))
+        if post['url'] not in [p['url'] for p in weeks[wn]]:
+            weeks[wn].append(post)
 
 for (username, u) in users.items():
 
 for (username, u) in users.items():
-    weeks = []
-    print "[%s]" % (username)
+    weeks = log.setdefault(username, [])
     for l in u['links']:
         parse_feeds(weeks, l[2])
     for l in u['links']:
         parse_feeds(weeks, l[2])
-    log[username] = weeks
-    for (i, w) in enumerate(weeks):
-        print " [%d]: %s" % (i, w)
 
 with open('out/report.yml', 'w') as f:
     yaml.safe_dump(log, f)
 
 with open('out/report.yml', 'w') as f:
     yaml.safe_dump(log, f)

Benjamin Mako Hill || Want to submit a patch?