view backend/AbstractFeedUpdater.py @ 206:f74fe7cb5091

when updating feeds, only ever create new Feed objects for entries that are younger than the current expire date. This ensures that we do not see old, read, expired entries again
author dirk
date Sat, 02 Jun 2012 04:30:04 +0200
parents e604c32f67aa
children 524cbf9e413c
line wrap: on
line source


from datetime import datetime
from urllib2 import ProxyHandler
import AbstractBackend
import feedparser
import logging

STATUS_ERROR = 400
log = logging.getLogger("FeedUpdater")

class AbstractFeedUpdater(object):
    '''
    Abstract base class for FeedUpdater implementations - handles all the parsing of the feed.
    Subclasses need to implement creating and storing the new feed entries.
    '''

    def __init__(self, preferences):
        self.preferences = preferences

    def update(self, feed):
        self.feed = feed
        log.info("updating " + feed.rss_url)
        result = self._retrieveFeed()
        self._setFeedTitle(result)
        self._processEntries(result)

    def _retrieveFeed(self):
        if self.preferences.isProxyConfigured():
            proxyUrl = "http://%s:%i" % (self.preferences.proxyHost(), self.preferences.proxyPort())
            proxyHandler = ProxyHandler({"http" : proxyUrl})
            result = feedparser.parse(self.feed.rss_url, handlers=[proxyHandler])
        else:
            # when updating to python3 see http://code.google.com/p/feedparser/issues/detail?id=260
            result = feedparser.parse(self.feed.rss_url)
        # bozo flags if a feed is well-formed.
#        if result["bozo"] > 0:
#            raise FeedUpdateException()
        status = result["status"]
        if status >= STATUS_ERROR:
            raise FeedUpdateException("HTTP status " + str(status))
        return result

    def _processEntries(self, feedDict):
        for entry in feedDict.entries:
            self._normalize(entry)
            if not self._isExpired(entry):
                self._processEntry(entry)
        self._incrementFeedUpdateDate()

    def _normalize(self, entry):
        self._normalizeId(entry)
        self._normalizePublishedDate(entry)
        self._normalizeUpdatedDate(entry)
        self._normalizeSummary(entry)

    def _normalizeId(self, entry):
        if not hasattr(entry, "id"):
            entry.id = entry.link

    def _normalizePublishedDate(self, entry):
        if not hasattr(entry, "published"):
            if hasattr(entry, "updated"):
                entry.published = entry.updated

    def _normalizeUpdatedDate(self, entry):
        if not hasattr(entry, "updated_parsed") or entry.updated_parsed is None:
            # TODO try to parse the entry.updated date string
            entry.updated_parsed = datetime.today()
        else:
            entry.updated_parsed = datetime(*entry.updated_parsed[:6])

    def _normalizeSummary(self, entry):
        if not hasattr(entry, "summary"):
            if hasattr(entry, "content"):
                entry.summary = entry.content[0].value
            else:
                entry.summary = ""

    def _isExpired(self, entry):
        expireDate = AbstractBackend.calculateExpireDate(self.preferences)
        return entry.updated_parsed < expireDate

    def _processEntry(self, entry):
        raise Exception("_processEntry is abstract, subclasses must override")

    def _incrementFeedUpdateDate(self):
        raise Exception("_incrementNextUpdateDate is abstract, subclasses must override")

    def _setFeedTitle(self, feedDict):
        if self.feed.title is None:
            if feedDict.feed.has_key("title"):
                self.feed.title = feedDict.feed.title
            else:
                self.feed.title = self.feed.rss_url


class FeedUpdateException(Exception):
    pass