view backend/AbstractFeedUpdater.py @ 197:e604c32f67aa

normalize the published date if the feed contains none
author dirk
date Tue, 24 Jan 2012 10:08:45 +0100
parents 2f2016a10f7d
children f74fe7cb5091
line wrap: on
line source


from datetime import datetime
import feedparser
import logging
from urllib2 import ProxyHandler

STATUS_ERROR = 400
log = logging.getLogger("FeedUpdater")

class AbstractFeedUpdater(object):
    '''
    Abstract base class for FeedUpdater implementations - handles all the parsing of the feed.
    Subclasses need to implement creating and storing the new feed entries.
    '''

    def __init__(self, preferences):
        self.preferences = preferences

    def update(self, feed):
        self.feed = feed
        log.info("updating " + feed.rss_url)
        result = self._retrieveFeed()
        self._setFeedTitle(result)
        self._processEntries(result)

    def _retrieveFeed(self):
        if self.preferences.isProxyConfigured():
            proxyUrl = "http://%s:%i" % (self.preferences.proxyHost(), self.preferences.proxyPort())
            proxyHandler = ProxyHandler({"http" : proxyUrl})
            result = feedparser.parse(self.feed.rss_url, handlers=[proxyHandler])
        else:
            # when updating to python3 see http://code.google.com/p/feedparser/issues/detail?id=260
            result = feedparser.parse(self.feed.rss_url)
        # bozo flags if a feed is well-formed.
#        if result["bozo"] > 0:
#            raise FeedUpdateException()
        status = result["status"]
        if status >= STATUS_ERROR:
            raise FeedUpdateException("HTTP status " + str(status))
        return result

    def _processEntries(self, feedDict):
        for entry in feedDict.entries:
            self._normalize(entry)
            self._processEntry(entry)
        self._incrementFeedUpdateDate()

    def _normalize(self, entry):
        self._normalizeId(entry)
        self._normalizePublishedDate(entry)
        self._normalizeUpdatedDate(entry)
        self._normalizeSummary(entry)

    def _normalizeId(self, entry):
        if not hasattr(entry, "id"):
            entry.id = entry.link

    def _normalizePublishedDate(self, entry):
        if not hasattr(entry, "published"):
            if hasattr(entry, "updated"):
                entry.published = entry.updated

    def _normalizeUpdatedDate(self, entry):
        if not hasattr(entry, "updated_parsed") or entry.updated_parsed is None:
            # TODO try to parse the entry.updated date string
            entry.updated_parsed = datetime.today()
        else:
            entry.updated_parsed = datetime(*entry.updated_parsed[:6])

    def _normalizeSummary(self, entry):
        if not hasattr(entry, "summary"):
            if hasattr(entry, "content"):
                entry.summary = entry.content[0].value
            else:
                entry.summary = ""

    def _processEntry(self, entry):
        raise Exception("_processEntry is abstract, subclasses must override")

    def _incrementFeedUpdateDate(self):
        raise Exception("_incrementNextUpdateDate is abstract, subclasses must override")

    def _setFeedTitle(self, feedDict):
        if self.feed.title is None:
            if feedDict.feed.has_key("title"):
                self.feed.title = feedDict.feed.title
            else:
                self.feed.title = self.feed.rss_url


class FeedUpdateException(Exception):
    pass