view backend/AbstractFeedUpdater.py @ 166:04c3b9796b89

feedparser uses the proxy now if one is configured. To implement this the FeedUpdater had to change a bit - sqlalchemy backend is not yet refactored.
author dirk
date Sat, 03 Sep 2011 04:12:35 +0200
parents 86f828096aaf
children a3c945ce434c
line wrap: on
line source


from datetime import datetime
import feedparser
import logging
from urllib2 import ProxyHandler

STATUS_ERROR = 400
log = logging.getLogger("FeedUpdater")

class AbstractFeedUpdater(object):
    '''
    Abstract base class for FeedUpdater implementations - handles all the parsing of the feed.
    Subclasses need to implement creating and storing the new feed entries.
    '''

    @staticmethod
    def parseFeed(url):
        proxy = ProxyHandler( {"http":"http://your.proxy.here:8080/"} )
        return feedparser.parse(url, handlers = [proxy])

    def __init__(self, preferences):
        self.preferences = preferences

    def update(self, feed):
        self.feed = feed
        log.info("updating " + feed.rss_url)
        result = self._retrieveFeed()
        self._processEntries(result)
        self._setFeedTitle(result)

    def _retrieveFeed(self):
        if self.preferences.isProxyConfigured():
            proxyUrl = "http://%s:%i" % (self.preferences.proxyHost(), self.preferences.proxyPort())
            proxyHandler = ProxyHandler({"http" : proxyUrl})
            result = feedparser.parse(self.feed.rss_url, handlers=[proxyHandler])
        else:
            result = feedparser.parse(self.feed.rss_url)
        # bozo flags if a feed is well-formed.
#        if result["bozo"] > 0:
#            raise FeedUpdateException()
        status = result["status"]
        if status >= STATUS_ERROR:
            raise FeedUpdateException("HTTP status " + str(status))
        return result

    def _processEntries(self, feedDict):
        for entry in feedDict.entries:
            self._normalize(entry)
            self._processEntry(entry)
        self._incrementFeedUpdateDate()

    def _normalize(self, entry):
        if not hasattr(entry, "id"):
            entry.id = entry.link
        if not hasattr(entry, "updated_parsed"):
            entry.updated_parsed = datetime.today()
        else:
            entry.updated_parsed = datetime(*entry.updated_parsed[:6])
        if not hasattr(entry, "summary"):
            if hasattr(entry, "content"):
                entry.summary = entry.content[0].value
            else:
                entry.summary = ""

    def _processEntry(self, entry):
        raise Exception("_processEntry is abstract, subclasses must override")

    def _incrementFeedUpdateDate(self):
        raise Exception("_incrementNextUpdateDate is abstract, subclasses must override")

    def _setFeedTitle(self, feedDict):
        if self.feed.title is None:
            if feedDict.feed.has_key("title"):
                self.feed.title = feedDict.feed.title
            else:
                self.feed.title = self.feed.rss_url


class FeedUpdateException(Exception):
    pass