view backend/AbstractFeedUpdater.py @ 205:adf7f617bda9

make the name of the design document configurable via command line switch. When cloning the feedworm db, the design document is no longer the same as the database name
author dirk
date Sat, 02 Jun 2012 04:24:49 +0200
parents e604c32f67aa
children f74fe7cb5091
line wrap: on
line source


from datetime import datetime
import feedparser
import logging
from urllib2 import ProxyHandler

STATUS_ERROR = 400
log = logging.getLogger("FeedUpdater")

class AbstractFeedUpdater(object):
    '''
    Abstract base class for FeedUpdater implementations - handles all the parsing of the feed.
    Subclasses need to implement creating and storing the new feed entries.
    '''

    def __init__(self, preferences):
        self.preferences = preferences

    def update(self, feed):
        self.feed = feed
        log.info("updating " + feed.rss_url)
        result = self._retrieveFeed()
        self._setFeedTitle(result)
        self._processEntries(result)

    def _retrieveFeed(self):
        if self.preferences.isProxyConfigured():
            proxyUrl = "http://%s:%i" % (self.preferences.proxyHost(), self.preferences.proxyPort())
            proxyHandler = ProxyHandler({"http" : proxyUrl})
            result = feedparser.parse(self.feed.rss_url, handlers=[proxyHandler])
        else:
            # when updating to python3 see http://code.google.com/p/feedparser/issues/detail?id=260
            result = feedparser.parse(self.feed.rss_url)
        # bozo flags if a feed is well-formed.
#        if result["bozo"] > 0:
#            raise FeedUpdateException()
        status = result["status"]
        if status >= STATUS_ERROR:
            raise FeedUpdateException("HTTP status " + str(status))
        return result

    def _processEntries(self, feedDict):
        for entry in feedDict.entries:
            self._normalize(entry)
            self._processEntry(entry)
        self._incrementFeedUpdateDate()

    def _normalize(self, entry):
        self._normalizeId(entry)
        self._normalizePublishedDate(entry)
        self._normalizeUpdatedDate(entry)
        self._normalizeSummary(entry)

    def _normalizeId(self, entry):
        if not hasattr(entry, "id"):
            entry.id = entry.link

    def _normalizePublishedDate(self, entry):
        if not hasattr(entry, "published"):
            if hasattr(entry, "updated"):
                entry.published = entry.updated

    def _normalizeUpdatedDate(self, entry):
        if not hasattr(entry, "updated_parsed") or entry.updated_parsed is None:
            # TODO try to parse the entry.updated date string
            entry.updated_parsed = datetime.today()
        else:
            entry.updated_parsed = datetime(*entry.updated_parsed[:6])

    def _normalizeSummary(self, entry):
        if not hasattr(entry, "summary"):
            if hasattr(entry, "content"):
                entry.summary = entry.content[0].value
            else:
                entry.summary = ""

    def _processEntry(self, entry):
        raise Exception("_processEntry is abstract, subclasses must override")

    def _incrementFeedUpdateDate(self):
        raise Exception("_incrementNextUpdateDate is abstract, subclasses must override")

    def _setFeedTitle(self, feedDict):
        if self.feed.title is None:
            if feedDict.feed.has_key("title"):
                self.feed.title = feedDict.feed.title
            else:
                self.feed.title = self.feed.rss_url


class FeedUpdateException(Exception):
    pass