Mercurial > hg > Feedworm
view backend/AbstractFeedUpdater.py @ 206:f74fe7cb5091
when updating feeds, only ever create new Feed objects for entries that are younger than the current expire date. This ensures that we do not see old, read, expired entries again
author | dirk |
---|---|
date | Sat, 02 Jun 2012 04:30:04 +0200 |
parents | e604c32f67aa |
children | 524cbf9e413c |
line wrap: on
line source
from datetime import datetime from urllib2 import ProxyHandler import AbstractBackend import feedparser import logging STATUS_ERROR = 400 log = logging.getLogger("FeedUpdater") class AbstractFeedUpdater(object): ''' Abstract base class for FeedUpdater implementations - handles all the parsing of the feed. Subclasses need to implement creating and storing the new feed entries. ''' def __init__(self, preferences): self.preferences = preferences def update(self, feed): self.feed = feed log.info("updating " + feed.rss_url) result = self._retrieveFeed() self._setFeedTitle(result) self._processEntries(result) def _retrieveFeed(self): if self.preferences.isProxyConfigured(): proxyUrl = "http://%s:%i" % (self.preferences.proxyHost(), self.preferences.proxyPort()) proxyHandler = ProxyHandler({"http" : proxyUrl}) result = feedparser.parse(self.feed.rss_url, handlers=[proxyHandler]) else: # when updating to python3 see http://code.google.com/p/feedparser/issues/detail?id=260 result = feedparser.parse(self.feed.rss_url) # bozo flags if a feed is well-formed. # if result["bozo"] > 0: # raise FeedUpdateException() status = result["status"] if status >= STATUS_ERROR: raise FeedUpdateException("HTTP status " + str(status)) return result def _processEntries(self, feedDict): for entry in feedDict.entries: self._normalize(entry) if not self._isExpired(entry): self._processEntry(entry) self._incrementFeedUpdateDate() def _normalize(self, entry): self._normalizeId(entry) self._normalizePublishedDate(entry) self._normalizeUpdatedDate(entry) self._normalizeSummary(entry) def _normalizeId(self, entry): if not hasattr(entry, "id"): entry.id = entry.link def _normalizePublishedDate(self, entry): if not hasattr(entry, "published"): if hasattr(entry, "updated"): entry.published = entry.updated def _normalizeUpdatedDate(self, entry): if not hasattr(entry, "updated_parsed") or entry.updated_parsed is None: # TODO try to parse the entry.updated date string entry.updated_parsed = datetime.today() else: entry.updated_parsed = datetime(*entry.updated_parsed[:6]) def _normalizeSummary(self, entry): if not hasattr(entry, "summary"): if hasattr(entry, "content"): entry.summary = entry.content[0].value else: entry.summary = "" def _isExpired(self, entry): expireDate = AbstractBackend.calculateExpireDate(self.preferences) return entry.updated_parsed < expireDate def _processEntry(self, entry): raise Exception("_processEntry is abstract, subclasses must override") def _incrementFeedUpdateDate(self): raise Exception("_incrementNextUpdateDate is abstract, subclasses must override") def _setFeedTitle(self, feedDict): if self.feed.title is None: if feedDict.feed.has_key("title"): self.feed.title = feedDict.feed.title else: self.feed.title = self.feed.rss_url class FeedUpdateException(Exception): pass