Mercurial > hg > Feedworm
view backend/AbstractFeedUpdater.py @ 257:75b81da8d7a5
convert the feed entry timestamps to arango compatible date strings in migration
author | Dirk Olmes <dirk@xanthippe.ping.de> |
---|---|
date | Tue, 12 Mar 2019 02:38:41 +0100 |
parents | 8e73a8ae863f |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- import AbstractBackend import feedparser import logging from datetime import datetime from urllib2 import ProxyHandler STATUS_ERROR = 400 log = logging.getLogger("FeedUpdater") """ Abstract base class for FeedUpdater implementations - handles all the parsing of the feed. Subclasses need to implement creating and storing the new feed entries. """ class AbstractFeedUpdater(object): def __init__(self, preferences): self.preferences = preferences def update(self, feed): self.feed = feed log.info("updating " + feed.rss_url) result = self._retrieveFeed() self._setFeedTitle(result) self._processEntries(result) def _retrieveFeed(self): # when updating to python3 see http://code.google.com/p/feedparser/issues/detail?id=260 handlers = None if self.preferences.isProxyConfigured() and self.preferences.useProxy(): proxyUrl = '{0}:{1}'.format(self.preferences.proxyHost(), self.preferences.proxyPort()) proxyHandler = ProxyHandler({'http': proxyUrl, 'https': proxyUrl}) handlers = [proxyHandler] result = feedparser.parse(self.feed.rss_url, handlers) if result.bozo > 0: log.warn('result contains bozo') log.warn(result) # bozo flags if a feed is well-formed. # if result["bozo"] > 0: # raise FeedUpdateException() status = result.status if status >= STATUS_ERROR: raise FeedUpdateException("HTTP status " + str(status)) return result def _processEntries(self, feedDict): for entry in feedDict.entries: self._normalize(entry) if not self._isExpired(entry): self._processEntry(entry) self._incrementFeedUpdateDate() def _normalize(self, entry): self._normalizeId(entry) self._normalizePublishedDate(entry) self._normalizeUpdatedDate(entry) self._normalizeSummary(entry) def _normalizeId(self, entry): if not hasattr(entry, "id"): entry.id = entry.link def _normalizePublishedDate(self, entry): if not hasattr(entry, "published"): if hasattr(entry, "updated"): entry.published = entry.updated def _normalizeUpdatedDate(self, entry): if not hasattr(entry, "updated_parsed") or entry.updated_parsed is None: # TODO: try to parse the entry.updated date string entry.updated_parsed = datetime.today() else: entry.updated_parsed = datetime(*entry.updated_parsed[:6]) def _normalizeSummary(self, entry): if not hasattr(entry, "summary"): if hasattr(entry, "content"): entry.summary = entry.content[0].value else: entry.summary = "" def _isExpired(self, entry): expireDate = AbstractBackend.calculateExpireDate(self.preferences) return entry.updated_parsed < expireDate def _processEntry(self, entry): raise Exception("_processEntry is abstract, subclasses must override") def _incrementFeedUpdateDate(self): raise Exception("_incrementNextUpdateDate is abstract, subclasses must override") def _setFeedTitle(self, feedDict): if self.feed.title is None: if 'title' in feedDict.feed: self.feed.title = feedDict.feed.title else: self.feed.title = self.feed.rss_url class FeedUpdateException(Exception): pass