# -*- coding: utf-8 -*- """ plnt.sync ~~~~~~~~~ Does the synchronization. Called by "manage-plnt.py sync" :copyright: (c) 2009 by the Werkzeug Team, see AUTHORS for more details. :license: BSD. """ import sys import feedparser from time import time from datetime import datetime from werkzeug import escape from plnt.database import Blog, Entry, session from plnt.utils import strip_tags, nl2p HTML_MIMETYPES = set(['text/html', 'application/xhtml+xml']) def sync(): """ Performs a synchronization. Articles that are already syncronized aren't touched anymore. """ for blog in Blog.query.all(): # parse the feed. feedparser.parse will never given an exception # but the bozo bit might be defined. feed = feedparser.parse(blog.feed_url) blog_author = feed.get('author') or blog.name blog_author_detail = feed.get('author_detail') for entry in feed.entries: # get the guid. either the id if specified, otherwise the link. # if none is available we skip the entry. guid = entry.get('id') or entry.get('link') if not guid: continue # get an old entry for the guid to check if we need to update # or recreate the item old_entry = Entry.query.filter_by(guid=guid).first() # get title, url and text. skip if no title or no text is # given. if the link is missing we use the blog link. if 'title_detail' in entry: title = entry.title_detail.get('value') or '' if entry.title_detail.get('type') in HTML_MIMETYPES: title = strip_tags(title) else: title = escape(title) else: title = entry.get('title') url = entry.get('link') or blog.blog_url text = 'content' in entry and entry.content[0] or \ entry.get('summary_detail') if not title or not text: continue # if we have an html text we use that, otherwise we HTML # escape the text and use that one. We also handle XHTML # with our tag soup parser for the moment. if text.get('type') not in HTML_MIMETYPES: text = escape(nl2p(text.get('value') or '')) else: text = text.get('value') or '' # no text? continue if not text.strip(): continue # get the pub date and updated date. This is rather complex # because different feeds do different stuff pub_date = entry.get('published_parsed') or \ entry.get('created_parsed') or \ entry.get('date_parsed') updated = entry.get('updated_parsed') or pub_date pub_date = pub_date or updated # if we don't have a pub_date we skip. if not pub_date: continue # convert the time tuples to datetime objects. pub_date = datetime(*pub_date[:6]) updated = datetime(*updated[:6]) if old_entry and updated <= old_entry.last_update: continue # create a new entry object based on the data collected or # update the old one. entry = old_entry or Entry() entry.blog = blog entry.guid = guid entry.title = title entry.url = url entry.text = text entry.pub_date = pub_date entry.last_update = updated session.add(entry) session.commit()