~gmb/aggrebot/trunk : contents of aggrebot.py at revision 9

~gmb/aggrebot/trunk : (revision 9)
# Aggrebot - Python MetaWeblog aggregator bot
# Copyright (C) 2008  Graham Binns
#
# Email: graham.binns@gmail.com
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>

__version__ = '0.1'

import feedparser
import logging
import os

from datetime import datetime, timedelta
from ConfigParser import ConfigParser, DEFAULTSECT
from logging import getLogger

from cloglib.blogpost import BlogPost
from cloglib.metaweblog import MetaWeblog

feedparser.USER_AGENT = (
    "Aggrebot/%s +http://launchpad.net/aggrebot" % __version__)

DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
config_parser = ConfigParser()

post_template = """\
<p><a href="%(site_url)s">%(author)s</a> wrote:</p>
%(post_body)s
<p><a href="%(post_link)s">Read more...</a></p>
"""


def get_sites():
    """Return a list of dicts, one for each site to be aggregated."""
    global config_parser
    sites = {}

    # Any config section that isn't core or DEFAULT is treated as a
    # site.
    sites = [
        section for section in config_parser.sections() if section != 'core']

    return sites


def save_config(filename, sections):
    """Save the configuration to a file.

    :param filename: The name of the file to save to.
    :param sections: A list of the sections in the current configuration
        to save.
    """
    new_config = ConfigParser()

    # Loop through the sections, add them to the new config as necessary
    # and copy the current options and values across.
    for section in sections:
        if not new_config.has_section(section):
            new_config.add_section(section)

        for option, value in config_parser.items(section):
            new_config.set(section, option, value)

    config_fp = open(filename, 'w')
    new_config.write(config_fp)


def get_entries(site):
    """Return a list of the feed entries for a site."""
    # Last updated is the last updated date of the blog, not of Aggrebot.
    if not config_parser.has_option(site, 'last_updated'):
        last_updated = None
    else:
        last_updated = datetime.strptime(
            config_parser.get(site, 'last_updated'), DATETIME_FORMAT)

    feed_url = config_parser.get(site, 'feed_url')
    parsed_feed = feedparser.parse(feed_url)

    # If we don't have a last_updated date we don't want to aggregate
    # too many posts, so we just nab the first one. Otherwise, we just
    # grab those whose updated_parsed date is > than last_updated.
    # That's not the ideal way to avoid reposts, but it's a start.
    if last_updated is None:
        entries = [parsed_feed.entries[0]]
    else:
        entries = [
            entry for entry in parsed_feed.entries
                if datetime(*entry['updated_parsed'][:6]) > last_updated]

    return entries


def get_categories(site):
    """Return a list of the categories for posts from a given site."""
    # Load the default categories, if they exist.
    if config_parser.has_option('core', 'categories'):
        category_string = config_parser.get('core', 'categories')

        categories = [
            category.strip() for category in category_string.split(',')]
    else:
        categories = []

    # Load the categories from the site config, if they exist.
    if config_parser.has_option(site, 'categories'):
        category_string = config_parser.get(site, 'categories')

        categories = categories + [
            category.strip() for category in category_string.split(',')]

    return categories


def post_entry(site, entry):
    """Post an entry to the remote site."""
    post_body = post_template % {
        'site_url': config_parser.get(site, 'site_url'),
        'author': config_parser.get(site, 'author'),
        'post_body': entry['summary'],
        'post_link': entry['link']}
    subject = "[%s]: %s" % (
        config_parser.get(site, 'author'), entry['title'])
    categories = get_categories(site)

    post = BlogPost(
        subject=subject, body=post_body, categories=categories)
    post_struct = post.toMetaWeblogStruct()
    post_struct['pubDate'] = entry['updated']

    # We use the metaweblog module to make the post.
    # XXX 2008-03-12 gmb:
    #     This should be changed when cloglib is more flexible.
    metaweblog = MetaWeblog(
        config_parser.get('core', 'xmlrpc_url'),
        config_parser.getint('core', 'xmlrpc_blog'),
        config_parser.get('core', 'xmlrpc_user'),
        config_parser.get('core', 'xmlrpc_password'))

    post_id = metaweblog.newPost(
        post_struct, config_parser.getboolean('core', 'xmlrpc_publish'))

    # XXX 2008-03-12 gmb: post['title'] might be WP / MT specific,
    # whcih is BAD.
    post = metaweblog.getPost(post_id)
    logging.info("Posted entry %s; url=%s" % (
        post['title'], post['link']))

    # We set last_updated to ensure that we don't try adding this
    # post again.
    last_updated = datetime(*entry['updated_parsed'][:6])

    # We update the site to say when we last checked and updated it.
    config_parser.set(
        site, 'last_checked',
        datetime.strftime(datetime.now(), DATETIME_FORMAT))
    config_parser.set(
        site, 'last_updated',
        datetime.strftime(last_updated, DATETIME_FORMAT))


def update_site(site):
    """Update a single site.

    :param site: The name of the site as used in sites.conf.
    """
    # If the site doesn't exist then, well, that's a bit weird.
    if not config_parser.has_section(site):
        logging.warning("Site %s doesn't exist in sites.conf." % section)
        return

    # We only update if the site hasn't been updated since last_update +
    # interval. We default to 1 hour for the interval.
    if not config_parser.has_option(site, 'update_interval'):
        update_interval = 3600
    else:
        update_interval = config_parser.getint(site, 'update_interval')

    if not config_parser.has_option(site, 'last_checked'):
        last_checked = None
    else:
        last_checked = datetime.strptime(
            config_parser.get(site, 'last_checked'), DATETIME_FORMAT)

    now = datetime.now()
    if (last_checked is not None
        and now < last_checked + timedelta(seconds=update_interval)):
        logging.info(
            "Not checking %s: it was checked less than %i seconds ago." %
            (site, update_interval))
        return

    entries = get_entries(site)
    logging.info("Aggregating %i entries from %s." % (len(entries), site))

    # Now we actually post the entries to the remote blog. We process
    # the list of entries in reverse. RSS feeds ususally list the most
    # recent entry first, which is exactly the order we don't want to
    # process the entries in. Of course, there's always the chance that
    # someone won't DTRT...
    for entry in reversed(entries):
        post_entry(site, entry)


def main():
    """Carry out the main processing."""
    logging.info("Aggrebot version %s." % __version__)

    # We store our last run time in the configuration file. Eww.
    if config_parser.has_option('core', 'last_run_time'):
        last_run_time = datetime.strptime(
            config_parser.get('core', 'last_run_time'), DATETIME_FORMAT)
        logging.info("Aggrebot last ran at %s." %
            datetime.strftime(last_run_time, DATETIME_FORMAT))
    else:
        last_run_time = None

    logging.info("Loading sites.")

    sites = get_sites()
    logging.info("%i sites loaded." % len(sites))

    # Update the sites.
    for site in sites:
        update_site(site)

    # We save the current time as our last run time. This could mean
    # missing some posts but it's a tradeoff between that and possible
    # double-postings.
    config_parser.set(
        'core', 'last_run_time',
        datetime.strftime(datetime.now(), DATETIME_FORMAT))

    # Finally, we write the config to the config files.
    logging.info("Saving config.")
    save_config('aggrebot.conf', ['core'])
    save_config('sites.conf', sites)


if __name__ == '__main__':
    # Bit of a cheat, but we assume everything's in one directory.
    base_path = os.path.dirname(__file__)
    config_parser.read([
        os.path.join(base_path, 'aggrebot.conf'),
        os.path.join(base_path, 'sites.conf')])

    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s %(levelname)s %(message)s')

    main()