~gmb/aggrebot/trunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# Aggrebot - Python MetaWeblog aggregator bot
# Copyright (C) 2008  Graham Binns
#
# Email: graham.binns@gmail.com
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>

__version__ = '0.1'

import feedparser
import logging
import os

from datetime import datetime, timedelta
from ConfigParser import ConfigParser, DEFAULTSECT
from logging import getLogger

from cloglib.blogpost import BlogPost
from cloglib.metaweblog import MetaWeblog

feedparser.USER_AGENT = (
    "Aggrebot/%s +http://launchpad.net/aggrebot" % __version__)

DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
config_parser = ConfigParser()

post_template = """\
<p><a href="%(site_url)s">%(author)s</a> wrote:</p>
%(post_body)s
<p><a href="%(post_link)s">Read more...</a></p>
"""


def get_sites():
    """Return a list of dicts, one for each site to be aggregated."""
    global config_parser
    sites = {}

    # Any config section that isn't core or DEFAULT is treated as a
    # site.
    sites = [
        section for section in config_parser.sections() if section != 'core']

    return sites


def save_config(filename, sections):
    """Save the configuration to a file.

    :param filename: The name of the file to save to.
    :param sections: A list of the sections in the current configuration
        to save.
    """
    new_config = ConfigParser()

    # Loop through the sections, add them to the new config as necessary
    # and copy the current options and values across.
    for section in sections:
        if not new_config.has_section(section):
            new_config.add_section(section)

        for option, value in config_parser.items(section):
            new_config.set(section, option, value)

    config_fp = open(filename, 'w')
    new_config.write(config_fp)


def get_entries(site):
    """Return a list of the feed entries for a site."""
    # Last updated is the last updated date of the blog, not of Aggrebot.
    if not config_parser.has_option(site, 'last_updated'):
        last_updated = None
    else:
        last_updated = datetime.strptime(
            config_parser.get(site, 'last_updated'), DATETIME_FORMAT)

    feed_url = config_parser.get(site, 'feed_url')
    parsed_feed = feedparser.parse(feed_url)

    # If we don't have a last_updated date we don't want to aggregate
    # too many posts, so we just nab the first one. Otherwise, we just
    # grab those whose updated_parsed date is > than last_updated.
    # That's not the ideal way to avoid reposts, but it's a start.
    if last_updated is None:
        entries = [parsed_feed.entries[0]]
    else:
        entries = [
            entry for entry in parsed_feed.entries
                if datetime(*entry['updated_parsed'][:6]) > last_updated]

    return entries


def get_categories(site):
    """Return a list of the categories for posts from a given site."""
    # Load the default categories, if they exist.
    if config_parser.has_option('core', 'categories'):
        category_string = config_parser.get('core', 'categories')

        categories = [
            category.strip() for category in category_string.split(',')]
    else:
        categories = []

    # Load the categories from the site config, if they exist.
    if config_parser.has_option(site, 'categories'):
        category_string = config_parser.get(site, 'categories')

        categories = categories + [
            category.strip() for category in category_string.split(',')]

    return categories


def post_entry(site, entry):
    """Post an entry to the remote site."""
    post_body = post_template % {
        'site_url': config_parser.get(site, 'site_url'),
        'author': config_parser.get(site, 'author'),
        'post_body': entry['summary'],
        'post_link': entry['link']}
    subject = "[%s]: %s" % (
        config_parser.get(site, 'author'), entry['title'])
    categories = get_categories(site)

    post = BlogPost(
        subject=subject, body=post_body, categories=categories)
    post_struct = post.toMetaWeblogStruct()
    post_struct['pubDate'] = entry['updated']

    # We use the metaweblog module to make the post.
    # XXX 2008-03-12 gmb:
    #     This should be changed when cloglib is more flexible.
    metaweblog = MetaWeblog(
        config_parser.get('core', 'xmlrpc_url'),
        config_parser.getint('core', 'xmlrpc_blog'),
        config_parser.get('core', 'xmlrpc_user'),
        config_parser.get('core', 'xmlrpc_password'))

    post_id = metaweblog.newPost(
        post_struct, config_parser.getboolean('core', 'xmlrpc_publish'))

    # XXX 2008-03-12 gmb: post['title'] might be WP / MT specific,
    # whcih is BAD.
    post = metaweblog.getPost(post_id)
    logging.info("Posted entry %s; url=%s" % (
        post['title'], post['link']))

    # We set last_updated to ensure that we don't try adding this
    # post again.
    last_updated = datetime(*entry['updated_parsed'][:6])

    # We update the site to say when we last checked and updated it.
    config_parser.set(
        site, 'last_checked',
        datetime.strftime(datetime.now(), DATETIME_FORMAT))
    config_parser.set(
        site, 'last_updated',
        datetime.strftime(last_updated, DATETIME_FORMAT))


def update_site(site):
    """Update a single site.

    :param site: The name of the site as used in sites.conf.
    """
    # If the site doesn't exist then, well, that's a bit weird.
    if not config_parser.has_section(site):
        logging.warning("Site %s doesn't exist in sites.conf." % section)
        return

    # We only update if the site hasn't been updated since last_update +
    # interval. We default to 1 hour for the interval.
    if not config_parser.has_option(site, 'update_interval'):
        update_interval = 3600
    else:
        update_interval = config_parser.getint(site, 'update_interval')

    if not config_parser.has_option(site, 'last_checked'):
        last_checked = None
    else:
        last_checked = datetime.strptime(
            config_parser.get(site, 'last_checked'), DATETIME_FORMAT)

    now = datetime.now()
    if (last_checked is not None
        and now < last_checked + timedelta(seconds=update_interval)):
        logging.info(
            "Not checking %s: it was checked less than %i seconds ago." %
            (site, update_interval))
        return

    entries = get_entries(site)
    logging.info("Aggregating %i entries from %s." % (len(entries), site))

    # Now we actually post the entries to the remote blog. We process
    # the list of entries in reverse. RSS feeds ususally list the most
    # recent entry first, which is exactly the order we don't want to
    # process the entries in. Of course, there's always the chance that
    # someone won't DTRT...
    for entry in reversed(entries):
        post_entry(site, entry)


def main():
    """Carry out the main processing."""
    logging.info("Aggrebot version %s." % __version__)

    # We store our last run time in the configuration file. Eww.
    if config_parser.has_option('core', 'last_run_time'):
        last_run_time = datetime.strptime(
            config_parser.get('core', 'last_run_time'), DATETIME_FORMAT)
        logging.info("Aggrebot last ran at %s." %
            datetime.strftime(last_run_time, DATETIME_FORMAT))
    else:
        last_run_time = None

    logging.info("Loading sites.")

    sites = get_sites()
    logging.info("%i sites loaded." % len(sites))

    # Update the sites.
    for site in sites:
        update_site(site)

    # We save the current time as our last run time. This could mean
    # missing some posts but it's a tradeoff between that and possible
    # double-postings.
    config_parser.set(
        'core', 'last_run_time',
        datetime.strftime(datetime.now(), DATETIME_FORMAT))

    # Finally, we write the config to the config files.
    logging.info("Saving config.")
    save_config('aggrebot.conf', ['core'])
    save_config('sites.conf', sites)


if __name__ == '__main__':
    # Bit of a cheat, but we assume everything's in one directory.
    base_path = os.path.dirname(__file__)
    config_parser.read([
        os.path.join(base_path, 'aggrebot.conf'),
        os.path.join(base_path, 'sites.conf')])

    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s %(levelname)s %(message)s')

    main()