2
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
3
'''Convert websites into LRF files.'''
5
import sys, tempfile, shutil, os, logging, imp, inspect, re
6
from urlparse import urlsplit
8
from calibre import __appname__, setup_cli_handlers, CommandLineError, strftime
9
from calibre.ebooks.lrf import option_parser as lrf_option_parser
10
from calibre.ebooks.lrf.html.convert_from import process_file
12
from calibre.web.fetch.simple import create_fetcher
14
from calibre.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile, create_class
15
from calibre.ebooks.lrf.web import builtin_profiles, available_profiles
19
parser = lrf_option_parser(usage='''%prog [options] website_profile\n\n'''
20
'''%prog downloads a site from the web and converts it '''
21
'''into a LRF file for use with the SONY Reader. '''
22
'''website_profile is one of '''+str(available_profiles)+\
23
''' If you specify a website_profile of default or do not specify '''
24
'''it, you must specify the --url option.'''
27
parser.add_option('-u', '--url', dest='url', default=None,
28
help='The URL to download. You only need to specify this if you are not specifying a website_profile.')
29
parser.add_option('--user-profile', default=None,
30
help='Path to a python file containing a user created profile. For help visit http://%s.kovidgoyal.net/wiki/UserProfiles'%__appname__)
31
parser.add_option('--username', dest='username', default=None,
32
help='Specify the username to be used while downloading. Only used if the profile supports it.')
33
parser.add_option('--password', dest='password', default=None,
34
help='Specify the password to be used while downloading. Only used if the profile supports it.')
35
parser.add_option('--timeout', help='Timeout in seconds to wait for a response from the server. Default: %d s'%DefaultProfile.timeout,
36
default=None, type='int', dest='timeout')
37
parser.add_option('-r', '--max-recursions', help='Maximum number of levels to recurse i.e. depth of links to follow. Default %d'%DefaultProfile.timeout,
38
default=None, type='int', dest='max_recursions')
39
parser.add_option('-n', '--max-files', default=None, type='int', dest='max_files',
40
help='The maximum number of files to download. This only applies to files from <a href> tags. Default is %d'%DefaultProfile.timeout)
41
parser.add_option('--delay', default=None, dest='delay', type='int',
42
help='Minimum interval in seconds between consecutive fetches. Default is %d s'%DefaultProfile.timeout)
43
parser.add_option('--dont-download-stylesheets', action='store_true', default=None,
44
help='Do not download CSS stylesheets.', dest='no_stylesheets')
45
parser.add_option('--match-regexp', dest='match_regexps', default=[], action='append',
46
help='Only links that match this regular expression will be followed. This option can be specified multiple times, in which case as long as a link matches any one regexp, it will be followed. By default all links are followed.')
47
parser.add_option('--filter-regexp', default=[], action='append', dest='filter_regexps',
48
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
49
parser.add_option('--keep-downloaded-files', default=False, action='store_true',
50
help='''Do not delete the downloaded files after creating the LRF''')
53
def fetch_website(options, logger):
54
tdir = tempfile.mkdtemp(prefix=__appname__+'_', suffix='_web2lrf')
56
fetcher = create_fetcher(options, logger)
57
fetcher.preprocess_regexps = options.preprocess_regexps
58
return fetcher.start_fetch(options.url), tdir
60
def create_lrf(htmlfile, options, logger):
61
if not options.author or options.author.lower() == 'unknown':
62
options.author = __appname__
65
options.output = os.path.abspath(os.path.expanduser(options.output))
67
options.output = os.path.abspath(os.path.expanduser(options.title + ('.lrs' if options.lrs else '.lrf')))
69
process_file(htmlfile, options, logger)
71
def process_profile(args, options, logger=None):
75
level = logging.DEBUG if options.verbose else logging.INFO
76
logger = logging.getLogger('web2lrf')
77
setup_cli_handlers(logger, level)
80
if len(args) == 2 and re.search(r'class\s+\S+\(\S+\)\s*\:', args[1]):
81
profile = create_class(args[1])
83
if options.user_profile is not None:
84
path = os.path.abspath(options.user_profile)
85
name = os.path.splitext(os.path.basename(path))[0]
86
res = imp.find_module(name, [os.path.dirname(path)])
87
module = imp.load_module(name, *res)
88
classes = inspect.getmembers(module,
89
lambda x : inspect.isclass(x) and issubclass(x, DefaultProfile)\
90
and x is not DefaultProfile and x is not FullContentProfile)
92
raise CommandLineError('Invalid user profile '+path)
93
builtin_profiles.append(classes[0][1])
94
available_profiles.append(name)
101
if isinstance(args[1], basestring):
102
if args[1] != 'default':
103
index = available_profiles.index(args[1])
105
raise CommandLineError('Unknown profile: %s\nValid profiles: %s'%(args[1], available_profiles))
107
raise CommandLineError('Only one profile at a time is allowed.')
108
profile = DefaultProfile if index == -1 else builtin_profiles[index]
112
profile = profile(logger, options.verbose, options.username, options.password)
113
if profile.browser is not None:
114
options.browser = profile.browser
116
for opt in ('url', 'timeout', 'max_recursions', 'max_files', 'delay', 'no_stylesheets'):
117
val = getattr(options, opt)
119
setattr(options, opt, getattr(profile, opt))
122
options.url = profile.url
125
raise CommandLineError('You must specify the --url option or a profile from one of: %s'%(available_profiles,))
127
if not options.title:
128
title = profile.title
130
title = urlsplit(options.url).netloc
131
options.title = title + strftime(profile.timefmt)
133
options.match_regexps += profile.match_regexps
134
options.preprocess_regexps = profile.preprocess_regexps
135
options.filter_regexps += profile.filter_regexps
137
options.encoding = profile.encoding if options.encoding is None else options.encoding
139
if len(args) == 2 and args[1] != 'default':
140
options.anchor_ids = False
142
htmlfile, tdir = fetch_website(options, logger)
143
options.encoding = 'utf-8'
145
if not options.output:
146
title = options.title.encode(sys.getfilesystemencoding()) if isinstance(options.title, unicode) else options.title
147
options.output = os.path.join(cwd, options.title+('.lrs' if options.lrs else '.lrf'))
148
if not os.path.isabs(options.output):
149
options.output = os.path.join(cwd, options.output)
151
option_parser().parse_args(profile.html2lrf_options, options)
154
os.chdir(os.path.dirname(htmlfile))
155
create_lrf(os.path.basename(htmlfile), options, logger)
163
if tdir and os.path.isdir(tdir):
164
if options.keep_downloaded_files:
165
print 'Downloaded files in ', tdir
170
def main(args=sys.argv, logger=None):
171
parser = option_parser()
172
options, args = parser.parse_args(args)
173
if len(args) > 2 or (len(args) == 1 and not options.user_profile):
177
process_profile(args, options, logger=logger)
178
except CommandLineError, err:
179
print >>sys.stderr, err
182
if __name__ == '__main__':
b'\\ No newline at end of file'