2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
3
from __future__ import (unicode_literals, division, absolute_import,
7
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
8
__docformat__ = 'restructuredtext en'
11
from threading import Thread
12
from Queue import Queue, Empty
14
from calibre import as_unicode, random_user_agent
15
from calibre.ebooks.metadata import check_isbn
16
from calibre.ebooks.metadata.sources.base import Source
20
from calibre.ebooks.chardet import xml_to_unicode
21
from calibre.utils.cleantext import clean_ascii_chars
22
raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True,
23
resolve_entities=True, assume_utf8=True)[0])
24
return html5lib.parse(raw, treebuilder='lxml',
25
namespaceHTMLElements=False).getroot()
28
from cssselect import HTMLTranslator
29
from lxml.etree import XPath
30
return XPath(HTMLTranslator().css_to_xpath(expr))
33
from lxml import etree
34
return etree.tostring(node, method='text', encoding=unicode,
35
with_tail=False).strip()
37
class Worker(Thread): # {{{
39
def __init__(self, sku, url, relevance, result_queue, br, timeout, log, plugin):
42
self.url, self.br, self.log, self.timeout = url, br, log, timeout
43
self.result_queue, self.plugin, self.sku = result_queue, plugin, sku
44
self.relevance = relevance
48
raw = self.br.open_novisit(self.url, timeout=self.timeout).read()
50
self.log.exception('Failed to load details page: %r'%self.url)
55
mi.source_relevance = self.relevance
56
self.plugin.clean_downloaded_metadata(mi)
57
self.result_queue.put(mi)
59
self.log.exception('Failed to parse details page: %r'%self.url)
62
from calibre.ebooks.metadata.book.base import Metadata
63
from calibre.utils.date import parse_only_date, UNDEFINED_DATE
64
root = parse_html(raw)
65
sku = CSSSelect('div.sku.attGroup')(root)[0]
66
info = sku.getparent()
67
top = info.getparent().getparent()
68
banner = top.find('div')
69
spans = banner.findall('span')
71
for i, span in enumerate(spans):
72
if i == 0 or '12pt' in span.get('style', ''):
76
authors = [re.sub(r'\(.*\)', '', x).strip() for x in astext(spans[-1]).split(',')]
77
mi = Metadata(title.strip(), authors)
80
isbns = [check_isbn(x.strip()) for x in astext(sku).split(',')]
83
self.plugin.cache_isbn_to_identifier(isbn, self.sku)
84
isbns = sorted(isbns, key=lambda x:len(x) if x else 0, reverse=True)
85
if isbns and isbns[0]:
87
mi.set_identifier('edelweiss', self.sku)
90
bisac = CSSSelect('div.bisac.attGroup')(root)
92
bisac = astext(bisac[0])
93
mi.tags = [x.strip() for x in bisac.split(',')]
94
mi.tags = [t[1:].strip() if t.startswith('&') else t for t in mi.tags]
97
pub = CSSSelect('div.supplier.attGroup')(root)
103
pub = CSSSelect('div.shipDate.attGroupItem')(root)
106
parts = pub.partition(':')[0::2]
107
pub = parts[1] or parts[0]
109
q = parse_only_date(pub, assume_utc=True)
110
if q.year != UNDEFINED_DATE:
113
self.log.exception('Error parsing published date: %r'%pub)
117
general = CSSSelect('div#pd-general-overview-content')(root)
119
q = self.render_comments(general[0])
120
if q != '<p>No title summary available. </p>':
122
general = CSSSelect('div#pd-general-contributor-content')(root)
124
comm += self.render_comments(general[0])
125
general = CSSSelect('div#pd-general-quotes-content')(root)
127
comm += self.render_comments(general[0])
132
img = CSSSelect('img.title-image[src]')(root)
134
href = img[0].get('src').replace('jacket_covers/medium/',
135
'jacket_covers/flyout/')
136
self.plugin.cache_identifier_to_cover_url(self.sku, href)
138
mi.has_cover = self.plugin.cached_identifier_to_cover_url(self.sku) is not None
142
def render_comments(self, desc):
143
from lxml import etree
144
from calibre.library.comments import sanitize_comments_html
145
for c in desc.xpath('descendant::noscript'):
146
c.getparent().remove(c)
147
for a in desc.xpath('descendant::a[@href]'):
150
desc = etree.tostring(desc, method='html', encoding=unicode).strip()
152
# remove all attributes from tags
153
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
154
# Collapse whitespace
155
#desc = re.sub('\n+', '\n', desc)
156
#desc = re.sub(' +', ' ', desc)
158
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
159
return sanitize_comments_html(desc)
162
class Edelweiss(Source):
165
description = _('Downloads metadata and covers from Edelweiss - A catalog updated by book publishers')
167
capabilities = frozenset(['identify', 'cover'])
168
touched_fields = frozenset([
169
'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
170
'identifier:isbn', 'identifier:edelweiss'])
171
supports_gzip_transfer_encoding = True
172
has_html_comments = True
175
def user_agent(self):
176
# Pass in an index to random_user_agent() to test with a particular
178
return random_user_agent()
180
def _get_book_url(self, sku):
182
return 'http://edelweiss.abovethetreeline.com/ProductDetailPage.aspx?sku=%s'%sku
184
def get_book_url(self, identifiers): # {{{
185
sku = identifiers.get('edelweiss', None)
187
return 'edelweiss', sku, self._get_book_url(sku)
191
def get_cached_cover_url(self, identifiers): # {{{
192
sku = identifiers.get('edelweiss', None)
194
isbn = identifiers.get('isbn', None)
196
sku = self.cached_isbn_to_identifier(isbn)
197
return self.cached_identifier_to_cover_url(sku)
200
def create_query(self, log, title=None, authors=None, identifiers={}): # {{{
201
from urllib import urlencode
202
BASE_URL = 'http://edelweiss.abovethetreeline.com/CatalogOverview.aspx?'
210
for num in (0, 1, 2, 3, 4, 5, 6, 200, 201, 202, 204):
211
params['condition%d'%num] = 1
212
params['keywords%d'%num] = ''
213
title_key, author_key = 'keywords200', 'keywords201'
215
isbn = check_isbn(identifiers.get('isbn', None))
218
params['isbn'] = isbn
220
elif title or authors:
221
title_tokens = list(self.get_title_tokens(title))
223
params[title_key] = ' '.join(title_tokens)
225
author_tokens = self.get_author_tokens(authors,
226
only_first_author=True)
228
params[author_key] = ' '.join(author_tokens)
234
for k in (title_key, author_key, 'isbn'):
236
if isinstance(v, unicode):
237
params[k] = v.encode('utf-8')
239
return BASE_URL+urlencode(params)
242
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
243
identifiers={}, timeout=30):
244
from urlparse import parse_qs
246
book_url = self._get_book_url(identifiers.get('edelweiss', None))
249
entries = [(book_url, identifiers['edelweiss'])]
252
query = self.create_query(log, title=title, authors=authors,
253
identifiers=identifiers)
255
log.error('Insufficient metadata to construct query')
258
raw = br.open_novisit(query, timeout=timeout).read()
259
except Exception as e:
260
log.exception('Failed to make identify query: %r'%query)
264
root = parse_html(raw)
265
except Exception as e:
266
log.exception('Failed to parse identify results')
269
for entry in CSSSelect('div.listRow div.listRowMain')(root):
270
a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "ProductDetailPage.aspx")]')
272
href = a[0].get('href')
273
prefix, qs = href.partition('?')[0::2]
274
sku = parse_qs(qs).get('sku', None)
277
div = CSSSelect('div.sku.attGroup')(entry)
279
text = astext(div[0])
280
isbns = [check_isbn(x.strip()) for x in text.split(',')]
283
self.cache_isbn_to_identifier(isbn, sku)
284
for img in entry.xpath('descendant::img[contains(@src, "/jacket_covers/thumbnail/")]'):
285
self.cache_identifier_to_cover_url(sku, img.get('src').replace('/thumbnail/', '/flyout/'))
287
div = CSSSelect('div.format.attGroup')(entry)
288
text = astext(div[0]).lower()
289
if 'audio' in text or 'mp3' in text: # Audio-book, ignore
291
entries.append((self._get_book_url(sku), sku))
293
if (not entries and identifiers and title and authors and
295
return self.identify(log, result_queue, abort, title=title,
296
authors=authors, timeout=timeout)
301
workers = [Worker(sku, url, i, result_queue, br.clone_browser(), timeout, log, self)
302
for i, (url, sku) in enumerate(entries[:5])]
306
# Don't send all requests at the same time
309
while not abort.is_set():
310
a_worker_is_alive = False
316
a_worker_is_alive = True
317
if not a_worker_is_alive:
322
def download_cover(self, log, result_queue, abort, # {{{
323
title=None, authors=None, identifiers={}, timeout=30):
324
cached_url = self.get_cached_cover_url(identifiers)
325
if cached_url is None:
326
log.info('No cached cover found, running identify')
328
self.identify(log, rq, abort, title=title, authors=authors,
329
identifiers=identifiers)
335
results.append(rq.get_nowait())
338
results.sort(key=self.identify_results_keygen(
339
title=title, authors=authors, identifiers=identifiers))
341
cached_url = self.get_cached_cover_url(mi.identifiers)
342
if cached_url is not None:
344
if cached_url is None:
345
log.info('No cover found')
351
log('Downloading cover from:', cached_url)
353
cdata = br.open_novisit(cached_url, timeout=timeout).read()
354
result_queue.put((self, cdata))
356
log.exception('Failed to download cover from:', cached_url)
359
if __name__ == '__main__':
360
from calibre.ebooks.metadata.sources.test import (
361
test_identify_plugin, title_test, authors_test, comments_test, pubdate_test)
363
# Multiple authors and two part title and no general description
364
({'identifiers':{'edelweiss':'0321180607'}},
366
"XQuery from the Experts: A Guide to the W3C XML Query Language"
367
, exact=True), authors_test([
368
'Howard Katz', 'Don Chamberlin', 'Denise Draper', 'Mary Fernandez',
369
'Michael Kay', 'Jonathan Robie', 'Michael Rys', 'Jerome Simeon',
370
'Jim Tivy', 'Philip Wadler']), pubdate_test(2003, 8, 22),
371
comments_test('Jérôme Siméon'), lambda mi: bool(mi.comments and 'No title summary' not in mi.comments)
374
( # An isbn not present in edelweiss
375
{'identifiers':{'isbn': '9780316044981'}, 'title':'The Heroes',
376
'authors':['Joe Abercrombie']},
377
[title_test('The Heroes', exact=True),
378
authors_test(['Joe Abercrombie'])]
383
{'title':'The Great Gatsby', 'authors':['F. Scott Fitzgerald']},
384
[title_test('The great gatsby', exact=True),
385
authors_test(['F. Scott Fitzgerald']), pubdate_test(2004, 9, 29)]
390
start, stop = 0, len(tests)
392
tests = tests[start:stop]
393
test_identify_plugin(Edelweiss.name, tests)