42
42
doc = html.fromstring(f.read())
44
44
# Amazon has two results pages.
45
is_shot = doc.xpath('boolean(//div[@id="shotgunMainResults"])')
46
# Horizontal grid of books.
48
data_xpath = '//div[contains(@class, "result")]'
49
cover_xpath = './/div[@class="productTitle"]//img/@src'
50
# Vertical list of books.
52
data_xpath = '//div[contains(@class, "product")]'
53
cover_xpath = './div[@class="productImage"]/a/img/@src'
45
# 20110725: seems that is_shot is gone.
46
# is_shot = doc.xpath('boolean(//div[@id="shotgunMainResults"])')
47
# # Horizontal grid of books.
49
# data_xpath = '//div[contains(@class, "result")]'
50
# format_xpath = './/div[@class="productTitle"]/text()'
51
# cover_xpath = './/div[@class="productTitle"]//img/@src'
52
# # Vertical list of books.
54
data_xpath = '//div[contains(@class, "result") and contains(@class, "product")]'
55
format_xpath = './/span[@class="format"]/text()'
56
cover_xpath = './/img[@class="productImage"]/@src'
55
59
for data in doc.xpath(data_xpath):
63
# Even though we are searching digital-text only Amazon will still
64
# put in results for non Kindle books (author pages). So we need
65
# to explicitly check if the item is a Kindle book and ignore it
67
format = ''.join(data.xpath(format_xpath))
68
if 'kindle' not in format.lower():
59
71
# We must have an asin otherwise we can't easily reference the
61
asin = ''.join(data.xpath('./@name'))
73
asin = ''.join(data.xpath("@name"))
64
75
cover_url = ''.join(data.xpath(cover_xpath))
66
title = ''.join(data.xpath('.//div[@class="productTitle"]/a/text()'))
77
title = ''.join(data.xpath('.//div[@class="title"]/a/text()'))
67
78
price = ''.join(data.xpath('.//div[@class="newPrice"]/span/text()'))
81
# author = format.split(' von ')[-1]
83
author = ''.join(data.xpath('.//div[@class="title"]/span[@class="ptBrand"]/text()'))
84
author = author.split('by ')[-1]
72
89
s.cover_url = cover_url.strip()
73
90
s.title = title.strip()
91
s.author = author.strip()
74
92
s.price = price.strip()
75
93
s.detail_item = asin.strip()
79
# Amazon UK does not include the author on the grid layout
81
self.get_details(s, timeout)
82
if s.formats != 'Kindle':
85
author = ''.join(data.xpath('.//div[@class="productTitle"]/span[@class="ptBrand"]/text()'))
86
s.author = author.split(' by ')[-1].strip()