1
# -*- coding: utf-8 -*-
4
Read content from ereader pdb file with a 132 byte header created by Dropbook.
8
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
9
__docformat__ = 'restructuredtext en'
16
from calibre import CurrentDir
17
from calibre.ebooks import DRMError
18
from calibre.ebooks.compression.palmdoc import decompress_doc
19
from calibre.ebooks.metadata.opf2 import OPFCreator
20
from calibre.ebooks.pdb.ereader import EreaderError
21
from calibre.ebooks.pdb.formatreader import FormatReader
22
from calibre.ebooks.pml.pmlconverter import footnote_sidebar_to_html
23
from calibre.ebooks.pml.pmlconverter import pml_to_html
25
class HeaderRecord(object):
27
The first record in the file is always the header record. It holds
28
information related to the location of text, images, and so on
29
in the file. This is used in conjunction with the sections
30
defined in the file header.
33
def __init__(self, raw):
34
self.version, = struct.unpack('>H', raw[0:2])
35
self.non_text_offset, = struct.unpack('>H', raw[12:14])
36
self.has_metadata, = struct.unpack('>H', raw[24:26])
37
self.footnote_rec, = struct.unpack('>H', raw[28:30])
38
self.sidebar_rec, = struct.unpack('>H', raw[30:32])
39
self.bookmark_offset, = struct.unpack('>H', raw[32:34])
40
self.image_data_offset, = struct.unpack('>H', raw[40:42])
41
self.metadata_offset, = struct.unpack('>H', raw[44:46])
42
self.footnote_offset, = struct.unpack('>H', raw[48:50])
43
self.sidebar_offset, = struct.unpack('>H', raw[50:52])
44
self.last_data_offset, = struct.unpack('>H', raw[52:54])
46
self.num_text_pages = self.non_text_offset - 1
47
self.num_image_pages = self.metadata_offset - self.image_data_offset
50
class Reader132(FormatReader):
52
def __init__(self, header, stream, log, encoding=None):
54
self.encoding = encoding
56
self.log.debug('132 byte header version found.')
59
for i in range(header.num_sections):
60
self.sections.append(header.section_data(i))
62
self.header_record = HeaderRecord(self.section_data(0))
64
if self.header_record.version not in (2, 10):
65
if self.header_record.version in (260, 272):
66
raise DRMError('eReader DRM is not supported.')
68
raise EreaderError('Unknown book version %i.' % self.header_record.version)
70
from calibre.ebooks.metadata.pdb import get_metadata
71
self.mi = get_metadata(stream, False)
73
def section_data(self, number):
74
return self.sections[number]
76
def decompress_text(self, number):
77
if self.header_record.version == 2:
78
return decompress_doc(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
79
if self.header_record.version == 10:
80
return zlib.decompress(self.section_data(number)).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
82
def get_image(self, number):
83
if number < self.header_record.image_data_offset or number > self.header_record.image_data_offset + self.header_record.num_image_pages - 1:
85
data = self.section_data(number)
86
name = data[4:4 + 32].strip('\x00')
90
def get_text_page(self, number):
92
Only palmdoc and zlib compressed are supported. The text is
93
assumed to be encoded as Windows-1252. The encoding is part of
94
the eReader file spec and should always be this encoding.
96
if number not in range(1, self.header_record.num_text_pages + 1):
99
return self.decompress_text(number)
101
def extract_content(self, output_dir):
102
output_dir = os.path.abspath(output_dir)
104
if not os.path.exists(output_dir):
105
os.makedirs(output_dir)
107
html = u'<html><head><title></title></head><body>'
109
for i in range(1, self.header_record.num_text_pages + 1):
110
self.log.debug('Extracting text page %i' % i)
111
html += pml_to_html(self.get_text_page(i))
113
if self.header_record.footnote_rec > 0:
114
html += '<br /><h1>%s</h1>' % _('Footnotes')
115
footnoteids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.footnote_offset).decode('cp1252' if self.encoding is None else self.encoding))
116
for fid, i in enumerate(range(self.header_record.footnote_offset + 1, self.header_record.footnote_offset + self.header_record.footnote_rec)):
117
self.log.debug('Extracting footnote page %i' % i)
119
html += footnote_sidebar_to_html(footnoteids[fid], self.decompress_text(i))
122
if self.header_record.sidebar_rec > 0:
123
html += '<br /><h1>%s</h1>' % _('Sidebar')
124
sidebarids = re.findall('\w+(?=\x00)', self.section_data(self.header_record.sidebar_offset).decode('cp1252' if self.encoding is None else self.encoding))
125
for sid, i in enumerate(range(self.header_record.sidebar_offset + 1, self.header_record.sidebar_offset + self.header_record.sidebar_rec)):
126
self.log.debug('Extracting sidebar page %i' % i)
128
html += footnote_sidebar_to_html(sidebarids[sid], self.decompress_text(i))
131
html += '</body></html>'
133
with CurrentDir(output_dir):
134
with open('index.html', 'wb') as index:
135
self.log.debug('Writing text to index.html')
136
index.write(html.encode('utf-8'))
138
if not os.path.exists(os.path.join(output_dir, 'images/')):
139
os.makedirs(os.path.join(output_dir, 'images/'))
141
with CurrentDir(os.path.join(output_dir, 'images/')):
142
for i in range(0, self.header_record.num_image_pages):
143
name, img = self.get_image(self.header_record.image_data_offset + i)
145
with open(name, 'wb') as imgf:
146
self.log.debug('Writing image %s to images/' % name)
149
opf_path = self.create_opf(output_dir, images)
153
def create_opf(self, output_dir, images):
154
with CurrentDir(output_dir):
155
opf = OPFCreator(output_dir, self.mi)
157
manifest = [('index.html', None)]
160
manifest.append((os.path.join('images/', i), None))
162
opf.create_manifest(manifest)
163
opf.create_spine(['index.html'])
164
with open('metadata.opf', 'wb') as opffile:
167
return os.path.join(output_dir, 'metadata.opf')
171
This is primarily used for debugging and 3rd party tools to
172
get the plm markup that comprises the text in the file.
176
for i in range(1, self.header_record.num_text_pages + 1):
177
pml += self.get_text_page(i)
181
def dump_images(self, output_dir):
183
This is primarily used for debugging and 3rd party tools to
184
get the images in the file.
186
if not os.path.exists(output_dir):
187
os.makedirs(output_dir)
189
with CurrentDir(output_dir):
190
for i in range(0, self.header_record.num_image_pages):
191
name, img = self.get_image(self.header_record.image_data_offset + i)
192
with open(name, 'wb') as imgf: