1
# -*- coding: utf-8 -*-
4
Read content from ereader pdb file with a 202 byte header created by Makebook.
7
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
8
__docformat__ = 'restructuredtext en'
13
from calibre import CurrentDir
14
from calibre.ebooks.metadata.opf2 import OPFCreator
15
from calibre.ebooks.pml.pmlconverter import pml_to_html
16
from calibre.ebooks.compression.palmdoc import decompress_doc
17
from calibre.ebooks.pdb.formatreader import FormatReader
18
from calibre.ebooks.pdb.ereader import EreaderError
20
class HeaderRecord(object):
22
The first record in the file is always the header record. It holds
23
information related to the location of text, images, and so on
24
in the file. This is used in conjunction with the sections
25
defined in the file header.
28
def __init__(self, raw):
29
self.version, = struct.unpack('>H', raw[0:2])
30
self.non_text_offset, = struct.unpack('>H', raw[8:10])
32
self.num_text_pages = self.non_text_offset - 1
35
class Reader202(FormatReader):
37
def __init__(self, header, stream, log, encoding=None):
39
self.encoding = encoding
41
self.log.debug('202 byte header version found.')
44
for i in range(header.num_sections):
45
self.sections.append(header.section_data(i))
47
self.header_record = HeaderRecord(self.section_data(0))
49
if self.header_record.version != 4:
50
raise EreaderError('Unknown book version %i.' % self.header_record.version)
52
from calibre.ebooks.metadata.pdb import get_metadata
53
self.mi = get_metadata(stream, False)
55
def section_data(self, number):
56
return self.sections[number]
58
def decompress_text(self, number):
59
return decompress_doc(''.join([chr(ord(x) ^ 0xA5) for x in self.section_data(number)])).decode('cp1252' if self.encoding is None else self.encoding, 'replace')
61
def get_image(self, number):
65
data = self.section_data(number)
66
if data.startswith('PNG'):
67
name = data[4:4 + 32].strip('\x00')
72
def get_text_page(self, number):
74
Only palmdoc compression is supported. The text is xored with 0xA5 and
75
assumed to be encoded as Windows-1252. The encoding is part of
76
the eReader file spec and should always be this encoding.
78
if number not in range(1, self.header_record.num_text_pages + 1):
81
return self.decompress_text(number)
83
def extract_content(self, output_dir):
84
output_dir = os.path.abspath(output_dir)
86
if not os.path.exists(output_dir):
87
os.makedirs(output_dir)
89
html = u'<html><head><title></title></head><body>'
91
for i in range(1, self.header_record.num_text_pages + 1):
92
self.log.debug('Extracting text page %i' % i)
93
html += pml_to_html(self.get_text_page(i))
96
html += '</body></html>'
98
with CurrentDir(output_dir):
99
with open('index.html', 'wb') as index:
100
self.log.debug('Writing text to index.html')
101
index.write(html.encode('utf-8'))
103
if not os.path.exists(os.path.join(output_dir, 'images/')):
104
os.makedirs(os.path.join(output_dir, 'images/'))
106
with CurrentDir(os.path.join(output_dir, 'images/')):
107
for i in range(self.header_record.non_text_offset, len(self.sections)):
108
name, img = self.get_image(i)
111
with open(name, 'wb') as imgf:
112
self.log.debug('Writing image %s to images/' % name)
115
opf_path = self.create_opf(output_dir, images)
119
def create_opf(self, output_dir, images):
120
with CurrentDir(output_dir):
121
opf = OPFCreator(output_dir, self.mi)
123
manifest = [('index.html', None)]
126
manifest.append((os.path.join('images/', i), None))
128
opf.create_manifest(manifest)
129
opf.create_spine(['index.html'])
130
with open('metadata.opf', 'wb') as opffile:
133
return os.path.join(output_dir, 'metadata.opf')
137
This is primarily used for debugging and 3rd party tools to
138
get the plm markup that comprises the text in the file.
142
for i in range(1, self.header_record.num_text_pages + 1):
143
pml += self.get_text_page(i)
147
def dump_images(self, output_dir):
149
This is primarily used for debugging and 3rd party tools to
150
get the images in the file.
152
if not os.path.exists(output_dir):
153
os.makedirs(output_dir)
155
with CurrentDir(output_dir):
156
for i in range(0, self.header_record.num_image_pages):
157
name, img = self.get_image(self.header_record.image_data_offset + i)
158
with open(name, 'wb') as imgf: