~kovid/calibre/trunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# -*- coding: utf-8 -*-

__license__   = 'GPL v3'
__copyright__ = '2011, John Schember <john at nachtimwald.com>'
__docformat__ = 'restructuredtext en'

'''
Generates and writes an APNX page mapping file.
'''

import struct

from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.mobi.reader.headers import MetadataHeader
from calibre.utils.logging import default_log
from calibre import prints
from calibre.constants import DEBUG

class APNXBuilder(object):
    '''
    Create an APNX file using a pseudo page mapping.
    '''

    def write_apnx(self, mobi_file_path, apnx_path, accurate=True, page_count=0):
        '''
        If you want a fixed number of pages (such as from a custom column) then
        pass in a value to page_count, otherwise a count will be estimated
        using either the fast or accurate algorithm.
        '''
        import uuid
        apnx_meta = { 'guid': str(uuid.uuid4()).replace('-', '')[:8], 'asin':
                '', 'cdetype': 'EBOK', 'format': 'MOBI_7', 'acr': '' }

        with open(mobi_file_path, 'rb') as mf:
            ident = PdbHeaderReader(mf).identity()
            if ident != 'BOOKMOBI':
                # Check that this is really a MOBI file.
                raise Exception(_('Not a valid MOBI file. Reports identity of %s') % ident)
            apnx_meta['acr'] = str(PdbHeaderReader(mf).name())

        # We'll need the PDB name, the MOBI version, and some metadata to make FW 3.4 happy with KF8 files...
        with open(mobi_file_path, 'rb') as mf:
            mh = MetadataHeader(mf, default_log)
            if mh.mobi_version == 8:
                apnx_meta['format'] = 'MOBI_8'
            else:
                apnx_meta['format'] = 'MOBI_7'
            if mh.exth is None or not mh.exth.cdetype:
                apnx_meta['cdetype'] = 'EBOK'
            else:
                apnx_meta['cdetype'] = str(mh.exth.cdetype)
            if mh.exth is None or not mh.exth.uuid:
                apnx_meta['asin'] = ''
            else:
                apnx_meta['asin'] = str(mh.exth.uuid)

        # Get the pages depending on the chosen parser
        pages = []
        if page_count:
            pages = self.get_pages_exact(mobi_file_path, page_count)
        else:
            if accurate:
                try:
                    pages = self.get_pages_accurate(mobi_file_path)
                except:
                    # Fall back to the fast parser if we can't
                    # use the accurate one. Typically this is
                    # due to the file having DRM.
                    pages = self.get_pages_fast(mobi_file_path)
            else:
                pages = self.get_pages_fast(mobi_file_path)

        if not pages:
            raise Exception(_('Could not generate page mapping.'))

        # Generate the APNX file from the page mapping.
        apnx = self.generate_apnx(pages, apnx_meta)

        # Write the APNX.
        with open(apnx_path, 'wb') as apnxf:
            apnxf.write(apnx)

    def generate_apnx(self, pages, apnx_meta):
        apnx = ''

        if DEBUG:
            prints('APNX META: guid:', apnx_meta['guid'])
            prints('APNX META: ASIN:', apnx_meta['asin'])
            prints('APNX META: CDE:', apnx_meta['cdetype'])
            prints('APNX META: format:', apnx_meta['format'])
            prints('APNX META: Name:', apnx_meta['acr'])

        # Updated header if we have a KF8 file...
        if apnx_meta['format'] == 'MOBI_8':
            content_header = '{"contentGuid":"%(guid)s","asin":"%(asin)s","cdeType":"%(cdetype)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' % apnx_meta
        else:
            # My 5.1.x Touch & 3.4 K3 seem to handle the 'extended' header fine for legacy mobi files, too. But, since they still handle this one too, let's try not to break old devices, and keep using the simple header ;).
            content_header = '{"contentGuid":"%(guid)s","asin":"%(asin)s","cdeType":"%(cdetype)s","fileRevisionId":"1"}' % apnx_meta
        page_header = '{"asin":"%(asin)s","pageMap":"(1,a,1)"}' % apnx_meta

        if DEBUG:
            prints('APNX Content Header:', content_header)

        apnx += struct.pack('>I', 65537)
        apnx += struct.pack('>I', 12 + len(content_header))
        apnx += struct.pack('>I', len(content_header))
        apnx += content_header
        apnx += struct.pack('>H', 1)
        apnx += struct.pack('>H', len(page_header))
        apnx += struct.pack('>H', len(pages))
        apnx += struct.pack('>H', 32)
        apnx += page_header

        # Write page values to APNX.
        for page in pages:
            apnx += struct.pack('>I', page)

        return apnx

    def get_pages_exact(self, mobi_file_path, page_count):
        '''
        Given a specified page count (such as from a custom column),
        create our array of pages for the apnx file by dividing by
        the content size of the book.
        '''
        pages = []
        count = 0

        with open(mobi_file_path, 'rb') as mf:
            phead = PdbHeaderReader(mf)
            r0 = phead.section_data(0)
            text_length = struct.unpack('>I', r0[4:8])[0]

        chars_per_page = int(text_length / page_count)
        while count < text_length:
            pages.append(count)
            count += chars_per_page

        if len(pages) > page_count:
            # Rounding created extra page entries
            pages = pages[:page_count]

        return pages

    def get_pages_fast(self, mobi_file_path):
        '''
        2300 characters of uncompressed text per page. This is
        not meant to map 1 to 1 to a print book but to be a
        close enough measure.

        A test book was chosen and the characters were counted
        on one page. This number was round to 2240 then 60
        characters of markup were added to the total giving
        2300.

        Uncompressed text length is used because it's easily
        accessible in MOBI files (part of the header). Also,
        It's faster to work off of the length then to
        decompress and parse the actual text.
        '''
        text_length = 0
        pages = []
        count = 0

        with open(mobi_file_path, 'rb') as mf:
            phead = PdbHeaderReader(mf)
            r0 = phead.section_data(0)
            text_length = struct.unpack('>I', r0[4:8])[0]

        while count < text_length:
            pages.append(count)
            count += 2300

        return pages

    def get_pages_accurate(self, mobi_file_path):
        '''
        A more accurate but much more resource intensive and slower
        method to calculate the page length.

        Parses the uncompressed text. In an average paper back book
        There are 32 lines per page and a maximum of 70 characters
        per line.

        Each paragraph starts a new line and every 70 characters
        (minus markup) in a paragraph starts a new line. The
        position after every 30 lines will be marked as a new
        page.

        This can be make more accurate by accounting for
        <div class="mbp_pagebreak" /> as a new page marker.
        And <br> elements as an empty line.
        '''
        pages = []

        # Get the MOBI html.
        mr = MobiReader(mobi_file_path, default_log)
        if mr.book_header.encryption_type != 0:
            # DRMed book
            return self.get_pages_fast(mobi_file_path)
        mr.extract_text()

        # States
        in_tag = False
        in_p = False
        check_p = False
        closing = False
        p_char_count = 0

        # Get positions of every line
        # A line is either a paragraph starting
        # or every 70 characters in a paragraph.
        lines = []
        pos = -1
        # We want this to be as fast as possible so we
        # are going to do one pass across the text. re
        # and string functions will parse the text each
        # time they are called.
        #
        # We can can use .lower() here because we are
        # not modifying the text. In this case the case
        # doesn't matter just the absolute character and
        # the position within the stream.
        for c in mr.mobi_html.lower():
            pos += 1

            # Check if we are starting or stopping a p tag.
            if check_p:
                if c == '/':
                    closing = True
                    continue
                elif c == 'p':
                    if closing:
                        in_p = False
                    else:
                        in_p = True
                        lines.append(pos - 2)
                check_p = False
                closing = False
                continue

            if c == '<':
                in_tag = True
                check_p = True
                continue
            elif c == '>':
                in_tag = False
                check_p = False
                continue

            if in_p and not in_tag:
                p_char_count += 1
                if p_char_count == 70:
                    lines.append(pos)
                    p_char_count = 0

        # Every 30 lines is a new page
        for i in xrange(0, len(lines), 32):
            pages.append(lines[i])

        return pages