~ubuntu-branches/debian/sid/calibre/sid

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import (unicode_literals, division, absolute_import,
                        print_function)

__license__   = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

import struct
from collections import OrderedDict, namedtuple

from calibre.ebooks.mobi.utils import (decint, count_set_bits,
        decode_string)

TagX = namedtuple('TagX', 'tag num_of_values bitmask eof')
PTagX = namedtuple('PTagX', 'tag value_count value_bytes num_of_values')
INDEX_HEADER_FIELDS = (
            'len', 'nul1', 'type', 'gen', 'start', 'count', 'code',
            'lng', 'total', 'ordt', 'ligt', 'nligt', 'ncncx'
    ) + tuple('unknown%d'%i for i in xrange(27)) + ('ocnt', 'oentries',
            'ordt1', 'ordt2', 'tagx')


class InvalidFile(ValueError):
    pass

def check_signature(data, signature):
    if data[:len(signature)] != signature:
        raise InvalidFile('Not a valid %r section'%signature)

class NotAnINDXRecord(InvalidFile):
    pass

class NotATAGXSection(InvalidFile):
    pass

def format_bytes(byts):
    byts = bytearray(byts)
    byts = [hex(b)[2:] for b in byts]
    return ' '.join(byts)

def parse_indx_header(data):
    check_signature(data, b'INDX')
    words = INDEX_HEADER_FIELDS
    num = len(words)
    values = struct.unpack(bytes('>%dL' % num), data[4:4*(num+1)])
    ans = dict(zip(words, values))
    ordt1, ordt2 = ans['ordt1'], ans['ordt2']
    ans['ordt1_raw'], ans['ordt2_raw'] = [], []
    ans['ordt_map'] = ''

    if ordt1 > 0 and data[ordt1:ordt1+4] == b'ORDT':
        # I dont know what this is, but using it seems to be unnecessary, so
        # just leave it as the raw bytestring
        ans['ordt1_raw'] = data[ordt1+4:ordt1+4+ans['oentries']]
    if ordt2 > 0 and data[ordt2:ordt2+4] == b'ORDT':
        ans['ordt2_raw'] = raw = bytearray(data[ordt2+4:ordt2+4+2*ans['oentries']])
        if ans['code'] == 65002:
            # This appears to be EBCDIC-UTF (65002) encoded. I can't be
            # bothered to write a decoder for this (see
            # http://www.unicode.org/reports/tr16/) Just how stupid is Amazon?
            # Instead, we use a weird hack that seems to do the trick for all
            # the books with this type of ORDT record that I have come across.
            # Some EBSP book samples in KF8 format from Amazon have this type
            # of encoding.
            # Basically we try to interpret every second byte as a printable
            # ascii character. If we cannot, we map to the ? char.

            parsed = bytearray(ans['oentries'])
            for i in xrange(0, 2*ans['oentries'], 2):
                parsed[i//2] = raw[i+1] if 0x20 < raw[i+1] < 0x7f else ord(b'?')
            ans['ordt_map'] = bytes(parsed).decode('ascii')
        else:
            ans['ordt_map'] = '?'*ans['oentries']

    return ans


class CNCX(object):  # {{{

    '''
    Parses the records that contain the compiled NCX (all strings from the
    NCX). Presents a simple offset : string mapping interface to access the
    data.
    '''

    def __init__(self, records, codec):
        self.records = OrderedDict()
        record_offset = 0
        for raw in records:
            pos = 0
            while pos < len(raw):
                length, consumed = decint(raw[pos:])
                if length > 0:
                    try:
                        self.records[pos+record_offset] = raw[
                            pos+consumed:pos+consumed+length].decode(codec)
                    except:
                        byts = raw[pos:]
                        r = format_bytes(byts)
                        print ('CNCX entry at offset %d has unknown format %s'%(
                            pos+record_offset, r))
                        self.records[pos+record_offset] = r
                        pos = len(raw)
                pos += consumed+length
            record_offset += 0x10000

    def __getitem__(self, offset):
        return self.records.get(offset)

    def get(self, offset, default=None):
        return self.records.get(offset, default)

    def __bool__(self):
        return bool(self.records)
    __nonzero__ = __bool__

    def iteritems(self):
        return self.records.iteritems()
# }}}

def parse_tagx_section(data):
    check_signature(data, b'TAGX')

    tags = []
    first_entry_offset, = struct.unpack_from(b'>L', data, 4)
    control_byte_count, = struct.unpack_from(b'>L', data, 8)

    for i in xrange(12, first_entry_offset, 4):
        vals = list(bytearray(data[i:i+4]))
        tags.append(TagX(*vals))
    return control_byte_count, tags

def get_tag_map(control_byte_count, tagx, data, strict=False):
    ptags = []
    ans = {}
    control_bytes = list(bytearray(data[:control_byte_count]))
    data = data[control_byte_count:]

    for x in tagx:
        if x.eof == 0x01:
            control_bytes = control_bytes[1:]
            continue
        value = control_bytes[0] & x.bitmask
        if value != 0:
            value_count = value_bytes = None
            if value == x.bitmask:
                if count_set_bits(x.bitmask) > 1:
                    # If all bits of masked value are set and the mask has more
                    # than one bit, a variable width value will follow after
                    # the control bytes which defines the length of bytes (NOT
                    # the value count!) which will contain the corresponding
                    # variable width values.
                    value_bytes, consumed = decint(data)
                    data = data[consumed:]
                else:
                    value_count = 1
            else:
                # Shift bits to get the masked value.
                mask = x.bitmask
                while mask & 0b1 == 0:
                    mask >>= 1
                    value >>= 1
                value_count = value
            ptags.append(PTagX(x.tag, value_count, value_bytes,
                x.num_of_values))

    for x in ptags:
        values = []
        if x.value_count is not None:
            # Read value_count * values_per_entry variable width values.
            for _ in xrange(x.value_count * x.num_of_values):
                byts, consumed = decint(data)
                data = data[consumed:]
                values.append(byts)
        else:  # value_bytes is not None
            # Convert value_bytes to variable width values.
            total_consumed = 0
            while total_consumed < x.value_bytes:
                # Does this work for values_per_entry != 1?
                byts, consumed = decint(data)
                data = data[consumed:]
                total_consumed += consumed
                values.append(byts)
            if total_consumed != x.value_bytes:
                err = ("Error: Should consume %s bytes, but consumed %s" %
                        (x.value_bytes, total_consumed))
                if strict:
                    raise ValueError(err)
                else:
                    print(err)
        ans[x.tag] = values
    # Test that all bytes have been processed
    if data.replace(b'\0', b''):
        err = ("Warning: There are unprocessed index bytes left: %s" %
                format_bytes(data))
        if strict:
            raise ValueError(err)
        else:
            print(err)

    return ans

def parse_index_record(table, data, control_byte_count, tags, codec,
        ordt_map, strict=False):
    header = parse_indx_header(data)
    idxt_pos = header['start']
    if data[idxt_pos:idxt_pos+4] != b'IDXT':
        print ('WARNING: Invalid INDX record')
    entry_count = header['count']

    # loop through to build up the IDXT position starts
    idx_positions= []
    for j in xrange(entry_count):
        pos, = struct.unpack_from(b'>H', data, idxt_pos + 4 + (2 * j))
        idx_positions.append(pos)
    # The last entry ends before the IDXT tag (but there might be zero fill
    # bytes we need to ignore!)
    idx_positions.append(idxt_pos)

    # For each entry in the IDXT build up the tag map and any associated
    # text
    for j in xrange(entry_count):
        start, end = idx_positions[j:j+2]
        rec = data[start:end]
        # Sometimes (in the guide table if the type attribute has non ascii
        # values) the ident is UTF-16 encoded. Try to handle that.
        try:
            ident, consumed = decode_string(rec, codec=codec, ordt_map=ordt_map)
        except UnicodeDecodeError:
            ident, consumed = decode_string(rec, codec='utf-16', ordt_map=ordt_map)
        if u'\x00' in ident:
            try:
                ident, consumed = decode_string(rec, codec='utf-16',
                        ordt_map=ordt_map)
            except UnicodeDecodeError:
                ident = ident.replace('u\x00', u'')
        rec = rec[consumed:]
        tag_map = get_tag_map(control_byte_count, tags, rec, strict=strict)
        table[ident] = tag_map
    return header

def read_index(sections, idx, codec):
    table, cncx = OrderedDict(), CNCX([], codec)

    data = sections[idx][0]

    indx_header = parse_indx_header(data)
    indx_count = indx_header['count']

    if indx_header['ncncx'] > 0:
        off = idx + indx_count + 1
        cncx_records = [x[0] for x in sections[off:off+indx_header['ncncx']]]
        cncx = CNCX(cncx_records, codec)

    tag_section_start = indx_header['tagx']
    control_byte_count, tags = parse_tagx_section(data[tag_section_start:])

    for i in xrange(idx + 1, idx + 1 + indx_count):
        # Index record
        data = sections[i][0]
        parse_index_record(table, data, control_byte_count, tags, codec,
                indx_header['ordt_map'])
    return table, cncx