~cedric-lebrouster/ocb-server/ocb-7.0-bug-1322191-db_maxconn

3216.1.1 by P. Christeas
pyPdf: upgrade from upstream git: 4abdca42a7d8a4
1
import re
2
import datetime
3
import decimal
4
from generic import PdfObject
5
from xml.dom import getDOMImplementation
6
from xml.dom.minidom import parseString
7
8
RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
9
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
10
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
11
PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
12
XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
13
14
# What is the PDFX namespace, you might ask?  I might ask that too.  It's
15
# a completely undocumented namespace used to place "custom metadata"
16
# properties, which are arbitrary metadata properties with no semantic or
17
# documented meaning.  Elements in the namespace are key/value-style storage,
18
# where the element name is the key and the content is the value.  The keys
19
# are transformed into valid XML identifiers by substituting an invalid
20
# identifier character with \u2182 followed by the unicode hex ID of the
21
# original character.  A key like "my car" is therefore "my\u21820020car".
22
#
23
# \u2182, in case you're wondering, is the unicode character
24
# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
25
# escaping characters.
26
#
27
# Intentional users of the pdfx namespace should be shot on sight.  A
28
# custom data schema and sensical XML elements could be used instead, as is
29
# suggested by Adobe's own documentation on XMP (under "Extensibility of
30
# Schemas").
31
#
32
# Information presented here on the /pdfx/ schema is a result of limited
33
# reverse engineering, and does not constitute a full specification.
34
PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
35
36
iso8601 = re.compile("""
37
        (?P<year>[0-9]{4})
38
        (-
39
            (?P<month>[0-9]{2})
40
            (-
41
                (?P<day>[0-9]+)
42
                (T
43
                    (?P<hour>[0-9]{2}):
44
                    (?P<minute>[0-9]{2})
45
                    (:(?P<second>[0-9]{2}(.[0-9]+)?))?
46
                    (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
47
                )?
48
            )?
49
        )?
50
        """, re.VERBOSE)
51
52
##
53
# An object that represents Adobe XMP metadata.
54
class XmpInformation(PdfObject):
55
56
    def __init__(self, stream):
57
        self.stream = stream
58
        docRoot = parseString(self.stream.getData())
59
        self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
60
        self.cache = {}
61
62
    def writeToStream(self, stream, encryption_key):
63
        self.stream.writeToStream(stream, encryption_key)
64
65
    def getElement(self, aboutUri, namespace, name):
66
        for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
67
            if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
68
                attr = desc.getAttributeNodeNS(namespace, name)
4682.2.2 by Xavier Morel
[IMP] compare to None by identity
69
                if attr is not None:
3216.1.1 by P. Christeas
pyPdf: upgrade from upstream git: 4abdca42a7d8a4
70
                    yield attr
71
                for element in desc.getElementsByTagNameNS(namespace, name):
72
                    yield element
73
74
    def getNodesInNamespace(self, aboutUri, namespace):
75
        for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
76
            if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
77
                for i in range(desc.attributes.length):
78
                    attr = desc.attributes.item(i)
79
                    if attr.namespaceURI == namespace:
80
                        yield attr
81
                for child in desc.childNodes:
82
                    if child.namespaceURI == namespace:
83
                        yield child
84
85
    def _getText(self, element):
86
        text = ""
87
        for child in element.childNodes:
88
            if child.nodeType == child.TEXT_NODE:
89
                text += child.data
90
        return text
91
92
    def _converter_string(value):
93
        return value
94
95
    def _converter_date(value):
96
        m = iso8601.match(value)
97
        year = int(m.group("year"))
98
        month = int(m.group("month") or "1")
99
        day = int(m.group("day") or "1")
100
        hour = int(m.group("hour") or "0")
101
        minute = int(m.group("minute") or "0")
102
        second = decimal.Decimal(m.group("second") or "0")
103
        seconds = second.to_integral(decimal.ROUND_FLOOR)
104
        milliseconds = (second - seconds) * 1000000
105
        tzd = m.group("tzd") or "Z"
106
        dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
107
        if tzd != "Z":
108
            tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
109
            tzd_hours *= -1
110
            if tzd_hours < 0:
111
                tzd_minutes *= -1
112
            dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
113
        return dt
114
    _test_converter_date = staticmethod(_converter_date)
115
116
    def _getter_bag(namespace, name, converter):
117
        def get(self):
118
            cached = self.cache.get(namespace, {}).get(name)
119
            if cached:
120
                return cached
121
            retval = []
122
            for element in self.getElement("", namespace, name):
123
                bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
124
                if len(bags):
125
                    for bag in bags:
126
                        for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
127
                            value = self._getText(item)
128
                            value = converter(value)
129
                            retval.append(value)
130
            ns_cache = self.cache.setdefault(namespace, {})
131
            ns_cache[name] = retval
132
            return retval
133
        return get
134
135
    def _getter_seq(namespace, name, converter):
136
        def get(self):
137
            cached = self.cache.get(namespace, {}).get(name)
138
            if cached:
139
                return cached
140
            retval = []
141
            for element in self.getElement("", namespace, name):
142
                seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
143
                if len(seqs):
144
                    for seq in seqs:
145
                        for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
146
                            value = self._getText(item)
147
                            value = converter(value)
148
                            retval.append(value)
149
                else:
150
                    value = converter(self._getText(element))
151
                    retval.append(value)
152
            ns_cache = self.cache.setdefault(namespace, {})
153
            ns_cache[name] = retval
154
            return retval
155
        return get
156
157
    def _getter_langalt(namespace, name, converter):
158
        def get(self):
159
            cached = self.cache.get(namespace, {}).get(name)
160
            if cached:
161
                return cached
162
            retval = {}
163
            for element in self.getElement("", namespace, name):
164
                alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
165
                if len(alts):
166
                    for alt in alts:
167
                        for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
168
                            value = self._getText(item)
169
                            value = converter(value)
170
                            retval[item.getAttribute("xml:lang")] = value
171
                else:
172
                    retval["x-default"] = converter(self._getText(element))
173
            ns_cache = self.cache.setdefault(namespace, {})
174
            ns_cache[name] = retval
175
            return retval
176
        return get
177
178
    def _getter_single(namespace, name, converter):
179
        def get(self):
180
            cached = self.cache.get(namespace, {}).get(name)
181
            if cached:
182
                return cached
183
            value = None
184
            for element in self.getElement("", namespace, name):
185
                if element.nodeType == element.ATTRIBUTE_NODE:
186
                    value = element.nodeValue
187
                else:
188
                    value = self._getText(element)
189
                break
4682.2.2 by Xavier Morel
[IMP] compare to None by identity
190
            if value is not None:
3216.1.1 by P. Christeas
pyPdf: upgrade from upstream git: 4abdca42a7d8a4
191
                value = converter(value)
192
            ns_cache = self.cache.setdefault(namespace, {})
193
            ns_cache[name] = value
194
            return value
195
        return get
196
197
    ##
198
    # Contributors to the resource (other than the authors).  An unsorted
199
    # array of names.
200
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
201
    dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
202
203
    ##
204
    # Text describing the extent or scope of the resource.
205
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
206
    dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
207
208
    ##
209
    # A sorted array of names of the authors of the resource, listed in order
210
    # of precedence.
211
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
212
    dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
213
214
    ##
215
    # A sorted array of dates (datetime.datetime instances) of signifigance to
216
    # the resource.  The dates and times are in UTC.
217
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
218
    dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
219
220
    ##
221
    # A language-keyed dictionary of textual descriptions of the content of the
222
    # resource.
223
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
224
    dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
225
226
    ##
227
    # The mime-type of the resource.
228
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
229
    dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
230
231
    ##
232
    # Unique identifier of the resource.
233
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
234
    dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
235
236
    ##
237
    # An unordered array specifying the languages used in the resource.
238
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
239
    dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
240
241
    ##
242
    # An unordered array of publisher names.
243
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
244
    dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
245
246
    ##
247
    # An unordered array of text descriptions of relationships to other
248
    # documents.
249
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
250
    dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
251
252
    ##
253
    # A language-keyed dictionary of textual descriptions of the rights the
254
    # user has to this resource.
255
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
256
    dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
257
258
    ##
259
    # Unique identifier of the work from which this resource was derived.
260
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
261
    dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
262
263
    ##
264
    # An unordered array of descriptive phrases or keywrods that specify the
265
    # topic of the content of the resource.
266
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
267
    dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
268
269
    ##
270
    # A language-keyed dictionary of the title of the resource.
271
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
272
    dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
273
274
    ##
275
    # An unordered array of textual descriptions of the document type.
276
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
277
    dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
278
279
    ##
280
    # An unformatted text string representing document keywords.
281
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
282
    pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
283
284
    ##
285
    # The PDF file version, for example 1.0, 1.3.
286
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
287
    pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
288
289
    ##
290
    # The name of the tool that created the PDF document.
291
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
292
    pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
293
294
    ##
295
    # The date and time the resource was originally created.  The date and
296
    # time are returned as a UTC datetime.datetime object.
297
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
298
    xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
299
    
300
    ##
301
    # The date and time the resource was last modified.  The date and time
302
    # are returned as a UTC datetime.datetime object.
303
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
304
    xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
305
306
    ##
307
    # The date and time that any metadata for this resource was last
308
    # changed.  The date and time are returned as a UTC datetime.datetime
309
    # object.
310
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
311
    xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
312
313
    ##
314
    # The name of the first known tool used to create the resource.
315
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
316
    xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
317
318
    ##
319
    # The common identifier for all versions and renditions of this resource.
320
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
321
    xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
322
323
    ##
324
    # An identifier for a specific incarnation of a document, updated each
325
    # time a file is saved.
326
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
327
    xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
328
329
    def custom_properties(self):
330
        if not hasattr(self, "_custom_properties"):
331
            self._custom_properties = {}
332
            for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
333
                key = node.localName
334
                while True:
335
                    # see documentation about PDFX_NAMESPACE earlier in file
336
                    idx = key.find(u"\u2182")
337
                    if idx == -1:
338
                        break
339
                    key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
340
                if node.nodeType == node.ATTRIBUTE_NODE:
341
                    value = node.nodeValue
342
                else:
343
                    value = self._getText(node)
344
                self._custom_properties[key] = value
345
        return self._custom_properties
346
347
    ##
348
    # Retrieves custom metadata properties defined in the undocumented pdfx
349
    # metadata schema.
350
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
351
    # @return Returns a dictionary of key/value items for custom metadata
352
    # properties.
353
    custom_properties = property(custom_properties)
354
355
4682.2.2 by Xavier Morel
[IMP] compare to None by identity
356
357
# vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: