1
# -*- coding: iso-8859-15 -*-
3
"""Application-specific settings for the document."""
8
import xml.etree.cElementTree as ET
10
from elementtree.cElementTree import ElementTree as ET
13
# Prefix values with "application/vnd.oasis.opendocument." to get MIME types
15
odf_formats = {'odt':'text', 'ods':'spreadsheet', 'odp':'presentation',
16
'odg':'graphics', 'odc':'chart', 'odf':'formula', 'odi':'image',
17
'odm':'text-master', 'ott':'text-template',
18
'ots':'spreadsheet-template', 'otp':'presentation-template',
19
'otg':'graphics-template'}
21
odf_prefix = "application/vnd.oasis.opendocument."
24
# Exceptions for this module
26
class ReCompileError(Exception):
27
"""Thrown if regular expression cannot be compiled."""
30
class PathNotFoundError(Exception):
31
"""Thrown if a file reference contains a nonexistant path."""
36
class Settings(object):
38
def __init__(self, text):
39
self.root = ET.fromstring(text)
43
# The Document tree and associated methods
46
"""The ODF document object."""
47
# Map attribute names to file names
48
file_map = {'mimetype': 'mimetype',
49
'manifest': 'META-INF/manifest.xml',
50
'content': 'content.xml',
51
'styles': 'styles.xml',
53
'settings': 'settings.xml'}
56
file='', # Document file name
57
mimetype='', # Mimetype string
58
manifest='', # Lists the contents of the ODF file
59
content='', # Content data (the text)
60
styles='', # Formatting data
62
settings='', # Application-specific data
63
additional={}, # Additional bundled files (e.g. images)
64
file_dates={} # File dates for all files and directories
67
# Get all method parameters
70
# Process all Document files
71
for key, filename in self.__class__.file_map.items():
72
if key not in args or 0 == len(args[key]):
73
setattr(self, key, '')
74
elif not filename or '.xml' != filename[-4:]:
75
setattr(self, key, args[key])
78
# Parse the XML string and set it as an ElementTree object
79
setattr(self, key, ET.XML(args[key]))
81
print >>sys.stderr, sysargs[key]
84
self.additional = additional
85
self.file_dates = file_dates
87
if not hasattr(self, 'file'):
90
def __del__(self): # XXX is this still necessary?
93
This was originally here to unlink each DOM component.
96
for key in self.__class__.file_map:
97
attr = getattr(self, key)
98
if not isinstance(attr, basestring):
101
# ---------------------------
102
# Extract objects from the document
104
def getComponentAsString(self, component_name, #pretty_printing=False,
106
"""Return document component as Unicode string."""
107
if component_name not in self.__class__.file_map:
109
filename = self.__class__.file_map[component_name]
110
attr = getattr(self, component_name)
111
if isinstance(attr, basestring):
114
# return attr.toprettyxml(encoding)
115
return ET.tostring(attr, encoding=encoding)
117
def getEmbeddedObjects(self, filter=None, ignore_case=False):
118
"""Return a dictionary of the objects embedded in the document.
120
A more general form of getImages. By default, this should return
121
all embedded objects; the list/dictionary can also be filtered
122
for a certain type, e.g. image files.
124
The filter currently supports UNIX glob patterns like "*a[bc]?.png"
125
and/or correct regular expressions like ".*a[bc].\.png$".
128
# TODO: support other embedded objects
129
search = get_search_for_filter(filter, ignore_case)
130
return dict([(filename[9:], content)
131
for filename, content in self.additional.items()
132
if 'Pictures/' == filename[:9]
133
and search(filename[9:])])
135
def getElementsByType(self, elementtype):
136
"""Extract all elements of a given type from the document.
138
For example, formulas or code.
144
"""Return the author of this document if available."""
147
for node in self.meta.getElementsByTagName("dc:creator"):
148
if (node.firstChild.nodeType == node.TEXT_NODE) and node.firstChild.data:
149
author = node.firstChild.data
154
def getExtension(self):
155
"""Return ODF extension for given mimetype."""
156
return get_extension(self.mimetype)
158
# ---------------------------
159
# Convert the document to other formats
161
def toXml(self, pretty_printing=False, encoding=None):
162
"""Return the content of the document as a XML Unicode string."""
164
return self.content.toprettyxml(encoding)
165
return self.content.toxml(encoding)
167
def toText(self, skip_blank_lines=True):
168
"""Return the content of the document as a plain-text Unicode string."""
169
textlist = (node.text for node in self.content.getiterator()
170
if not skip_blank_lines or node.text)
171
return unicode(os.linesep).join(textlist)
173
def toHtml(self, title="", encoding="utf-8"):
174
"""Return an UTF-8 encoded HTML representation of the document."""
176
# First, convert to ET operations
178
# - Scrape up meta tags and add to headnode
179
# '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'
180
# '<meta type="Generator" content="python-odftools" />'
181
# - Title for the page, if applicable
182
# - Convert self.styles to CSS and add to headnode as a <style type="text/css"> element
183
# - see cssutils at the Python cheeseshop
184
# - Fix the unit test
187
# - Support encodings other than UTF-8, and maybe Unicode
188
# - Allow named elements
189
# - A more natural way of doing the doctype declaration, if possible
191
attrs_odf2html = {"style-name": "class"}
205
htmldoc = ET.Element("html")
206
headnode = ET.SubElement(htmldoc, "head")
207
titlenode = ET.SubElement(headnode, "title")
208
titlenode.text = title
209
# ENH: add meta etc. nodes to the head as needed
211
docbody = self.content.find("office:body")
213
bodynode = translate_nodes(docbody, tags_odf2html, attrs_odf2html)
215
bodynode = ET.SubElement(htmldoc, "body")
217
doctypestr = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">\n'
218
htmlstr = ET.tostring(htmldoc, encoding=encoding) # .split("\n", 1)[1] # XXX kill 1st line?
219
return "\n".join((doctypestr, htmlstr))
221
def replace(self, search, replace):
222
"""Replace all occurences of search in content by replace.
224
Regular expressions are fully supported for search and replace.
226
Returns the number of replacements made.
232
import re, sre_constants
235
_replace = re.compile(search).sub
236
search = lambda x, y: find(x, y)
237
except (sre_constants.error, TypeError), v:
238
print >>sys.stderr, 'Warning: could not compile regular expression:', v
242
for node in self.content.getiterator():
245
replaced = _replace(replace, node.text)
246
if replaced != node.text:
249
except (sre_constants.error, TypeError), v:
250
print >>sys.stderr, 'Warning: could not compile regular expression:', v