2
# -*- encoding: utf-8 -*-
3
##############################################################################
7
# 2005 pyopenoffice.py Martin Simon (http://www.bezirksreiter.de)
8
# 2005 Fabien Pinckaers, TINY SPRL. (http://tiny.be)
10
# WARNING: This program as such is intended to be used by professional
11
# programmers who take the whole responsability of assessing all potential
12
# consequences resulting from its eventual inadequacies and bugs
13
# End users who are looking for a ready-to-use solution with commercial
14
# garantees and support are strongly adviced to contact a Free Software
17
# This program is Free Software; you can redistribute it and/or
18
# modify it under the terms of the GNU General Public License
19
# as published by the Free Software Foundation; either version 2
20
# of the License, or (at your option) any later version.
22
# This program is distributed in the hope that it will be useful,
23
# but WITHOUT ANY WARRANTY; without even the implied warranty of
24
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25
# GNU General Public License for more details.
27
# You should have received a copy of the GNU General Public License
28
# along with this program; if not, write to the Free Software
29
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
31
##############################################################################
34
Tiny SXW2RML - The Open ERP's report engine
36
Tiny SXW2RMLis part of the Tiny report project.
37
Tiny Report is a module that allows you to render high quality PDF document
38
from an OpenOffice template (.sxw) and any relationnal database.
40
The whole source code is distributed under the terms of the
43
(c) 2005 pyopenoffice.py Martin Simon (http://www.bezirksreiter.de)
44
(c) 2005-TODAY, Fabien Pinckaers - Tiny sprl
53
import xml.dom.minidom
54
from reportlab.lib.units import toLength
58
"""General DOM API utilities."""
59
def __init__(self,content_string="",file=""):
60
self.content_string = content_string
61
self.re_digits = re.compile(r"(.*?\d)(pt|cm|mm|inch|in)")
63
def _unitTuple(self,string):
64
"""Split values and units to a tuple."""
65
temp = self.re_digits.findall(string)
71
def stringPercentToFloat(self,string):
72
temp = string.replace("""%""","")
73
return float(temp)/100
75
def findChildrenByName(self,parent,name,attr_dict={}):
76
"""Helper functions. Does not work recursively.
77
Optional: also test for certain attribute/value pairs."""
79
for c in parent.childNodes:
80
if c.nodeType == c.ELEMENT_NODE and c.nodeName == name:
85
return self._selectForAttributes(nodelist=children,attr_dict=attr_dict)
87
def _selectForAttributes(self,nodelist,attr_dict):
92
for a in attr_dict.keys():
93
if n.getAttribute(a) != attr_dict[a]:
94
# at least one incorrect attribute value?
97
selected_nodes.append(n)
100
def _stringToTuple(self,s):
101
"""Helper function."""
103
temp = string.split(s,",")
104
return int(temp[0]),int(temp[1])
108
def _tupleToString(self,t):
110
return self.openOfficeStringUtf8("%s,%s" % (t[0],t[1]))
114
def _lengthToFloat(self,value):
116
if not self.re_digits.search(v):
120
# OO files use "inch" instead of "in" in Reportlab units
125
c = round(toLength(v))
130
def openOfficeStringUtf8(self,string):
131
if type(string) == unicode:
132
return string.encode("utf-8")
133
tempstring = unicode(string,"cp1252").encode("utf-8")
136
class DomApi(DomApiGeneral):
137
"""This class provides a DOM-API for XML-Files from an SXW-Archive."""
138
def __init__(self,xml_content,xml_styles):
139
DomApiGeneral.__init__(self)
140
self.content_dom = xml.dom.minidom.parseString(xml_content)
141
self.styles_dom = xml.dom.minidom.parseString(xml_styles)
142
body = self.content_dom.getElementsByTagName("office:body")
143
self.body = body and body[0]
147
self.style_properties_dict = {}
149
# ******** always use the following order:
150
self.buildStyleDict()
151
self.buildStylePropertiesDict()
152
if self.styles_dom.getElementsByTagName("style:page-master").__len__()<>0:
153
self.page_master = self.styles_dom.getElementsByTagName("style:page-master")[0]
154
if self.styles_dom.getElementsByTagName("style:page-layout").__len__()<>0 :
155
self.page_master = self.styles_dom.getElementsByTagName("style:page-layout")[0]
156
self.document = self.content_dom.getElementsByTagName("office:document-content")[0]
158
def buildStylePropertiesDict(self):
159
for s in self.style_dict.keys():
160
self.style_properties_dict[s] = self.getStylePropertiesDict(s)
162
def updateWithPercents(self,dict,updatedict):
163
"""Sometimes you find values like "115%" in the style hierarchy."""
165
# no style hierarchies for this style? =>
167
new_updatedict = copy.copy(updatedict)
168
for u in new_updatedict.keys():
170
if new_updatedict[u].find("""%""") != -1 and dict.has_key(u):
171
number = float(self.re_digits.search(dict[u]).group(1))
172
unit = self.re_digits.search(dict[u]).group(2)
173
new_number = self.stringPercentToFloat(new_updatedict[u]) * number
175
new_number = int(new_number)
176
# no floats allowed for "pt"
177
# OOo just takes the int, does not round (try it out!)
178
new_updatedict[u] = "%s%s" % (new_number,unit)
180
dict[u] = new_updatedict[u]
182
dict[u] = new_updatedict[u]
183
dict.update(new_updatedict)
185
def normalizeStyleProperties(self):
186
"""Transfer all style:style-properties attributes from the
187
self.style_properties_hierarchical dict to the automatic-styles
188
from content.xml. Use this function to preprocess content.xml for
189
XSLT transformations etc.Do not try to implement this function
190
with XSlT - believe me, it's a terrible task..."""
191
styles_styles = self.styles_dom.getElementsByTagName("style:style")
192
automatic_styles = self.content_dom.getElementsByTagName("office:automatic-styles")[0]
193
for s in styles_styles:
194
automatic_styles.appendChild(s.cloneNode(deep=1))
195
content_styles = self.content_dom.getElementsByTagName("style:style")
196
# these are the content_styles with styles_styles added!!!
197
for s in content_styles:
198
c = self.findChildrenByName(s,"style:properties")
200
# some derived automatic styles do not have "style:properties":
201
temp = self.content_dom.createElement("style:properties")
203
c = self.findChildrenByName(s,"style:properties")
205
dict = self.style_properties_dict[(s.getAttribute("style:name")).encode("utf-8")] or {}
206
for attribute in dict.keys():
207
c.setAttribute(self.openOfficeStringUtf8(attribute),self.openOfficeStringUtf8(dict[attribute]))
209
def transferStylesXml(self):
210
"""Transfer certain sub-trees from styles.xml to the normalized content.xml
211
(see above). It is not necessary to do this - for example - with paragraph styles.
212
the "normalized" style properties contain all information needed for
213
further processing."""
214
# TODO: What about table styles etc.?
215
outline_styles = self.styles_dom.getElementsByTagName("text:outline-style")
216
t = self.content_dom.createElement("transferredfromstylesxml")
217
self.document.insertBefore(t,self.body)
218
t_new = self.body.previousSibling
220
page_master = self.page_master
221
t_new.appendChild(page_master.cloneNode(deep=1))
222
t_new.appendChild(outline_styles[0].cloneNode(deep=1))
226
def normalizeLength(self):
227
"""Normalize all lengthes to floats (i.e: 1 inch = 72).
228
Always use this after "normalizeContent" and "transferStyles"!"""
229
# TODO: The complex attributes of table cell styles are not transferred yet.
230
#all_styles = self.content_dom.getElementsByTagName("style:properties")
231
#all_styles += self.content_dom.getElementsByTagName("draw:image")
232
all_styles = self.content_dom.getElementsByTagName("*")
234
for x in s._attrs.keys():
235
v = s.getAttribute(x)
236
s.setAttribute(x,"%s" % self._lengthToFloat(v))
237
# convert float to string first!
239
def normalizeTableColumns(self):
240
"""Handle this strange table:number-columns-repeated attribute."""
241
columns = self.content_dom.getElementsByTagName("table:table-column")
243
if c.hasAttribute("table:number-columns-repeated"):
244
number = int(c.getAttribute("table:number-columns-repeated"))
245
c.removeAttribute("table:number-columns-repeated")
246
for i in range(number-1):
247
(c.parentNode).insertBefore(c.cloneNode(deep=1),c)
249
def buildStyleDict(self):
250
"""Store all style:style-nodes from content.xml and styles.xml in self.style_dict.
251
Caution: in this dict the nodes from two dom apis are merged!"""
252
for st in (self.styles_dom,self.content_dom):
253
for s in st.getElementsByTagName("style:style"):
254
name = s.getAttribute("style:name").encode("utf-8")
255
self.style_dict[name] = s
259
return self.content_dom.toxml(encoding="utf-8")
261
def getStylePropertiesDict(self,style_name):
264
if self.style_dict[style_name].hasAttribute("style:parent-style-name"):
265
parent = self.style_dict[style_name].getAttribute("style:parent-style-name").encode("utf-8")
266
res = self.getStylePropertiesDict(parent)
268
childs = self.style_dict[style_name].childNodes
270
if c.nodeType == c.ELEMENT_NODE and c.nodeName.find("properties")>0 :
271
for attr in c._attrs.keys():
272
res[attr] = c.getAttribute(attr).encode("utf-8")
275
class PyOpenOffice(object):
276
"""This is the main class which provides all functionality."""
277
def __init__(self, path='.', save_pict=False):
279
self.save_pict = save_pict
282
def oo_read(self,fname):
283
z = zipfile.ZipFile(fname,"r")
284
content = z.read('content.xml')
285
style = z.read('styles.xml')
288
if a[:9]=='Pictures/' and len(a)>10:
289
pic_content = z.read(a)
290
self.images[a[9:]] = pic_content
292
f=open(os.path.join(self.path, os.path.basename(a)),"wb")
298
def oo_replace(self,content):
300
(r"<para[^>]*/>", ""),
301
#(r"<text:ordered-list.*?>(.*?)</text:ordered-list>", "$1"),
302
#(r"<text:unordered-list.*?>(.*?)</text:unordered-list>", "$1"),
303
(r"<para(.*)>(.*?)<text:line-break[^>]*/>", "<para$1>$2</para><para$1>"),
305
for key,val in regex:
306
content = re.sub(key, val, content)
309
def unpackNormalize(self,sourcefile):
310
c,s = self.oo_read(sourcefile)
311
c = self.oo_replace(c)
313
dom.normalizeStyleProperties()
314
dom.transferStylesXml()
315
dom.normalizeLength()
316
dom.normalizeTableColumns()
320
def sxw2rml(sxw_file, xsl, output='.', save_pict=False):
321
from lxml import etree
322
from StringIO import StringIO
324
tool = PyOpenOffice(output, save_pict = save_pict)
325
res = tool.unpackNormalize(sxw_file)
328
styledoc = etree.parse(f)
329
style = etree.XSLT(styledoc)
334
root = etree.XPathEvaluator(result)("/document/stylesheet")
338
images = etree.Element("images")
339
for img in tool.images:
340
node = etree.Element('image', name=img)
341
node.text = base64.encodestring(tool.images[img])
351
if __name__ == "__main__":
353
parser = optparse.OptionParser(
354
version="Tiny Report v%s" % __version__,
355
usage = 'tiny_sxw2rml.py [options] file.sxw')
356
parser.add_option("-v", "--verbose", default=False, dest="verbose", help="enable basic debugging")
357
parser.add_option("-o", "--output", dest="output", default='.', help="directory of image output")
358
(opt, args) = parser.parse_args()
360
parser.error("incorrect number of arguments")
367
xsl_file = 'normalized_oo2rml.xsl'
368
z = zipfile.ZipFile(fname,"r")
369
mimetype = z.read('mimetype')
370
if mimetype.split('/')[-1] == 'vnd.oasis.opendocument.text' :
371
xsl_file = 'normalized_odt2rml.xsl'
372
xsl = file(os.path.join(os.getcwd(), os.path.dirname(sys.argv[0]), xsl_file)).read()
373
result = sxw2rml(f, xsl, output=opt.output, save_pict=False)
376
# vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: