~vcs-imports/py-odftools/trunk

Viewing changes to odftools/components/settings.py

Committer: eric.talevich
Date: 2007-07-17 01:32:15 UTC
Revision ID: vcs-imports@canonical.com-20070717013215-zrfrecfiohvz7mg7

Start a class for each of the 5 XML-based document components, throw the files for components into a new subdirectory, chew up everything and make it ugly. HTML export is broken and I don't like the command-line options.

files added:
odftools/components

odftools/components/.content.py.swp

odftools/components/__init__.py

odftools/components/content.py

odftools/components/manifest.py

odftools/components/meta.py

odftools/components/settings.py

odftools/components/styles.py

files modified:
odftools/document.py

Show diffs side-by-side

added added

removed removed

odftools/components/settings.py

# -*- coding: iso-8859-15 -*-

"""Application-specific settings for the document."""

import os, sys

try:

import xml.etree.cElementTree as ET

except ImportError:

from elementtree.cElementTree import ElementTree as ET

# Prefix values with "application/vnd.oasis.opendocument." to get MIME types

odf_formats = {'odt':'text', 'ods':'spreadsheet', 'odp':'presentation',

'odg':'graphics', 'odc':'chart', 'odf':'formula', 'odi':'image',

'odm':'text-master', 'ott':'text-template',

'ots':'spreadsheet-template', 'otp':'presentation-template',

'otg':'graphics-template'}

odf_prefix = "application/vnd.oasis.opendocument."

# Exceptions for this module

class ReCompileError(Exception):

"""Thrown if regular expression cannot be compiled."""

pass

class PathNotFoundError(Exception):

"""Thrown if a file reference contains a nonexistant path."""

pass

# Main class

class Settings(object):

def __init__(self, text):

self.root = ET.fromstring(text)

# The Document tree and associated methods

class Document:

"""The ODF document object."""

# Map attribute names to file names

file_map = {'mimetype': 'mimetype',

'manifest': 'META-INF/manifest.xml',

'content': 'content.xml',

'styles': 'styles.xml',

'meta': 'meta.xml',

'settings': 'settings.xml'}

def __init__(self,

file='', # Document file name

mimetype='', # Mimetype string

manifest='', # Lists the contents of the ODF file

content='', # Content data (the text)

styles='', # Formatting data

meta='', # Metadata

settings='', # Application-specific data

additional={}, # Additional bundled files (e.g. images)

file_dates={} # File dates for all files and directories

# Get all method parameters

args = locals()

# Process all Document files

for key, filename in self.__class__.file_map.items():

if key not in args or 0 == len(args[key]):

setattr(self, key, '')

elif not filename or '.xml' != filename[-4:]:

setattr(self, key, args[key])

else:

try:

# Parse the XML string and set it as an ElementTree object

setattr(self, key, ET.XML(args[key]))

except Exception, e:

print >>sys.stderr, sysargs[key]

print >>sys.stderr, e

self.additional = additional

self.file_dates = file_dates

if not hasattr(self, 'file'):

self.file = None

def __del__(self): # XXX is this still necessary?

"""Clean up.

This was originally here to unlink each DOM component.

"""

for key in self.__class__.file_map:

attr = getattr(self, key)

if not isinstance(attr, basestring):

del attr

100

101

# ---------------------------

102

# Extract objects from the document

103

104

def getComponentAsString(self, component_name, #pretty_printing=False,

105

encoding=None):

106

"""Return document component as Unicode string."""

107

if component_name not in self.__class__.file_map:

108

return ""

109

filename = self.__class__.file_map[component_name]

110

attr = getattr(self, component_name)

111

if isinstance(attr, basestring):

112

return attr

113

#if pretty_printing:

114

# return attr.toprettyxml(encoding)

115

return ET.tostring(attr, encoding=encoding)

116

117

def getEmbeddedObjects(self, filter=None, ignore_case=False):

118

"""Return a dictionary of the objects embedded in the document.

119

120

A more general form of getImages. By default, this should return

121

all embedded objects; the list/dictionary can also be filtered

122

for a certain type, e.g. image files.

123

124

The filter currently supports UNIX glob patterns like "*a[bc]?.png"

125

and/or correct regular expressions like ".*a[bc].\.png$".

126

127

"""

128

# TODO: support other embedded objects

129

search = get_search_for_filter(filter, ignore_case)

130

return dict([(filename[9:], content)

131

for filename, content in self.additional.items()

132

if 'Pictures/' == filename[:9]

133

and search(filename[9:])])

134

135

def getElementsByType(self, elementtype):

136

"""Extract all elements of a given type from the document.

137

138

For example, formulas or code.

139

140

"""

141

pass

142

143

def getAuthor(self):

144

"""Return the author of this document if available."""

145

author = ''

146

if self.meta:

147

for node in self.meta.getElementsByTagName("dc:creator"):

148

if (node.firstChild.nodeType == node.TEXT_NODE) and node.firstChild.data:

149

author = node.firstChild.data

150

break

151

152

return author

153

154

def getExtension(self):

155

"""Return ODF extension for given mimetype."""

156

return get_extension(self.mimetype)

157

158

# ---------------------------

159

# Convert the document to other formats

160

161

def toXml(self, pretty_printing=False, encoding=None):

162

"""Return the content of the document as a XML Unicode string."""

163

if pretty_printing:

164

return self.content.toprettyxml(encoding)

165

return self.content.toxml(encoding)

166

167

def toText(self, skip_blank_lines=True):

168

"""Return the content of the document as a plain-text Unicode string."""

169

textlist = (node.text for node in self.content.getiterator()

170

if not skip_blank_lines or node.text)

171

return unicode(os.linesep).join(textlist)

172

173

def toHtml(self, title="", encoding="utf-8"):

174

"""Return an UTF-8 encoded HTML representation of the document."""

175

# TODO:

176

# First, convert to ET operations

177

# Then,

178

# - Scrape up meta tags and add to headnode

179

# '<meta http-equiv="content-type" content="text/html; charset=UTF-8">'

180

# '<meta type="Generator" content="python-odftools" />'

181

# - Title for the page, if applicable

182

# - Convert self.styles to CSS and add to headnode as a <style type="text/css"> element

183

# - see cssutils at the Python cheeseshop

184

# - Fix the unit test

185

186

# ENH:

187

# - Support encodings other than UTF-8, and maybe Unicode

188

# - Allow named elements

189

# - A more natural way of doing the doctype declaration, if possible

190

191

attrs_odf2html = {"style-name": "class"}

192

tags_odf2html = {

193

"a": "a",

194

"body": "body",

195

"p": "p",

196

"span": "span",

197

"table": "table",

198

"h": "h1",

199

"table-row": "tr",

200

"table-cell": "td",

201

"image": "img",

202

"list": "ol",

203

"list-item": "li" }

204

205

htmldoc = ET.Element("html")

206

headnode = ET.SubElement(htmldoc, "head")

207

titlenode = ET.SubElement(headnode, "title")

208

titlenode.text = title

209

# ENH: add meta etc. nodes to the head as needed

210

211

docbody = self.content.find("office:body")

212

if docbody:

213

bodynode = translate_nodes(docbody, tags_odf2html, attrs_odf2html)

214

else:

215

bodynode = ET.SubElement(htmldoc, "body")

216

217

doctypestr = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">\n'

218

htmlstr = ET.tostring(htmldoc, encoding=encoding) # .split("\n", 1)[1] # XXX kill 1st line?

219

return "\n".join((doctypestr, htmlstr))

220

221

def replace(self, search, replace):

222

"""Replace all occurences of search in content by replace.

223

224

Regular expressions are fully supported for search and replace.

225

226

Returns the number of replacements made.

227

228

"""

229

if not search:

230

return 0

231

232

import re, sre_constants

233

234

try:

235

_replace = re.compile(search).sub

236

search = lambda x, y: find(x, y)

237

except (sre_constants.error, TypeError), v:

238

print >>sys.stderr, 'Warning: could not compile regular expression:', v

239

return 0

240

241

count = 0

242

for node in self.content.getiterator():

243

if node.text:

244

try:

245

replaced = _replace(replace, node.text)

246

if replaced != node.text:

247

node.text = replaced

248

count += 1

249

except (sre_constants.error, TypeError), v:

250

print >>sys.stderr, 'Warning: could not compile regular expression:', v

251

return 0

252

return count

253

Older »