3
# Author: David Goodger
4
# Contact: goodger@python.org
5
# Revision: $Revision: 3532 $
6
# Date: $Date: 2005-06-20 20:59:29 +0200 (Mon, 20 Jun 2005) $
7
# Copyright: This program has been placed in the public domain.
10
unicode2subfiles.py -- produce character entity files (reSructuredText
11
substitutions) from the W3C master unicode.xml file.
13
This program extracts character entity and entity set information from a
14
unicode.xml file and produces multiple reStructuredText files (in the current
15
directory) containing substitutions. Entity sets are from ISO 8879 & ISO
16
9573-13 (combined), MathML, and HTML4. One or two files are produced for each
17
entity set; a second file with a "-wide.txt" suffix is produced if there are
18
wide-Unicode characters in the set.
20
The input file, unicode.xml, is maintained as part of the MathML 2
21
Recommentation XML source, and is available from
22
<http://www.w3.org/2003/entities/xml/>.
29
from xml.parsers.expat import ParserCreate
32
usage_msg = """Usage: %s [unicode.xml]"""
34
def usage(prog, status=0, msg=None):
35
print >>sys.stderr, usage_msg % prog
37
print >>sys.stderr, msg
47
'Too many arguments (%s): only 1 expected.' % (len(argv) - 1))
49
inpath = 'unicode.xml'
50
if not os.path.isfile(inpath):
51
usage(argv[0], 1, 'No such file: "%s".' % inpath)
56
grouper = CharacterEntitySetExtractor(infile)
61
class CharacterEntitySetExtractor:
64
Extracts character entity information from unicode.xml file, groups it by
65
entity set, and writes out reStructuredText substitution files.
68
unwanted_entity_sets = ['stix', # unknown, buggy set
72
.. This data file has been placed in the public domain.
73
.. Derived from the Unicode character mappings available from
74
<http://www.w3.org/2003/entities/xml/>.
75
Processed by unicode2rstsubs.py, part of Docutils:
76
<http://docutils.sourceforge.net>.
79
def __init__(self, infile):
81
"""Input unicode.xml file."""
83
self.parser = self.setup_parser()
87
"""Stack of element names. Last is current element."""
90
"""Mapping of charent set name to set dict."""
93
"""Current character's "id" attribute value."""
95
self.descriptions = {}
96
"""Mapping of character ID to description."""
98
def setup_parser(self):
99
parser = ParserCreate()
100
parser.StartElementHandler = self.StartElementHandler
101
parser.EndElementHandler = self.EndElementHandler
102
parser.CharacterDataHandler = self.CharacterDataHandler
106
self.parser.ParseFile(self.infile)
108
def StartElementHandler(self, name, attributes):
109
self.elements.append(name)
110
handler = name + '_start'
111
if hasattr(self, handler):
112
getattr(self, handler)(name, attributes)
114
def EndElementHandler(self, name):
115
assert self.elements[-1] == name, \
116
'unknown end-tag %r (%r)' % (name, self.element)
118
handler = name + '_end'
119
if hasattr(self, handler):
120
getattr(self, handler)(name)
122
def CharacterDataHandler(self, data):
123
handler = self.elements[-1] + '_data'
124
if hasattr(self, handler):
125
getattr(self, handler)(data)
127
def character_start(self, name, attributes):
128
self.charid = attributes['id']
130
def entity_start(self, name, attributes):
131
set = self.entity_set_name(attributes['set'])
134
if not self.sets.has_key(set):
135
print 'bad set: %r' % set
137
entity = attributes['id']
138
assert (not self.sets[set].has_key(entity)
139
or self.sets[set][entity] == self.charid), \
140
('sets[%r][%r] == %r (!= %r)'
141
% (set, entity, self.sets[set][entity], self.charid))
142
self.sets[set][entity] = self.charid
144
def description_data(self, data):
145
self.descriptions.setdefault(self.charid, '')
146
self.descriptions[self.charid] += data
148
entity_set_name_pat = re.compile(r'[0-9-]*(.+)$')
149
"""Pattern to strip ISO numbers off the beginning of set names."""
151
def entity_set_name(self, name):
153
Return lowcased and standard-number-free entity set name.
154
Return ``None`` for unwanted entity sets.
156
match = self.entity_set_name_pat.match(name)
157
name = match.group(1).lower()
158
if name in self.unwanted_entity_sets:
160
self.sets.setdefault(name, {})
163
def write_sets(self):
164
sets = self.sets.keys()
166
for set_name in sets:
167
self.write_set(set_name)
169
def write_set(self, set_name, wide=None):
171
outname = set_name + '-wide.txt'
173
outname = set_name + '.txt'
174
outfile = open(outname, 'w')
175
print 'writing file "%s"' % outname
176
print >>outfile, self.header
177
set = self.sets[set_name]
178
entities = [(e.lower(), e) for e in set.keys()]
181
for _, entity_name in entities:
182
longest = max(longest, len(entity_name))
184
for _, entity_name in entities:
185
has_wide = self.write_entity(
186
set, set_name, entity_name, outfile, longest, wide) or has_wide
187
if has_wide and not wide:
188
self.write_set(set_name, 1)
190
def write_entity(self, set, set_name, entity_name, outfile, longest,
192
charid = set[entity_name]
194
for code in charid[1:].split('-'):
195
if int(code, 16) > 0xFFFF:
196
return 1 # wide-Unicode character
197
codes = ' '.join(['U+%s' % code for code in charid[1:].split('-')])
198
print >>outfile, ('.. %-*s unicode:: %s .. %s'
199
% (longest + 2, '|' + entity_name + '|',
200
codes, self.descriptions[charid]))
203
if __name__ == '__main__':