1
#characterProcessing.py
2
#A part of NonVisual Desktop Access (NVDA)
3
#Copyright (C) 2010-2011 NV Access Inc, World Light Information Limited, Hong Kong Blind Union
4
#This file is covered by the GNU General Public License.
5
#See the file COPYING for more details.
12
from logHandler import log
15
class LocaleDataMap(object):
16
"""Allows access to locale-specific data objects, dynamically loading them if needed on request"""
18
def __init__(self,localeDataFactory):
20
@param localeDataFactory: the factory to create data objects for the requested locale.
22
self._localeDataFactory=localeDataFactory
25
def fetchLocaleData(self,locale):
27
Fetches a data object for the given locale.
28
This may mean that the data object is first created and stored if it does not yet exist in the map.
29
The locale is also simplified (country is dropped) if the full locale can not be used to create a data object.
30
@param locale: the locale of the data object requested
32
@return: the data object for the given locale
36
localeList.append(locale.split('_')[0])
38
data=self._dataMap.get(l)
41
data=self._localeDataFactory(l)
47
raise LookupError(locale)
49
def invalidateLocaleData(self, locale):
50
"""Invalidate the data object (if any) for the given locale.
51
This will cause a new data object to be created when this locale is next requested.
52
@param locale: The locale for which the data object should be invalidated.
56
del self._dataMap[locale]
60
class CharacterDescriptions(object):
62
Represents a map of characters to one or more descriptions (examples) for that character.
63
The data is loaded from a file from the requested locale.
66
def __init__(self,locale):
68
@param locale: The characterDescriptions.dic file will be found by using this locale.
72
fileName=os.path.join('locale',locale,'characterDescriptions.dic')
73
if not os.path.isfile(fileName):
74
raise LookupError(fileName)
75
f = codecs.open(fileName,"r","utf_8_sig",errors="replace")
77
if line.isspace() or line.startswith('#'):
79
line=line.rstrip('\r\n')
83
self._entries[key] = temp
85
log.warning("can't parse line '%s'" % line)
86
log.debug("Loaded %d entries." % len(self._entries))
89
def getCharacterDescription(self, character):
91
Looks up the given character and returns a list containing all the description strings found.
93
return self._entries.get(character)
95
_charDescLocaleDataMap=LocaleDataMap(CharacterDescriptions)
97
def getCharacterDescription(locale,character):
99
Finds a description or examples for the given character, which makes sence in the given locale.
100
@param locale: the locale (language[_COUNTRY]) the description should be for.
102
@param character: the character who's description should be retreaved.
103
@type character: string
104
@return: the found description for the given character
105
@rtype: list of strings
108
l=_charDescLocaleDataMap.fetchLocaleData(locale)
110
if not locale.startswith('en'):
111
return getCharacterDescription('en',character)
112
raise LookupError("en")
113
desc=l.getCharacterDescription(character)
114
if not desc and not locale.startswith('en'):
115
desc=getCharacterDescription('en',character)
118
# Speech symbol levels
124
SPEECH_SYMBOL_LEVEL_LABELS = {
125
SYMLVL_NONE: _("none"),
126
SYMLVL_SOME: _("some"),
127
SYMLVL_MOST: _("most"),
128
SYMLVL_ALL: _("all"),
129
SYMLVL_CHAR: _("character"),
131
CONFIGURABLE_SPEECH_SYMBOL_LEVELS = (SYMLVL_NONE, SYMLVL_SOME, SYMLVL_MOST, SYMLVL_ALL)
132
SPEECH_SYMBOL_LEVELS = CONFIGURABLE_SPEECH_SYMBOL_LEVELS + (SYMLVL_CHAR,)
134
# Speech symbol preserve modes
139
class SpeechSymbol(object):
140
__slots__ = ("identifier", "pattern", "replacement", "level", "preserve", "displayName")
142
def __init__(self, identifier, pattern=None, replacement=None, level=None, preserve=None, displayName=None):
143
self.identifier = identifier
144
self.pattern = pattern
145
self.replacement = replacement
147
self.preserve = preserve
148
self.displayName = displayName
152
for attr in self.__slots__:
153
attrs.append("{name}={val!r}".format(
154
name=attr, val=getattr(self, attr)))
155
return "SpeechSymbol(%s)" % ", ".join(attrs)
157
class SpeechSymbols(object):
159
Contains raw information about the pronunciation of symbols.
160
It does not handle inheritance of data from other sources, processing of text, etc.
161
This is all handled by L{SpeechSymbolProcessor}.
167
self.complexSymbols = collections.OrderedDict()
168
self.symbols = collections.OrderedDict()
171
def load(self, fileName, allowComplexSymbols=True):
172
"""Load symbol information from a file.
173
@param fileName: The name of the file from which to load symbol information.
175
@param allowComplexSymbols: Whether to allow complex symbols.
176
@type allowComplexSymbols: bool
177
@raise IOError: If the file cannot be read.
179
self.fileName = fileName
180
with codecs.open(fileName, "r", "utf_8_sig", errors="replace") as f:
183
if line.isspace() or line.startswith("#"):
184
# Whitespace or comment.
186
line = line.rstrip("\r\n")
188
if line == "complexSymbols:" and allowComplexSymbols:
189
handler = self._loadComplexSymbol
190
elif line == "symbols:":
191
handler = self._loadSymbol
193
# This is a line within a section, so handle it according to which section we're in.
198
log.warning(u"Invalid line in file {file}: {line}".format(
199
file=fileName, line=line))
201
def _loadComplexSymbol(self, line):
203
identifier, pattern = line.split("\t")
206
self.complexSymbols[identifier] = pattern
208
def _loadSymbolField(self, input, inputMap=None):
215
return inputMap[input]
219
IDENTIFIER_ESCAPES_INPUT = {
229
IDENTIFIER_ESCAPES_OUTPUT = {v: k for k, v in IDENTIFIER_ESCAPES_INPUT.iteritems()}
237
LEVEL_OUTPUT = {v:k for k, v in LEVEL_INPUT.iteritems()}
239
"never": SYMPRES_NEVER,
240
"always": SYMPRES_ALWAYS,
241
"norep": SYMPRES_NOREP,
243
PRESERVE_OUTPUT = {v: k for k, v in PRESERVE_INPUT.iteritems()}
245
def _loadSymbol(self, line):
246
line = line.split("\t")
247
identifier = replacement = level = preserve = displayName = None
248
if line[-1].startswith("#"):
249
# Regardless of how many fields there are,
250
# if the last field is a comment, it is the display name.
251
displayName = line[-1][1:].lstrip()
255
identifier = next(line)
257
# Empty identifier is not allowed.
259
if identifier.startswith("\\") and len(identifier) >= 2:
260
identifier = self.IDENTIFIER_ESCAPES_INPUT.get(identifier[1], identifier[1]) + identifier[2:]
261
replacement = self._loadSymbolField(next(line))
262
except StopIteration:
263
# These fields are mandatory.
266
level = self._loadSymbolField(next(line), self.LEVEL_INPUT)
267
preserve = self._loadSymbolField(next(line), self.PRESERVE_INPUT)
268
except StopIteration:
269
# These fields are optional. Defaults will be used for unspecified fields.
271
self.symbols[identifier] = SpeechSymbol(identifier, None, replacement, level, preserve, displayName)
273
def save(self, fileName=None):
274
"""Save symbol information to a file.
275
@param fileName: The name of the file to which to save symbol information,
276
C{None} to use the file name last passed to L{load} or L{save}.
278
@raise IOError: If the file cannot be written.
279
@raise ValueError: If C{fileName} is C{None}
280
and L{load} or L{save} has not been called.
283
self.fileName = fileName
285
fileName = self.fileName
287
raise ValueError("No file name")
289
with codecs.open(fileName, "w", "utf_8_sig", errors="replace") as f:
290
if self.complexSymbols:
291
f.write(u"complexSymbols:\r\n")
292
for identifier, pattern in self.complexSymbols.iteritems():
293
f.write(u"%s\t%s\r\n" % (identifier, pattern))
297
f.write(u"symbols:\r\n")
298
for symbol in self.symbols.itervalues():
299
f.write(u"%s\r\n" % self._saveSymbol(symbol))
301
def _saveSymbolField(self, output, outputMap=None):
307
return outputMap[output]
311
def _saveSymbol(self, symbol):
312
identifier = symbol.identifier
314
identifier = u"\\%s%s" % (
315
self.IDENTIFIER_ESCAPES_OUTPUT[identifier[0]], identifier[1:])
318
fields = [identifier,
319
self._saveSymbolField(symbol.replacement),
320
self._saveSymbolField(symbol.level, self.LEVEL_OUTPUT),
321
self._saveSymbolField(symbol.preserve, self.PRESERVE_OUTPUT)
323
# Strip optional fields with default values.
324
for field in reversed(fields[2:]):
327
if symbol.displayName:
328
fields.append("# %s" % symbol.displayName)
329
return u"\t".join(fields)
331
def _getSpeechSymbolsForLocale(locale):
332
builtin = SpeechSymbols()
334
builtin.load(os.path.join("locale", locale, "symbols.dic"))
336
raise LookupError("No symbol information for locale %s" % locale)
337
user = SpeechSymbols()
339
# Don't allow users to specify complex symbols
340
# because an error will cause the whole processor to fail.
341
user.load(os.path.join(globalVars.appArgs.configPath, "symbols-%s.dic" % locale),
342
allowComplexSymbols=False)
344
# An empty user SpeechSymbols is okay.
348
class SpeechSymbolProcessor(object):
350
Handles processing of symbol pronunciation for a locale.
351
Pronunciation information is taken from one or more L{SpeechSymbols} instances.
354
#: Caches symbol data for locales.
355
localeSymbols = LocaleDataMap(_getSpeechSymbolsForLocale)
357
def __init__(self, locale):
359
@param locale: The locale for which symbol pronunciation should be processed.
364
# We need to merge symbol data from several sources.
365
sources = self.sources = []
366
builtin, user = self.localeSymbols.fetchLocaleData(locale)
367
self.userSymbols = user
369
sources.append(builtin)
371
# Always use English as a base.
373
# Only the builtin data.
374
sources.append(self.localeSymbols.fetchLocaleData("en")[0])
376
# The computed symbol information from all sources.
377
symbols = self.computedSymbols = collections.OrderedDict()
378
# An indexable list of complex symbols for use in building/executing the regexp.
379
complexSymbolsList = self._computedComplexSymbolsList = []
380
# A list of simple symbol identifiers for use in building the regexp.
381
simpleSymbolIdentifiers = []
382
# Single character symbols.
385
# Add all complex symbols first, as they take priority.
386
for source in sources:
387
for identifier, pattern in source.complexSymbols.iteritems():
388
if identifier in symbols:
391
symbol = SpeechSymbol(identifier, pattern)
392
symbols[identifier] = symbol
393
complexSymbolsList.append(symbol)
395
# Supplement the data for complex symbols and add all simple symbols.
396
for source in sources:
397
for identifier, sourceSymbol in source.symbols.iteritems():
399
symbol = symbols[identifier]
400
# We're updating an already existing symbol.
402
# This is a new simple symbol.
403
# (All complex symbols have already been added.)
404
symbol = symbols[identifier] = SpeechSymbol(identifier)
405
simpleSymbolIdentifiers.append(identifier)
406
if len(identifier) == 1:
407
characters.add(identifier)
408
# If fields weren't explicitly specified, inherit the value from later sources.
409
if symbol.replacement is None:
410
symbol.replacement = sourceSymbol.replacement
411
if symbol.level is None:
412
symbol.level = sourceSymbol.level
413
if symbol.preserve is None:
414
symbol.preserve = sourceSymbol.preserve
415
if symbol.displayName is None:
416
symbol.displayName = sourceSymbol.displayName
418
# Set defaults for any fields not explicitly set.
419
for symbol in symbols.values():
420
if symbol.replacement is None:
421
# Symbols without a replacement specified are useless.
422
log.warning(u"Replacement not defined in locale {locale} for symbol: {symbol}".format(
423
symbol=symbol.identifier, locale=self.locale))
424
del symbols[symbol.identifier]
426
complexSymbolsList.remove(symbol)
430
if symbol.level is None:
431
symbol.level = SYMLVL_ALL
432
if symbol.preserve is None:
433
symbol.preserve = SYMPRES_NEVER
434
if symbol.displayName is None:
435
symbol.displayName = symbol.identifier
437
characters = "".join(characters)
438
# The simple symbols must be ordered longest first so that the longer symbols will match.
439
simpleSymbolIdentifiers.sort(key=lambda identifier: len(identifier), reverse=True)
443
# Strip repeated spaces from the end of the line to stop them from being picked up by repeated.
444
r"(?P<rstripSpace> +$)",
445
# Repeated characters: more than 3 repeats.
446
r"(?P<repeated>(?P<repTmp>[%s])(?P=repTmp){3,})" % re.escape("".join(characters))
449
# Each complex symbol has its own named group so we know which symbol matched.
451
u"(?P<c{index}>{pattern})".format(index=index, pattern=symbol.pattern)
452
for index, symbol in enumerate(complexSymbolsList))
454
# These are all handled in one named group.
455
# Because the symbols are just text, we know which symbol matched just by looking at the matched text.
456
patterns.append(ur"(?P<simple>{})".format(
457
"|".join(re.escape(identifier) for identifier in simpleSymbolIdentifiers)
459
pattern = "|".join(patterns)
461
self._regexp = re.compile(pattern, re.UNICODE)
462
except re.error as e:
463
log.error("Invalid complex symbol regular expression in locale %s: %s" % (locale, e))
466
def _regexpRepl(self, m):
469
if group == "rstripSpace":
472
elif group == "repeated":
473
# Repeated character.
475
symbol = self.computedSymbols[text[0]]
476
if self._level >= symbol.level:
477
return u" {count} {char} ".format(count=len(text), char=symbol.replacement)
482
# One of the defined symbols.
484
if group == "simple":
486
symbol = self.computedSymbols[text]
489
index = int(group[1:])
490
symbol = self._computedComplexSymbolsList[index]
491
if symbol.preserve == SYMPRES_ALWAYS or (symbol.preserve == SYMPRES_NOREP and self._level < symbol.level):
495
if self._level >= symbol.level and symbol.replacement:
496
return u" {repl}{suffix}".format(repl=symbol.replacement, suffix=suffix)
500
def processText(self, text, level):
502
return self._regexp.sub(self._regexpRepl, text)
504
def updateSymbol(self, newSymbol):
505
"""Update information for a symbol if it has changed.
506
If there is a change, the changed information will be added to the user's symbol data.
507
These changes do not take effect until the symbol processor is reinitialised.
508
@param newSymbol: The symbol to update.
509
@type newSymbol: L{SpeechSymbol}
510
@return: Whether there was a change.
513
identifier = newSymbol.identifier
514
oldSymbol = self.computedSymbols[identifier]
515
if oldSymbol is newSymbol:
518
userSymbol = self.userSymbols.symbols[identifier]
520
userSymbol = SpeechSymbol(identifier)
523
if newSymbol.pattern != oldSymbol.pattern:
524
userSymbol.pattern = newSymbol.pattern
526
if newSymbol.replacement != oldSymbol.replacement:
527
userSymbol.replacement = newSymbol.replacement
529
if newSymbol.level != oldSymbol.level:
530
userSymbol.level = newSymbol.level
532
if newSymbol.preserve != oldSymbol.preserve:
533
userSymbol.preserve = newSymbol.preserve
535
if newSymbol.displayName != oldSymbol.displayName:
536
userSymbol.displayName = newSymbol.displayName
542
# Do this in case the symbol wasn't in userSymbols before.
543
self.userSymbols.symbols[identifier] = userSymbol
546
_localeSpeechSymbolProcessors = LocaleDataMap(SpeechSymbolProcessor)
548
def processSpeechSymbols(locale, text, level):
549
"""Process some text, converting symbols according to desired pronunciation.
550
@param locale: The locale of the text.
552
@param text: The text to process.
554
@param level: The symbol level to use; one of the SYMLVL_* constants.
557
ss = _localeSpeechSymbolProcessors.fetchLocaleData(locale)
559
if not locale.startswith("en_"):
560
return processSpeechSymbols("en", text, level)
562
return ss.processText(text, level)
564
def processSpeechSymbol(locale, symbol):
565
"""Process a single symbol according to desired pronunciation.
566
@param locale: The locale of the symbol.
568
@param symbol: The symbol.
572
ss = _localeSpeechSymbolProcessors.fetchLocaleData(locale)
574
if not locale.startswith("en_"):
575
return processSpeechSymbol("en", symbol)
578
return ss.computedSymbols[symbol].replacement