1
# Copyright (c) 2007 Carnegie Mellon University
3
# You may copy and modify this freely under the same terms as
6
"""Corpus classes for acoustic model training.
8
This module provides classes for representing a corpus of utterances
9
for acoustic modeling. The Corpus class implements the iterator
10
protocol, acting as a list of Utterance objects.
12
__author__ = "David Huggins-Daines <dhuggins@cs.cmu.edu>"
13
__version__ = "$Revision: 10058 $"
17
class Resource(object):
18
"""Resource associated with an utterance in a speech corpus.
20
Any utterance has an arbitrary set of resources associated with
21
it. These are things such as waveforms, acoustic feature files,
22
transcriptions and other forms of supervision, etc.
26
class FileResourceIterator(object):
28
Iterator over items in a FileResource.
30
def __init__(self, resource):
32
self.ctl = iter(resource.ctl_file)
35
# This will raise StopIteration for us at EOF
36
entry = self.ctl.next()
37
if isinstance(entry, CtlEntry):
38
path = os.path.join(self.res.base_dir, entry.fileid + self.res.file_ext)
40
path = os.path.join(self.res.base_dir, entry + self.res.file_ext)
41
if self.res.data_type:
42
return self.res.data_type(path)
46
class FileResource(Resource):
47
def __init__(self, ctl_file, base_dir, file_ext, data_type=None):
49
Initialize a file-based resource.
50
@param ctl_file: Control file resource on which this is based
51
@ptype ctl_file: iterator(CtlEntry)
52
@param base_dir: Base directory to prepend to control entries
53
@param file_ext: Filename extension to append to control entries
54
@param data_type: Class to construct from entries.
55
@ptype data_type: type
57
self.ctl_file = ctl_file
58
self.base_dir = base_dir
59
self.file_ext = file_ext
60
self.data_type = data_type
63
return FileResourceIterator(self)
65
class CtlEntry(object):
66
"""Entry in a control file"""
67
def __init__(self, str):
70
self.fileid, self.sf, self.ef, self.uttid = fields
71
self.sf = int(self.sf)
72
self.ef = int(self.ef)
74
self.fileid = self.uttid = str
78
class ListResourceIterator(object):
80
Iterator over items in a ListResource.
82
def __init__(self, resource):
83
self.fh = open(resource.file_name)
84
self.data_type = resource.data_type
87
spam = self.fh.readline()
91
return self.data_type(spam.rstrip())
95
class ListResource(Resource):
97
Corpus resource consisting of lines in a text file, of some data
98
type. This includes things like control and transcript files.
100
def __init__(self, file_name, data_type=None):
102
Initialize a listing-based resource.
104
If no data_type argument is specified, each item in the list
105
will be returned as a string.
107
@param file_name: File to read resource from
108
@ptype file_name: string
109
@param data_type: Class implementing the data type of each item
110
@ptype data_type: type
112
self.data_type = data_type
113
self.file_name = file_name
116
return ListResourceIterator(self)
118
class CorpusIterator(object):
120
Iterator over elements in a Corpus.
122
def __init__(self, corpus, part=1, npart=1):
128
for k, v in corpus.resources.iteritems():
129
self.iters[k] = iter(v)
133
for k,v in self.iters.iteritems():
137
class Corpus(object):
138
"""Corpus of speech data."""
139
def __init__(self, ctl_file):
140
self.ctl = ListResource(ctl_file, CtlEntry)
141
self.resources = { 'ctl' : self.ctl }
144
return CorpusIterator(self)
146
def add_resource(self, name, res):
147
self.resources[name] = res