4
#===============================================================================
6
#===============================================================================
9
#4.0.264 - version update
18
from fnmatch import fnmatch
20
from progress_bar import ProgressBar
22
logger = logging.getLogger('.'.join([os.path.splitext(os.path.basename(sys.argv[0]))[0],'manager','filtergraph',__name__]))
29
'reversedcolumns': False,
36
reversedcolumns = False
44
def open(self,parent,cfg):
46
self.stage = cfg['stage']
47
self.p.stage = cfg['stage']
49
self.errors.append([__name__,'missing','[%s] \"stage=<missing>\"'%(__name__.split(',')[-1])])
50
logger.error('%s\t%s',*self.errors[-1][1:])
52
self.roottype = cfg['roottype'].strip(',').split(',')[0]
53
self.p.roottype = cfg['roottype'].strip(',').split(',')[0]
55
self.errors.append([__name__,'missing','[%s] \"roottype=<missing>\"'%(__name__.split(',')[-1])])
56
logger.error('%s\t%s',*self.errors[-1][1:])
57
if not self.p.cfg['languagepairs']:
58
self.errors.append([__name__,'missing','[%s] requires \"languagepairs=<missing>\"'%(__name__.split(',')[-1])])
59
logger.error('%s\t%s',*self.errors[-1][1:])
60
self.encoding = 'utf-8-sig' if 'utf8' in cfg['encoding'].lower().replace('-','') else cfg['encoding']
61
self.maxinput = 0 if int(cfg['maxinput']) < 1 else int(cfg['maxinput'])
62
self.reversedcolumns = cfg['reversedcolumns']
66
'''Open the file and read contents into buffer'''
67
for k in self.p.cfinput.keys():
68
# 1) test and lock output file
69
if not self.p.cfoutput[k]['lock']:
70
del(self.p.cfoutput[k])
72
# 2) open input files, read to buffers
74
self.p.cfinput[k]['tempbuff'] = [line.strip('\x0A\x09') for line in codecs.open(self.p.cfinput[k]['tempname'],'r',self.encoding).readlines()]
76
logger.exception('%s\t%s - %s',*['failed',self.p.cfinput[k]['tempname'],str(e)])
77
if k in self.p.cfinput: del self.p.cfinput[k]
81
langs = list(self.p.languages)
82
if self.reversedcolumns:
85
for i in range(len(langs)):
87
kk = tuple(self.p.cfinput[k]['filelist'][cf.stage+1:cf.kind]+['tm']+[self.p.languages[srcidx]]+[self.p.languages[i]]+[self.p.cfinput[k]['filelist'][cf.basename]])
88
self.p.cfoutput[kk]['tempbuff'] = [line.split('\t')[srcidx] for line in self.p.cfinput[k]['tempbuff']]
90
kk = tuple(self.p.cfinput[k]['filelist'][cf.stage+1:cf.kind]+['tm']+[self.p.languages[srcidx]]+[self.p.languages[i]]+[self.p.cfinput[k]['filelist'][cf.basename]])
91
self.p.cfoutput[kk]['tempbuff'] = [line.split('\t')[i] for line in self.p.cfinput[k]['tempbuff']]
92
self.p.cfinput[k]['tempbuff'] = []
93
logger.debug('%s\t%s',*['file',self.p.cfinput[k]['tempname']])
94
# 3) test & cleanup if fail
95
if not len(self.p.cfoutput) == len(langs):
96
for k in self.p.cfoutput.keys():
97
del(self.p.cfoutput[k])
98
logger.warn('%s\tinput/output %s',*['mismatch',os.sep.join(self.p.filelist)])
102
'''clear input object'''
103
for k in self.p.cfinput.keys():
104
del(self.p.cfinput[k])
110
'''returns True on error'''
111
if tuple(self.p.filelist) in self.seen.keys(): return -2
112
# use wildcard patterns to update self.p.languages
113
if self.p.filelist[cf.kind].lower().startswith('tm') or self.p.filelist[cf.kind].lower().startswith('lm'):
114
# update self.p.languages and self.p.toolchain based on filelist
115
if sum([1 for pattern in self.p.targetpatterns if '?' in pattern or '*' in pattern]):
116
listdirs = os.listdir(os.sep.join(self.p.filelist[:cf.rdrlang]))
117
for pattern in self.p.targetpatterns:
118
for lang in [dir for dir in listdirs if not dir[0] == '.' and fnmatch(dir,pattern)]:
119
if not lang in self.p.languages: self.p.languages.append(lang)
120
for lang in self.p.languages:
121
for plugintype in ['aligner','filter','postfilter']:
122
if not lang in self.p.toolchain[plugintype]: self.p.toolchain[plugintype][lang] = {}
123
srckey = tuple(self.p.filelist[cf.stage+1:])
124
src = os.sep.join(self.p.cfinput[srckey]['filelist'])
125
# test and lock input file (usually srclang)
126
if not self.p.cfinput[srckey]['lock']:
127
del(self.p.cfinput[srckey])
129
# save local temp file
130
if _getfile(src,self.p.cfinput[srckey]['tempname']):
131
del(self.p.cfinput[srckey])
133
# add primary file to "seen" list
134
self.seen[tuple(self.p.cfinput[srckey]['filelist'])] = None
135
logger.debug('%s\t%s',*['file',src])
137
def flagrecord(self,filelist):
138
if os.path.exists(os.sep.join(filelist)):
139
os.chmod(os.sep.join(filelist),stat.S_IRUSR+stat.S_IRGRP+stat.S_IROTH)
141
def unflagrecord(self,filelist):
142
if os.path.exists(os.sep.join(filelist)):
143
os.chmod(os.sep.join(filelist),stat.S_IRUSR+stat.S_IWUSR+stat.S_IRGRP+stat.S_IROTH)
145
def getcorpusattributes(self,rootfolder,roottype,stage):
146
fs,root = self.getroot(rootfolder)
147
path = os.sep.join([fs,root,roottype,stage])
149
for (dirpath, dirnames, filenames) in os.walk(path):
150
for i in [i for i in reversed(range(len(dirnames))) if dirnames[i].startswith('.') or dirnames[i].startswith('tm') or dirnames[i].startswith('lm')]:
151
dirnames.remove(dirnames[i])
152
result.append(dirpath.replace(path,'').strip(os.sep).split(os.sep) if dirpath.replace(path,'').strip(os.sep) else [])
153
if filenames: result.append(dirpath.replace(path,'').strip(os.sep).split(os.sep))
156
def getallinputs(self,rootfolder,roottype,stage):
157
fs,root = self.getroot(rootfolder)
159
for corpusattribute in self.getcorpusattributes(rootfolder,roottype,stage):
160
path = os.sep.join([fs,root,roottype,stage]+corpusattribute).rstrip(os.sep)
161
pathlist = [fs,root,roottype,stage]+corpusattribute
162
result[tuple(pathlist)] = []
163
for dir in [dir for dir in os.listdir(path) if not dir.startswith('.')]:
164
if os.path.isfile(os.sep.join([path,dir])):
165
result[tuple(pathlist)].append(dir)
166
elif dir.startswith('tm') or dir.startswith('lm'):
167
for (dirpath, dirnames, filenames) in os.walk(os.sep.join([path,dir])):
168
if not filenames: continue
169
result[tuple(pathlist+dirpath.replace(os.sep.join(pathlist),'').lstrip(os.sep).split(os.sep))] = filenames
170
if not result[tuple(pathlist)]: del(result[tuple(pathlist)])
173
def buildqueue(self,srcqueue,searchpaths):
174
progress = ProgressBar()
175
sys.stdout.write("Paths:\n")
176
for path in searchpaths:
177
sys.stdout.write(" %s\n"%(os.sep.join(path)))
178
totalsearchpaths = 1 if len(searchpaths) == 0 else len(searchpaths)
179
progress.reset(0,totalsearchpaths,72,mode='fixed',char='#')
180
sys.stdout.write('Searching...\n')
181
sys.stdout.write(str(progress)+'\r')
185
for path in searchpaths:
186
logger.debug('%s\t%s',*['search',os.sep.join(path)])
187
filenames.extend([self.path2list(filelist,path[cf.root]) for filelist in glob.glob(os.sep.join(path))])
188
progress.increment_amount()
189
sys.stdout.write(str(progress)+'\r')
190
# dedupe and check read-only
193
with codecs.open(srcqueue,'w','utf8') as fq:
194
for filelist in [filelist for filelist in filenames if filelist]:
195
if tuple(filelist) in keep: continue
196
if not os.access(os.sep.join(filelist),os.W_OK):
197
if self.p.resetreadonly:
198
self.unflagrecord(filelist)
200
logger.debug('%s\t%s',*['read-only',os.sep.join(filelist)])
202
keep[tuple(filelist)] = None
203
fq.write('%s\n'%('\t'.join(filelist).encode('utf8')))
204
if self.maxinput and len(keep) == self.maxinput: break
205
with open(srcqueue+os.extsep+'count','w') as fq:
206
fq.write('%s\n'%(len(keep)))
207
sys.stdout.write('\n')
208
sys.stdout.write("Found: %d file set(s)\n"%(len(keep)) if len(keep) else "Files not found\n")
212
def path2list(self,path,root):
213
'''Converts filename to list of levels.'''
214
if fnmatch(os.sep.join(path.split(os.sep)[1:root.count(os.sep)+2]),root):
215
fs,workingpath = path.rstrip('\\/').split(os.sep.join(path.split(os.sep)[1:root.count(os.sep)+2]))
216
return [fs.rstrip(os.sep),]+[os.sep.join(path.split(os.sep)[1:root.count(os.sep)+2]).strip(os.sep),]+workingpath.lstrip(os.sep).split(os.sep)
220
def getroot(self,rootfolder):
221
'''Splits rootfolder to file system and rootfolder'''
222
folder = os.path.abspath(os.path.expanduser(self.normalizeroot(rootfolder)))
223
if fnmatch(folder,'?:*'):
224
# DOS drive letter : folder
225
fss, folder = folder.split(':')
227
elif fnmatch(folder,'*:*'):
228
# ssh hostname : folder
229
fss, folder = folder.split(':')
231
elif fnmatch(folder,'\\\\*\\*'):
232
# MS Windows UNC \ folder
233
fss = '\\\\'+folder.lstrip(os.sep).split(os.sep)[0]
234
folder = folder.replace(fss,'')
238
folder = os.path.normpath(folder)
239
if folder == '.': folder = '*'
240
folder = folder.lstrip('\\/')
241
return [unicode(fss), unicode(folder)]
243
def normalizeroot(self,rootfolder):
244
return rootfolder.replace('\\',os.sep).replace('/',os.sep)
246
def _getfile(orig,dest):
248
os.makedirs(os.path.dirname(dest))
250
if not e.errno == 17: return True
251
shutil.copyfile(orig,dest)
254
'''Command prompt help.'''
255
return "\n%s\n\tUsage:\n\tfrom %s import reader\n"%(
256
os.path.basename(sys.argv[0]),
257
os.path.splitext(os.path.basename(sys.argv[0]))[0]
260
licensetxt=u'''CorpusFiltergraph™ v4.0
261
Copyright © 2010-2012 Precision Translation Tools Co., Ltd.
263
This program is free software: you can redistribute it and/or modify
264
it under the terms of the GNU Lesser General Public License as published by
265
the Free Software Foundation, either version 3 of the License, or
266
(at your option) any later version.
268
This program is distributed in the hope that it will be useful,
269
but WITHOUT ANY WARRANTY; without even the implied warranty of
270
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
271
GNU Lesser General Public License for more details.
273
You should have received a copy of the GNU Lesser General Public License
274
along with this program. If not, see http://www.gnu.org/licenses/.
276
For more information, please contact Precision Translation Tools Co., Ltd.
277
at: http://www.precisiontranslationtools.com'''
279
if __name__ == "__main__":
282
sys.stdout.write(usage().encode('utf8')+'\n')