~jtv/corpusfiltergraph/cross-python

kk = tuple(self.p.cfinput[k]['filelist'][cf.stage+1:cf.kind]+['tm']+[self.p.languages[srcidx]]+[self.p.languages[i]]+[self.p.cfinput[k]['filelist'][cf.basename]])

self.p.cfoutput[kk]['tempbuff'] = [line.split('\t')[srcidx] for line in self.p.cfinput[k]['tempbuff']]

else:

kk = tuple(self.p.cfinput[k]['filelist'][cf.stage+1:cf.kind]+['tm']+[self.p.languages[srcidx]]+[self.p.languages[i]]+[self.p.cfinput[k]['filelist'][cf.basename]])

self.p.cfoutput[kk]['tempbuff'] = [line.split('\t')[i] for line in self.p.cfinput[k]['tempbuff']]

self.p.cfinput[k]['tempbuff'] = []

logger.debug('%s\t%s',*['file',self.p.cfinput[k]['tempname']])

# 3) test & cleanup if fail

if not len(self.p.cfoutput) == len(langs):

for k in self.p.cfoutput.keys():

del(self.p.cfoutput[k])

logger.warn('%s\tinput/output %s',*['mismatch',os.sep.join(self.p.filelist)])

return -2

100

101

def flush(self):

102

'''clear input object'''

103

for k in self.p.cfinput.keys():

104

del(self.p.cfinput[k])

105

106

def close(self):

107

return

108

109

def getrecord(self):

110

'''returns True on error'''

111

if tuple(self.p.filelist) in self.seen.keys(): return -2

112

# use wildcard patterns to update self.p.languages

113

if self.p.filelist[cf.kind].lower().startswith('tm') or self.p.filelist[cf.kind].lower().startswith('lm'):

114

# update self.p.languages and self.p.toolchain based on filelist

115

if sum([1 for pattern in self.p.targetpatterns if '?' in pattern or '*' in pattern]):

116

listdirs = os.listdir(os.sep.join(self.p.filelist[:cf.rdrlang]))

117

for pattern in self.p.targetpatterns:

118

for lang in [dir for dir in listdirs if not dir[0] == '.' and fnmatch(dir,pattern)]:

119

if not lang in self.p.languages: self.p.languages.append(lang)

120

for lang in self.p.languages:

121

for plugintype in ['aligner','filter','postfilter']:

122

if not lang in self.p.toolchain[plugintype]: self.p.toolchain[plugintype][lang] = {}

123

srckey = tuple(self.p.filelist[cf.stage+1:])

124

src = os.sep.join(self.p.cfinput[srckey]['filelist'])

125

# test and lock input file (usually srclang)

126

if not self.p.cfinput[srckey]['lock']:

127

del(self.p.cfinput[srckey])

128

return -2

129

# save local temp file

130

if _getfile(src,self.p.cfinput[srckey]['tempname']):

131

del(self.p.cfinput[srckey])

132

return -2

133

# add primary file to "seen" list

134

self.seen[tuple(self.p.cfinput[srckey]['filelist'])] = None

135

logger.debug('%s\t%s',*['file',src])

136

137

def flagrecord(self,filelist):

138

if os.path.exists(os.sep.join(filelist)):

139

os.chmod(os.sep.join(filelist),stat.S_IRUSR+stat.S_IRGRP+stat.S_IROTH)

140

141

def unflagrecord(self,filelist):

142

if os.path.exists(os.sep.join(filelist)):

143

os.chmod(os.sep.join(filelist),stat.S_IRUSR+stat.S_IWUSR+stat.S_IRGRP+stat.S_IROTH)

144

145

def getcorpusattributes(self,rootfolder,roottype,stage):

146

fs,root = self.getroot(rootfolder)

147

path = os.sep.join([fs,root,roottype,stage])

148

result = []

149

for (dirpath, dirnames, filenames) in os.walk(path):

150

for i in [i for i in reversed(range(len(dirnames))) if dirnames[i].startswith('.') or dirnames[i].startswith('tm') or dirnames[i].startswith('lm')]:

151

dirnames.remove(dirnames[i])

152

result.append(dirpath.replace(path,'').strip(os.sep).split(os.sep) if dirpath.replace(path,'').strip(os.sep) else [])

153

if filenames: result.append(dirpath.replace(path,'').strip(os.sep).split(os.sep))

154

return result

155

156

def getallinputs(self,rootfolder,roottype,stage):

157

fs,root = self.getroot(rootfolder)

158

result = {}

159

for corpusattribute in self.getcorpusattributes(rootfolder,roottype,stage):

160

path = os.sep.join([fs,root,roottype,stage]+corpusattribute).rstrip(os.sep)

161

pathlist = [fs,root,roottype,stage]+corpusattribute

162

result[tuple(pathlist)] = []

163

for dir in [dir for dir in os.listdir(path) if not dir.startswith('.')]:

164

if os.path.isfile(os.sep.join([path,dir])):

165

result[tuple(pathlist)].append(dir)

166

elif dir.startswith('tm') or dir.startswith('lm'):

167

for (dirpath, dirnames, filenames) in os.walk(os.sep.join([path,dir])):

168

if not filenames: continue

169

result[tuple(pathlist+dirpath.replace(os.sep.join(pathlist),'').lstrip(os.sep).split(os.sep))] = filenames

170

if not result[tuple(pathlist)]: del(result[tuple(pathlist)])

171

return result

172

173

def buildqueue(self,srcqueue,searchpaths):

174

progress = ProgressBar()

175

sys.stdout.write("Paths:\n")

176

for path in searchpaths:

177

sys.stdout.write(" %s\n"%(os.sep.join(path)))

178

totalsearchpaths = 1 if len(searchpaths) == 0 else len(searchpaths)

179

progress.reset(0,totalsearchpaths,72,mode='fixed',char='#')

180

sys.stdout.write('Searching...\n')

181

sys.stdout.write(str(progress)+'\r')

182

sys.stdout.flush()

183

# search file system

184

filenames = []

185

for path in searchpaths:

186

logger.debug('%s\t%s',*['search',os.sep.join(path)])

187

filenames.extend([self.path2list(filelist,path[cf.root]) for filelist in glob.glob(os.sep.join(path))])

188

progress.increment_amount()

189

sys.stdout.write(str(progress)+'\r')

190

# dedupe and check read-only

191

filenames.sort()

192

keep = {}

193

with codecs.open(srcqueue,'w','utf8') as fq:

194

for filelist in [filelist for filelist in filenames if filelist]:

195

if tuple(filelist) in keep: continue

196

if not os.access(os.sep.join(filelist),os.W_OK):

197

if self.p.resetreadonly:

198

self.unflagrecord(filelist)

199

else:

200

logger.debug('%s\t%s',*['read-only',os.sep.join(filelist)])

201

continue

202

keep[tuple(filelist)] = None

203

fq.write('%s\n'%('\t'.join(filelist).encode('utf8')))

204

if self.maxinput and len(keep) == self.maxinput: break

205

with open(srcqueue+os.extsep+'count','w') as fq:

206

fq.write('%s\n'%(len(keep)))

207

sys.stdout.write('\n')

208

sys.stdout.write("Found: %d file set(s)\n"%(len(keep)) if len(keep) else "Files not found\n")

209

sys.stdout.flush()

210

return len(keep)

211

212

def path2list(self,path,root):

213

'''Converts filename to list of levels.'''

214

if fnmatch(os.sep.join(path.split(os.sep)[1:root.count(os.sep)+2]),root):

215

fs,workingpath = path.rstrip('\\/').split(os.sep.join(path.split(os.sep)[1:root.count(os.sep)+2]))

216

return [fs.rstrip(os.sep),]+[os.sep.join(path.split(os.sep)[1:root.count(os.sep)+2]).strip(os.sep),]+workingpath.lstrip(os.sep).split(os.sep)

217

else:

218

return []

219

220

def getroot(self,rootfolder):

221

'''Splits rootfolder to file system and rootfolder'''

222

folder = os.path.abspath(os.path.expanduser(self.normalizeroot(rootfolder)))

223

if fnmatch(folder,'?:*'):

224

# DOS drive letter : folder

225

fss, folder = folder.split(':')

226

fss = fss + ':'

227

elif fnmatch(folder,'*:*'):

228

# ssh hostname : folder

229

fss, folder = folder.split(':')

230

fss = fss + ':'

231

elif fnmatch(folder,'\\\\*\\*'):

232

# MS Windows UNC \ folder

233

fss = '\\\\'+folder.lstrip(os.sep).split(os.sep)[0]

234

folder = folder.replace(fss,'')

235

else:

236

# NFS mount / folder

237

fss = ''

238

folder = os.path.normpath(folder)

239

if folder == '.': folder = '*'

240

folder = folder.lstrip('\\/')

241

return [unicode(fss), unicode(folder)]

242

243

def normalizeroot(self,rootfolder):

244

return rootfolder.replace('\\',os.sep).replace('/',os.sep)

245

246

def _getfile(orig,dest):

247

try:

248

os.makedirs(os.path.dirname(dest))

249

except OSError,e:

250

if not e.errno == 17: return True

251

shutil.copyfile(orig,dest)

252

253

def usage():

254

'''Command prompt help.'''

255

return "\n%s\n\tUsage:\n\tfrom %s import reader\n"%(

256

os.path.basename(sys.argv[0]),

257

os.path.splitext(os.path.basename(sys.argv[0]))[0]

258

)

259

260

licensetxt=u'''CorpusFiltergraph™ v4.0

261

262

263

This program is free software: you can redistribute it and/or modify

264

it under the terms of the GNU Lesser General Public License as published by

265

the Free Software Foundation, either version 3 of the License, or

266

(at your option) any later version.

267

268

This program is distributed in the hope that it will be useful,

269

but WITHOUT ANY WARRANTY; without even the implied warranty of

270

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

271

GNU Lesser General Public License for more details.

272

273

You should have received a copy of the GNU Lesser General Public License

274

along with this program. If not, see http://www.gnu.org/licenses/.

275

276

For more information, please contact Precision Translation Tools Co., Ltd.

277

at: http://www.precisiontranslationtools.com'''

278

279

if __name__ == "__main__":

280

import os

281

import sys

282

sys.stdout.write(usage().encode('utf8')+'\n')

Older »