~jtv/corpusfiltergraph/cross-python

« back to all changes in this revision

Viewing changes to trunk/lib/corpusfg/plugins/reader-tab.py

  • Committer: tahoar
  • Date: 2012-05-02 15:46:23 UTC
  • Revision ID: svn-v4:bc069b21-dff4-4e29-a776-06a4e04bad4e::266
new layout. need to update code to use the new layout

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#! /usr/bin/env python
 
2
# -*- coding: utf8 -*-
 
3
 
 
4
#===============================================================================
 
5
# Author: Tom Hoar
 
6
#===============================================================================
 
7
 
 
8
#version:
 
9
#4.0.264 - version update
 
10
 
 
11
import sys
 
12
import os
 
13
import stat
 
14
import common as cf
 
15
import logging
 
16
import shutil
 
17
import codecs
 
18
from fnmatch import fnmatch
 
19
import glob
 
20
from progress_bar import ProgressBar
 
21
 
 
22
logger = logging.getLogger('.'.join([os.path.splitext(os.path.basename(sys.argv[0]))[0],'manager','filtergraph',__name__]))
 
23
 
 
24
class reader(object):
 
25
 
 
26
        cfg = { 
 
27
                'encoding': 'utf8',
 
28
                'maxinput': 0,
 
29
                'reversedcolumns': False,
 
30
                'roottype': '',
 
31
                'stage': None,
 
32
                'version': '4.0.264',
 
33
                }
 
34
        encoding = 'utf8'
 
35
        maxinput = 0
 
36
        reversedcolumns = False
 
37
        roottype = ''
 
38
        seen = {}
 
39
        stage = None
 
40
        isopen = False
 
41
        p = object
 
42
        errors = []
 
43
 
 
44
        def open(self,parent,cfg):
 
45
                if cfg['stage']:
 
46
                        self.stage = cfg['stage']
 
47
                        self.p.stage = cfg['stage']
 
48
                else:
 
49
                        self.errors.append([__name__,'missing','[%s] \"stage=<missing>\"'%(__name__.split(',')[-1])])
 
50
                        logger.error('%s\t%s',*self.errors[-1][1:])
 
51
                if cfg['roottype']:
 
52
                        self.roottype = cfg['roottype'].strip(',').split(',')[0]
 
53
                        self.p.roottype = cfg['roottype'].strip(',').split(',')[0]
 
54
                else:
 
55
                        self.errors.append([__name__,'missing','[%s] \"roottype=<missing>\"'%(__name__.split(',')[-1])])
 
56
                        logger.error('%s\t%s',*self.errors[-1][1:])
 
57
                if not self.p.cfg['languagepairs']:
 
58
                        self.errors.append([__name__,'missing','[%s] requires \"languagepairs=<missing>\"'%(__name__.split(',')[-1])])
 
59
                        logger.error('%s\t%s',*self.errors[-1][1:])
 
60
                self.encoding = 'utf-8-sig' if 'utf8' in cfg['encoding'].lower().replace('-','') else cfg['encoding']
 
61
                self.maxinput = 0 if int(cfg['maxinput']) < 1 else int(cfg['maxinput'])
 
62
                self.reversedcolumns = cfg['reversedcolumns']
 
63
                self.seen = {}
 
64
 
 
65
        def run(self):
 
66
                '''Open the file and read contents into buffer'''
 
67
                for k in self.p.cfinput.keys():
 
68
                        # 1) test and lock output file
 
69
                        if not self.p.cfoutput[k]['lock']:
 
70
                                del(self.p.cfoutput[k])
 
71
                                continue
 
72
                        # 2) open input files, read to buffers
 
73
                        try:
 
74
                                self.p.cfinput[k]['tempbuff'] = [line.strip('\x0A\x09') for line in codecs.open(self.p.cfinput[k]['tempname'],'r',self.encoding).readlines()]
 
75
                        except Exception,e:
 
76
                                logger.exception('%s\t%s - %s',*['failed',self.p.cfinput[k]['tempname'],str(e)])
 
77
                                if k in self.p.cfinput: del self.p.cfinput[k]
 
78
                                continue
 
79
                        else:
 
80
                                srcidx = 0
 
81
                                langs = list(self.p.languages)
 
82
                                if self.reversedcolumns:
 
83
                                        srcindex = -1
 
84
                                        langs.reverse()
 
85
                                for i in range(len(langs)):
 
86
                                        if i == srcidx:
 
87
                                                kk = tuple(self.p.cfinput[k]['filelist'][cf.stage+1:cf.kind]+['tm']+[self.p.languages[srcidx]]+[self.p.languages[i]]+[self.p.cfinput[k]['filelist'][cf.basename]])
 
88
                                                self.p.cfoutput[kk]['tempbuff'] = [line.split('\t')[srcidx] for line in self.p.cfinput[k]['tempbuff']]
 
89
                                        else:
 
90
                                                kk = tuple(self.p.cfinput[k]['filelist'][cf.stage+1:cf.kind]+['tm']+[self.p.languages[srcidx]]+[self.p.languages[i]]+[self.p.cfinput[k]['filelist'][cf.basename]])
 
91
                                                self.p.cfoutput[kk]['tempbuff'] = [line.split('\t')[i] for line in self.p.cfinput[k]['tempbuff']]
 
92
                                self.p.cfinput[k]['tempbuff'] = []
 
93
                        logger.debug('%s\t%s',*['file',self.p.cfinput[k]['tempname']])
 
94
                # 3) test & cleanup if fail
 
95
                if not len(self.p.cfoutput) == len(langs):
 
96
                        for k in self.p.cfoutput.keys():
 
97
                                del(self.p.cfoutput[k])
 
98
                        logger.warn('%s\tinput/output %s',*['mismatch',os.sep.join(self.p.filelist)])
 
99
                        return -2
 
100
 
 
101
        def flush(self):
 
102
                '''clear input object'''
 
103
                for k in self.p.cfinput.keys():
 
104
                        del(self.p.cfinput[k])
 
105
 
 
106
        def close(self):
 
107
                return
 
108
 
 
109
        def getrecord(self):
 
110
                '''returns True on error'''
 
111
                if tuple(self.p.filelist) in self.seen.keys(): return -2
 
112
                # use wildcard patterns to update self.p.languages
 
113
                if self.p.filelist[cf.kind].lower().startswith('tm') or self.p.filelist[cf.kind].lower().startswith('lm'):
 
114
                        # update self.p.languages and self.p.toolchain based on filelist
 
115
                        if sum([1 for pattern in self.p.targetpatterns if '?' in pattern or '*' in pattern]):
 
116
                                listdirs = os.listdir(os.sep.join(self.p.filelist[:cf.rdrlang]))
 
117
                                for pattern in self.p.targetpatterns:
 
118
                                        for lang in [dir for dir in listdirs if not dir[0] == '.' and fnmatch(dir,pattern)]:
 
119
                                                if not lang in self.p.languages: self.p.languages.append(lang)
 
120
                                for lang in self.p.languages:
 
121
                                        for plugintype in ['aligner','filter','postfilter']:
 
122
                                                if not lang in self.p.toolchain[plugintype]: self.p.toolchain[plugintype][lang] = {}
 
123
                srckey = tuple(self.p.filelist[cf.stage+1:])
 
124
                src = os.sep.join(self.p.cfinput[srckey]['filelist'])
 
125
                # test and lock input file (usually srclang)
 
126
                if not self.p.cfinput[srckey]['lock']:
 
127
                        del(self.p.cfinput[srckey])
 
128
                        return -2
 
129
                # save local temp file
 
130
                if _getfile(src,self.p.cfinput[srckey]['tempname']):
 
131
                        del(self.p.cfinput[srckey])
 
132
                        return -2
 
133
                # add primary file to "seen" list
 
134
                self.seen[tuple(self.p.cfinput[srckey]['filelist'])] = None
 
135
                logger.debug('%s\t%s',*['file',src])
 
136
 
 
137
        def flagrecord(self,filelist):
 
138
                if os.path.exists(os.sep.join(filelist)):
 
139
                        os.chmod(os.sep.join(filelist),stat.S_IRUSR+stat.S_IRGRP+stat.S_IROTH)
 
140
 
 
141
        def unflagrecord(self,filelist):
 
142
                if os.path.exists(os.sep.join(filelist)):
 
143
                        os.chmod(os.sep.join(filelist),stat.S_IRUSR+stat.S_IWUSR+stat.S_IRGRP+stat.S_IROTH)
 
144
 
 
145
        def getcorpusattributes(self,rootfolder,roottype,stage):
 
146
                fs,root = self.getroot(rootfolder)
 
147
                path = os.sep.join([fs,root,roottype,stage])
 
148
                result = []
 
149
                for (dirpath, dirnames, filenames) in os.walk(path):
 
150
                        for i in [i for i in reversed(range(len(dirnames))) if dirnames[i].startswith('.') or dirnames[i].startswith('tm') or dirnames[i].startswith('lm')]:
 
151
                                dirnames.remove(dirnames[i])
 
152
                                result.append(dirpath.replace(path,'').strip(os.sep).split(os.sep) if dirpath.replace(path,'').strip(os.sep) else [])
 
153
                        if filenames: result.append(dirpath.replace(path,'').strip(os.sep).split(os.sep))
 
154
                return result
 
155
 
 
156
        def getallinputs(self,rootfolder,roottype,stage):
 
157
                fs,root = self.getroot(rootfolder)
 
158
                result = {}
 
159
                for corpusattribute in self.getcorpusattributes(rootfolder,roottype,stage):
 
160
                        path = os.sep.join([fs,root,roottype,stage]+corpusattribute).rstrip(os.sep)
 
161
                        pathlist = [fs,root,roottype,stage]+corpusattribute
 
162
                        result[tuple(pathlist)] = []
 
163
                        for dir in [dir for dir in os.listdir(path) if not dir.startswith('.')]:
 
164
                                if os.path.isfile(os.sep.join([path,dir])):
 
165
                                        result[tuple(pathlist)].append(dir)
 
166
                                elif dir.startswith('tm') or dir.startswith('lm'):
 
167
                                        for (dirpath, dirnames, filenames) in os.walk(os.sep.join([path,dir])):
 
168
                                                if not filenames: continue
 
169
                                                result[tuple(pathlist+dirpath.replace(os.sep.join(pathlist),'').lstrip(os.sep).split(os.sep))] = filenames
 
170
                        if not result[tuple(pathlist)]: del(result[tuple(pathlist)])
 
171
                return result
 
172
 
 
173
        def buildqueue(self,srcqueue,searchpaths):
 
174
                progress = ProgressBar()
 
175
                sys.stdout.write("Paths:\n")
 
176
                for path in searchpaths:
 
177
                        sys.stdout.write("   %s\n"%(os.sep.join(path)))
 
178
                totalsearchpaths = 1 if len(searchpaths) == 0 else len(searchpaths)
 
179
                progress.reset(0,totalsearchpaths,72,mode='fixed',char='#')
 
180
                sys.stdout.write('Searching...\n')
 
181
                sys.stdout.write(str(progress)+'\r')
 
182
                sys.stdout.flush()
 
183
                # search file system
 
184
                filenames = []
 
185
                for path in searchpaths:
 
186
                        logger.debug('%s\t%s',*['search',os.sep.join(path)])
 
187
                        filenames.extend([self.path2list(filelist,path[cf.root]) for filelist in glob.glob(os.sep.join(path))])
 
188
                        progress.increment_amount()
 
189
                        sys.stdout.write(str(progress)+'\r')
 
190
                # dedupe and check read-only
 
191
                filenames.sort()
 
192
                keep = {}
 
193
                with codecs.open(srcqueue,'w','utf8') as fq:
 
194
                        for filelist in [filelist for filelist in filenames if filelist]:
 
195
                                if tuple(filelist) in keep: continue
 
196
                                if not os.access(os.sep.join(filelist),os.W_OK):
 
197
                                        if self.p.resetreadonly:
 
198
                                                self.unflagrecord(filelist)
 
199
                                        else:
 
200
                                                logger.debug('%s\t%s',*['read-only',os.sep.join(filelist)])
 
201
                                                continue
 
202
                                keep[tuple(filelist)] = None
 
203
                                fq.write('%s\n'%('\t'.join(filelist).encode('utf8')))
 
204
                                if self.maxinput and len(keep) == self.maxinput: break
 
205
                with open(srcqueue+os.extsep+'count','w') as fq:
 
206
                        fq.write('%s\n'%(len(keep)))
 
207
                sys.stdout.write('\n')
 
208
                sys.stdout.write("Found: %d file set(s)\n"%(len(keep)) if len(keep) else "Files not found\n")
 
209
                sys.stdout.flush()
 
210
                return len(keep)
 
211
 
 
212
        def path2list(self,path,root):
 
213
                '''Converts filename to list of levels.'''
 
214
                if fnmatch(os.sep.join(path.split(os.sep)[1:root.count(os.sep)+2]),root):
 
215
                        fs,workingpath = path.rstrip('\\/').split(os.sep.join(path.split(os.sep)[1:root.count(os.sep)+2]))
 
216
                        return [fs.rstrip(os.sep),]+[os.sep.join(path.split(os.sep)[1:root.count(os.sep)+2]).strip(os.sep),]+workingpath.lstrip(os.sep).split(os.sep)
 
217
                else:
 
218
                        return []
 
219
 
 
220
        def getroot(self,rootfolder):
 
221
                '''Splits rootfolder to file system and rootfolder'''
 
222
                folder = os.path.abspath(os.path.expanduser(self.normalizeroot(rootfolder)))
 
223
                if fnmatch(folder,'?:*'):
 
224
                        # DOS drive letter : folder
 
225
                        fss, folder = folder.split(':')
 
226
                        fss = fss + ':'
 
227
                elif fnmatch(folder,'*:*'):
 
228
                        # ssh hostname : folder
 
229
                        fss, folder = folder.split(':')
 
230
                        fss = fss + ':'
 
231
                elif fnmatch(folder,'\\\\*\\*'):
 
232
                        # MS Windows UNC \ folder
 
233
                        fss = '\\\\'+folder.lstrip(os.sep).split(os.sep)[0]
 
234
                        folder = folder.replace(fss,'')
 
235
                else:
 
236
                        # NFS mount / folder
 
237
                        fss = ''
 
238
                        folder = os.path.normpath(folder)
 
239
                        if folder == '.': folder = '*'
 
240
                folder = folder.lstrip('\\/')
 
241
                return [unicode(fss), unicode(folder)]
 
242
 
 
243
        def normalizeroot(self,rootfolder):
 
244
                return rootfolder.replace('\\',os.sep).replace('/',os.sep)
 
245
 
 
246
def _getfile(orig,dest):
 
247
        try:
 
248
                os.makedirs(os.path.dirname(dest))
 
249
        except OSError,e:
 
250
                if not e.errno == 17: return True
 
251
        shutil.copyfile(orig,dest)
 
252
 
 
253
def usage():
 
254
        '''Command prompt help.'''
 
255
        return "\n%s\n\tUsage:\n\tfrom %s import reader\n"%(
 
256
        os.path.basename(sys.argv[0]),
 
257
        os.path.splitext(os.path.basename(sys.argv[0]))[0]
 
258
        )
 
259
 
 
260
licensetxt=u'''CorpusFiltergraph™ v4.0
 
261
Copyright © 2010-2012 Precision Translation Tools Co., Ltd.
 
262
 
 
263
This program is free software: you can redistribute it and/or modify
 
264
it under the terms of the GNU Lesser General Public License as published by
 
265
the Free Software Foundation, either version 3 of the License, or
 
266
(at your option) any later version.
 
267
 
 
268
This program is distributed in the hope that it will be useful,
 
269
but WITHOUT ANY WARRANTY; without even the implied warranty of
 
270
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
271
GNU Lesser General Public License for more details.
 
272
 
 
273
You should have received a copy of the GNU Lesser General Public License
 
274
along with this program.  If not, see http://www.gnu.org/licenses/.
 
275
 
 
276
For more information, please contact Precision Translation Tools Co., Ltd.
 
277
at: http://www.precisiontranslationtools.com'''
 
278
 
 
279
if __name__ == "__main__":
 
280
        import os
 
281
        import sys
 
282
        sys.stdout.write(usage().encode('utf8')+'\n')