1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
|
# Copyright (c) 2002-2010 Infrae. All rights reserved.
# See also LICENSE.txt
# $Id: UnicodeSplitter.py 42064 2010-05-13 12:18:05Z sylvain $
from Products.ZCTextIndex.PipelineFactory import element_factory
import re
class Splitter(object):
rx = re.compile(r"\w+", re.UNICODE)
rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE)
def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
return result
def processGlob(self, lst):
result = []
for s in lst:
result += self.rxGlob.findall(s)
return result
try:
element_factory.registerFactory('Word Splitter',
'Unicode Whitespace splitter', Splitter)
except ValueError:
# in case the splitter is already registred, ValueError is raised
pass
if __name__ == "__main__":
import sys
splitter = Splitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process([buf])
|