~vcs-imports/silva/trunk

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# Copyright (c) 2002-2010 Infrae. All rights reserved.
# See also LICENSE.txt
# $Id: UnicodeSplitter.py 42064 2010-05-13 12:18:05Z sylvain $

from Products.ZCTextIndex.PipelineFactory import element_factory

import re

class Splitter(object):

    rx = re.compile(r"\w+", re.UNICODE)
    rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE)

    def process(self, lst):
        result = []
        for s in lst:
            result += self.rx.findall(s)
        return result

    def processGlob(self, lst):
        result = []
        for s in lst:
            result += self.rxGlob.findall(s)
        return result
try:
    element_factory.registerFactory('Word Splitter',
        'Unicode Whitespace splitter', Splitter)
except ValueError:
    # in case the splitter is already registred, ValueError is raised
    pass

if __name__ == "__main__":
    import sys
    splitter = Splitter()
    for path in sys.argv[1:]:
        f = open(path, "rb")
        buf = f.read()
        f.close()
        print path
        print splitter.process([buf])