1
##############################################################################
3
# Copyright (c) 2002 Zope Corporation and Contributors.
6
# This software is subject to the provisions of the Zope Public License,
7
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
8
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
9
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
10
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
11
# FOR A PARTICULAR PURPOSE.
13
##############################################################################
16
$Id: htmlsplitter.py 100821 2009-06-10 23:48:01Z tseaver $
20
from zope.interface import implements
22
from zope.index.text.interfaces import ISplitter
24
MARKUP = re.compile(r"(<[^<>]*>|&[A-Za-z]+;)")
25
WORDS = re.compile(r"(?L)\w+")
26
GLOBS = re.compile(r"(?L)\w+[\w*?]*")
28
class HTMLWordSplitter(object):
32
def process(self, text):
33
return self._apply(text, WORDS)
35
def processGlob(self, text):
36
# see Lexicon.globToWordIds()
37
return self._apply(text, GLOBS)
39
def _apply(self, text, pattern):
42
result.extend(self._split(chunk, pattern))
45
def _split(self, text, pattern):
46
text = MARKUP.sub(' ', text.lower())
47
return pattern.findall(text)