1
/*------------------------------------------------------------------------------
2
* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
4
* Distributable under the terms of either the Apache License (Version 2.0) or
5
* the GNU Lesser General Public License, as specified in the COPYING file.
6
------------------------------------------------------------------------------*/
7
#include "CLucene/_ApiHeader.h"
8
#include "CLucene/util/CLStreams.h"
9
#include "CLucene/analysis/Analyzers.h"
10
#include "CLucene/analysis/standard/StandardTokenizer.h"
11
#include "CLucene/analysis/standard/StandardFilter.h"
12
#include "CLucene/util/StringBuffer.h"
13
#include "GermanAnalyzer.h"
14
#include "GermanStemmer.h"
15
#include "GermanStemFilter.h"
18
CL_NS_USE2(analysis,de)
19
CL_NS_USE2(analysis,standard)
21
const TCHAR GermanAnalyzer_DASZ[] = { 0x64, 0x61, 0xdf };
22
const TCHAR GermanAnalyzer_FUER[] = { 0x66, 0xfc, 0x72 };
23
const TCHAR* GermanAnalyzer_GERMAN_STOP_WORDS[] = {
24
_T("einer"), _T("eine"), _T("eines"), _T("einem"), _T("einen"),
25
_T("der"), _T("die"), _T("das"), _T("dass"), GermanAnalyzer_DASZ,
26
_T("du"), _T("er"), _T("sie"), _T("es"),
27
_T("was"), _T("wer"), _T("wie"), _T("wir"),
28
_T("und"), _T("oder"), _T("ohne"), _T("mit"),
29
_T("am"), _T("im"),_T("in"), _T("aus"), _T("auf"),
30
_T("ist"), _T("sein"), _T("war"), _T("wird"),
31
_T("ihr"), _T("ihre"), _T("ihres"),
32
_T("als"), GermanAnalyzer_FUER, _T("von"), _T("mit"),
33
_T("dich"), _T("dir"), _T("mich"), _T("mir"),
34
_T("mein"), _T("sein"), _T("kein"),
35
_T("durch"), _T("wegen"), _T("wird")
38
CL_NS(util)::ConstValueArray<const TCHAR*> GermanAnalyzer::GERMAN_STOP_WORDS( GermanAnalyzer_GERMAN_STOP_WORDS, 48 );
40
class GermanAnalyzer::SavedStreams : public TokenStream {
42
StandardTokenizer* tokenStream;
43
TokenStream* filteredTokenStream;
45
SavedStreams():tokenStream(NULL), filteredTokenStream(NULL)
50
Token* next(Token* token) {return NULL;}
53
GermanAnalyzer::GermanAnalyzer() {
55
stopSet = _CLNEW CLTCSetList;
56
StopFilter::fillStopTable(stopSet, GERMAN_STOP_WORDS.values);
59
GermanAnalyzer::GermanAnalyzer(const TCHAR** stopwords) {
61
stopSet = _CLNEW CLTCSetList;
62
StopFilter::fillStopTable(stopSet, stopwords);
65
GermanAnalyzer::GermanAnalyzer(CL_NS(analysis)::CLTCSetList* stopwords) {
70
GermanAnalyzer::GermanAnalyzer(const char* stopwordsFile, const char* enc) {
72
stopSet = WordlistLoader::getWordSet(stopwordsFile, enc);
75
GermanAnalyzer::GermanAnalyzer(CL_NS(util)::Reader* stopwordsReader, const bool deleteReader) {
77
stopSet = WordlistLoader::getWordSet(stopwordsReader, NULL, deleteReader);
80
GermanAnalyzer::~GermanAnalyzer() {
82
_CLLDELETE(exclusionSet);
85
void GermanAnalyzer::setStemExclusionTable(const TCHAR** exclusionlist) {
86
if (exclusionSet != NULL) {
87
exclusionSet->clear();
89
exclusionSet = _CLNEW CLTCSetList;
92
CL_NS(analysis)::StopFilter::fillStopTable(exclusionSet, exclusionlist);
95
void GermanAnalyzer::setStemExclusionTable(CL_NS(analysis)::CLTCSetList* exclusionlist) {
96
if (exclusionSet != exclusionlist) {
97
_CLLDELETE(exclusionSet);
98
exclusionSet = exclusionlist;
102
void GermanAnalyzer::setStemExclusionTable(const char* exclusionlistFile, const char* enc) {
103
exclusionSet = WordlistLoader::getWordSet(exclusionlistFile, enc, exclusionSet);
106
void GermanAnalyzer::setStemExclusionTable(CL_NS(util)::Reader* exclusionlistReader, const bool deleteReader) {
107
exclusionSet = WordlistLoader::getWordSet(exclusionlistReader, exclusionSet, deleteReader);
110
TokenStream* GermanAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
112
CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();
114
if ( bufferedReader == NULL )
115
result = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true );
117
result = _CLNEW StandardTokenizer(bufferedReader);
119
result = _CLNEW StandardFilter(result, true);
120
result = _CLNEW LowerCaseFilter(result, true);
121
result = _CLNEW StopFilter(result, true, stopSet);
122
result = _CLNEW GermanStemFilter(result, true, exclusionSet);
127
TokenStream* GermanAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader)
129
SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
131
if (streams == NULL) {
132
streams = _CLNEW SavedStreams();
133
CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();
135
if ( bufferedReader == NULL )
136
streams->tokenStream = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true );
138
streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
140
streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
141
streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
142
streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
143
streams->filteredTokenStream = _CLNEW GermanStemFilter(streams->filteredTokenStream, true, exclusionSet);
144
setPreviousTokenStream(streams);
146
streams->tokenStream->reset(reader);
148
return streams->filteredTokenStream;