~ubuntu-branches/ubuntu/raring/clucene-core/raring-proposed

« back to all changes in this revision

Viewing changes to src/contribs-lib/CLucene/analysis/de/GermanAnalyzer.cpp

  • Committer: Package Import Robot
  • Author(s): Fathi Boudra
  • Date: 2012-08-11 09:33:38 UTC
  • mfrom: (1.1.5)
  • Revision ID: package-import@ubuntu.com-20120811093338-fgrx41ftqew3qt6a
Tags: 2.3.3.4-1
* New upstream release (Closes: #661703).
* Convert package to multiarch.
* Drop obsolete patches:
  - 01_add_missing_include_bug505667.diff
  - 02_posixness_fix_bug530308.diff
* Add patches:
  - Fixing_ZLIB_configuration_in_shared_CMakeLists.patch
  - Fix-pkgconfig-file-by-adding-clucene-shared-library.patch
  - Install-contribs-lib.patch
  - multiarch.patch
* Update debian/compat: bump to 8.
* Update debian/control:
  - update build dependencies (add cmake, libboost-dev and libz-dev).
  - bump Standards-Version to 3.9.3.
  - rename packages due to ABI bump: libclucene0ldbl -> libclucene-core1.
  - add libclucene-contribs1 package.
* Update debian/rules:
  - rewrite to use CMake.
  - add multiarch support.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*------------------------------------------------------------------------------
 
2
* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
 
3
 
4
* Distributable under the terms of either the Apache License (Version 2.0) or 
 
5
* the GNU Lesser General Public License, as specified in the COPYING file.
 
6
------------------------------------------------------------------------------*/
 
7
#include "CLucene/_ApiHeader.h"
 
8
#include "CLucene/util/CLStreams.h"
 
9
#include "CLucene/analysis/Analyzers.h"
 
10
#include "CLucene/analysis/standard/StandardTokenizer.h"
 
11
#include "CLucene/analysis/standard/StandardFilter.h"
 
12
#include "CLucene/util/StringBuffer.h"
 
13
#include "GermanAnalyzer.h"
 
14
#include "GermanStemmer.h"
 
15
#include "GermanStemFilter.h"
 
16
 
 
17
CL_NS_USE(analysis)
 
18
CL_NS_USE2(analysis,de)
 
19
CL_NS_USE2(analysis,standard)
 
20
 
 
21
  const TCHAR GermanAnalyzer_DASZ[] = { 0x64, 0x61, 0xdf };
 
22
  const TCHAR GermanAnalyzer_FUER[] = { 0x66, 0xfc, 0x72 };
 
23
  const TCHAR* GermanAnalyzer_GERMAN_STOP_WORDS[] = {
 
24
    _T("einer"), _T("eine"), _T("eines"), _T("einem"), _T("einen"),
 
25
    _T("der"), _T("die"), _T("das"), _T("dass"), GermanAnalyzer_DASZ,
 
26
    _T("du"), _T("er"), _T("sie"), _T("es"),
 
27
    _T("was"), _T("wer"), _T("wie"), _T("wir"),
 
28
    _T("und"), _T("oder"), _T("ohne"), _T("mit"),
 
29
    _T("am"), _T("im"),_T("in"), _T("aus"), _T("auf"),
 
30
    _T("ist"), _T("sein"), _T("war"), _T("wird"),
 
31
    _T("ihr"), _T("ihre"), _T("ihres"),
 
32
    _T("als"), GermanAnalyzer_FUER, _T("von"), _T("mit"),
 
33
    _T("dich"), _T("dir"), _T("mich"), _T("mir"),
 
34
    _T("mein"), _T("sein"), _T("kein"),
 
35
    _T("durch"), _T("wegen"), _T("wird")
 
36
  };
 
37
 
 
38
  CL_NS(util)::ConstValueArray<const TCHAR*> GermanAnalyzer::GERMAN_STOP_WORDS( GermanAnalyzer_GERMAN_STOP_WORDS, 48 );
 
39
 
 
40
  class GermanAnalyzer::SavedStreams : public TokenStream {
 
41
  public:
 
42
      StandardTokenizer* tokenStream;
 
43
      TokenStream* filteredTokenStream;
 
44
 
 
45
      SavedStreams():tokenStream(NULL), filteredTokenStream(NULL)
 
46
      {
 
47
      }
 
48
 
 
49
      void close(){}
 
50
      Token* next(Token* token) {return NULL;}
 
51
  };
 
52
 
 
53
  GermanAnalyzer::GermanAnalyzer() {
 
54
    exclusionSet = NULL;
 
55
    stopSet = _CLNEW CLTCSetList;
 
56
    StopFilter::fillStopTable(stopSet, GERMAN_STOP_WORDS.values);
 
57
  }
 
58
 
 
59
  GermanAnalyzer::GermanAnalyzer(const TCHAR** stopwords) {
 
60
    exclusionSet = NULL;
 
61
    stopSet = _CLNEW CLTCSetList;
 
62
    StopFilter::fillStopTable(stopSet, stopwords);
 
63
  }
 
64
 
 
65
  GermanAnalyzer::GermanAnalyzer(CL_NS(analysis)::CLTCSetList* stopwords) {
 
66
    exclusionSet = NULL;
 
67
    stopSet = stopwords;
 
68
  }
 
69
 
 
70
  GermanAnalyzer::GermanAnalyzer(const char* stopwordsFile, const char* enc) {
 
71
    exclusionSet = NULL;
 
72
    stopSet = WordlistLoader::getWordSet(stopwordsFile, enc);
 
73
  }
 
74
 
 
75
  GermanAnalyzer::GermanAnalyzer(CL_NS(util)::Reader* stopwordsReader, const bool deleteReader) {
 
76
    exclusionSet = NULL;
 
77
    stopSet = WordlistLoader::getWordSet(stopwordsReader, NULL, deleteReader);
 
78
  }
 
79
 
 
80
  GermanAnalyzer::~GermanAnalyzer() {
 
81
    _CLLDELETE(stopSet);
 
82
    _CLLDELETE(exclusionSet);
 
83
  }
 
84
 
 
85
  void GermanAnalyzer::setStemExclusionTable(const TCHAR** exclusionlist) {
 
86
    if (exclusionSet != NULL) {
 
87
      exclusionSet->clear();
 
88
    } else {
 
89
      exclusionSet = _CLNEW CLTCSetList;
 
90
    }
 
91
 
 
92
    CL_NS(analysis)::StopFilter::fillStopTable(exclusionSet, exclusionlist);
 
93
  }
 
94
 
 
95
  void GermanAnalyzer::setStemExclusionTable(CL_NS(analysis)::CLTCSetList* exclusionlist) {
 
96
    if (exclusionSet != exclusionlist) {
 
97
      _CLLDELETE(exclusionSet);
 
98
      exclusionSet = exclusionlist;
 
99
    }
 
100
  }
 
101
 
 
102
  void GermanAnalyzer::setStemExclusionTable(const char* exclusionlistFile, const char* enc) {
 
103
    exclusionSet = WordlistLoader::getWordSet(exclusionlistFile, enc, exclusionSet);
 
104
  }
 
105
 
 
106
  void GermanAnalyzer::setStemExclusionTable(CL_NS(util)::Reader* exclusionlistReader, const bool deleteReader) {
 
107
    exclusionSet = WordlistLoader::getWordSet(exclusionlistReader, exclusionSet, deleteReader);
 
108
  }
 
109
 
 
110
  TokenStream* GermanAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
 
111
    TokenStream* result;
 
112
    CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();
 
113
 
 
114
    if ( bufferedReader == NULL )
 
115
      result = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true );
 
116
    else
 
117
      result = _CLNEW StandardTokenizer(bufferedReader);
 
118
 
 
119
    result = _CLNEW StandardFilter(result, true);
 
120
    result = _CLNEW LowerCaseFilter(result, true);
 
121
    result = _CLNEW StopFilter(result, true, stopSet);
 
122
    result = _CLNEW GermanStemFilter(result, true, exclusionSet);
 
123
 
 
124
    return result;
 
125
  }
 
126
 
 
127
  TokenStream* GermanAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader)
 
128
  {
 
129
    SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
 
130
 
 
131
    if (streams == NULL) {
 
132
      streams = _CLNEW SavedStreams();
 
133
      CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader();
 
134
 
 
135
      if ( bufferedReader == NULL )
 
136
        streams->tokenStream = _CLNEW StandardTokenizer( _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true );
 
137
      else
 
138
        streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
 
139
 
 
140
      streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
 
141
      streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
 
142
      streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
 
143
      streams->filteredTokenStream = _CLNEW GermanStemFilter(streams->filteredTokenStream, true, exclusionSet);
 
144
      setPreviousTokenStream(streams);
 
145
    } else
 
146
      streams->tokenStream->reset(reader);
 
147
 
 
148
    return streams->filteredTokenStream;
 
149
  }