75
76
*Log(nerLog) << "NER Tagger is already initialized!" << endl;
78
string tagset = conf.lookUp( "settings", "NER" );
79
if ( tagset.empty() ){
79
string val = conf.lookUp( "settings", "NER" );
80
81
*Log(nerLog) << "Unable to find settings for NER" << endl;
83
string init = "-s " + configuration.configDir() + tagset;
84
if ( Tagger::Version() == "3.2.6" )
84
string settings = val;
85
val = conf.lookUp( "version", "NER" );
91
val = conf.lookUp( "set", "NER" );
93
tagset = "http://ilk.uvt.nl/folia/sets/frog-ner-nl";
98
string init = "-s " + configuration.configDir() + settings + " -vcf";
88
99
tagger = new MbtAPI( init, *nerLog );
89
100
return tagger != 0;
92
bool NERTagger::splitOneWT( const string& inp, string& word,
93
string& tag, string& confidence ){
96
// split word and tag, and store num of slashes
98
*Log(nerLog) << "split Classify starting with " << in << endl;
99
string::size_type pos = in.rfind("/");
100
if ( pos == string::npos ) {
101
*Log(nerLog) << "no word/tag/confidence triple in this line: " << in << endl;
102
exit( EXIT_FAILURE );
105
confidence = in.substr( pos+1 );
108
pos = in.rfind("//");
109
if ( pos != string::npos ) {
110
// double slash: lets's hope is is an unknown word
112
// but this is definitely something like //LET()
114
tag = in.substr(pos+2);
117
word = in.substr( 0, pos );
118
tag = in.substr( pos+2 );
124
if ( pos != string::npos ) {
125
word = in.substr( 0, pos );
126
tag = in.substr( pos+1 );
129
*Log(nerLog) << "no word/tag/confidence triple in this line: " << in << endl;
130
exit( EXIT_FAILURE );
135
*Log(nerLog) << "known";
137
*Log(nerLog) << "unknown";
138
*Log(nerLog) << " word: " << word << "\ttag: " << tag << "\tconfidence: " << confidence << endl;
143
int NERTagger::splitWT( const string& tagged,
144
vector<string>& tags,
145
vector<double>& conf ){
146
vector<string> words;
147
vector<string> tagwords;
151
size_t num_words = Timbl::split_at( tagged, tagwords, " " );
152
num_words--; // the last "word" is <utt> which gets added by the tagger
153
for( size_t i = 0; i < num_words; ++i ) {
154
string word, tag, confs;
155
bool isKnown = splitOneWT( tagwords[i], word, tag, confs );
157
if ( !stringTo<double>( confs, confidence ) ){
158
*Log(nerLog) << "tagger confused. Expected a double, got '" << confs << "'" << endl;
159
exit( EXIT_FAILURE );
161
words.push_back( word );
162
tags.push_back( tag );
163
known.push_back( isKnown );
164
conf.push_back( confidence );
167
*Log(nerLog) << "#tagged_words: " << num_words << endl;
168
for( size_t i = 0; i < num_words; i++)
169
*Log(nerLog) << "\ttagged word[" << i <<"]: " << words[i] << (known[i]?"/":"//")
170
<< tags[i] << " <" << conf[i] << ">" << endl;
175
103
static void addEntity( EntitiesLayer *entities,
104
const string& tagset,
176
105
const vector<Word*>& words,
177
106
const vector<double>& confs,
178
107
const string& NER ){
265
194
using folia::operator<<;
266
195
*Log(nerLog) << "spit out " << stack << endl;
268
addEntity( el, stack, dstack, curNER );
272
string NERTagger::Classify( Sentence *sent ){
274
vector<Word*> swords = sent->words();
197
addEntity( el, tagset, stack, dstack, curNER );
201
void NERTagger::addDeclaration( Document& doc ) const {
202
doc.declare( AnnotationType::ENTITY,
204
"annotator='frog-ner-" + version
205
+ "', annotatortype='auto'");
208
void NERTagger::Classify( Sentence *sent ){
209
vector<Word*> swords;
210
#pragma omp critical(foliaupdate)
212
swords = sent->words();
275
214
if ( !swords.empty() ) {
276
215
vector<string> words;
277
216
string sentence; // the tagger needs the whole sentence
285
224
*Log(nerLog) << "NER in: " << sentence << endl;
286
tagged = tagger->Tag(sentence);
288
*Log(nerLog) << "sentence: " << sentence << endl
289
<< "NER tagged: "<< tagged
225
vector<TagResult> tagv = tagger->TagLine(sentence);
226
if ( tagv.size() != swords.size() ){
227
throw runtime_error( "NER tagger is confused" );
230
*Log(nerLog) << "NER tagger out: " << endl;
231
for ( size_t i=0; i < tagv.size(); ++i ){
232
*Log(nerLog) << "[" << i << "] : word=" << tagv[i].word()
233
<< " tag=" << tagv[i].assignedTag()
234
<< " confidence=" << tagv[i].confidence() << endl;
292
237
vector<double> conf;
294
splitWT( tagged, tags, conf );
239
for ( size_t i=0; i < tagv.size(); ++i ){
240
tags.push_back( tagv[i].assignedTag() );
241
conf.push_back( tagv[i].confidence() );
295
243
addNERTags( sent, swords, tags, conf );