37
37
#include "base/base.h"
38
38
#include "base/config_file_stream.h"
39
#include "base/thread.h"
39
#include "base/singleton.h"
40
40
#include "base/util.h"
41
#include "converter/dictionary_data.h"
42
#include "converter/dictionary_preloader.h"
43
41
#include "converter/key_corrector.h"
42
#include "converter/lattice.h"
44
43
#include "converter/segments.h"
45
#include "converter/converter_data.h"
46
#include "converter/connector.h"
44
#include "converter/connector_interface.h"
47
45
#include "converter/nbest_generator.h"
48
#include "converter/pos.h"
46
#include "converter/pos_matcher.h"
49
47
#include "converter/segmenter.h"
48
#include "dictionary/dictionary_interface.h"
50
49
#include "session/config_handler.h"
51
50
#include "session/config.pb.h"
52
#include "dictionary/dictionary.h"
58
55
const size_t kMaxSegmentsSize = 256;
61
58
const int kDefaultNumberCost = 3000;
62
59
const int kEOSPenalty = 700;
64
// const char kLastNamePos[] = "姓";
65
// const char kFistNamePos[] = "名";
66
const char kLastNamePos[] = "\xE5\xA7\x93";
67
const char kFistNamePos[] = "\xE5\x90\x8D";
75
Node *InitBOSNode(ConverterData *data, uint16 length) {
76
Node *bos_node = data->NewNode();
79
bos_node->key.clear();
80
bos_node->value = "BOS";
81
bos_node->node_type = Node::BOS_NODE;
84
bos_node->begin_pos = length;
85
bos_node->end_pos = length;
89
// For EOS node, we use both pure EOS node
90
// and "サ変名詞". Since many users still inputs via single-segment-conversion,
91
// the right word of user input is not always end of sentence.
92
// If you see the side effect of this treatment,
93
// set some penalty to node->wcost.
94
Node *InitEOSNode(ConverterData *data, uint16 length) {
95
Node *eos_node = data->NewNode();
96
eos_node->rid = 0; // pure EOS
98
eos_node->key.clear();
99
eos_node->value = "EOS";
100
eos_node->node_type = Node::EOS_NODE;
103
eos_node->begin_pos = length;
104
eos_node->end_pos = length;
106
Node *eos_noun_node = data->NewNode();
108
// POS::unknown_id(), POS::unknown_id()
109
// returns IDs for "サ変名詞"
110
eos_noun_node->rid = POS::unknown_id();
111
eos_noun_node->lid = POS::unknown_id();
112
eos_noun_node->key.clear();
113
eos_noun_node->value = "EOS";
114
eos_noun_node->node_type = Node::EOS_NODE;
115
eos_noun_node->wcost = kEOSPenalty; // add some a constant as penalty
116
eos_noun_node->cost = 0;
117
eos_noun_node->begin_pos = length;
118
eos_noun_node->end_pos = length;
121
eos_node->bnext = eos_noun_node;
126
void InsertNodes(size_t pos, Node *node, ConverterData *data) {
127
Node **begin_nodes_list = data->begin_nodes_list();
128
Node **end_nodes_list = data->end_nodes_list();
130
for (Node *rnode = node; rnode != NULL; rnode = rnode->bnext) {
131
rnode->begin_pos = static_cast<uint16>(pos);
132
rnode->end_pos = static_cast<uint16>(pos + rnode->key.size());
136
const size_t x = rnode->key.size() + pos;
137
rnode->enext = end_nodes_list[x];
138
end_nodes_list[x] = rnode;
141
if (begin_nodes_list[pos] == NULL) {
142
begin_nodes_list[pos] = node;
144
for (Node *rnode = node; rnode != NULL; rnode = rnode->bnext) {
145
if (rnode->bnext == NULL) {
146
rnode->bnext = begin_nodes_list[pos];
147
begin_nodes_list[pos] = node;
154
// TODO(taku): move it to KeyCorrector
155
int GetCorrectedCostPenalty(const string &key) {
156
// "んん" and "っっ" must be mis-spelling.
157
// if (key.find("んん") != string::npos ||
158
// key.find("っっ") != string::npos) {
159
if (key.find("\xE3\x82\x93\xE3\x82\x93") != string::npos ||
160
key.find("\xE3\x81\xA3\xE3\x81\xA3") != string::npos) {
163
// add 3000 to the original word cost
164
const int kCorrectedCostPenalty = 3000;
165
return kCorrectedCostPenalty;
168
67
void InsertCorrectedNodes(size_t pos, const string &key,
169
const KeyCorrector &key_corrector,
170
Dictionary *dictionary,
171
ConverterData *data) {
68
const KeyCorrector *key_corrector,
69
DictionaryInterface *dictionary,
71
if (key_corrector == NULL) {
172
74
size_t length = 0;
173
const char *str = key_corrector.GetCorrectedPrefix(pos, &length);
75
const char *str = key_corrector->GetCorrectedPrefix(pos, &length);
174
76
if (str == NULL || length == 0) {
178
Node *r_node = dictionary->LookupPrefix(str, length, data);
80
Node *rnode = dictionary->LookupPrefix(str, length,
81
lattice->node_allocator());
179
82
Node *prev = NULL;
180
for (Node *node = r_node; node != NULL; node = node->bnext) {
83
for (Node *node = rnode; node != NULL; node = node->bnext) {
181
84
const size_t offset =
182
key_corrector.GetOriginalOffset(pos, node->key.size());
85
key_corrector->GetOriginalOffset(pos, node->key.size());
183
86
if (KeyCorrector::IsValidPosition(offset) && offset > 0) {
185
88
node->key = key.substr(pos, offset);
186
node->wcost += GetCorrectedCostPenalty(node->key);
89
node->wcost += KeyCorrector::GetCorrectedCostPenalty(node->key);
189
92
if (prev == NULL) {
190
r_node = node; // drop the first node
93
rnode = node; // drop the first node
192
95
prev->bnext = node->bnext; // change the chain
197
if (r_node != NULL) {
198
InsertNodes(pos, r_node, data);
101
lattice->Insert(pos, rnode);
308
} // anonymous namespace
310
class ImmutableConverterImpl: public ImmutableConverter {
234
class ImmutableConverterImpl: public ImmutableConverterInterface {
312
236
virtual bool Convert(Segments *segments) const;
313
virtual Dictionary *GetDictionary() const;
314
237
ImmutableConverterImpl();
315
238
virtual ~ImmutableConverterImpl() {}
318
Node *Lookup(const char *begin,
320
ConverterData *data) const;
241
Node *Lookup(const char *begin, const char *end,
242
Lattice *lattice) const;
322
void Resegment(size_t pos, ConverterData *data) const;
244
void Resegment(size_t pos, Lattice *lattice) const;
324
246
// return true resegmentation happened
325
bool ResegmentArabicNumberAndSuffix(size_t pos, ConverterData *data) const;
326
bool ResegmentPersonalName(size_t pos, ConverterData *data) const;
247
bool ResegmentArabicNumberAndSuffix(size_t pos, Lattice *lattice) const;
248
bool ResegmentPrefixAndArabicNumber(size_t pos, Lattice *lattice) const;
249
bool ResegmentPersonalName(size_t pos, Lattice *lattice) const;
328
251
bool MakeLattice(Segments *segments) const;
329
252
bool ModifyLattice(Segments *segments) const;
342
265
return connector_->GetTransitionCost(lnode->rid, rnode->lid) + rnode->wcost;
345
scoped_ptr<ConnectorInterface> connector_;
346
scoped_ptr<Dictionary> dictionary_;
268
ConnectorInterface *connector_;
269
DictionaryInterface *dictionary_;
348
271
int32 last_to_first_name_transition_cost_;
349
272
DISALLOW_COPY_AND_ASSIGN(ImmutableConverterImpl);
352
275
ImmutableConverterImpl::ImmutableConverterImpl()
353
: dictionary_(new Dictionary),
276
: connector_(ConnectorFactory::GetConnector()),
277
dictionary_(DictionaryFactory::GetDictionary()),
354
278
last_to_first_name_transition_cost_(0) {
355
size_t connection_size = 0;
356
const char *connection_data =
357
DictionaryData::GetConnectionData(&connection_size);
358
CHECK(connection_data);
359
CHECK_GT(connection_size, 0);
360
connector_.reset(ConnectorInterface::OpenFromArray(connection_data,
362
CHECK(connector_.get());
364
DictionaryInterface *sys_dic = dictionary_->Add(Dictionary::SYSTEM);
367
size_t dictionary_size = 0;
368
const char *dictionary_data =
369
DictionaryData::GetDictionaryData(&dictionary_size);
370
CHECK(dictionary_data);
371
CHECK_GT(dictionary_size, 0);
373
CHECK(sys_dic->OpenFromArray(dictionary_data,
376
DictionaryInterface *user_dic = dictionary_->Add(Dictionary::USER);
379
279
last_to_first_name_transition_cost_
380
280
= connector_->GetTransitionCost(
381
POS::last_name_id(), POS::first_name_id());
383
DictionaryPreloader::PreloadIfApplicable();
386
Dictionary *ImmutableConverterImpl::GetDictionary() const {
387
return dictionary_.get();
390
void ImmutableConverterImpl::Resegment(size_t pos, ConverterData *data) const {
391
if (ResegmentArabicNumberAndSuffix(pos, data)) {
392
VLOG(1) << "ResegmentArabicNumberAndSuffix returned true";
396
if (ResegmentPersonalName(pos, data)) {
281
POSMatcher::GetLastNameId(), POSMatcher::GetFirstNameId());
284
void ImmutableConverterImpl::Resegment(size_t pos, Lattice *lattice) const {
285
if (ResegmentArabicNumberAndSuffix(pos, lattice)) {
286
VLOG(1) << "ResegmentArabicNumberAndSuffix returned true";
290
if (ResegmentPrefixAndArabicNumber(pos, lattice)) {
291
VLOG(1) << "ResegmentArabicNumberAndSuffix returned true";
295
if (ResegmentPersonalName(pos, lattice)) {
397
296
VLOG(1) << "ResegmentPersonalName returned true";
415
313
for (const Node *compound_node = bnode;
416
314
compound_node != NULL; compound_node = compound_node->bnext) {
417
if (bnode->value.size() > 0 &&
418
bnode->value[0] >= '0' && bnode->value[0] <= '9' &&
419
bnode->key[0] >= '0' && bnode->key[0] <= '9' &&
420
POS::IsNumber(compound_node->lid) &&
421
!POS::IsNumber(compound_node->rid)) {
315
if (!compound_node->value.empty() && !compound_node->key.empty() &&
316
POSMatcher::IsNumber(compound_node->lid) &&
317
!POSMatcher::IsNumber(compound_node->rid) &&
318
IsNumber(compound_node->value[0]) && IsNumber(compound_node->key[0])) {
422
319
string number_value, number_key;
423
320
string suffix_value, suffix_key;
424
DecomposeNumber(compound_node->value, &number_value, &suffix_value);
425
DecomposeNumber(compound_node->key, &number_key, &suffix_key);
321
DecomposeNumberAndSuffix(compound_node->value,
322
&number_value, &suffix_value);
323
DecomposeNumberAndSuffix(compound_node->key,
324
&number_key, &suffix_key);
427
326
if (suffix_value.empty() || suffix_key.empty()) {
377
bool ImmutableConverterImpl::ResegmentPrefixAndArabicNumber(
378
size_t pos, Lattice *lattice) const {
379
const Node *bnode = lattice->begin_nodes(pos);
381
VLOG(1) << "bnode is NULL";
385
bool modified = false;
387
for (const Node *compound_node = bnode;
388
compound_node != NULL; compound_node = compound_node->bnext) {
389
// Unlike ResegmentArabicNumberAndSuffix, we don't
390
// check POS as words ending with Arabic numbers are pretty rare.
391
if (compound_node->value.size() > 1 && compound_node->key.size() > 1 &&
392
!IsNumber(compound_node->value[0]) &&
393
!IsNumber(compound_node->key[0]) &&
394
IsNumber(compound_node->value[compound_node->value.size() - 1]) &&
395
IsNumber(compound_node->key[compound_node->key.size() - 1])) {
396
string number_value, number_key;
397
string prefix_value, prefix_key;
398
DecomposePrefixAndNumber(compound_node->value,
399
&prefix_value, &number_value);
400
DecomposePrefixAndNumber(compound_node->key,
401
&prefix_key, &number_key);
403
if (prefix_value.empty() || prefix_key.empty()) {
408
if (number_value != number_key) {
409
LOG(WARNING) << "Incompatible key/value number pair";
413
// do -1 so that resegmented nodes are boosted
414
// over compound node.
415
const int32 wcost = max(compound_node->wcost / 2 - 1, 0);
417
Node *prefix_node = lattice->NewNode();
419
prefix_node->key = prefix_key;
420
prefix_node->value = prefix_value;
421
prefix_node->lid = compound_node->lid;
422
prefix_node->rid = 0; // 0 to 0 transition cost is 0
423
prefix_node->wcost = wcost;
424
prefix_node->node_type = Node::NOR_NODE;
425
prefix_node->bnext = NULL;
427
// insert number into the lattice
428
lattice->Insert(pos, prefix_node);
430
Node *number_node = lattice->NewNode();
432
number_node->key = number_key;
433
number_node->value = number_value;
434
number_node->lid = 0;
435
number_node->rid = compound_node->rid;
436
number_node->wcost = wcost;
437
number_node->node_type = Node::NOR_NODE;
438
number_node->bnext = NULL;
440
number_node->constrained_prev = prefix_node;
442
// insert number into the lattice
443
lattice->Insert(pos + prefix_node->key.size(), number_node);
444
VLOG(1) << "Resegmented: " << compound_node->value << " "
445
<< prefix_node->value << " " << number_node->value;
476
454
bool ImmutableConverterImpl::ResegmentPersonalName(
477
size_t pos, ConverterData *data) const {
478
Node **begin_nodes_list = data->begin_nodes_list();
479
const Node *bnode = begin_nodes_list[pos];
455
size_t pos, Lattice *lattice) const {
456
const Node *bnode = lattice->begin_nodes(pos);
480
457
if (bnode == NULL) {
481
458
VLOG(1) << "bnode is NULL";
574
551
const int32 wcost = (compound_node->wcost -
575
552
last_to_first_name_transition_cost_) / 2;
577
Node *last_name_node = data->NewNode();
554
Node *last_name_node = lattice->NewNode();
578
555
CHECK(last_name_node);
579
556
last_name_node->key = best_last_name_node->key;
580
557
last_name_node->value = best_last_name_node->value;
581
558
last_name_node->lid = compound_node->lid;
582
last_name_node->rid = POS::last_name_id();
559
last_name_node->rid = POSMatcher::GetLastNameId();
583
560
last_name_node->wcost = wcost;
584
561
last_name_node->node_type = Node::NOR_NODE;
585
562
last_name_node->bnext = NULL;
587
564
// insert last_name into the lattice
588
InsertNodes(pos, last_name_node, data);
565
lattice->Insert(pos, last_name_node);
590
Node *first_name_node = data->NewNode();
567
Node *first_name_node = lattice->NewNode();
591
568
CHECK(first_name_node);
592
569
first_name_node->key = best_first_name_node->key;
593
570
first_name_node->value = best_first_name_node->value;
594
first_name_node->lid = POS::first_name_id();
571
first_name_node->lid = POSMatcher::GetFirstNameId();
595
572
first_name_node->rid = compound_node->rid;
596
573
first_name_node->wcost = wcost;
597
574
first_name_node->node_type = Node::NOR_NODE;
689
668
bool ImmutableConverterImpl::Viterbi(Segments *segments,
690
669
const vector<uint16> &group) const {
691
ConverterData *data = segments->converter_data();
692
Node **begin_nodes_list = data->begin_nodes_list();
693
Node **end_nodes_list = data->end_nodes_list();
670
Lattice *lattice = segments->lattice();
695
const string &key = segments->converter_data()->key();
673
const string &key = lattice->key();
697
675
for (size_t pos = 0; pos <= key.size(); ++pos) {
698
for (Node *rnode = begin_nodes_list[pos];
676
for (Node *rnode = lattice->begin_nodes(pos);
699
677
rnode != NULL; rnode = rnode->bnext) {
700
int bestCost = INT_MAX;
701
Node* bestNode = NULL;
702
for (Node *lnode = end_nodes_list[pos];
678
int best_cost = INT_MAX;
679
Node* best_node = NULL;
680
for (Node *lnode = lattice->end_nodes(pos);
703
681
lnode != NULL; lnode = lnode->enext) {
705
683
switch (GetConnectionType(lnode, rnode, group, segments)) {
817
794
const string key = history_key + conversion_key;
795
lattice->SetKey(key);
819
ConverterData *data = segments->converter_data();
820
KeyCorrector::InputMode mode = KeyCorrector::ROMAN;
821
if (GET_CONFIG(preedit_method) != config::Config::ROMAN) {
822
mode = KeyCorrector::KANA;
797
for (Node *node = lattice->eos_nodes();
798
node != NULL; node = node->bnext) {
799
if (node->lid != 0 || node->rid != 0) {
800
node->wcost = kEOSPenalty;
824
data->set_key(key, mode);
826
Node *bosNode = InitBOSNode(data, 0);
827
Node *eosNode = InitEOSNode(data, static_cast<uint16>(key.size()));
829
data->set_bos_node(bosNode);
830
data->set_eos_node(eosNode);
832
Node **end_nodes_list = data->end_nodes_list();
833
Node **begin_nodes_list = data->begin_nodes_list();
834
end_nodes_list[0] = bosNode;
835
begin_nodes_list[key.size()] = eosNode;
837
804
size_t segments_pos = 0;
838
805
const char *key_end = key.data() + key.size();
960
930
last_rid = rnode->rid;
963
if (end_nodes_list[history_key.size()] == NULL) {
933
if (lattice->end_nodes(history_key.size()) == NULL) {
964
934
LOG(WARNING) << "cannot build lattice from input";
968
// Dictionary Lookup for conversion segment
969
const KeyCorrector &key_corrector = data->key_corrector();
938
// Do not use KeyCorrector if user changes the boundary.
939
// http://b/issue?id=2804996
940
scoped_ptr<KeyCorrector> key_corrector;
941
if (!segments->has_resized()) {
942
KeyCorrector::InputMode mode = KeyCorrector::ROMAN;
943
if (GET_CONFIG(preedit_method) != config::Config::ROMAN) {
944
mode = KeyCorrector::KANA;
946
key_corrector.reset(new KeyCorrector(key, mode));
971
949
for (size_t pos = history_key.size(); pos < key.size(); ++pos) {
972
if (end_nodes_list[pos] != NULL) {
973
Node *rnode = Lookup(key_begin + pos, key_end, data);
950
if (lattice->end_nodes(pos) != NULL) {
951
Node *rnode = Lookup(key_begin + pos, key_end, lattice);
974
952
CHECK(rnode != NULL);
975
InsertNodes(pos, rnode, data);
953
lattice->Insert(pos, rnode);
976
954
// Insert corrected nodes like みんあ -> みんな
977
955
InsertCorrectedNodes(pos, key,
979
dictionary_.get(), data);
957
dictionary_, lattice);
983
if (end_nodes_list[key.size()] == NULL) {
961
if (lattice->end_nodes(key.size()) == NULL) {
984
962
LOG(WARNING) << "cannot build lattice from input";
989
967
for (size_t pos = history_key.size(); pos < key.size(); ++pos) {
990
Resegment(pos, data);
968
Resegment(pos, lattice);
996
974
bool ImmutableConverterImpl::ModifyLattice(Segments *segments) const {
997
ConverterData *data = segments->converter_data();
998
Node **begin_nodes_list = data->begin_nodes_list();
999
const string &key = data->key();
975
Lattice *lattice = segments->lattice();
978
const string &key = lattice->key();
1001
980
// disable all CON_NODE
1002
981
for (size_t pos = 0; pos <= key.size(); ++pos) {
1003
for (Node *node = begin_nodes_list[pos];
982
for (Node *node = lattice->begin_nodes(pos);
1004
983
node != NULL; node = node->bnext) {
1005
984
node->cost = 0; // reset cost
1006
985
if (node->node_type == Node::CON_NODE) {
1053
1036
if (node->next->node_type != Node::EOS_NODE &&
1054
1037
old_seg.segment_type() == Segment::FIXED_BOUNDARY &&
1055
1038
group[node->begin_pos] == group[node->next->begin_pos]) {
1057
1040
// Condition 2: prev->next is a boundary. Very strong constraint
1058
1041
} else if (node->node_type == Node::CON_NODE ||
1059
1042
(node->next->node_type != Node::EOS_NODE &&
1060
1043
group[node->begin_pos] != group[node->next->begin_pos]) ||
1061
1044
Segmenter::IsBoundary(node, node->next)) {
1062
1045
Segment *segment = segments->add_segment();
1063
1047
NBestGenerator *nbest = segment->nbest_generator();
1064
nbest->Init(prev, node->next, connector_.get(),
1065
segments->converter_data());
1049
nbest->Init(prev, node->next, segments->lattice());
1066
1050
segment->set_key(key);
1067
1051
segment->Expand(max(static_cast<size_t>(1), old_seg.candidates_size()));
1068
1052
if (segment->candidates_size() == 0) {