1
// Copyright 2010, Google Inc.
2
// All rights reserved.
4
// Redistribution and use in source and binary forms, with or without
5
// modification, are permitted provided that the following conditions are
8
// * Redistributions of source code must retain the above copyright
9
// notice, this list of conditions and the following disclaimer.
10
// * Redistributions in binary form must reproduce the above
11
// copyright notice, this list of conditions and the following disclaimer
12
// in the documentation and/or other materials provided with the
14
// * Neither the name of Google Inc. nor the names of its
15
// contributors may be used to endorse or promote products derived from
16
// this software without specific prior written permission.
18
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
#include "base/base.h"
34
#include "base/util.h"
35
#include "base/file_stream.h"
36
#include "converter/pos_util.h"
40
void POSUtil::Open(const string &id_file) {
42
InputFileStream ifs(id_file.c_str());
45
vector<string> fields;
46
while (getline(ifs, line)) {
47
if (line.empty() || line[0] == '#') {
51
Util::SplitStringUsing(line, "\t ", &fields);
52
CHECK_GE(fields.size(), 2);
53
const int id = atoi32(fields[0].c_str());
54
ids_.push_back(make_pair(fields[1], static_cast<uint16>(id)));
57
// const char kNumberPOS[] = "名詞,数";
58
const char kNumberPOS[] = "\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0";
59
CHECK(ids(kNumberPOS, &number_ids_));
60
sort(number_ids_.begin(), number_ids_.end());
63
const char kParticlePOS[] = "\xe5\x8a\xa9\xe8\xa9\x9e";
65
const char kAuxVerbPOS[] = "\xe5\x8a\xa9\xe5\x8b\x95\xe8\xa9\x9e";
67
const char kSymbolPOS[] = "\xe8\xa8\x98\xe5\x8f\xb7";
69
const char kVerbDependentPOS[] =
70
"\xE5\x8B\x95\xE8\xA9\x9E,\xE9\x9D\x9E\xE8\x87\xAA\xE7\xAB\x8B";
72
const char kNounDependentPOS[] =
73
"\xE5\x90\x8D\xE8\xA9\x9E,\xE9\x9D\x9E\xE8\x87\xAA\xE7\xAB\x8B";
75
const char kAdjectiveDependentPOS[] =
76
"\xE5\xBD\xA2\xE5\xAE\xB9\xE8\xA9\x9E,\xE9\x9D\x9E\xE8\x87\xAA\xE7\xAB\x8B";
78
const char kVerbSuffixPOS[] =
79
"\xE5\x8B\x95\xE8\xA9\x9E,\xE6\x8E\xA5\xE5\xB0\xBE";
81
const char kNounSuffixPOS[] =
82
"\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x8E\xA5\xE5\xB0\xBE";
84
const char kAdjectiveSuffixPOS[] =
85
"\xE5\xBD\xA2\xE5\xAE\xB9\xE8\xA9\x9E,\xE6\x8E\xA5\xE5\xB0\xBE";
87
CHECK(ids(kParticlePOS, &functional_word_ids_));
88
CHECK(ids(kAuxVerbPOS, &functional_word_ids_));
89
CHECK(ids(kSymbolPOS, &functional_word_ids_));
90
CHECK(ids(kVerbDependentPOS, &functional_word_ids_));
91
CHECK(ids(kNounDependentPOS, &functional_word_ids_));
92
CHECK(ids(kAdjectiveDependentPOS, &functional_word_ids_));
93
CHECK(ids(kVerbSuffixPOS, &functional_word_ids_));
94
CHECK(ids(kNounSuffixPOS, &functional_word_ids_));
95
CHECK(ids(kAdjectiveSuffixPOS, &functional_word_ids_));
97
sort(functional_word_ids_.begin(), functional_word_ids_.end());
100
uint16 POSUtil::id(const string &feature) const {
101
CHECK(!feature.empty());
102
for (size_t i = 0; i < ids_.size(); ++i) {
103
if (ids_[i].first.find(feature) == 0) {
104
return ids_[i].second;
107
LOG(ERROR) << "Cannot find the POS for: " << feature;
111
bool POSUtil::ids(const string &feature,
112
vector<uint16> *ids) const {
114
CHECK(!feature.empty());
116
for (size_t i = 0; i < ids_.size(); ++i) {
117
if (ids_[i].first.find(feature) == 0) {
118
ids->push_back(ids_[i].second);
122
LOG_IF(ERROR, !found) << "Cannot find the POS for: "