1
// Copyright 2010, Google Inc.
2
// All rights reserved.
4
// Redistribution and use in source and binary forms, with or without
5
// modification, are permitted provided that the following conditions are
8
// * Redistributions of source code must retain the above copyright
9
// notice, this list of conditions and the following disclaimer.
10
// * Redistributions in binary form must reproduce the above
11
// copyright notice, this list of conditions and the following disclaimer
12
// in the documentation and/or other materials provided with the
14
// * Neither the name of Google Inc. nor the names of its
15
// contributors may be used to endorse or promote products derived from
16
// this software without specific prior written permission.
18
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
#include "base/base.h"
34
#include "base/file_stream.h"
35
#include "base/util.h"
37
// Input: id.def, user-pos.def, cforms.def
39
DEFINE_string(id_file, "", "");
40
DEFINE_string(user_pos_file, "", "");
41
DEFINE_string(cforms_file, "", "");
42
DEFINE_string(output, "", "");
43
DECLARE_bool(logtostderr);
50
// load data/dictioanry/id.def
51
void Open(const string &id_file) {
53
InputFileStream ifs(id_file.c_str());
56
vector<string> fields;
57
while (getline(ifs, line)) {
58
if (line.empty() || line[0] == '#') {
62
Util::SplitStringUsing(line, "\t ", &fields);
63
CHECK_GE(fields.size(), 2);
64
const int id = atoi32(fields[0].c_str());
65
ids_.push_back(make_pair(fields[1], static_cast<uint16>(id)));
69
// return id of feature defined in id.def
70
uint16 id(const string &feature) const {
71
CHECK(!feature.empty());
72
for (size_t i = 0; i < ids_.size(); ++i) {
73
if (ids_[i].first.find(feature) == 0) {
74
return ids_[i].second;
77
LOG(ERROR) << "Cannot find the POS for: " << feature;
82
vector<pair<string, uint16> > ids_;
85
string Escape(const string &str) {
87
Util::Escape(str, &output);
91
struct ConjugationType {
97
void LoadConjugation(const string &filename,
98
map<string, vector<ConjugationType> > *output) {
99
InputFileStream ifs(filename.c_str());
103
vector<string> fields;
104
while (getline(ifs, line)) {
105
if (line.empty() || line[0] == '#') {
109
Util::SplitStringUsing(line, "\t ", &fields);
110
CHECK_GE(fields.size(), 4);
113
tmp.form = fields[1];
114
tmp.value_suffix = fields[2] == "*" ? "" : fields[2];
115
tmp.key_suffix = fields[3] == "*" ? "" : fields[3];
116
(*output)[fields[0]].push_back(tmp); // insert
122
util.Open(FLAGS_id_file);
124
map<string, vector<ConjugationType> > inflection_map;
125
LoadConjugation(FLAGS_cforms_file, &inflection_map);
127
InputFileStream ifs(FLAGS_user_pos_file.c_str());
128
ostream *ofs = &cout;
129
if (!FLAGS_output.empty()) {
130
ofs = new OutputFileStream(FLAGS_output.c_str());
135
vector<string> fields, pos_fields;
136
vector<pair<string, size_t> > pos_tokens;
138
while (getline(ifs, line)) {
139
if (line.empty() || line[0] == '#') {
143
Util::SplitStringUsing(line, "\t ", &fields);
144
CHECK_GE(fields.size(), 3);
145
const string &user_pos = fields[0];
146
const string ctype = fields[1];
147
const string &feature = fields[2];
150
const uint16 id = util.id(fields[2]);
152
*ofs << "static const ConjugationType kConjugation" << pos_tokens.size()
154
*ofs << " { NULL, NULL, " << id << "}" << endl;
155
*ofs << "};" << endl;
156
pos_tokens.push_back(make_pair(user_pos, static_cast<size_t>(1)));
158
vector<ConjugationType> &forms = inflection_map[ctype];
159
CHECK(!forms.empty());
160
*ofs << "const ConjugationType kConjugation"
161
<< pos_tokens.size() << "[] = {" << endl;
162
bool is_first = true;;
164
for (size_t i = 0; i < forms.size(); ++i) {
165
// repalce <cfrom> with actual cform
167
Util::StringReplace(feature, "<cform>", forms[i].form, true, &output);
168
const uint16 id = util.id(output);
170
LOG(ERROR) << "Cannot find id for:" << output;
177
*ofs << "\"" << Escape(forms[i].value_suffix) << "\"" << ", ";
178
*ofs << "\"" << Escape(forms[i].key_suffix) << "\"" << ", ";
184
*ofs << "};" << endl;
185
pos_tokens.push_back(make_pair(user_pos, added));
189
*ofs << "const POSToken kPOSToken[] = {" << endl;
190
for (size_t i = 0; i < pos_tokens.size(); ++i) {
191
*ofs << " { \"" << Escape(pos_tokens[i].first) << "\", "
192
<< pos_tokens[i].second << ", kConjugation" << i << " }," << endl;
194
*ofs << " { NULL, 0, NULL }" << endl;
195
*ofs << "};" << endl;
204
int main(int argc, char **argv) {
205
FLAGS_logtostderr = true;
206
InitGoogle(argv[0], &argc, &argv, false);
208
if (FLAGS_id_file.empty() &&
209
FLAGS_user_pos_file.empty() &&
210
FLAGS_cforms_file.empty() &&
212
FLAGS_id_file = argv[1];
213
FLAGS_user_pos_file = argv[2];
214
FLAGS_cforms_file = argv[3];
217
LOG(INFO) << FLAGS_id_file;
218
LOG(INFO) << FLAGS_user_pos_file;
219
LOG(INFO) << FLAGS_cforms_file;