2
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
4
* Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
6
* The contents of this file are subject to the terms of either the GNU Lesser
7
* General Public License Version 2.1 only ("LGPL") or the Common Development and
8
* Distribution License ("CDDL")(collectively, the "License"). You may not use this
9
* file except in compliance with the License. You can obtain a copy of the CDDL at
10
* http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
11
* http://www.opensource.org/licenses/lgpl-license.php. See the License for the
12
* specific language governing permissions and limitations under the License. When
13
* distributing the software, include this License Header Notice in each file and
14
* include the full text of the License in the License file as well as the
17
* NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
19
* For Covered Software in this distribution, this License shall be governed by the
20
* laws of the State of California (excluding conflict-of-law provisions).
21
* Any litigation relating to this License shall be subject to the jurisdiction of
22
* the Federal Courts of the Northern District of California and the state courts
23
* of the State of California, with venue lying in Santa Clara County, California.
27
* If you wish your version of this file to be governed by only the CDDL or only
28
* the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
29
* include this software in this distribution under the [CDDL or LGPL Version 2.1]
30
* license." If you don't indicate a single choice of license, a recipient has the
31
* option to distribute your version of this file under either the CDDL or the LGPL
32
* Version 2.1, or to extend the choice of license to its licensees as provided
33
* above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
34
* Version 2 license, then the option applies only if the new code is made subject
35
* to such option by the copyright holder.
59
cerr << "getWordFreq [-s corpus_size] [-v] [-e] -m slm_file -l lexicon\n";
60
cerr << " default corpus_size is 300000000 if not given\n";
61
cerr << " -v means output other information after word and freq for each line\n";
62
cerr << " -e give format for ervin\n";
66
static char* slm_file = NULL;
67
static char* lexicon_file = NULL;
68
static int corpus_size = 300000000;
69
static bool verbose = false;
70
static bool ervin = false;
73
getParameters(int argc, char* argv[])
76
while ((ch = getopt(argc, argv, "m:l:s:ve")) != -1) {
79
slm_file = strdup(optarg); break;
81
lexicon_file = strdup(optarg); break;
83
corpus_size = atoi(optarg); break;
85
verbose = true; break;
92
return (slm_file && lexicon_file && corpus_size > 10);
95
static char buf[8192];
98
tagFile(FILE *fp, CThreadSlm& slm)
101
while (fgets(buf, sizeof(buf), fp) != NULL) {
103
char* wrd = strtok(buf, "\n\r \t");
104
char* idstr = strtok(NULL, "\n\r \t");
105
char* info = strtok(NULL, "\n\r");
107
int id = atoi(idstr);
109
CThreadSlm::TState st;
110
double neglogpr = slm.transfer(st, (unsigned int)id, st);
111
if (st.getLevel() == 1) {
112
freq = int(exp(-neglogpr) * corpus_size);
120
for (char *p = strtok(info, " \t\n\r"); p != NULL; p = strtok(NULL, " \t\n\t"))
123
for (int i=0, sz=pyv.size(); i < sz; ++i) {
124
cout << wrd << " " << pyv[i] << " " << freq << "\n";
126
} else if (idstr && verbose) {
127
cout << wrd << " " << idstr << " " << freq;
132
cout << wrd << " " << freq << "\n";
139
main(int argc, char*argv[])
141
if (!getParameters(argc, argv))
146
if (slm.load(slm_file, true) && (fp = fopen(lexicon_file, "r")) != NULL) {