2
* Copyright 2008 Novamente LLC
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
19
import java.io.Serializable;
20
import java.util.ArrayList;
22
import relex.feature.Atom;
23
import relex.feature.FeatureForeach;
24
import relex.feature.FeatureNode;
25
import relex.feature.FeatureNodeCallback;
26
import relex.feature.LinkableView;
27
import relex.feature.RelationCallback;
28
import relex.feature.RelationForeach;
29
import relex.stats.SimpleTruthValue;
30
import relex.tree.PhraseTree;
33
* A ParsedSentence object stores all of the syntactic and semantic
34
* information about a sentence parse. The data in the Object is
35
* gradually built up by RelationExtractor.
37
* ParsedSentence contains:
38
* 1. A FeatureNode with metaData about the parse (i.e., the number
40
* 2. An ArrayList of FeatureNodes (leafConstituents) representing each
41
* word in the sentence. -- the parse data can be found by checking
42
* the links in these words.
43
* 3. Strings representing the original sentence, and representations
45
* 4. Sets of relations, with the semantic data from the sentence.
46
* 5. A TruthValue (inherited from Atom) that ranks the relative
47
* likelihood of this parse of being a correct (meaningful) parse
50
public class ParsedSentence extends Atom implements Serializable
52
private static final long serialVersionUID = -5518792541801263127L;
54
// Unique ID string identifying this parse.
55
private String idString;
57
// Back-pointer to collection of other parses for this sentence
58
private Sentence sentence;
60
// String containing the original sentence
61
private String original;
63
// String containing the ascii-art tree output by the link grammar parser.
64
private String linkString;
66
// A string containing the Penn tree-bank style markup,
67
// aka "phrase structure" markup, for example
68
// (S (NP I) (VP am (NP a big robot)) .)
69
private String phraseString;
71
private String errorString;
73
// Metadata about the sentence; primarily, this consists of diagnostic
74
// info returned by the link grammar parser.
75
private FeatureNode metaData;
77
// An ArrayList of FeatureNodes, each one representing a word in the
78
// sentence. If there are no "link islands", each can be reached by
79
// following arcs from the others.
80
private ArrayList<FeatureNode> leafConstituents;
82
/* -------------------------------------------------------------------- */
83
/* Constructors, and setters/getters for private members. */
85
public ParsedSentence(String originalString)
87
original = originalString;
91
leafConstituents = new ArrayList<FeatureNode>();
94
public void setMetaData(FeatureNode f) {
98
public FeatureNode getMetaData() {
102
public String getOriginalSentence() {
106
public String getIDString() {
110
public Sentence getSentence() {
114
public void setSentence(Sentence s) {
118
public void setIDString(String str) {
122
public String getLinkString() {
126
public void setLinkString(String str) {
130
public String getPhraseString() {
134
public void setPhraseString(String str) {
138
public void setErrorString(String eString) {
139
errorString = eString;
142
public String getErrorString() {
146
/* -------------------------------------------------------------------- */
147
public int getNumWords()
149
return leafConstituents.size();
153
* Return the i'th word in the sentence, as a feature node
155
public FeatureNode getWordAsNode(int i)
157
return leafConstituents.get(i);
161
* Return the i'th lemmatized word in the sentence, as a string.
162
* This is the "root form" of the word, and not the original word.
164
public String getWord(int i)
166
return LinkableView.getWordString(getWordAsNode(i));
170
* Return the i'th word in the sentence, as a string
171
* This is the original form of the word, and not its lemma.
173
public String getOrigWord(int i)
175
return LinkableView.getOrigWordString(getWordAsNode(i));
179
* Return the part-of-speech of the i'th word in the sentence
181
public String getPOS(int i)
183
return LinkableView.getPOS(getWordAsNode(i));
187
* Return the offset, in the original sentence, to the first
188
* character of the i'th word in the sentence.
190
public int getStartChar(int i)
192
return LinkableView.getStartChar(getWordAsNode(i));
195
public void addWord(FeatureNode w)
197
leafConstituents.add(w);
201
* Return feature node for the indicated word. Return null
202
* if the word cannot be found in the sentence. The input
203
* word may be either the word as it appears in the sentence,
204
* or its morphological root.
206
* If there are multiple occurances of a word in a sentence,
207
* this will return only the left-most such occurance.
209
public FeatureNode findWord(String word)
211
class word_cb implements FeatureNodeCallback
221
Boolean test(FeatureNode fn, FeatureNode fstr)
223
if (null == fstr) return false;
224
String w = fstr.getValue();
225
if (match_word.equals(w))
232
public Boolean FNCallback(FeatureNode fn)
234
Boolean rc = test(fn, fn.get("orig_str"));
236
rc = test(fn, fn.get("str"));
241
word_cb cb = new word_cb(word);
242
FeatureForeach.foreachWord(getLeft(), cb);
246
/* -------------------------------------------------------------------- */
247
/* Various different views of the parsed sentence */
250
* Shows the full feature structure of the parse as it can be found by
251
* tracing links from the left-most word. Islands will be missed.
253
public String fullParseString()
255
if (getLeft() != null)
256
return getLeft().toString();
261
* Returns a list of the words in the sentence, marked up according to
262
* which "part of speech" they are. Thus, for example:
263
* "The big red baloon floated away." becomes
264
* LEFT-WALL The.det big.adj red.adj balloon.noun float.verb away.prep .
266
public String printPartsOfSpeech()
268
StringBuffer sb = new StringBuffer();
269
for (int i = 0; i < leafConstituents.size(); i++) {
270
sb.append(getWord(i));
271
LinkableView w = new LinkableView(getWordAsNode(i));
272
String pos = w.getPOS();
273
if (pos != null && !pos.equals("WORD"))
274
sb.append("." + pos);
275
String tense = w.getTenseVal(); // ??? tense is not working ...
276
if (tense != null && tense.length() > 0)
278
if (i < leafConstituents.size() - 1)
283
return sb.toString();
286
public String toString()
291
/* ---------------------------------------------------------------- */
293
* Call the callback on each relation in the sentence
295
public Boolean foreach(RelationCallback cb)
297
return RelationForeach.foreach(getLeft(), cb);
300
public Boolean foreach(FeatureNodeCallback cb)
302
return RelationForeach.foreach(getLeft(), cb);
305
/* ---------------------------------------------------------------- */
307
* @return the FeatureNode representing the left-most word in the sentence.
309
public FeatureNode getLeft()
311
return leafConstituents.get(0);
315
* @return the phrase tree associated with this parse
317
public PhraseTree getPhraseTree()
319
return new PhraseTree(getLeft());
322
/* ---------------------------------------------------------------- */
323
/* Return unpacked meta information about parse, and ranking too */
325
public int getAndCost()
327
return getMeta("and_cost");
330
public int getDisjunctCost()
332
return getMeta("disjunct_cost");
335
public int getLinkCost()
337
return getMeta("link_cost");
340
public int getNumSkippedWords()
342
return getMeta("num_skipped_words");
345
private int getMeta(String str)
347
FeatureNode fn = metaData.get(str);
348
if (fn == null) return -1;
349
String val = fn.getValue();
350
return Integer.parseInt(val);
354
* Perform a crude parse-ranking based on Link-grammar output.
355
* The ranking will be stored as the "confidence" of the
356
* TruthValue associated with this parse.
358
* @returns the score that was assigned.
360
* A classic example of competing parses for a sentence is:
361
* (S (NP I) (VP saw (NP the man) (PP with (NP the binoculars))) .)
362
* (S (NP I) (VP saw (NP (NP the man) (PP with (NP the binoculars)))) .)
363
* The ranker below gives both about equal scores.
366
public double simpleRankParse()
368
SimpleTruthValue stv = new SimpleTruthValue();
370
stv.setMean(1.0); // 1.0 == true -- this is a parse.
372
// The weights used here are rather ad-hoc; but the
373
// basic idea is that we want to penalize skipped words
374
// strongly, but disjunct costs not as much. Low link
375
// costs are the tiebreaker.
376
double weight = 0.4 * getNumSkippedWords();
377
weight += 0.2 * getDisjunctCost();
378
weight += 0.06 * getAndCost();
379
weight += 0.012 * getLinkCost();
381
weight = Math.exp(-weight);
383
stv.setConfidence(weight);
388
* Take the current parse confidence, and rescale it by the
389
* indicated amount. The method simpleRankParse() must have
390
* been previously called to perform the initial ranking.
392
public void rescaleRank(double weight)
394
SimpleTruthValue stv = (SimpleTruthValue) truth_value;
395
double confidence = stv.getConfidence();
396
confidence *= weight;
397
stv.setConfidence(confidence);
400
public double getRank()
402
SimpleTruthValue stv = (SimpleTruthValue) truth_value;
403
return stv.getConfidence();
406
public int hashCode()
408
if (original == null)
410
return original.hashCode() | leafConstituents.size();
413
public boolean equals(Object x)
415
if (! (x instanceof ParsedSentence))
417
ParsedSentence p = (ParsedSentence)x;
418
if (original == null)
419
return p.original == null;
421
return original.equals(p.original) && this.leafConstituents.equals(p.leafConstituents);
424
} // end ParsedSentence