~wintermute-devel/wintermute/linguistics-devel

« back to all changes in this revision

Viewing changes to wintermute-linguistics-en/src/EnglishParser.cpp

  • Committer: Jacky Alciné
  • Date: 2011-04-09 22:18:29 UTC
  • Revision ID: jackyalcine@gmail.com-20110409221829-igczcog3k673ici7
Hmm.. candy :)

Show diffs side-by-side

added added

removed removed

Lines of Context:
21
21
#include <iostream>
22
22
#include <vector>
23
23
#include <string>
24
 
#include <wintermute-linguistics.hpp>
25
 
#include <wintermute-database.hpp>
 
24
#include <wintermute/linguistics/wintermute-linguistics.hpp>
 
25
#include <wintermute/database/wintermute-database.hpp>
26
26
#include "EnglishParser.hpp"
27
27
 
28
28
using namespace std;
35
35
using Wintermute::Linguistics::Parser;
36
36
using Wintermute::Linguistics::English::EnglishParser;
37
37
using Wintermute::Linguistics::English::Phrase;
38
 
using Wintermute::Linguistics::English::NounPhrase;
39
 
using Wintermute::Linguistics::English::VerbPhrase;
40
38
using Wintermute::Data::Linguistics::Configuration;
41
39
using Wintermute::Data::Ontology::Instance;
42
40
 
43
 
EnglishParser::EnglishParser ( ) : Parser ( ) { }
44
 
 
45
 
const string EnglishParser::getLocale ( ) const {
46
 
        return "en";
47
 
}
48
 
 
49
 
char EnglishParser::getTextSeparator ( ) const {
50
 
        return ' ';
51
 
}
52
 
 
53
 
SyntacticBranch EnglishParser::generateSyntactics ( const StringVector& tokens ) {
54
 
        SyntacticBranch theTree;
55
 
 
56
 
        for ( StringVector::const_iterator itr = tokens.begin ( ); itr != tokens.end ( ); itr ++ ) {
57
 
                const string theRawWord = * itr;
58
 
                bool isPseudo;
59
 
 
60
 
                // Do word santiziation.
61
 
                string theWord;
62
 
                for ( int i = 0; i < theRawWord.length ( ); i ++ ) {
63
 
                        char theChar = theRawWord.at ( i );
64
 
                        if ( isalpha ( theChar ) || isdigit ( theChar ) )
65
 
                                theWord += tolower ( theChar );
66
 
                }
67
 
 
68
 
                NodeVector theBranches;
69
 
                const string theWordID = LexicalNode::toID ( theWord );
70
 
                LexicalNode* theLexicalNode = LexicalNode::fromID ( theWordID, this->getLocale ( ) );
71
 
                isPseudo = ( theLexicalNode == NULL );
72
 
                if ( isPseudo ) {
73
 
                        SyntacticNode* theNode = SyntacticNode::buildPsuedo ( theRawWord , "B",this->getLocale ( ) );
74
 
                        theBranches.push_back ( theNode );
75
 
                } else {
76
 
                        int paths = 1;
77
 
                        StringVector allPossibleMeanings = LexicalLink::obtainFlagsFor ( theLexicalNode,NULL );
78
 
                        paths *= allPossibleMeanings.size ( );
79
 
                        for ( StringVector::const_iterator stringItr = allPossibleMeanings.begin ( ); stringItr != allPossibleMeanings.end ( ); stringItr ++ ) {
80
 
                                const string theFlag = * stringItr;
81
 
                                SemanticNode* theMeaning = LexicalLink::getExactMeaningOf ( theLexicalNode,theFlag );
82
 
                                //cout << theFlag << " " << theMeaning->getID () << " => " << theLexicalNode->getID () << endl;
83
 
                                SyntacticNode* synNode = SyntacticNode::build ( theMeaning->getID ( ),theFlag,theRawWord,this->getLocale ( ) );
84
 
                                theBranches.push_back ( synNode );
85
 
                        }
86
 
                        //cout << "(linguistics) [EnglishParser] Discovered " << paths << " linkages for  symbol '" << theRawWord << "'." << endl;
87
 
                }
88
 
                theTree.push_back ( theBranches );
89
 
        }
90
 
        return theTree;
91
 
}
92
 
 
93
 
Phrase* EnglishParser::formPhrase ( const NodeVector::iterator& itr, NodeVector& list ) {
94
 
        SyntacticNode* curNode = dynamic_cast < SyntacticNode* > ( *itr );
95
 
        Phrase* thePhrase = NULL;
96
 
        //cout << "Word class: " << curNode->getType ( ) << ", " << curNode->getText() << endl;
97
 
        switch ( curNode->getType ( ) ) {
98
 
                case 'A': // pronoun
99
 
                {
100
 
                        /**
101
 
                         * @todo How to determine the last noun used that refers to this?
102
 
                         */
103
 
                        if ( curNode->hasFlag ( 'x' ) ) {
104
 
                                /* found a determiner, now snag it up! */
105
 
                                NodeVector theWords;
106
 
                                NodeVector::iterator nextWord = itr + 1;
107
 
                                theWords.push_back ( curNode );
108
 
                                for ( ;nextWord != list.end ( ) ;nextWord ++ ) {
109
 
                                        SyntacticNode* aNode = dynamic_cast < SyntacticNode* > ( *nextWord );
110
 
                                        if ( aNode->getType ( ) == 'A' ||
111
 
                                                aNode->getType ( ) == 'B' ||
112
 
                                                aNode->getType ( ) == 'E' ||
113
 
                                                aNode->getType ( ) == 'F' ) {
114
 
                                                theWords.push_back ( curNode );
115
 
                                                skipWords ( 1 );
116
 
                                        } else
117
 
                                                break;
118
 
                                }
119
 
                                thePhrase = new Phrase ( theWords,curNode );
120
 
                                break;
121
 
                        } else if ( curNode->hasFlag ( "rideg" ) ) {
122
 
                                //
123
 
                        }
124
 
                }
125
 
                        break;
126
 
                case 'B': // noun
127
 
                {
128
 
                        /**
129
 
                         * Nouns can be anything of the following:
130
 
                         *  - pseudo
131
 
                         *  - concepts in the ontology
132
 
                         */
133
 
                        if ( curNode->isPsuedo ( ) ) {
134
 
                                /// Got a pseudo noun; link up the rest and make it a noun.
135
 
                                NodeVector theWords;
136
 
                                NodeVector::iterator nextWord = itr + 1;
137
 
                                theWords.push_back ( curNode );
138
 
                                for ( ;nextWord != list.end ( ) ;nextWord ++ ) {
139
 
                                        SyntacticNode* aNode = dynamic_cast < SyntacticNode* > ( *nextWord );
140
 
                                        if ( aNode->getType ( ) == 'B' && aNode->isPsuedo ( ) ) {
141
 
                                                theWords.push_back ( curNode );
142
 
                                                skipWords ( 1 );
143
 
                                        } else
144
 
                                                break;
145
 
                                }
146
 
                                thePhrase = new Phrase ( theWords,curNode );
147
 
                        } else {
148
 
                                if ( curNode->hasFlag ( "coalp" ) ) {
149
 
 
150
 
                                }
151
 
                        }
152
 
                }
153
 
                        break;
154
 
                case 'C': // verb
155
 
                {
156
 
                        NodeVector theWords;
157
 
                        if ( itr + 1 == list.end ( ) ) {
158
 
                                // Might be a reflexive subject; like "He flew." or "We ate."
159
 
                        } else {
160
 
                                NodeVector::iterator nextWord = itr + 1;
161
 
                                theWords.push_back ( curNode );
162
 
                                for ( ;nextWord != list.end ( ) ;nextWord ++ ) {
163
 
                                        SyntacticNode* aNode = dynamic_cast < SyntacticNode* > ( *nextWord );
164
 
                                        if ( aNode->getType ( ) == 'A' ||
165
 
                                                aNode->getType ( ) == 'B' ) {
166
 
                                                theWords.push_back ( curNode );
167
 
                                                skipWords ( 1 );
168
 
                                        }
169
 
                                }
170
 
                        }
171
 
                        thePhrase = new Phrase ( theWords,curNode );
172
 
                }
173
 
                        break;
174
 
                default: // WTF?
175
 
                {
176
 
                }
177
 
                        break;
178
 
        }
179
 
 
180
 
        return thePhrase;
181
 
}
182
 
 
183
 
void EnglishParser::doPreparsingWork ( PhraseVector& phrase ) {
184
 
        //cout << phrase.size ( ) << " phrases found." << endl;
185
 
}
186
 
 
187
 
SyntacticLink* EnglishParser::formSyntacticLink ( const PhraseVector::iterator& itr, PhraseVector& list ) {
188
 
        Phrase* thePhrase = * itr;
189
 
        const SyntacticNode* headNode = thePhrase->getHeadNode ( );
190
 
        SyntacticLink* theLink = NULL;
191
 
        switch ( headNode->getType ( ) ) {
192
 
                case 'A': // pronoun
193
 
                {
194
 
                }
195
 
                        break;
196
 
                case 'B': // noun
197
 
                {
198
 
                        if (headNode->isPsuedo ()){
199
 
                                /// This is a pseudo phrase like "John Mary", "Microsoft Kinect".
200
 
                                /// This handler right here does a bit of magic. ;)
201
 
                                /// @consider Pushing this up into the standard parser, so _all_ parsers benefit for?
202
 
 
203
 
                                /// Query the locale database for an entry associated with this term.
204
 
                                string rawText = thePhrase->toString();
205
 
                                Instance::QueryMap* results = Instance::query(rawText);
206
 
                                if (results){
207
 
                                        /// Wintermute's familiar with this concept.
208
 
                                        cout << "I know about" << rawText << endl;
209
 
                                } else {
210
 
                                        /// Wintermute has no idea what this is.
211
 
                                        cout << "What's a " << rawText << endl;
212
 
                                }
213
 
                        }
214
 
                }
215
 
                        break;
216
 
                case 'C':
217
 
                {
218
 
                }
219
 
                        break;
220
 
                default: // shouldn't even happen.
221
 
                {
222
 
                        cout << "Not sure.." << endl;
223
 
                }
224
 
                        break;
225
 
        }
226
 
 
227
 
        return theLink;
228
 
}
 
 
b'\\ No newline at end of file'
 
41
namespace Wintermute {
 
42
        namespace Linguistics {
 
43
                namespace English {
 
44
 
 
45
                        EnglishParser::EnglishParser ( ) : Parser ( ) { }
 
46
 
 
47
                        const string EnglishParser::getLocale ( ) const {
 
48
                                return "en";
 
49
                        }
 
50
 
 
51
                        char EnglishParser::getTextSeparator ( ) const {
 
52
                                return ' ';
 
53
                        }
 
54
 
 
55
                        SyntacticBranch EnglishParser::generateSyntactics ( const StringVector& tokens ) {
 
56
                                SyntacticBranch theTree;
 
57
 
 
58
                                for ( StringVector::const_iterator itr = tokens.begin ( ); itr != tokens.end ( ); itr ++ ) {
 
59
                                        const string theRawWord = * itr;
 
60
                                        bool isPseudo;
 
61
 
 
62
                                        // Do word santiziation.
 
63
                                        string theWord;
 
64
                                        for ( int i = 0; i < theRawWord.length ( ); i ++ ) {
 
65
                                                char theChar = theRawWord.at ( i );
 
66
                                                if ( isalpha ( theChar ) || isdigit ( theChar ) )
 
67
                                                        theWord += tolower ( theChar );
 
68
                                        }
 
69
 
 
70
                                        NodeVector theBranches;
 
71
                                        const string theWordID = LexicalNode::toID ( theWord );
 
72
                                        const LexicalNode* theLexicalNode = LexicalNode::fromID ( theWordID, this->getLocale ( ) );
 
73
                                        isPseudo = ( theLexicalNode == NULL );
 
74
                                        if ( isPseudo ) {
 
75
                                                SyntacticNode* theNode = SyntacticNode::buildPsuedo ( theRawWord , "B",this->getLocale ( ) );
 
76
                                                theBranches.push_back ( theNode );
 
77
                                        } else {
 
78
                                                int paths = 1;
 
79
                                                StringVector allPossibleMeanings = LexicalLink::obtainFlagsFor ( theLexicalNode,NULL );
 
80
                                                paths *= allPossibleMeanings.size ( );
 
81
                                                for ( StringVector::const_iterator stringItr = allPossibleMeanings.begin ( ); stringItr != allPossibleMeanings.end ( ); stringItr ++ ) {
 
82
                                                        const string theFlag = * stringItr;
 
83
                                                        const SemanticNode* theMeaning = LexicalLink::getExactMeaningOf ( theLexicalNode,theFlag );
 
84
                                                        // cout << theFlag << " " << theMeaning->getID () << " => " << theLexicalNode->getID () << endl;
 
85
                                                        SyntacticNode* synNode = SyntacticNode::build ( theMeaning->getID ( ),theFlag,theRawWord,this->getLocale ( ) );
 
86
                                                        theBranches.push_back ( synNode );
 
87
                                                }
 
88
                                                // cout << "(linguistics) [EnglishParser] Discovered " << paths << " linkages for  symbol '" << theRawWord << "'." << endl;
 
89
                                        }
 
90
                                        theTree.push_back ( theBranches );
 
91
                                }
 
92
                                return theTree;
 
93
                        }
 
94
 
 
95
                        Phrase* EnglishParser::formPhrase ( const NodeVector::iterator& itr, NodeVector& list ) {
 
96
                                const SyntacticNode* curNode = dynamic_cast < const SyntacticNode* > ( *itr );
 
97
                                Phrase* thePhrase = NULL;
 
98
                                //cout << "Word class: " << curNode->getType ( ) << ", " << curNode->getText() << endl;
 
99
                                switch ( curNode->getType ( ) ) {
 
100
                                        case 'A': // pronoun
 
101
                                        {
 
102
                                                /**
 
103
                                                 * @todo How to determine the last noun used that refers to this?
 
104
                                                 */
 
105
                                                if ( curNode->hasFlag ( 'x' ) ) {
 
106
                                                        /* found a determiner, now snag it up! */
 
107
                                                        NodeVector theWords;
 
108
                                                        NodeVector::iterator nextWord = itr + 1;
 
109
                                                        theWords.push_back ( curNode );
 
110
                                                        for ( ;nextWord != list.end ( ) ;nextWord ++ ) {
 
111
                                                                const SyntacticNode* aNode = dynamic_cast < const SyntacticNode* > ( *nextWord );
 
112
                                                                if ( aNode->getType ( ) == 'A' ||
 
113
                                                                        aNode->getType ( ) == 'B' ||
 
114
                                                                        aNode->getType ( ) == 'E' ||
 
115
                                                                        aNode->getType ( ) == 'F' ) {
 
116
                                                                        theWords.push_back ( curNode );
 
117
                                                                        skipWords ( 1 );
 
118
                                                                } else
 
119
                                                                        break;
 
120
                                                        }
 
121
                                                        thePhrase = new Phrase ( theWords,curNode );
 
122
                                                        break;
 
123
                                                } else if ( curNode->hasFlag ( "rideg" ) ) {
 
124
                                                        //
 
125
                                                }
 
126
                                        }
 
127
                                                break;
 
128
                                        case 'B': // noun
 
129
                                        {
 
130
                                                /**
 
131
                                                 * Nouns can be anything of the following:
 
132
                                                 *  - pseudo
 
133
                                                 *  - concepts in the ontology
 
134
                                                 */
 
135
                                                if ( curNode->isPsuedo ( ) ) {
 
136
                                                        /// Got a pseudo noun; link up the rest and make it a noun.
 
137
                                                        NodeVector theWords;
 
138
                                                        NodeVector::iterator nextWord = itr + 1;
 
139
                                                        theWords.push_back ( curNode );
 
140
                                                        for ( ;nextWord != list.end ( ) ;nextWord ++ ) {
 
141
                                                                const SyntacticNode* aNode = dynamic_cast < const SyntacticNode* > ( *nextWord );
 
142
                                                                if ( aNode->getType ( ) == 'B' && aNode->isPsuedo ( ) ) {
 
143
                                                                        theWords.push_back ( curNode );
 
144
                                                                        skipWords ( 1 );
 
145
                                                                } else
 
146
                                                                        break;
 
147
                                                        }
 
148
                                                        thePhrase = new Phrase ( theWords,curNode );
 
149
                                                } else {
 
150
                                                        if ( curNode->hasFlag ( "coalp" ) ) {
 
151
 
 
152
                                                        }
 
153
                                                }
 
154
                                        }
 
155
                                                break;
 
156
                                        case 'C': // verb
 
157
                                        {
 
158
                                                NodeVector theWords;
 
159
                                                if ( itr + 1 == list.end ( ) ) {
 
160
                                                        // Might be a reflexive subject; like "He flew.", "We quickly ate."
 
161
                                                } else {
 
162
                                                        NodeVector::iterator nextWord = itr + 1;
 
163
                                                        theWords.push_back ( curNode );
 
164
                                                        for ( ;nextWord != list.end ( ) ;nextWord ++ ) {
 
165
                                                                const SyntacticNode* aNode = dynamic_cast < const SyntacticNode* > ( *nextWord );
 
166
                                                                if ( aNode->getType ( ) == 'A' || aNode->getType ( ) == 'B' || aNode->getType ( ) == 'D' || aNode->getType ( ) == 'E' ) {
 
167
                                                                        theWords.push_back ( curNode );
 
168
                                                                        skipWords ( 1 );
 
169
                                                                }
 
170
                                                        }
 
171
                                                }
 
172
                                                thePhrase = new Phrase ( theWords,curNode );
 
173
                                        }
 
174
                                                break;
 
175
                                        default: // WTF?
 
176
                                        {
 
177
                                        }
 
178
                                                break;
 
179
                                }
 
180
 
 
181
                                return thePhrase;
 
182
                        }
 
183
 
 
184
                        void EnglishParser::doPreparsingWork ( PhraseVector& phrase ) {
 
185
                                //cout << phrase.size ( ) << " phrases found." << endl;
 
186
                        }
 
187
 
 
188
                        SyntacticLink* EnglishParser::formSyntacticLink ( const PhraseVector::iterator& itr, PhraseVector& list ) {
 
189
                                Phrase* thePhrase = * itr;
 
190
                                const SyntacticNode* headNode = thePhrase->getHeadNode ( );
 
191
                                SyntacticLink* theLink = NULL;
 
192
                                switch ( headNode->getType ( ) ) {
 
193
                                        case 'A': // pronoun
 
194
                                        {
 
195
                                        }
 
196
                                                break;
 
197
                                        case 'B': // noun
 
198
                                        {
 
199
                                                if ( headNode->isPsuedo ( ) ) {
 
200
                                                        /// This is a pseudo phrase like "John Mary", "Microsoft Kinect".
 
201
                                                        /// This handler right here does a bit of magic. ;)
 
202
                                                        /// @consider Pushing this up into the standard parser, so _all_ parsers benefit for?
 
203
 
 
204
                                                        /// Query the locale database for an entry associated with this term.
 
205
                                                        string rawText = thePhrase->toString ( );
 
206
                                                        Instance::QueryMap* results = Instance::query ( rawText );
 
207
                                                        if ( results ) {
 
208
                                                                /// Wintermute's familiar with this concept.
 
209
                                                                cout << "I know about" << rawText << endl;
 
210
                                                        } else {
 
211
                                                                /// Wintermute has no idea what this is.
 
212
                                                                cout << "What's a " << rawText << endl;
 
213
                                                        }
 
214
                                                }
 
215
                                        }
 
216
                                                break;
 
217
                                        case 'C': // verb
 
218
                                        {
 
219
                                                /// When it comes to linking, verbs need a subject & an object.
 
220
                                                /// We only look for the object, to save effort.
 
221
                                                PhraseVector::iterator aItr = itr + 1;
 
222
                                                Phrase* phrObj = NULL;
 
223
                                                NodeVector modifiers;
 
224
                                                for ( ; aItr != list.end ( ); aItr ++ ) {
 
225
                                                        Phrase* aPhrase = * aItr;
 
226
                                                        const char theType = aPhrase->getHeadNode ( )->getType ( );
 
227
                                                        if ( theType == 'A' || theType == 'B' ) {
 
228
                                                                /// It's our object!
 
229
                                                                phrObj = aPhrase;
 
230
                                                                break;
 
231
                                                        } else if ( theType == 'D' || theType == 'E' ) {
 
232
                                                                /// Grrr, it's an adverb or an adjective, hmm.
 
233
                                                                /// Shouldn't be bumping into this.
 
234
                                                                modifiers.push_back ( aPhrase );
 
235
                                                        } else {
 
236
                                                                phrObj = NULL;
 
237
                                                                break;
 
238
                                                        }
 
239
                                                }
 
240
                                                theLink = SyntacticLink::build ( thePhrase,( ( phrObj == NULL )?NULL:phrObj ),"C",&modifiers );
 
241
                                        }
 
242
                                                break;
 
243
                                        default: // Shouldn't even happen; but people say crazy things :)
 
244
                                        {
 
245
                                                cout << "Not sure.." << endl;
 
246
                                        }
 
247
                                                break;
 
248
                                }
 
249
 
 
250
                                return theLink;
 
251
                        }
 
252
 
 
253
                        void EnglishParser::interpretLinks ( const LinkVector& theLinks, const PhraseVector& thePhrase ) {
 
254
                                for ( LinkVector::const_iterator currentLinkItr = theLinks.begin ( ); currentLinkItr != theLinks.end ( ); currentLinkItr ++ ) {
 
255
                                        const SyntacticLink* currentLink = dynamic_cast < const SyntacticLink* > ( *currentLinkItr );
 
256
                                        const Phrase* srcPhrase = dynamic_cast < const Phrase* > ( currentLink->getSource ( ) );
 
257
                                        const Phrase* dstPhrase = dynamic_cast < const Phrase* > ( currentLink->getDestination ( ) );
 
258
 
 
259
                                        const char theFlag = currentLink->getFlags ( ).at ( 0 );
 
260
                                        switch ( theFlag ) {
 
261
                                                case 'C': // Verb
 
262
                                                {
 
263
                                                        if ( dstPhrase == NULL ) {
 
264
                                                                PhraseVector::const_iterator itr = find ( thePhrase.begin ( ),thePhrase.end ( ),srcPhrase );
 
265
                                                                cout << "No subject specified for '" << srcPhrase->getHeadNode ( )->getText ( ) << "'." << endl;
 
266
                                                        }
 
267
                                                }
 
268
                                                        break;
 
269
                                                default:
 
270
                                                {
 
271
                                                        cout << "I'm confused.." << endl;
 
272
                                                }
 
273
                                                        break;
 
274
                                        }
 
275
                                }
 
276
                        }
 
277
                }
 
278
        }
 
279
}