1
/* Copyright (c) 2005 MySQL AB, 2009 Sun Microsystems, Inc.
2
Use is subject to license terms.
4
This program is free software; you can redistribute it and/or modify
5
it under the terms of the GNU General Public License as published by
6
the Free Software Foundation; version 2 of the License.
8
This program is distributed in the hope that it will be useful,
9
but WITHOUT ANY WARRANTY; without even the implied warranty of
10
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
GNU General Public License for more details.
13
You should have received a copy of the GNU General Public License
14
along with this program; if not, write to the Free Software
15
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
17
#ifndef _my_plugin_ftparser_h
18
#define _my_plugin_ftparser_h
21
/*************************************************************************
22
API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN)
25
#define MYSQL_FTPARSER_INTERFACE_VERSION 0x0100
27
/* Parsing modes. Set in MYSQL_FTPARSER_PARAM::mode */
28
enum enum_ftparser_mode
31
Fast and simple mode. This mode is used for indexing, and natural
34
The parser is expected to return only those words that go into the
35
index. Stopwords or too short/long words should not be returned. The
36
'boolean_info' argument of mysql_add_word() does not have to be set.
38
MYSQL_FTPARSER_SIMPLE_MODE= 0,
41
Parse with stopwords mode. This mode is used in boolean searches for
44
The parser is not allowed to ignore words in this mode. Every word
45
should be returned, including stopwords and words that are too short
46
or long. The 'boolean_info' argument of mysql_add_word() does not
49
MYSQL_FTPARSER_WITH_STOPWORDS= 1,
52
Parse in boolean mode. This mode is used to parse a boolean query string.
54
The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO
55
structure in the 'boolean_info' argument to mysql_add_word().
56
Usually that means that the parser should recognize boolean operators
57
in the parsing stream and set appropriate fields in
58
MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As for
59
MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
60
Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
62
MYSQL_FTPARSER_FULL_BOOLEAN_INFO= 2
66
Token types for boolean mode searching (used for the type member of
67
MYSQL_FTPARSER_BOOLEAN_INFO struct)
69
FT_TOKEN_EOF: End of data.
70
FT_TOKEN_WORD: Regular word.
71
FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression).
72
FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression).
73
FT_TOKEN_STOPWORD: Stopword.
76
enum enum_ft_token_type
80
FT_TOKEN_LEFT_PAREN= 2,
81
FT_TOKEN_RIGHT_PAREN= 3,
86
This structure is used in boolean search mode only. It conveys
87
boolean-mode metadata to the MySQL search engine for every word in
88
the search query. A valid instance of this structure must be filled
89
in by the plugin parser and passed as an argument in the call to
90
mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM
91
structure) when a query is parsed in boolean mode.
93
type: The token type. Should be one of the enum_ft_token_type values.
95
yesno: Whether the word must be present for a match to occur:
97
<0 Must not be present
98
0 Neither; the word is optional but its presence increases the relevance
99
With the default settings of the ft_boolean_syntax system variable,
100
>0 corresponds to the '+' operator, <0 corrresponds to the '-' operator,
101
and 0 means neither operator was used.
103
weight_adjust: A weighting factor that determines how much a match
104
for the word counts. Positive values increase, negative - decrease the
105
relative word's importance in the query.
107
wasign: The sign of the word's weight in the query. If it's non-negative
108
the match for the word will increase document relevance, if it's
109
negative - decrease (the word becomes a "noise word", the less of it the
112
trunc: Corresponds to the '*' operator in the default setting of the
113
ft_boolean_syntax system variable.
116
typedef struct st_mysql_ftparser_boolean_info
118
enum enum_ft_token_type type;
123
/* These are parser state and must be removed. */
126
} MYSQL_FTPARSER_BOOLEAN_INFO;
129
The following flag means that buffer with a string (document, word)
130
may be overwritten by the caller before the end of the parsing (that is
131
before st_mysql_ftparser::deinit() call). If one needs the string
132
to survive between two successive calls of the parsing function, she
133
needs to save a copy of it. The flag may be set by MySQL before calling
134
st_mysql_ftparser::parse(), or it may be set by a plugin before calling
135
st_mysql_ftparser_param::mysql_parse() or
136
st_mysql_ftparser_param::mysql_add_word().
138
#define MYSQL_FTFLAGS_NEED_COPY 1
141
An argument of the full-text parser plugin. This structure is
142
filled in by MySQL server and passed to the parsing function of the
143
plugin as an in/out parameter.
145
mysql_parse: A pointer to the built-in parser implementation of the
146
server. It's set by the server and can be used by the parser plugin
147
to invoke the MySQL default parser. If plugin's role is to extract
148
textual data from .doc, .pdf or .xml content, it might extract
149
plaintext from the content, and then pass the text to the default
150
MySQL parser to be parsed.
152
mysql_add_word: A server callback to add a new word. When parsing
153
a document, the server sets this to point at a function that adds
154
the word to MySQL full-text index. When parsing a search query,
155
this function will add the new word to the list of words to search
156
for. The boolean_info argument can be NULL for all cases except
157
when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO.
159
ftparser_state: A generic pointer. The plugin can set it to point
160
to information to be used internally for its own purposes.
162
mysql_ftparam: This is set by the server. It is used by MySQL functions
163
called via mysql_parse() and mysql_add_word() callback. The plugin
164
should not modify it.
166
cs: Information about the character set of the document or query string.
168
doc: A pointer to the document or query string to be parsed.
170
length: Length of the document or query string, in bytes.
172
flags: See MYSQL_FTFLAGS_* constants above.
174
mode: The parsing mode. With boolean operators, with stopwords, or
175
nothing. See enum_ftparser_mode above.
178
typedef struct st_mysql_ftparser_param
180
int (*mysql_parse)(struct st_mysql_ftparser_param *,
181
char *doc, int doc_len);
182
int (*mysql_add_word)(struct st_mysql_ftparser_param *,
183
char *word, int word_len,
184
MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info);
185
void *ftparser_state;
187
struct charset_info_st *cs;
191
enum enum_ftparser_mode mode;
192
} MYSQL_FTPARSER_PARAM;
195
Full-text parser descriptor.
197
interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION.
198
The parsing, initialization, and deinitialization functions are
199
invoked per SQL statement for which the parser is used.
202
struct st_mysql_ftparser
204
int interface_version;
205
int (*parse)(MYSQL_FTPARSER_PARAM *param);
206
int (*init)(MYSQL_FTPARSER_PARAM *param);
207
int (*deinit)(MYSQL_FTPARSER_PARAM *param);