4
static VALUE cQueryParser;
5
VALUE cQueryParseException;
7
extern VALUE sym_analyzer;
8
static VALUE sym_wild_card_downcase;
9
static VALUE sym_fields;
10
static VALUE sym_all_fields;
11
static VALUE sym_tkz_fields;
12
static VALUE sym_default_field;
13
static VALUE sym_validate_fields;
14
static VALUE sym_or_default;
15
static VALUE sym_default_slop;
16
static VALUE sym_handle_parse_errors;
17
static VALUE sym_clean_string;
18
static VALUE sym_max_clauses;
19
static VALUE sym_use_keywords;
21
extern VALUE frt_get_analyzer(Analyzer *a);
22
extern VALUE frt_get_q(Query *q);
23
extern Analyzer *frt_get_cwrapped_analyzer(VALUE ranalyzer);
25
/****************************************************************************
29
****************************************************************************/
35
qp_destroy((QParser *)p);
41
frt_gc_mark(((QParser *)p)->analyzer);
45
frt_get_fields(VALUE rfields)
51
if (rfields == Qnil) return NULL;
53
fields = hs_new_str(&free);
54
if (TYPE(rfields) == T_ARRAY) {
56
for (i = 0; i < RARRAY(rfields)->len; i++) {
57
rval = rb_obj_as_string(RARRAY(rfields)->ptr[i]);
58
hs_add(fields, nstrdup(rval));
61
rval = rb_obj_as_string(rfields);
62
if (strcmp("*", rs2s(rval)) == 0) {
66
s = str = nstrdup(rval);
67
while ((p = strchr(s, '|')) != '\0') {
69
hs_add(fields, estrdup(s));
72
hs_add(fields, estrdup(s));
81
* QueryParser.new(options = {}) -> QueryParser
83
* Create a new QueryParser. The QueryParser is used to convert string
84
* queries into Query objects. The options are;
88
* :default_field:: Default: "*" (all fields). The default field to
89
* search when no field is specified in the search
90
* string. It can also be an array of fields.
91
* :analyzer:: Default: StandardAnalyzer. Analyzer used by the
92
* query parser to parse query terms
93
* :wild_card_downcase:: Default: true. Specifies whether wild-card queries
94
* and range queries should be downcased or not since
95
* they are not passed through the parser
96
* :fields:: Default: []. Lets the query parser know what
97
* fields are available for searching, particularly
98
* when the "*" is specified as the search field
99
* :tokenized_fields:: Default: :fields. Lets the query parser know which
100
* fields are tokenized so it knows which fields to
101
* run the analyzer over.
102
* :validate_fields:: Default: false. Set to true if you want an
103
* exception to be raised if there is an attempt to
104
* search a non-existent field
105
* :or_default:: Default: true. Use "OR" as the default boolean
107
* :default_slop:: Default: 0. Default slop to use in PhraseQuery
108
* :handle_parse_errors:: Default: true. QueryParser will quietly handle all
109
* parsing errors internally. If you'd like to handle
110
* them yourself, set this parameter to false.
111
* :clean_string:: Default: true. QueryParser will do a quick
112
* once-over the query string make sure that quotes
113
* and brackets match up and special characters are
115
* :max_clauses:: Default: 512. the maximum number of clauses
116
* allowed in boolean queries and the maximum number
117
* of terms allowed in multi, prefix, wild-card or
118
* fuzzy queries when those queries are generated by
119
* rewriting other queries
120
* :use_keywords: Default: true. By default AND, OR, NOT and REQ are
121
* keywords used by the query parser. Sometimes this
122
* is undesirable. For example, if your application
123
* allows searching for US states by their
124
* abbreviation, then OR will be a common query
125
* string. By setting :use_keywords to false, OR will
126
* no longer be a keyword allowing searches for the
127
* state of Oregon. You will still be able to use
128
* boolean queries by using the + and - characters.
131
frt_qp_init(int argc, VALUE *argv, VALUE self)
133
VALUE roptions = Qnil;
135
Analyzer *analyzer = NULL;
136
bool has_options = false;
138
HashSet *all_fields = NULL;
139
HashSet *tkz_fields = NULL;
140
HashSet *def_fields = NULL;
143
if (rb_scan_args(argc, argv, "01", &roptions) > 0) {
144
if (TYPE(roptions) == T_HASH) {
146
if (Qnil != (rval = rb_hash_aref(roptions, sym_default_field))) {
147
def_fields = frt_get_fields(rval);
149
if (Qnil != (rval = rb_hash_aref(roptions, sym_analyzer))) {
150
analyzer = frt_get_cwrapped_analyzer(rval);
152
if (Qnil != (rval = rb_hash_aref(roptions, sym_all_fields))) {
153
all_fields = frt_get_fields(rval);
155
if (Qnil != (rval = rb_hash_aref(roptions, sym_fields))) {
156
all_fields = frt_get_fields(rval);
158
if (Qnil != (rval = rb_hash_aref(roptions, sym_tkz_fields))) {
159
tkz_fields = frt_get_fields(rval);
162
def_fields = frt_get_fields(roptions);
166
if (all_fields == NULL) {
167
all_fields = hs_new_str(&free);
171
analyzer = mb_standard_analyzer_new(true);
174
qp = qp_new(all_fields, def_fields, tkz_fields, analyzer);
175
qp->allow_any_fields = true;
176
qp->clean_str = true;
177
qp->handle_parse_errors = true;
179
if (roptions != Qnil) {
180
if (Qnil != (rval = rb_hash_aref(roptions, sym_handle_parse_errors))) {
181
qp->handle_parse_errors = RTEST(rval);
183
if (Qnil != (rval = rb_hash_aref(roptions, sym_validate_fields))) {
184
qp->allow_any_fields = !RTEST(rval);
186
if (Qnil != (rval = rb_hash_aref(roptions, sym_wild_card_downcase))) {
187
qp->wild_lower = RTEST(rval);
189
if (Qnil != (rval = rb_hash_aref(roptions, sym_or_default))) {
190
qp->or_default = RTEST(rval);
192
if (Qnil != (rval = rb_hash_aref(roptions, sym_default_slop))) {
193
qp->def_slop = FIX2INT(rval);
195
if (Qnil != (rval = rb_hash_aref(roptions, sym_clean_string))) {
196
qp->clean_str = RTEST(rval);
198
if (Qnil != (rval = rb_hash_aref(roptions, sym_max_clauses))) {
199
qp->max_clauses = FIX2INT(rval);
201
if (Qnil != (rval = rb_hash_aref(roptions, sym_use_keywords))) {
202
qp->use_keywords = RTEST(rval);
205
Frt_Wrap_Struct(self, frt_qp_mark, frt_qp_free, qp);
206
object_add(qp, self);
210
#define GET_QP QParser *qp = (QParser *)DATA_PTR(self)
213
* query_parser.parse(query_string) -> Query
215
* Parse a query string returning a Query object if parsing was successful.
216
* Will raise a QueryParseException if unsuccessful.
219
frt_qp_parse(VALUE self, VALUE rstr)
221
const char *msg = NULL;
224
rstr = rb_obj_as_string(rstr);
226
rq = frt_get_q(qp_parse(qp, rs2s(rstr)));
234
rb_raise(cQueryParseException, msg);
242
* query_parser.fields -> Array of Symbols
244
* Returns the list of all fields that the QueryParser knows about.
247
frt_qp_get_fields(VALUE self)
251
HashSet *fields = qp->all_fields;
252
VALUE rfields = rb_ary_new();
254
for (i = 0; i < fields->size; i++) {
255
rb_ary_push(rfields, ID2SYM(rb_intern((char *)fields->elems[i])));
263
* query_parser.fields = fields -> self
265
* Set the list of fields. These fields are expanded for searches on "*".
268
frt_qp_set_fields(VALUE self, VALUE rfields)
271
HashSet *fields = frt_get_fields(rfields);
273
if (qp->def_fields == qp->all_fields) {
274
qp->def_fields = NULL;
276
if (fields == NULL) {
277
fields = hs_new_str(&free);
279
hs_destroy(qp->all_fields);
280
qp->all_fields = fields;
281
if (qp->def_fields == NULL) {
282
qp->def_fields = fields;
290
* query_parser.tokenized_fields -> Array of Symbols
292
* Returns the list of all tokenized_fields that the QueryParser knows about.
295
frt_qp_get_tkz_fields(VALUE self)
299
HashSet *fields = qp->tokenized_fields;
301
VALUE rfields = rb_ary_new();
303
for (i = 0; i < fields->size; i++) {
304
rb_ary_push(rfields, ID2SYM(rb_intern((char *)fields->elems[i])));
316
* query_parser.tokenized_fields = fields -> self
318
* Set the list of tokenized_fields. These tokenized_fields are tokenized in
319
* the queries. If this is set to Qnil then all fields will be tokenized.
322
frt_qp_set_tkz_fields(VALUE self, VALUE rfields)
325
if (qp->tokenized_fields) hs_destroy(qp->tokenized_fields);
326
qp->tokenized_fields = frt_get_fields(rfields);
330
/****************************************************************************
334
****************************************************************************/
337
extern VALUE mFerret = rb_define_module("Ferret");
338
extern VALUE cQueryParser = rb_define_module_under(mFerret, "QueryParser");
342
* Document-class: Ferret::QueryParser::QueryParseException
346
* Exception raised when there is an error parsing the query string passed to
350
Init_QueryParseException(void)
352
cQueryParseException = rb_define_class_under(cQueryParser,
353
"QueryParseException",
358
* Document-class: Ferret::QueryParser
362
* The QueryParser is used to transform user submitted query strings into
363
* QueryObjects. Ferret using its own Query Language known from now on as
364
* Ferret Query Language or FQL.
366
* == Ferret Query Language
370
* The following characters are special characters in FQL;
372
* :, (, ), [, ], {, }, !, +, ", ~, ^, -, |, <, >, =, *, ?, \
374
* If you want to use one of these characters in one of your terms you need
375
* to escape it with a \ character. \ escapes itself. The exception to this
376
* rule is within Phrases which a strings surrounded by double quotes (and
377
* will be explained further bellow in the section on PhraseQueries). In
378
* Phrases, only ", | and <> have special meaning and need to be escaped if
379
* you want the literal value. <> is escaped \<\>.
381
* In the following examples I have only written the query string. This would
384
* query = query_parser.parse("pet:(dog AND cat)")
385
* puts query # => "+pet:dog +pet:cat"
389
* A term query is the most basic query of all and is what most of the other
390
* queries are built upon. The term consists of a single word. eg;
394
* Note that the analyzer will be run on the term and if it splits the term
395
* in two then it will be turned into a phrase query. For example, with the
396
* plain Ferret::Analysis::Analyzer, the following;
404
* Which we will explain now...
408
* A phrase query is a string of terms surrounded by double quotes. For
409
* example you could write;
411
* '"quick brown fox"'
413
* But if a "fast" fox is just as good as a quick one you could use the |
414
* character to specify alternate terms.
416
* '"quick|speedy|fast brown fox"'
418
* What if we don't care what colour the fox is. We can use the <> to specify
419
* a place setter. eg;
421
* '"quick|speedy|fast <> fox"'
423
* This will match any word in between quick and fox. Alternatively we could
424
* set the "slop" for the phrase which allows a certain variation in the
425
* match of the phrase. The slop for a phrase is an integer indicating how
426
* many positions you are allowed to move the terms to get a match. Read more
427
* about the slop factor in Ferret::Search::PhraseQuery. To set the slop
428
* factor for a phrase you can type;
432
* This would match "big house", "big red house", "big red brick house" and
433
* even "house big". That's right, you don't need to have th terms in order
434
* if you allow some slop in your phrases. (See Ferret::Search::Spans if you
435
* need a phrase type query with ordered terms.)
437
* These basic queries will be run on the default field which is set when you
438
* create the query_parser. But what if you want to search a different field.
439
* You'll be needing a ...
443
* A field query is any field prefixed by <fieldname>:. For example, to
444
* search for all instances of the term "ski" in field "sport", you'd write;
447
* Or we can apply a field to phrase;
449
* 'sport:"skiing is fun"'
451
* Now we have a few types of queries, we'll be needing to glue them together
456
* There are a couple of ways of writing boolean queries. Firstly you can
457
* specify which terms are required, optional or required not to exist (not).
459
* * '+' or "REQ" can be used to indicate a required query. "REQ" must be
460
* surrounded by white space.
461
* * '-', '!' or "NOT" are used to indicate query that is required to be
462
* false. "NOT" must be surrounded by white space.
463
* * all other queries are optional if the above symbols are used.
467
* '+sport:ski -sport:snowboard sport:toboggan'
468
* '+ingredient:chocolate +ingredient:strawberries -ingredient:wheat'
470
* You may also use the boolean operators "AND", "&&", "OR" and "||". eg;
472
* 'sport:ski AND NOT sport:snowboard OR sport:toboggan'
473
* 'ingredient:chocolate AND ingredient:strawberries AND NOT ingredient:wheat'
475
* You can set the default operator when you create the query parse.
479
* A range query finds all documents with terms between the two query terms.
480
* This can be very useful in particular for dates. eg;
482
* 'date:[20050725 20050905]' # all dates >= 20050725 and <= 20050905
483
* 'date:[20050725 20050905}' # all dates >= 20050725 and < 20050905
484
* 'date:{20050725 20050905]' # all dates > 20050725 and <= 20050905
485
* 'date:{20050725 20050905}' # all dates > 20050725 and < 20050905
487
* You can also do open ended queries like this;
489
* 'date:[20050725>' # all dates >= 20050725
490
* 'date:{20050725>' # all dates > 20050725
491
* 'date:<20050905]' # all dates <= 20050905
492
* 'date:<20050905}' # all dates < 20050905
496
* 'date: >= 20050725'
498
* 'date: <= 20050905'
501
* If you prefer the above style you could use a boolean query but like this;
503
* 'date:( >= 20050725 AND <= 20050905)'
505
* But rangequery only solution shown first will be faster.
509
* A wild query is a query using the pattern matching characters * and ?. *
510
* matches 0 or more characters while ? matches a single character. This type
511
* of query can be really useful for matching hierarchical categories for
512
* example. Let's say we had this structure;
521
* If you wanted all categories with programming languages you could use the
524
* 'category:/coding?/?*'
526
* Note that this query can be quite expensive if not used carefully. In the
527
* example above there would be no problem but you should be careful not use
528
* the wild characters at the beginning of the query as it'll have to iterate
529
* through every term in that field. Having said that, some fields like the
530
* category field above will only have a small number of distinct fields so
531
* this could be okay.
535
* This is like the sloppy phrase query above, except you are now adding slop
536
* to a term. Basically it measures the Levenshtein distance between two
537
* terms and if the value is below the slop threshold the term is a match.
538
* This time though the slop must be a float between 0 and 1.0, 1.0 being a
539
* perfect match and 0 being far from a match. The default is set to 0.5 so
540
* you don't need to give a slop value if you don't want to. You can set the
541
* default in the Ferret::Search::FuzzyQuery class. Here are a couple of
545
* 'content:Ostralya~0.4'
547
* Note that this query can be quite expensive. If you'd like to use this
548
* query, you may want to set a minimum prefix length in the FuzzyQuery
549
* class. This can substantially reduce the number of terms that the query
554
Init_QueryParser(void)
557
sym_wild_card_downcase = ID2SYM(rb_intern("wild_card_downcase"));
558
sym_fields = ID2SYM(rb_intern("fields"));
559
sym_all_fields = ID2SYM(rb_intern("all_fields"));
560
sym_tkz_fields = ID2SYM(rb_intern("tokenized_fields"));
561
sym_default_field = ID2SYM(rb_intern("default_field"));
562
sym_validate_fields = ID2SYM(rb_intern("validate_fields"));
563
sym_or_default = ID2SYM(rb_intern("or_default"));
564
sym_default_slop = ID2SYM(rb_intern("default_slop"));
565
sym_handle_parse_errors = ID2SYM(rb_intern("handle_parse_errors"));
566
sym_clean_string = ID2SYM(rb_intern("clean_string"));
567
sym_max_clauses = ID2SYM(rb_intern("max_clauses"));
568
sym_use_keywords = ID2SYM(rb_intern("use_keywords"));
571
cQueryParser = rb_define_class_under(mFerret, "QueryParser", rb_cObject);
572
rb_define_alloc_func(cQueryParser, frt_data_alloc);
574
rb_define_method(cQueryParser, "initialize", frt_qp_init, -1);
575
rb_define_method(cQueryParser, "parse", frt_qp_parse, 1);
576
rb_define_method(cQueryParser, "fields", frt_qp_get_fields, 0);
577
rb_define_method(cQueryParser, "fields=", frt_qp_set_fields, 1);
578
rb_define_method(cQueryParser, "tokenized_fields",
579
frt_qp_get_tkz_fields, 0);
580
rb_define_method(cQueryParser, "tokenized_fields=",
581
frt_qp_set_tkz_fields, 1);
583
Init_QueryParseException();