1
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
3
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
4
<title>SQLite FTS3 and FTS4 Extensions</title>
5
<style type="text/css">
8
font-family: Verdana, sans-serif;
13
a:visited { color: #734559 }
15
.logo { position:absolute; margin:3px; }
31
.toolbar a { color: white; text-decoration: none; padding: 6px 12px; }
32
.toolbar a:visited { color: white; }
33
.toolbar a:hover { color: #044a64; background: white; }
35
.content { margin: 5%; }
36
.content dt { font-weight:bold; }
37
.content dd { margin-bottom: 25px; margin-left:20%; }
38
.content ul { padding:0px; padding-left: 15px; margin:0px; }
41
.se { background: url(images/se.gif) 100% 100% no-repeat #044a64}
42
.sw { background: url(images/sw.gif) 0% 100% no-repeat }
43
.ne { background: url(images/ne.gif) 100% 0% no-repeat }
44
.nw { background: url(images/nw.gif) 0% 0% no-repeat }
46
/* Things for "fancyformat" documents start here. */
47
.fancy img+p {font-style:italic}
48
.fancy .codeblock i { color: darkblue; }
49
.fancy h1,.fancy h2,.fancy h3,.fancy h4 {font-weight:normal;color:#044a64}
50
.fancy h2 { margin-left: 10px }
51
.fancy h3 { margin-left: 20px }
52
.fancy h4 { margin-left: 30px }
53
.fancy th {white-space:nowrap;text-align:left;border-bottom:solid 1px #444}
54
.fancy th, .fancy td {padding: 0.2em 1ex; vertical-align:top}
55
.fancy #toc a { color: darkblue ; text-decoration: none }
56
.fancy .todo { color: #AA3333 ; font-style : italic }
57
.fancy .todo:before { content: 'TODO:' }
58
.fancy p.todo { border: solid #AA3333 1px; padding: 1ex }
59
.fancy img { display:block; }
60
.fancy :link:hover, .fancy :visited:hover { background: wheat }
61
.fancy p,.fancy ul,.fancy ol { margin: 1em 5ex }
62
.fancy li p { margin: 1em 0 }
63
/* End of "fancyformat" specific rules. */
69
<div><!-- container div to satisfy validator -->
72
<img class="logo" src="images/sqlite370_banner.gif" alt="SQLite Logo"
74
<div><!-- IE hack to prevent disappearing logo--></div>
75
<div class="tagline">Small. Fast. Reliable.<br>Choose any three.</div>
77
<table width=100% style="clear:both"><tr><td>
78
<div class="se"><div class="sw"><div class="ne"><div class="nw">
79
<table width=100% style="padding:0;margin:0;cell-spacing:0"><tr>
82
<a href="about.html">About</a>
83
<a href="sitemap.html">Sitemap</a>
84
<a href="docs.html">Documentation</a>
85
<a href="download.html">Download</a>
86
<a href="copyright.html">License</a>
87
<a href="news.html">News</a>
88
<a href="support.html">Support</a>
91
gMsg = "Search SQLite Docs..."
92
function entersearch() {
93
var q = document.getElementById("q");
94
if( q.value == gMsg ) { q.value = "" }
95
q.style.color = "black"
96
q.style.fontStyle = "normal"
98
function leavesearch() {
99
var q = document.getElementById("q");
100
if( q.value == "" ) {
102
q.style.color = "#044a64"
103
q.style.fontStyle = "italic"
108
<div style="padding:0 1em 0px 0;white-space:nowrap">
109
<form name=f method="GET" action="http://www.sqlite.org/search">
110
<input id=q name=q type=text
111
onfocus="entersearch()" onblur="leavesearch()" style="width:24ex;padding:1px 1ex; border:solid white 1px; font-size:0.9em ; font-style:italic;color:#044a64;" value="Search SQLite Docs...">
112
<input type=submit value="Go" style="border:solid white 1px;background-color:#044a64;color:white;font-size:0.9em;padding:0 1ex">
116
</div></div></div></div>
118
<div class=startsearch></div>
126
<div style="font-size:2em;text-align:center;color:#044a64">
127
SQLite FTS3 and FTS4 Extensions
129
<div style="font-size:1.5em;margin:1em;color:#044a64">
130
Table Of Contents</div>
131
<div id=toc> <div style="margin-left:6ex"><a href="#section_1">1. Introduction to FTS3 and FTS4</a></div><div style="margin-left:12ex"><a href="#section_1_1">1.1. Differences between FTS3 and FTS4</a></div><div style="margin-left:12ex"><a href="#section_1_2">1.2. Creating and Destroying FTS Tables</a></div><div style="margin-left:12ex"><a href="#section_1_3">1.3. Populating FTS Tables</a></div><div style="margin-left:12ex"><a href="#section_1_4">1.4. Simple FTS Queries</a></div><div style="margin-left:12ex"><a href="#section_1_5">1.5. Summary</a></div><div style="margin-left:6ex"><a href="#section_2">2. Compiling and Enabling FTS3 and FTS4</a></div><div style="margin-left:6ex"><a href="#section_3">3. Full-text Index Queries</a></div><div style="margin-left:12ex"><a href="#section_3_1">3.1.
132
Set Operations Using The Enhanced Query Syntax</a></div><div style="margin-left:12ex"><a href="#section_3_2">3.2. Set Operations Using The Standard Query Syntax</a></div><div style="margin-left:6ex"><a href="#section_4">4. Auxiliary Functions - Snippet, Offsets and Matchinfo</a></div><div style="margin-left:12ex"><a href="#section_4_1">4.1. The Offsets Function</a></div><div style="margin-left:12ex"><a href="#section_4_2">4.2. The Snippet Function</a></div><div style="margin-left:12ex"><a href="#matchinfo">4.3. The Matchinfo Function</a></div><div style="margin-left:6ex"><a href="#fts4aux">5. Fts4aux - Direct Access to the Full-Text Index</a></div><div style="margin-left:6ex"><a href="#fts4_options">6. FTS4 Options</a></div><div style="margin-left:12ex"><a href="#section_6_1">6.1. The compress= and uncompress= options</a></div><div style="margin-left:12ex"><a href="#section_6_2">6.2. The content= option </a></div><div style="margin-left:18ex"><a href="#section_6_2_1">6.2.1. Contentless FTS4 Tables </a></div><div style="margin-left:18ex"><a href="#section_6_2_2">6.2.2. External Content FTS4 Tables </a></div><div style="margin-left:12ex"><a href="#section_6_3">6.3. The languageid= option</a></div><div style="margin-left:12ex"><a href="#section_6_4">6.4. The matchinfo= option</a></div><div style="margin-left:12ex"><a href="#section_6_5">6.5. The prefix= option</a></div><div style="margin-left:6ex"><a href="#commands">7. Special Commands For FTS3 and FTS4</a></div><div style="margin-left:12ex"><a href="#optimize">7.1. The "optimize" command</a></div><div style="margin-left:12ex"><a href="#rebuild">7.2. The "rebuild" command</a></div><div style="margin-left:12ex"><a href="#integcheck">7.3. The "integrity-check" command</a></div><div style="margin-left:12ex"><a href="#mergecmd">7.4. The "merge=X,Y" command</a></div><div style="margin-left:12ex"><a href="#automerge"">7.5. The "automerge=B" command</a></div><div style="margin-left:6ex"><a href="#tokenizer">8. Tokenizers</a></div><div style="margin-left:12ex"><a href="#section_8_1">8.1. Custom (User Implemented) Tokenizers</a></div><div style="margin-left:6ex"><a href="#section_9">9. Data Structures</a></div><div style="margin-left:12ex"><a href="#section_9_1">9.1. Shadow Tables</a></div><div style="margin-left:12ex"><a href="#section_9_2">9.2. Variable Length Integer (varint) Format</a></div><div style="margin-left:12ex"><a href="#section_9_3">9.3. Segment B-Tree Format</a></div><div style="margin-left:18ex"><a href="#section_9_3_1">9.3.1. Segment B-Tree Leaf Nodes</a></div><div style="margin-left:18ex"><a href="#section_9_3_2">9.3.2. Segment B-Tree Interior Nodes</a></div><div style="margin-left:12ex"><a href="#section_9_4">9.4. Doclist Format</a></div><div style="margin-left:6ex"><a href="#appendix_a">
133
Appendix A: Search Application Tips
135
<div class=startsearch></div>
138
<h2 style="margin-left:1.0em"> Overview</h2>
141
FTS3 and FTS4 are SQLite virtual table modules that allows users to perform
142
full-text searches on a set of documents. The most common (and effective)
143
way to describe full-text searches is "what Google, Yahoo, and Bing do
144
with documents placed on the World Wide Web". Users input a term, or series
145
of terms, perhaps connected by a binary operator or grouped together into a
146
phrase, and the full-text query system finds the set of documents that best
147
matches those terms considering the operators and groupings the user has
148
specified. This article describes the deployment and usage of FTS3 and FTS4.
151
FTS1 and FTS2 are obsolete full-text search modules for SQLite. There are known
152
issues with these older modules and their use should be avoided.
153
Portions of the original FTS3 code were contributed to the SQLite project
154
by Scott Hess of <a href="http://www.google.com">Google</a>. It is now
155
developed and maintained as part of SQLite.
157
<h1 id="section_1">1. Introduction to FTS3 and FTS4</h1>
160
The FTS3 and FTS4 extension modules allows users to create special tables with a
161
built-in full-text index (hereafter "FTS tables"). The full-text index
162
allows the user to efficiently query the database for all rows that contain
163
one or more words (hereafter "tokens"), even if the table
164
contains many large documents.
167
For example, if each of the 517430 documents in the
168
"<a href="http://www.cs.cmu.edu/~enron/">Enron E-Mail Dataset</a>"
169
is inserted into both an FTS table and an ordinary SQLite table
170
created using the following SQL script:
172
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE VIRTUAL TABLE enrondata1 USING fts3(content TEXT); /* FTS3 table */
173
CREATE TABLE enrondata2(content TEXT); /* Ordinary table */
177
Then either of the two queries below may be executed to find the number of
178
documents in the database that contain the word "linux" (351). Using one
179
desktop PC hardware configuration, the query on the FTS3 table returns in
180
approximately 0.03 seconds, versus 22.5 for querying the ordinary table.
182
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">SELECT count(*) FROM enrondata1 WHERE content MATCH 'linux'; /* 0.03 seconds */
183
SELECT count(*) FROM enrondata2 WHERE content LIKE '%linux%'; /* 22.5 seconds */
187
Of course, the two queries above are not entirely equivalent. For example
188
the LIKE query matches rows that contain terms such as "linuxophobe"
189
or "EnterpriseLinux" (as it happens, the Enron E-Mail Dataset does not
190
actually contain any such terms), whereas the MATCH query on the FTS3 table
191
selects only those rows that contain "linux" as a discrete token. Both
192
searches are case-insensitive. The FTS3 table consumes around 2006 MB on
193
disk compared to just 1453 MB for the ordinary table. Using the same
194
hardware configuration used to perform the SELECT queries above, the FTS3
195
table took just under 31 minutes to populate, versus 25 for the ordinary
198
<h2 id="section_1_1">1.1. Differences between FTS3 and FTS4</h2>
203
FTS3 and FTS4 are nearly identical. They share most of their code in common,
204
and their interfaces are the same. The differences are:
207
<li> <p>FTS4 contains query performance optimizations that may significantly
208
improve the performance of full-text queries that contain terms that are
209
very common (present in a large percentage of table rows).
211
<li> <p>FTS4 supports some additional options that may used with the <a href="fts3.html#matchinfo">matchinfo()</a>
214
<li> <p>Because it stores extra information on disk in two new
215
<a href="fts3.html#*shadowtab">shadow tables</a> in order to support the performance
216
optimizations and extra matchinfo() options, FTS4 tables may consume more
217
disk space than the equivalent table created using FTS3. Usually the overhead
218
is 1-2% or less, but may be as high as 10% if the documents stored in the
219
FTS table are very small. The overhead may be reduced by specifying the
220
directive <a href="fts3.html#fts4matchinfo">"matchinfo=fts3"</a> as part of the FTS4 table
221
declaration, but this comes at the expense of sacrificing some of the
222
extra supported matchinfo() options.
224
<li> <p>FTS4 provides hooks (the compress and uncompress
225
<a href="fts3.html#fts4_options">options</a>) allowing data to be stored in a compressed
226
form, reducing disk usage and IO.
230
FTS4 is an enhancement to FTS3. FTS3 has been available since SQLite <a href="releaselog/3_5_0.html">version 3.5.0</a> in
231
2007-09-04. The enhancements for FTS4 were added with SQLite <a href="releaselog/3_7_4.html">version 3.7.4</a> on 2010-12-08.
234
Which module, FTS3 or FTS4, should you use in your application? FTS4 is
235
sometimes significantly faster than FTS3, even orders of magnitude faster
236
depending on the query, though in the common case the performance of the two
237
modules is similar. FTS4 also offers the enhanced <a href="fts3.html#matchinfo">matchinfo()</a> outputs which
238
can be useful in ranking the results of a <a href="fts3.html#section_3">MATCH</a> operation. On the
239
other hand, in the absence of a <a href="fts3.html#fts4matchinfo">matchinfo=fts3</a> directive FTS4 requires a little
240
more disk space than FTS3, though only a percent of two in most cases.
243
For newer applications, FTS4 is recommended; though if compatibility with older
244
versions of SQLite is important, then FTS3 will usually serve just as well.
246
<h2 id="section_1_2">1.2. Creating and Destroying FTS Tables</h2>
249
Like other virtual table types, new FTS tables are created using a
250
<a href="lang_createvtab.html">CREATE VIRTUAL TABLE</a> statement. The module name, which follows
251
the USING keyword, is either "fts3" or "fts4". The virtual table module arguments may
252
be left empty, in which case an FTS table with a single user-defined
253
column named "content" is created. Alternatively, the module arguments
254
may be passed a list of comma separated column names.
257
If column names are explicitly provided for the FTS table as part of
258
the CREATE VIRTUAL TABLE statement, then a datatype name may be optionally
259
specified for each column. This is pure syntactic sugar, the
260
supplied typenames are not used by FTS or the SQLite core for any
261
purpose. The same applies to any constraints specified along with an
262
FTS column name - they are parsed but not used or recorded by the system
265
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create an FTS table named "data" with one column - "content":</i>
266
CREATE VIRTUAL TABLE data USING fts3();
268
<i>-- Create an FTS table named "pages" with three columns:</i>
269
CREATE VIRTUAL TABLE pages USING fts4(title, keywords, body);
271
<i>-- Create an FTS table named "mail" with two columns. Datatypes
272
-- and column constraints are specified along with each column. These
273
-- are completely ignored by FTS and SQLite. </i>
274
CREATE VIRTUAL TABLE mail USING fts3(
275
subject VARCHAR(256) NOT NULL,
276
body TEXT CHECK(length(body)<10240)
281
As well as a list of columns, the module arguments passed to a CREATE
282
VIRTUAL TABLE statement used to create an FTS table may be used to specify
283
a <a href="fts3.html#tokenizer">tokenizer</a>. This is done by specifying a string of the form
284
"tokenize=<tokenizer name> <tokenizer args>" in place of a column
285
name, where <tokenizer name> is the name of the tokenizer to use and
286
<tokenizer args> is an optional list of whitespace separated qualifiers
287
to pass to the tokenizer implementation. A tokenizer specification may be
288
placed anywhere in the column list, but at most one tokenizer declaration is
289
allowed for each CREATE VIRTUAL TABLE statement. <a href="fts3.html#tokenizer">See below</a> for a
290
detailed description of using (and, if necessary, implementing) a tokenizer.
292
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create an FTS table named "papers" with two columns that uses</i>
293
<i>-- the tokenizer "porter".</i>
294
CREATE VIRTUAL TABLE papers USING fts3(author, document, tokenize=porter);
296
<i>-- Create an FTS table with a single column - "content" - that uses</i>
297
<i>-- the "simple" tokenizer.</i>
298
CREATE VIRTUAL TABLE data USING fts4(tokenize=simple);
300
<i>-- Create an FTS table with two columns that uses the "icu" tokenizer.</i>
301
<i>-- The qualifier "en_AU" is passed to the tokenizer implementation</i>
302
CREATE VIRTUAL TABLE names USING fts3(a, b, tokenize=icu en_AU);
306
FTS tables may be dropped from the database using an ordinary <a href="lang_droptable.html">DROP TABLE</a>
307
statement. For example:
309
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create, then immediately drop, an FTS4 table.</i>
310
CREATE VIRTUAL TABLE data USING fts4();
314
<h2 id="section_1_3">1.3. Populating FTS Tables</h2>
317
FTS tables are populated using <a href="lang_insert.html">INSERT</a>, <a href="lang_update.html">UPDATE</a> and <a href="lang_delete.html">DELETE</a>
318
statements in the same way as ordinary SQLite tables are.
321
As well as the columns named by the user (or the "content" column if no
322
module arguments were specified as part of the <a href="lang_createvtab.html">CREATE VIRTUAL TABLE</a>
323
statement), each FTS table has a "rowid" column. The rowid of an FTS
324
table behaves in the same way as the rowid column of an ordinary SQLite
325
table, except that the values stored in the rowid column of an FTS table
326
remain unchanged if the database is rebuilt using the <a href="lang_vacuum.html">VACUUM</a> command.
327
For FTS tables, "docid" is allowed as an alias along with the usual "rowid",
328
"oid" and "_oid_" identifiers. Attempting to insert or update a row with a
329
docid value that already exists in the table is an error, just as it would
330
be with an ordinary SQLite table.
333
There is one other subtle difference between "docid" and the normal SQLite
334
aliases for the rowid column. Normally, if an INSERT or UPDATE statement
335
assigns discrete values to two or more aliases of the rowid column, SQLite
336
writes the rightmost of such values specified in the INSERT or UPDATE
337
statement to the database. However, assigning a non-NULL value to both
338
the "docid" and one or more of the SQLite rowid aliases when inserting or
339
updating an FTS table is considered an error. See below for an example.
341
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create an FTS table</i>
342
CREATE VIRTUAL TABLE pages USING fts4(title, body);
344
<i>-- Insert a row with a specific docid value.</i>
345
INSERT INTO pages(docid, title, body) VALUES(53, 'Home Page', 'SQLite is a software...');
347
<i>-- Insert a row and allow FTS to assign a docid value using the same algorithm as</i>
348
<i>-- SQLite uses for ordinary tables. In this case the new docid will be 54,</i>
349
<i>-- one greater than the largest docid currently present in the table.</i>
350
INSERT INTO pages(title, body) VALUES('Download', 'All SQLite source code...');
352
<i>-- Change the title of the row just inserted.</i>
353
UPDATE pages SET title = 'Download SQLite' WHERE rowid = 54;
355
<i>-- Delete the entire table contents.</i>
358
<i>-- The following is an error. It is not possible to assign non-NULL values to both</i>
359
<i>-- the rowid and docid columns of an FTS table.</i>
360
INSERT INTO pages(rowid, docid, title, body) VALUES(1, 2, 'A title', 'A document body');
364
To support full-text queries, FTS maintains an inverted index that maps
365
from each unique term or word that appears in the dataset to the locations
366
in which it appears within the table contents. For the curious, a
367
complete description of the <a href="fts3.html#section_9">data structure</a> used to store
368
this index within the database file appears below. A feature of
369
this data structure is that at any time the database may contain not
370
one index b-tree, but several different b-trees that are incrementally
371
merged as rows are inserted, updated and deleted. This technique improves
372
performance when writing to an FTS table, but causes some overhead for
373
full-text queries that use the index. Evaluating the special <a href="fts3.html#*fts4optcmd">"optimize" command</a>,
374
an SQL statement of the
375
form "INSERT INTO <fts-table>(<fts-table>) VALUES('optimize')",
376
causes FTS to merge all existing index b-trees into a single large
377
b-tree containing the entire index. This can be an expensive operation,
378
but may speed up future queries.
381
For example, to optimize the full-text index for an FTS table named
384
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Optimize the internal structure of FTS table "docs".</i>
385
INSERT INTO docs(docs) VALUES('optimize');
389
The statement above may appear syntactically incorrect to some. Refer to
390
the section describing the <a href="fts3.html#section_1_4">simple fts queries</a> for an explanation.
393
There is another, deprecated, method for invoking the optimize
394
operation using a SELECT statement. New code should use statements
395
similar to the INSERT above to optimize FTS structures.
397
<h2 id="section_1_4">1.4. Simple FTS Queries</h2>
400
As for all other SQLite tables, virtual or otherwise, data is retrieved
401
from FTS tables using a <a href="lang_select.html">SELECT</a> statement.
404
FTS tables can be queried efficiently using SELECT statements of two
409
<b>Query by rowid</b>. If the WHERE clause of the SELECT statement
410
contains a sub-clause of the form "rowid = ?", where ? is an SQL expression,
411
FTS is able to retrieve the requested row directly using the equivalent
412
of an SQLite <a href="lang_createtable.html#rowid">INTEGER PRIMARY KEY</a> index.
415
<b>Full-text query</b>. If the WHERE clause of the SELECT statement contains
416
a sub-clause of the form "<column> MATCH ?", FTS is able to use
417
the built-in full-text index to restrict the search to those documents
418
that match the full-text query string specified as the right-hand operand
423
If neither of these two query strategies can be used, all
424
queries on FTS tables are implemented using a linear scan of the entire
425
table. If the table contains large amounts of data, this may be an
426
impractical approach (the first example on this page shows that a linear
427
scan of 1.5 GB of data takes around 30 seconds using a modern PC).
429
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- The examples in this block assume the following FTS table:</i>
430
CREATE VIRTUAL TABLE mail USING fts3(subject, body);
432
SELECT * FROM mail WHERE rowid = 15; <i>-- Fast. Rowid lookup.</i>
433
SELECT * FROM mail WHERE body MATCH 'sqlite'; <i>-- Fast. Full-text query.</i>
434
SELECT * FROM mail WHERE mail MATCH 'search'; <i>-- Fast. Full-text query.</i>
435
SELECT * FROM mail WHERE rowid BETWEEN 15 AND 20; <i>-- Slow. Linear scan.</i>
436
SELECT * FROM mail WHERE subject = 'database'; <i>-- Slow. Linear scan.</i>
437
SELECT * FROM mail WHERE subject MATCH 'database'; <i>-- Fast. Full-text query.</i>
441
In all of the full-text queries above, the right-hand operand of the MATCH
442
operator is a string consisting of a single term. In this case, the MATCH
443
expression evaluates to true for all documents that contain one or more
444
instances of the specified word ("sqlite", "search" or "database", depending
445
on which example you look at). Specifying a single term as the right-hand
446
operand of the MATCH operator results in the simplest and most common type
447
of full-text query possible. However more complicated queries are possible,
448
including phrase searches, term-prefix searches and searches for documents
449
containing combinations of terms occurring within a defined proximity of each
450
other. The various ways in which the full-text index may be queried are
451
<a href="fts3.html#section_3">described below</a>.
454
Normally, full-text queries are case-insensitive. However, this
455
is dependent on the specific <a href="fts3.html#tokenizer">tokenizer</a> used by the FTS table
456
being queried. Refer to the section on <a href="fts3.html#tokenizer">tokenizers</a> for details.
459
The paragraph above notes that a MATCH operator with a simple term as the
460
right-hand operand evaluates to true for all documents that contain the
461
specified term. In this context, the "document" may refer to either the
462
data stored in a single column of a row of an FTS table, or to the contents
463
of all columns in a single row, depending on the identifier used as the
464
left-hand operand to the MATCH operator. If the identifier specified as
465
the left-hand operand of the MATCH operator is an FTS table column name,
466
then the document that the search term must be contained in is the value
467
stored in the specified column. However, if the identifier is the name
468
of the FTS <i>table</i> itself, then the MATCH operator evaluates to true
469
for each row of the FTS table for which any column contains the search
470
term. The following example demonstrates this:
472
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Example schema</i>
473
CREATE VIRTUAL TABLE mail USING fts3(subject, body);
475
<i>-- Example table population</i>
476
INSERT INTO mail(docid, subject, body) VALUES(1, 'software feedback', 'found it too slow');
477
INSERT INTO mail(docid, subject, body) VALUES(2, 'software feedback', 'no feedback');
478
INSERT INTO mail(docid, subject, body) VALUES(3, 'slow lunch order', 'was a software problem');
480
<i>-- Example queries</i>
481
SELECT * FROM mail WHERE subject MATCH 'software'; <i>-- Selects rows 1 and 2</i>
482
SELECT * FROM mail WHERE body MATCH 'feedback'; <i>-- Selects row 2</i>
483
SELECT * FROM mail WHERE mail MATCH 'software'; <i>-- Selects rows 1, 2 and 3</i>
484
SELECT * FROM mail WHERE mail MATCH 'slow'; <i>-- Selects rows 1 and 3</i>
488
At first glance, the final two full-text queries in the example above seem
489
to be syntactically incorrect, as there is a table name ("mail") used as
490
an SQL expression. The reason this is acceptable is that each FTS table
491
actually has a <a href="c3ref/declare_vtab.html">HIDDEN</a> column with the same name
492
as the table itself (in this case, "mail"). The value stored in this
493
column is not meaningful to the application, but can be used as the
494
left-hand operand to a MATCH operator. This special column may also be
495
passed as an argument to the <a href="fts3.html#snippet">FTS auxiliary functions</a>.
498
The following example illustrates the above. The expressions "docs",
499
"docs.docs" and "main.docs.docs" all refer to column "docs". However, the
500
expression "main.docs" does not refer to any column. It could be used to
501
refer to a table, but a table name is not allowed in the context in which
504
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Example schema</i>
505
CREATE VIRTUAL TABLE docs USING fts4(content);
507
<i>-- Example queries</i>
508
SELECT * FROM docs WHERE docs MATCH 'sqlite'; <i>-- OK.</i>
509
SELECT * FROM docs WHERE docs.docs MATCH 'sqlite'; <i>-- OK.</i>
510
SELECT * FROM docs WHERE main.docs.docs MATCH 'sqlite'; <i>-- OK.</i>
511
SELECT * FROM docs WHERE main.docs MATCH 'sqlite'; <i>-- Error.</i>
514
<h2 id="section_1_5">1.5. Summary</h2>
517
From the users point of view, FTS tables are similar to ordinary SQLite
518
tables in many ways. Data may be added to, modified within and removed
519
from FTS tables using the INSERT, UPDATE and DELETE commands just as
520
it may be with ordinary tables. Similarly, the SELECT command may be used
521
to query data. The following list summarizes the differences between FTS
526
As with all virtual table types, it is not possible to create indices or
527
triggers attached to FTS tables. Nor is it possible to use the ALTER TABLE
528
command to add extra columns to FTS tables (although it is possible to use
529
ALTER TABLE to rename an FTS table).
532
Data-types specified as part of the "CREATE VIRTUAL TABLE" statement
533
used to create an FTS table are ignored completely. Instead of the
534
normal rules for applying type <a href="datatype3.html#affinity">affinity</a> to inserted values, all
535
values inserted into FTS table columns (except the special rowid
536
column) are converted to type TEXT before being stored.
539
FTS tables permit the special alias "docid" to be used to refer to the
540
rowid column supported by all <a href="vtab.html">virtual tables</a>.
543
The <a href="fts3.html#section_3">FTS MATCH</a> operator is supported for queries based on the built-in
547
The <a href="fts3.html#snippet">FTS auxiliary functions</a>, <a href="fts3.html#snippet">snippet()</a>, <a href="fts3.html#offsets">offsets()</a>, and <a href="fts3.html#matchinfo">matchinfo()</a> are
548
available to support full-text queries.
551
<a name="hiddencol"></a>
553
Every FTS table has a <a href="vtab.html#hiddencol">hidden column</a> with the
554
same name as the table itself. The value contained in each row for the
555
hidden column is a blob that is only useful as the left operand of a
556
<a href="fts3.html#section_3">MATCH</a> operator, or as the left-most argument to one
557
of the <a href="fts3.html#snippet">FTS auxiliary functions</a>.
563
<h1 id="section_2">2. Compiling and Enabling FTS3 and FTS4</h1>
566
Although FTS3 and FTS4 are included with the SQLite core source code, they are not
567
enabled by default. To build SQLite with FTS functionality enabled, define
568
the preprocessor macro <a href="compile.html#enable_fts3">SQLITE_ENABLE_FTS3</a> when compiling. New applications
569
should also define the <a href="compile.html#enable_fts3_parenthesis">SQLITE_ENABLE_FTS3_PARENTHESIS</a> macro to enable the
570
<a href="fts3.html#section_3_1">enhanced query syntax</a> (see below). Usually, this is done by adding the
571
following two switches to the compiler command line:
573
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">-DSQLITE_ENABLE_FTS3
574
-DSQLITE_ENABLE_FTS3_PARENTHESIS
578
Note that enabling FTS3 also makes FTS4 available. There is not a separate
579
SQLITE_ENABLE_FTS4 compile-time option. A build of SQLite either supports
580
both FTS3 and FTS4 or it supports neither.
583
If using the amalgamation autoconf based build system, setting the CPPFLAGS
584
environment variable while running the 'configure' script is an easy
585
way to set these macros. For example, the following command:
587
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CPPFLAGS="-DSQLITE_ENABLE_FTS3 -DSQLITE_ENABLE_FTS3_PARENTHESIS" ./configure <configure options>
591
where <i><configure options></i> are those options normally passed to
592
the configure script, if any.
595
Because FTS3 and FTS4 are virtual tables, The <a href="compile.html#enable_fts3">SQLITE_ENABLE_FTS3</a> compile-time option
596
is incompatible with the <a href="compile.html#omit_virtualtable">SQLITE_OMIT_VIRTUALTABLE</a> option.
599
If a build of SQLite does not include the FTS modules, then any attempt to prepare an
600
SQL statement to create an FTS3 or FTS4 table or to drop or access an existing
601
FTS table in any way will fail. The error message returned will be similar
602
to "no such module: ftsN" (where N is either 3 or 4).
605
If the C version of the <a href="http://site.icu-project.org/">ICU library</a>
606
is available, then FTS may also be compiled with the SQLITE_ENABLE_ICU
607
pre-processor macro defined. Compiling with this macro enables an FTS
608
<a href="fts3.html#tokenizer">tokenizer</a> that uses the ICU library to split a document into terms
609
(words) using the conventions for a specified language and locale.
611
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">-DSQLITE_ENABLE_ICU
615
<h1 id="section_3">3. Full-text Index Queries</h1>
618
The most useful thing about FTS tables is the queries that may be
619
performed using the built-in full-text index. Full-text queries are
620
performed by specifying a clause of the form
621
"<column> MATCH <full-text query expression>" as part of the WHERE
622
clause of a SELECT statement that reads data from an FTS table.
623
<a href="fts3.html#section_1_4">Simple FTS queries</a> that return all documents that
624
contain a given term are described above. In that discussion the right-hand
625
operand of the MATCH operator was assumed to be a string consisting of a
626
single term. This section describes the more complex query types supported
627
by FTS tables, and how they may be utilized by specifying a more
628
complex query expression as the right-hand operand of a MATCH operator.
631
FTS tables support three basic query types:
634
<a name="termprefix"></a>
636
<li><p><b>Token or token prefix queries</b>.
637
An FTS table may be queried for all documents that contain a specified
638
term (the <a href="fts3.html#section_1_4">simple case</a> described above), or for
639
all documents that contain a term with a specified prefix. As we have
640
seen, the query expression for a specific term is simply the term itself.
641
The query expression used to search for a term prefix is the prefix
642
itself with a '*' character appended to it. For example:
645
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Virtual table declaration</i>
646
CREATE VIRTUAL TABLE docs USING fts3(title, body);
648
<i>-- Query for all documents containing the term "linux":</i>
649
SELECT * FROM docs WHERE docs MATCH 'linux';
651
<i>-- Query for all documents containing a term with the prefix "lin". This will match</i>
652
<i>-- all documents that contain "linux", but also those that contain terms "linear",</i>
653
<i>--"linker", "linguistic" and so on.</i>
654
SELECT * FROM docs WHERE docs MATCH 'lin*';
658
<li style="list-style:none"><p>
659
Normally, a token or token prefix query is matched against the FTS table
660
column specified as the right-hand side of the MATCH operator. Or, if the
661
special column with the same name as the FTS table itself is specified,
662
against all columns. This may be overridden by specifying a column-name
663
followed by a ":" character before a basic term query. There may be space
664
between the ":" and the term to query for, but not between the column-name
665
and the ":" character. For example:
668
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Query the database for documents for which the term "linux" appears in</i>
669
<i>-- the document title, and the term "problems" appears in either the title</i>
670
<i>-- or body of the document.</i>
671
SELECT * FROM docs WHERE docs MATCH 'title:linux problems';
673
<i>-- Query the database for documents for which the term "linux" appears in</i>
674
<i>-- the document title, and the term "driver" appears in the body of the document</i>
675
<i>-- ("driver" may also appear in the title, but this alone will not satisfy the</i>.
676
<i>-- query criteria).</i>
677
SELECT * FROM docs WHERE body MATCH 'title:linux driver';
681
<li><p><b>Phrase queries</b>.
682
A phrase query is a query that retrieves all documents that contain a
683
nominated set of terms or term prefixes in a specified order with no
684
intervening tokens. Phrase queries are specified by enclosing a space
685
separated sequence of terms or term prefixes in double quotes (").
689
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Query for all documents that contain the phrase "linux applications".</i>
690
SELECT * FROM docs WHERE docs MATCH '"linux applications"';
692
<i>-- Query for all documents that contain a phrase that matches "lin* app*". As well as</i>
693
<i>-- "linux applications", this will match common phrases such as "linoleum appliances"</i>
694
<i>-- or "link apprentice".</i>
695
SELECT * FROM docs WHERE docs MATCH '"lin* app*"';
699
<li><p><b>NEAR queries</b>.
700
A NEAR query is a query that returns documents that contain a two or
701
more nominated terms or phrases within a specified proximity of each
702
other (by default with 10 or less intervening terms). A NEAR query is
703
specified by putting the keyword "NEAR" between two phrase, term or
704
<a href="fts3.html#termprefix">prefix queries</a>. To specify a proximity other than the default,
705
an operator of the form "NEAR/<i><N></i>" may be used, where
706
<i><N></i> is the maximum number of intervening terms allowed.
710
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Virtual table declaration.</i>
711
CREATE VIRTUAL TABLE docs USING fts4();
713
<i>-- Virtual table data.</i>
714
INSERT INTO docs VALUES('SQLite is an ACID compliant embedded relational database management system');
716
<i>-- Search for a document that contains the terms "sqlite" and "database" with</i>
717
<i>-- not more than 10 intervening terms. This matches the only document in</i>
718
<i>-- table docs (since there are only six terms between "SQLite" and "database"</i>
719
<i>-- in the document)</i>.
720
SELECT * FROM docs WHERE docs MATCH 'sqlite NEAR database';
722
<i>-- Search for a document that contains the terms "sqlite" and "database" with</i>
723
<i>-- not more than 6 intervening terms. This also matches the only document in</i>
724
<i>-- table docs. Note that the order in which the terms appear in the document</i>
725
<i>-- does not have to be the same as the order in which they appear in the query.</i>
726
SELECT * FROM docs WHERE docs MATCH 'database NEAR/6 sqlite';
728
<i>-- Search for a document that contains the terms "sqlite" and "database" with</i>
729
<i>-- not more than 5 intervening terms. This query matches no documents.</i>
730
SELECT * FROM docs WHERE docs MATCH 'database NEAR/5 sqlite';
732
<i>-- Search for a document that contains the phrase "ACID compliant" and the term</i>
733
<i>-- "database" with not more than 2 terms separating the two. This matches the</i>
734
<i>-- document stored in table docs.</i>
735
SELECT * FROM docs WHERE docs MATCH 'database NEAR/2 "ACID compliant"';
737
<i>-- Search for a document that contains the phrase "ACID compliant" and the term</i>
738
<i>-- "sqlite" with not more than 2 terms separating the two. This also matches</i>
739
<i>-- the only document stored in table docs.</i>
740
SELECT * FROM docs WHERE docs MATCH '"ACID compliant" NEAR/2 sqlite';
744
<li style="list-style: none"><p>
745
More than one NEAR operator may appear in a single query. In this case each
746
pair of terms or phrases separated by a NEAR operator must appear within the
747
specified proximity of each other in the document. Using the same table and
748
data as in the block of examples above:
751
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">
752
<i>-- The following query selects documents that contains an instance of the term </i>
753
<i>-- "sqlite" separated by two or fewer terms from an instance of the term "acid",</i>
754
<i>-- which is in turn separated by two or fewer terms from an instance of the term</i>
755
<i>-- "relational". As it happens, the only document in table docs satisfies this criteria.</i>
756
SELECT * FROM docs WHERE docs MATCH 'sqlite NEAR/2 acid NEAR/2 relational';
758
<i>-- This query matches no documents. There is an instance of the term "sqlite" with</i>
759
<i>-- sufficient proximity to an instance of "acid" but it is not sufficiently close</i>
760
<i>-- to an instance of the term "relational".</i>
761
SELECT * FROM docs WHERE docs MATCH 'acid NEAR/2 sqlite NEAR/2 relational';
765
Phrase and NEAR queries may not span multiple columns within a row.
768
The three basic query types described above may be used to query the full-text
769
index for the set of documents that match the specified criteria. Using the
770
FTS query expression language it is possible to perform various set
771
operations on the results of basic queries. There are currently three
772
supported operations:
775
<li> The AND operator determines the <b>intersection</b> of two sets of documents.
777
<li> The OR operator calculates the <b>union</b> of two sets of documents.
779
<li> The NOT operator (or, if using the standard syntax, a unary "-" operator)
780
may be used to compute the <b>relative complement</b> of one set of
781
documents with respect to another.
785
The FTS modules may be compiled to use one of two slightly different versions
786
of the full-text query syntax, the "standard" query syntax and the "enhanced"
787
query syntax. The basic term, term-prefix, phrase and NEAR queries described
788
above are the same in both versions of the syntax. The way in which set
789
operations are specified is slightly different. The following two sub-sections
790
describe the part of the two query syntaxes that pertains to set operations.
791
Refer to the description of how to <a href="fts3.html#section_2">compile fts</a> for compilation notes.
793
<h2 id="section_3_1">3.1.
794
Set Operations Using The Enhanced Query Syntax</h2>
797
The enhanced query syntax supports the AND, OR and NOT binary set operators.
798
Each of the two operands to an operator may be a basic FTS query, or the
799
result of another AND, OR or NOT set operation. Operators must be entered
800
using capital letters. Otherwise, they are interpreted as basic term queries
801
instead of set operators.
804
The AND operator may be implicitly specified. If two basic queries appear
805
with no operator separating them in an FTS query string, the results are
806
the same as if the two basic queries were separated by an AND operator.
807
For example, the query expression "implicit operator" is a more succinct
808
version of "implicit AND operator".
810
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Virtual table declaration</i>
811
CREATE VIRTUAL TABLE docs USING fts3();
813
<i>-- Virtual table data</i>
814
INSERT INTO docs(docid, content) VALUES(1, 'a database is a software system');
815
INSERT INTO docs(docid, content) VALUES(2, 'sqlite is a software system');
816
INSERT INTO docs(docid, content) VALUES(3, 'sqlite is a database');
818
<i>-- Return the set of documents that contain the term "sqlite", and the</i>
819
<i>-- term "database". This query will return the document with docid 3 only.</i>
820
SELECT * FROM docs WHERE docs MATCH 'sqlite AND database';
822
<i>-- Again, return the set of documents that contain both "sqlite" and</i>
823
<i>-- "database". This time, use an implicit AND operator. Again, document</i>
824
<i>-- 3 is the only document matched by this query. </i>
825
SELECT * FROM docs WHERE docs MATCH 'database sqlite';
827
<i>-- Query for the set of documents that contains either "sqlite" or "database".</i>
828
<i>-- All three documents in the database are matched by this query.</i>
829
SELECT * FROM docs WHERE docs MATCH 'sqlite OR database';
831
<i>-- Query for all documents that contain the term "database", but do not contain</i>
832
<i>-- the term "sqlite". Document 1 is the only document that matches this criteria.</i>
833
SELECT * FROM docs WHERE docs MATCH 'database NOT sqlite';
835
<i>-- The following query matches no documents. Because "and" is in lowercase letters,</i>
836
<i>-- it is interpreted as a basic term query instead of an operator. Operators must</i>
837
<i>-- be specified using capital letters. In practice, this query will match any documents</i>
838
<i>-- that contain each of the three terms "database", "and" and "sqlite" at least once.</i>
839
<i>-- No documents in the example data above match this criteria.</i>
840
SELECT * FROM docs WHERE docs MATCH 'database and sqlite';
844
The examples above all use basic full-text term queries as both operands of
845
the set operations demonstrated. Phrase and NEAR queries may also be used,
846
as may the results of other set operations. When more than one set operation
847
is present in an FTS query, the precedence of operators is as follows:
849
<table style="margin:1em auto; width:80%; border-spacing:0">
850
<tr style="text-align:left"><th>Operator<th>Enhanced Query Syntax Precedence
851
<tr style="text-align:left;background-color:#DDDDDD"><td>NOT <td> Highest precedence (tightest grouping).
852
<tr style="text-align:left"><td>AND <td>
853
<tr style="text-align:left;background-color:#DDDDDD"><td>OR <td> Lowest precedence (loosest grouping).
857
When using the enhanced query syntax, parenthesis may be used to override
858
the default precedence of the various operators. For example:
860
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Return the docid values associated with all documents that contain the</i>
861
<i>-- two terms "sqlite" and "database", and/or contain the term "library".</i>
862
SELECT docid FROM docs WHERE docs MATCH 'sqlite AND database OR library';
864
<i>-- This query is equivalent to the above.</i>
865
SELECT docid FROM docs WHERE docs MATCH 'sqlite AND database'
867
SELECT docid FROM docs WHERE docs MATCH 'library';
869
<i>-- Query for the set of documents that contains the term "linux", and at least</i>
870
<i>-- one of the phrases "sqlite database" and "sqlite library".</i>
871
SELECT docid FROM docs WHERE docs MATCH '("sqlite database" OR "sqlite library") AND linux';
873
<i>-- This query is equivalent to the above.</i>
874
SELECT docid FROM docs WHERE docs MATCH 'linux'
877
SELECT docid FROM docs WHERE docs MATCH '"sqlite library"'
879
SELECT docid FROM docs WHERE docs MATCH '"sqlite database"'
884
<h2 id="section_3_2">3.2. Set Operations Using The Standard Query Syntax</h2>
887
FTS query set operations using the standard query syntax are similar, but
888
not identical, to set operations with the enhanced query syntax. There
889
are four differences, as follows:
892
<li value="1"><p> Only the implicit version of the AND operator is supported.
893
Specifying the string "AND" as part of a standard query syntax query is
894
interpreted as a term query for the set of documents containing the term
899
<li value="2"><p> Parenthesis are not supported.
903
<li value="3"><p> The NOT operator is not supported. Instead of the NOT
904
operator, the standard query syntax supports a unary "-" operator that
905
may be applied to basic term and term-prefix queries (but not to phrase
906
or NEAR queries). A term or term-prefix that has a unary "-" operator
907
attached to it may not appear as an operand to an OR operator. An FTS
908
query may not consist entirely of terms or term-prefix queries with unary
909
"-" operators attached to them.
912
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Search for the set of documents that contain the term "sqlite" but do</i>
913
<i>-- not contain the term "database".</i>
914
SELECT * FROM docs WHERE docs MATCH 'sqlite -database';
918
<li value="4"><p> The relative precedence of the set operations is different.
919
In particular, using the standard query syntax the "OR" operator has a
920
higher precedence than "AND". The precedence of operators when using the
921
standard query syntax is:
924
<table style="margin:1em auto; width:80%; border-spacing:0">
925
<tr style="text-align:left"><th>Operator<th>Standard Query Syntax Precedence
926
<tr style="text-align:left;background-color:#DDDDDD"><td>Unary "-" <td> Highest precedence (tightest grouping).
927
<tr style="text-align:left"><td>OR <td>
928
<tr style="text-align:left;background-color:#DDDDDD"><td>AND <td> Lowest precedence (loosest grouping).
931
<ol><li style="list-style:none">
932
The following example illustrates precedence of operators using the standard
936
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Search for documents that contain at least one of the terms "database"</i>
937
<i>-- and "sqlite", and also contain the term "library". Because of the differences</i>
938
<i>-- in operator precedences, this query would have a different interpretation using</i>
939
<i>-- the enhanced query syntax.</i>
940
SELECT * FROM docs WHERE docs MATCH 'sqlite OR database library';
943
<a name="snippet"></a>
945
<h1 id="section_4">4. Auxiliary Functions - Snippet, Offsets and Matchinfo</h1>
948
The FTS3 and FTS4 modules provide three special SQL scalar functions that may be useful
949
to the developers of full-text query systems: "snippet", "offsets" and
950
"matchinfo". The purpose of the "snippet" and "offsets" functions is to allow
951
the user to identify the location of queried terms in the returned documents.
952
The "matchinfo" function provides the user with metrics that may be useful
953
for filtering or sorting query results according to relevance.
956
The first argument to all three special SQL scalar functions
957
must be the <a href="fts3.html#hiddencol">FTS hidden column</a> of the FTS table that the function is
958
applied to. The <a href="fts3.html#hiddencol">FTS hidden column</a> is an automatically-generated column found on
959
all FTS tables that has the same name as the FTS table itself.
960
For example, given an FTS table named "mail":
962
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">SELECT offsets(mail) FROM mail WHERE mail MATCH <full-text query expression>;
963
SELECT snippet(mail) FROM mail WHERE mail MATCH <full-text query expression>;
964
SELECT matchinfo(mail) FROM mail WHERE mail MATCH <full-text query expression>;
968
The three auxiliary functions are only useful within a SELECT statement that
969
uses the FTS table's full-text index. If used within a SELECT that uses
970
the "query by rowid" or "linear scan" strategies, then the snippet and
971
offsets both return an empty string, and the matchinfo function returns
972
a blob value zero bytes in size.
975
All three auxiliary functions extract a set of "matchable phrases" from
976
the FTS query expression to work with. The set of matchable phrases for
977
a given query consists of all phrases (including unquoted tokens and
978
token prefixes) in the expression except those that are prefixed with
979
a unary "-" operator (standard syntax) or are part of a sub-expression
980
that is used as the right-hand operand of a NOT operator.
983
With the following provisos, each series of tokens in the FTS table that
984
matches one of the matchable phrases in the query expression is known as a
988
<li> If a matchable phrase is part of a series of phrases connected by
989
NEAR operators in the FTS query expression, then each phrase match
990
must be sufficiently close to other phrase matches of the relevant
991
types to satisfy the NEAR condition.
993
<li> If the matchable phrase in the FTS query is restricted to matching
994
data in a specified FTS table column, then only phrase matches that
995
occur within that column are considered.
998
<a name="offsets"></a>
1000
<h2 id="section_4_1">4.1. The Offsets Function</h2>
1003
For a SELECT query that uses the full-text index, the offsets() function
1004
returns a text value containing a series of space-separated integers. For
1005
each term in each <a href="#matchable">phrase match</a> of the current row,
1006
there are four integers in the returned list. Each set of four integers is
1007
interpreted as follows:
1009
<table style="margin:1em auto; width:80%; border-spacing:0">
1010
<tr style="text-align:left"><th>Integer <th>Interpretation
1011
<tr style="text-align:left;background-color:#DDDDDD"><td>0
1012
<td>The column number that the term instance occurs in (0 for the
1013
leftmost column of the FTS table, 1 for the next leftmost, etc.).
1014
<tr style="text-align:left"><td>1
1015
<td>The term number of the matching term within the full-text query
1016
expression. Terms within a query expression are numbered starting
1017
from 0 in the order that they occur.
1018
<tr style="text-align:left;background-color:#DDDDDD"><td>2
1019
<td>The byte offset of the matching term within the column.
1020
<tr style="text-align:left"><td>3
1021
<td>The size of the matching term in bytes.
1025
The following block contains examples that use the offsets function.
1027
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE VIRTUAL TABLE mail USING fts3(subject, body);
1028
INSERT INTO mail VALUES('hello world', 'This message is a hello world message.');
1029
INSERT INTO mail VALUES('urgent: serious', 'This mail is seen as a more serious mail');
1031
<i>-- The following query returns a single row (as it matches only the first</i>
1032
<i>-- entry in table "mail". The text returned by the offsets function is</i>
1033
<i>-- "0 0 6 5 1 0 24 5".</i>
1035
<i>-- The first set of four integers in the result indicate that column 0</i>
1036
<i>-- contains an instance of term 0 ("world") at byte offset 6. The term instance</i>
1037
<i>-- is 5 bytes in size. The second set of four integers shows that column 1</i>
1038
<i>-- of the matched row contains an instance of term 0 ("world") at byte offset</i>
1039
<i>-- 24. Again, the term instance is 5 bytes in size.</i>
1040
SELECT offsets(mail) FROM mail WHERE mail MATCH 'world';
1042
<i>-- The following query returns also matches only the first row in table "mail".</i>
1043
<i>-- In this case the returned text is "1 0 5 7 1 0 30 7".</i>
1044
SELECT offsets(mail) FROM mail WHERE mail MATCH 'message';
1046
<i>-- The following query matches the second row in table "mail". It returns the</i>
1047
<i>-- text "1 0 28 7 1 1 36 4". Only those occurrences of terms "serious" and "mail"</i>
1048
<i>-- that are part of an instance of the phrase "serious mail" are identified; the</i>
1049
<i>-- other occurrences of "serious" and "mail" are ignored.</i>
1050
SELECT offsets(mail) FROM mail WHERE mail MATCH '"serious mail"';
1053
<a name="snippet"></a>
1055
<h2 id="section_4_2">4.2. The Snippet Function</h2>
1058
The snippet function is used to create formatted fragments of document text
1059
for display as part of a full-text query results report. The snippet function
1060
may be passed between one and six arguments, as follows:
1062
<table style="margin:1em auto; width:80%; border-spacing:0">
1063
<tr style="text-align:left"><th>Argument <th>Default Value <th>Description
1064
<tr style="text-align:left;background-color:#DDDDDD"><td>0 <td>N/A
1065
<td> The first argument to the snippet function must always be the <a href="fts3.html#hiddencol">FTS hidden column</a>
1066
of the FTS table being queried and from which the snippet is to be taken. The
1067
<a href="fts3.html#hiddencol">FTS hidden column</a> is an automatically generated column with the same name as the
1069
<tr style="text-align:left"><td>1 <td>"<b>"
1070
<td> The "start match" text.
1071
<tr style="text-align:left;background-color:#DDDDDD"><td>2 <td>"</b>"
1072
<td> The "end match" text.
1073
<tr style="text-align:left"><td>3 <td>"<b>...</b>"
1074
<td> The "ellipses" text.
1075
<tr style="text-align:left;background-color:#DDDDDD"><td>4 <td>-1
1076
<td> The FTS table column number to extract the returned fragments of
1077
text from. Columns are numbered from left to right starting with
1078
zero. A negative value indicates that the text may be extracted
1080
<tr style="text-align:left"><td>5 <td>-15
1081
<td> The absolute value of this integer argument is used as the
1082
(approximate) number of tokens to include in the returned text
1083
value. The maximum allowable absolute value is 64. The value of
1084
this argument is referred to as <i>N</i> in the discussion below.
1088
The snippet function first attempts to find a fragment of text consisting
1089
of <i>|N|</i> tokens within the current row that contains at least one phrase
1090
match for each matchable phrase matched somewhere in the current row,
1091
where <i>|N|</i> is the absolute value of the sixth argument passed to the
1092
snippet function. If the text stored in a single column contains less than
1093
<i>|N|</i> tokens, then the entire column value is considered. Text fragments
1094
may not span multiple columns.
1097
If such a text fragment can be found, it is returned with the following
1101
<li> If the text fragment does not begin at the start of a column value,
1102
the "ellipses" text is prepended to it.
1103
<li> If the text fragment does not finish at the end of a column value,
1104
the "ellipses" text is appended to it.
1105
<li> For each token in the text fragment that is part of a phrase match,
1106
the "start match" text is inserted into the fragment before the token,
1107
and the "end match" text is inserted immediately after it.
1111
If more than one such fragment can be found, then fragments that contain
1112
a larger number of "extra" phrase matches are favored. The start of
1113
the selected text fragment may be moved a few tokens forward or backward
1114
to attempt to concentrate the phrase matches toward the center of the
1118
Assuming <i>N</i> is a positive value, if no fragments can be found that
1119
contain a phrase match corresponding to each matchable phrase, the snippet
1120
function attempts to find two fragments of approximately <i>N</i>/2 tokens
1121
that between them contain at least one phrase match for each matchable phrase
1122
matched by the current row. If this fails, attempts are made to find three
1123
fragments of <i>N</i>/3 tokens each and finally four <i>N</i>/4 token
1124
fragments. If a set of four fragments cannot be found that encompasses the
1125
required phrase matches, the four fragments of <i>N</i>/4 tokens that provide
1126
the best coverage are selected.
1129
If <i>N</i> is a negative value, and no single fragment can be found
1130
containing the required phrase matches, the snippet function searches
1131
for two fragments of <i>|N|</i> tokens each, then three, then four. In
1132
other words, if the specified value of <i>N</i> is negative, the sizes
1133
of the fragments is not decreased if more than one fragment is required
1134
to provide the desired phrase match coverage.
1137
After the <i>M</i> fragments have been located, where <i>M</i> is between
1138
two and four as described in the paragraphs above, they are joined together
1139
in sorted order with the "ellipses" text separating them. The three
1140
modifications enumerated earlier are performed on the text before it is
1143
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><b>Note: In this block of examples, newlines and whitespace characters have
1144
been inserted into the document inserted into the FTS table, and the expected
1145
results described in SQL comments. This is done to enhance readability only,
1146
they would not be present in actual SQLite commands or output.</b>
1148
<i>-- Create and populate an FTS table.</i>
1149
CREATE VIRTUAL TABLE text USING fts4();
1150
INSERT INTO text VALUES('
1151
During 30 Nov-1 Dec, 2-3oC drops. Cool in the upper portion, minimum temperature 14-16oC
1152
and cool elsewhere, minimum temperature 17-20oC. Cold to very cold on mountaintops,
1153
minimum temperature 6-12oC. Northeasterly winds 15-30 km/hr. After that, temperature
1154
increases. Northeasterly winds 15-30 km/hr.
1157
<i>-- The following query returns the text value:</i>
1159
<i>-- "<b>...</b>cool elsewhere, minimum temperature 17-20oC. <b>Cold</b> to very </i>
1160
<i>-- <b>cold</b> on mountaintops, minimum temperature 6<b>...</b>".</i>
1162
SELECT snippet(text) FROM text WHERE text MATCH 'cold';
1164
<i>-- The following query returns the text value:</i>
1166
<i>-- "...the upper portion, [minimum] [temperature] 14-16oC and cool elsewhere,</i>
1167
<i>-- [minimum] [temperature] 17-20oC. Cold..."</i>
1169
SELECT snippet(text, '[ ']', '...') FROM text WHERE text MATCH '"min* tem*"'
1172
<h2 id="matchinfo">4.3. The Matchinfo Function</h2>
1175
The matchinfo function returns a blob value. If it is used within a query
1176
that does not use the full-text index (a "query by rowid" or "linear scan"),
1177
then the blob is zero bytes in size. Otherwise, the blob consists of zero
1178
or more 32-bit unsigned integers in machine byte-order. The exact number
1179
of integers in the returned array depends on both the query and the value
1180
of the second argument (if any) passed to the matchinfo function.
1183
The matchinfo function is called with either one or two arguments. As for
1184
all auxiliary functions, the first argument must be the special
1185
<a href="fts3.html#hiddencol">FTS hidden column</a>. The second argument, if it is specified, must be a text value
1186
comprised only of the characters 'p', 'c', 'n', 'a', 'l', 's' and 'x'.
1187
If no second argument is explicitly supplied, it defaults to "pcx". The
1188
second argument is referred to as the "format string" below.
1191
Characters in the matchinfo format string are processed from left to right.
1192
Each character in the format string causes one or more 32-bit unsigned
1193
integer values to be added to the returned array. The "values" column in
1194
the following table contains the number of integer values appended to the
1195
output buffer for each supported format string character. In the formula
1196
given, <i>cols</i> is the number of columns in the FTS table, and
1197
<i>phrases</i> is the number of <a href="#matchable">matchable phrases</a> in
1200
<table style="margin:1em auto; width:80%; border-spacing:0">
1201
<tr style="text-align:left"><th>Character<th>Values<th>Description
1202
<tr style="text-align:left;background-color:#DDDDDD"><td>p <td>1 <td>The number of matchable phrases in the query.
1203
<tr style="text-align:left"><td>c <td>1 <td>The number of user defined columns in the FTS
1204
table (i.e. not including the docid or the <a href="fts3.html#hiddencol">FTS hidden column</a>).
1205
<tr style="text-align:left;background-color:#DDDDDD"><td>x <td style="white-space:nowrap">3 * <i>cols</i> * <i>phrases</i>
1207
For each distinct combination of a phrase and table column, the
1208
following three values:
1210
<li> In the current row, the number of times the phrase appears in
1212
<li> The total number of times the phrase appears in the column in
1213
all rows in the FTS table.
1214
<li> The total number of rows in the FTS table for which the
1215
column contains at least one instance of the phrase.
1217
The first set of three values corresponds to the left-most column
1218
of the table (column 0) and the left-most matchable phrase in the
1219
query (phrase 0). If the table has more than one column, the second
1220
set of three values in the output array correspond to phrase 0 and
1221
column 1. Followed by phrase 0, column 2 and so on for all columns of
1222
the table. And so on for phrase 1, column 0, then phrase 1, column 1
1223
etc. In other words, the data for occurrences of phrase <i>p</i> in
1224
column <i>c</i> may be found using the following formula:
1226
hits_this_row = array[3 * (c + p*cols) + 0]
1227
hits_all_rows = array[3 * (c + p*cols) + 1]
1228
docs_with_hits = array[3 * (c + p*cols) + 2]
1231
<tr style="text-align:left"><td>n <td>1 <td>The number of rows in the FTS4 table. This value is
1232
only available when querying FTS4 tables, not FTS3.
1233
<tr style="text-align:left;background-color:#DDDDDD"><td>a <td><i>cols</i> <td>For each column, the average number of
1234
tokens in the text values stored in the column (considering all rows in
1235
the FTS4 table). This value is only available when querying FTS4 tables,
1237
<tr style="text-align:left"><td>l <td><i>cols</i> <td>
1238
For each column, the length of the value stored in the current row of the
1239
FTS4 table, in tokens. This value is only available when querying
1240
FTS4 tables, not FTS3. And only if the "matchinfo=fts3" directive was not
1241
specified as part of the "CREATE VIRTUAL TABLE" statement used to create
1243
<tr style="text-align:left;background-color:#DDDDDD"><td>s <td><i>cols</i> <td>For each column, the length of the longest
1244
subsequence of phrase matches that the column value has in common
1245
with the query text. For example, if a table column contains the text
1246
'a b c d e' and the query is 'a c "d e"', then the length of the longest
1247
common subsequence is 2 (phrase "c" followed by phrase "d e").
1255
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create and populate an FTS4 table with two columns:</i>
1256
CREATE VIRTUAL TABLE t1 USING fts4(a, b);
1257
INSERT INTO t1 VALUES('transaction default models default', 'Non transaction reads');
1258
INSERT INTO t1 VALUES('the default transaction', 'these semantics present');
1259
INSERT INTO t1 VALUES('single request', 'default data');
1261
<i>-- In the following query, no format string is specified and so it defaults</i>
1262
<i>-- to "pcx". It therefore returns a single row consisting of a single blob</i>
1263
<i>-- value 80 bytes in size (20 32-bit integers - 1 for "p", 1 for "c" and</i>
1264
<i>-- 3*2*3 for "x"). If each block of 4 bytes in</i> the blob is interpreted</i>
1265
<i>-- as an unsigned integer in machine byte-order, the values will be:</i>
1267
<i>-- 3 2 1 3 2 0 1 1 1 2 2 0 1 1 0 0 0 1 1 1</i>
1269
<i>-- The row returned corresponds to the second entry inserted into table t1.</i>
1270
<i>-- The first two integers in the blob show that the query contained three</i>
1271
<i>-- phrases and the table being queried has two columns. The next block of</i>
1272
<i>-- three integers describes column 0 (in this case column "a") and phrase</i>
1273
<i>-- 0 (in this case "default"). The current row contains 1 hit for "default"</i>
1274
<i>-- in column 0, of a total of 3 hits for "default" that occur in column</i>
1275
<i>-- 0 of any table row. The 3 hits are spread across 2 different rows.</i>
1277
<i>-- The next set of three integers (0 1 1) pertain to the hits for "default"</i>
1278
<i>-- in column 1 of the table (0 in this row, 1 in all rows, spread across </i>
1281
SELECT matchinfo(t1) FROM t1 WHERE t1 MATCH 'default transaction "these semantics"';
1283
<i>-- The format string for this query is "ns". The output array will therefore</i>
1284
<i>-- contain 3 integer values - 1 for "n" and 2 for "s". The query returns</i>
1285
<i>-- two rows (the first two rows in the table match). The values returned are:</i>
1290
<i>-- The first value in the matchinfo array returned for both rows is 3 (the </i>
1291
<i>-- number of rows in the table). The following two values are the lengths </i>
1292
<i>-- of the longest common subsequence of phrase matches in each column.</i>
1293
SELECT matchinfo(t1, 'ns') FROM t1 WHERE t1 MATCH 'default transaction';
1297
The matchinfo function is much faster than either the snippet or offsets
1298
functions. This is because the implementation of both snippet and offsets
1299
is required to retrieve the documents being analyzed from disk, whereas
1300
all data required by matchinfo is available as part of the same portions
1301
of the full-text index that are required to implement the full-text query
1302
itself. This means that of the following two queries, the first may be
1303
an order of magnitude faster than the second:
1305
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">SELECT docid, matchinfo(tbl) FROM tbl WHERE tbl MATCH <query expression>;
1306
SELECT docid, offsets(tbl) FROM tbl WHERE tbl MATCH <query expression>;
1310
The matchinfo function provides all the information required to calculate
1311
probabilistic "bag-of-words" relevancy scores such as
1312
<a href="http://en.wikipedia.org/wiki/Okapi_BM25">Okapi BM25/BM25F</a> that may
1313
be used to order results in a full-text search application. Appendix A of this
1314
document, "<a href="fts3.html#appendix_a">search application tips</a>", contains an example of using the
1315
matchinfo() function efficiently.
1317
<h1 id="fts4aux">5. Fts4aux - Direct Access to the Full-Text Index</h1>
1320
As of version 3.7.6, SQLite includes a new virtual table module called
1321
"fts4aux", which can be used to inspect the full-text index of an existing
1322
FTS table directly. Despite its name, fts4aux works just as well with FTS3
1323
tables as it does with FTS4 tables. Fts4aux tables are read-only. The only
1324
way to modify the contents of an fts4aux table is by modifying the
1325
contents of the associated FTS table. The fts4aux module is automatically
1326
included in all <a href="fts3.html#section_2">builds that include FTS</a>.
1329
An fts4aux virtual table is constructed with a single argument - the
1330
unqualified name of the FTS table that it will be used to access.
1333
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create an FTS4 table</i>
1334
CREATE VIRTUAL TABLE ft USING fts4(x, y);
1336
<i>-- Create an fts4aux table to access the full-text index for table "ft"</i>
1337
CREATE VIRTUAL TABLE ft_terms USING fts4aux(ft);
1341
For each term present in the FTS table, there are between 2 and N+1 rows
1342
in the fts4aux table, where N is the number of user-defined columns in
1343
the associated FTS table. An fts4aux table always has the same four columns,
1344
as follows, from left to right:
1346
<table style="margin:1em auto; width:80%; border-spacing:0">
1347
<tr style="text-align:left"><th>Column Name<th>Column Contents
1348
<tr style="text-align:left;background-color:#DDDDDD"><td>term<td>
1349
Contains the text of the term for this row.
1350
<tr style="text-align:left"><td>col<td>
1351
This column may contain either the text value '*' (i.e. a single
1352
character, U+002a) or an integer between 0 and N-1, where N is
1353
again the number of user-defined columns in the corresponding FTS table.
1355
<tr style="text-align:left;background-color:#DDDDDD"><td>documents<td>
1356
This column always contains an integer value greater than zero.
1358
If the "col" column contains the value '*', then this column
1359
contains the number of rows of the FTS table that contain at least one
1360
instance of the term (in any column). If col contains an integer
1361
value, then this column contains the number of rows of the FTS table that
1362
contain at least one instance of the term in the column identified by
1363
the col value. As usual, the columns of the FTS table are numbered
1364
from left to right, starting with zero.
1366
<tr style="text-align:left"><td>occurrences<td>
1367
This column also always contains an integer value greater than zero.
1369
If the "col" column contains the value '*', then this column
1370
contains the total number of instances of the term in all rows of the
1371
FTS table (in any column). Otherwise, if col contains an integer
1372
value, then this column contains the total number of instances of the
1373
term that appear in the FTS table column identified by the col
1378
For example, using the tables created above:
1380
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">INSERT INTO ft(x, y) VALUES('Apple banana', 'Cherry');
1381
INSERT INTO ft(x, y) VALUES('Banana Date Date', 'cherry');
1382
INSERT INTO ft(x, y) VALUES('Cherry Elderberry', 'Elderberry');
1384
<i>-- The following query returns this data:</i>
1386
<i>-- apple | * | 1 | 1</i>
1387
<i>-- apple | 0 | 1 | 1</i>
1388
<i>-- banana | * | 2 | 2</i>
1389
<i>-- banana | 0 | 2 | 2</i>
1390
<i>-- cherry | * | 3 | 3</i>
1391
<i>-- cherry | 0 | 1 | 1</i>
1392
<i>-- cherry | 1 | 2 | 2</i>
1393
<i>-- date | * | 1 | 2</i>
1394
<i>-- date | 0 | 1 | 2</i>
1395
<i>-- elderberry | * | 1 | 2</i>
1396
<i>-- elderberry | 1 | 1 | 1</i>
1397
<i>-- elderberry | 1 | 1 | 1</i>
1399
SELECT term, col, documents, occurrences FROM ft_terms;
1403
In the example, the values in the "term" column are all lower case,
1404
even though they were inserted into table "ft" in mixed case. This is because
1405
an fts4aux table contains the terms as extracted from the document text
1406
by the <a href="fts3.html#tokenizer">tokenizer</a>. In this case, since table "ft" uses the
1407
<a href="fts3.html#tokenizer">simple tokenizer</a>, this means all terms have been folded to
1408
lower case. Also, there is (for example) no row with column "term"
1409
set to "apple" and column "col" set to 1. Since there are no instances
1410
of the term "apple" in column 1, no row is present in the fts4aux table.
1413
During a transaction, some of the data written to an FTS table may be
1414
cached in memory and written to the database only when the transaction is
1415
committed. However the implementation of the fts4aux module is only able
1416
to read data from the database. In practice this means that if an fts4aux
1417
table is queried from within a transaction in which the associated
1418
FTS table has been modified, the results of the query are likely to reflect
1419
only a (possibly empty) subset of the changes made.
1421
<h1 id="fts4_options">6. FTS4 Options</h1>
1424
If the "CREATE VIRTUAL TABLE" statement specifies module FTS4 (not FTS3),
1425
then special directives - FTS4 options - similar to the "tokenize=*" option
1426
may also appear in place of column names. An FTS4 option consists of the
1427
option name, followed by an "=" character, followed by the option value.
1428
The option value may optionally be enclosed in single or double quotes, with
1429
embedded quote characters escaped in the same way as for SQL literals. There
1430
may not be whitespace on either side of the "=" character. For example,
1431
to create an FTS4 table with the value of option "matchinfo" set to "fts3":
1433
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create a reduced-footprint FTS4 table.</i>
1434
CREATE VIRTUAL TABLE papers USING fts4(author, document, matchinfo=fts3);
1438
FTS4 currently supports the following options:
1440
<table style="margin:1em auto; width:80%; border-spacing:0">
1441
<tr style="text-align:left"><th>Option<th>Interpretation
1442
<tr style="text-align:left;background-color:#DDDDDD"><td>compress<td>
1443
The compress option is used to specify the compress function. It is an error to
1444
specify a compress function without also specifying an uncompress
1445
function. <a href="fts3.html#section_6_1">See below</a> for details.
1447
<tr style="text-align:left"><td>content<td>
1448
The content allows the text being indexed to be
1449
stored in a separate table distinct from the FTS4 table,
1450
or even outside of SQLite.
1452
<tr style="text-align:left;background-color:#DDDDDD"><td>languageid<td>
1453
The languageid option causes the FTS4 table to have an additional hidden
1454
integer column that identifies the language of the text contained in
1455
each row. The use of the languageid option allows the same FTS4 table
1456
to hold text in multiple languages or scripts, each with different tokenizer
1457
rules, and to query each language independently of the others.
1459
<tr style="text-align:left"><td>matchinfo<td>
1460
When set to the value "fts3", the matchinfo option reduces the amount of
1461
information stored by FTS4 with the consequence that the "l" option of
1462
<a href="fts3.html#matchinfo">matchinfo()</a> is no longer available.
1464
<tr style="text-align:left;background-color:#DDDDDD"><td>order<td>
1465
<a name="fts4order"></a>
1467
The "order" option may be set to either "DESC" or "ASC" (in upper or
1468
lower case). If it is set to "DESC", then FTS4 stores its data in such
1469
a way as to optimize returning results in descending order by docid.
1470
If it is set to "ASC" (the default), then the data structures are
1471
optimized for returning results in ascending order by docid. In other
1472
words, if many of the queries run against the FTS4 table use "ORDER BY
1473
docid DESC", then it may improve performance to add the "order=desc"
1474
option to the CREATE VIRTUAL TABLE statement.
1476
<tr style="text-align:left"><td>prefix<td>
1477
This option may be set to a comma-separated list of positive non-zero
1478
integers. For each integer N in the list, a separate index is created
1479
in the database file to optimize <a href="fts3.html#termprefix">prefix queries</a> where
1480
the query term is N bytes in length, not including the '*' character,
1481
when encoded using UTF-8. <a href="fts3.html#section_6_5">See below</a> for details.
1483
<tr style="text-align:left;background-color:#DDDDDD"><td>uncompress<td>
1484
This option is used to specify the uncompress function. It is an error to
1485
specify an uncompress function without also specifying a compress
1486
function. <a href="fts3.html#section_6_1">See below</a> for details.
1490
When using FTS4, specifying a column name that contains an "=" character
1491
and is not either a "tokenize=*" specification or a recognized FTS4 option
1492
is an error. With FTS3, the first token in the unrecognized directive is
1493
interpreted as a column name. Similarly, specifying multiple "tokenize=*"
1494
directives in a single table declaration is an error when using FTS4, whereas
1495
the second and subsequent "tokenize=*" directives are interpreted as column
1496
names by FTS3. For example:
1498
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- An error. FTS4 does not recognize the directive "xyz=abc".</i>
1499
CREATE VIRTUAL TABLE papers USING fts4(author, document, xyz=abc);
1501
<i>-- Create an FTS3 table with three columns - "author", "document"</i>
1502
<i>-- and "xyz".</i>
1503
CREATE VIRTUAL TABLE papers USING fts3(author, document, xyz=abc);
1505
<i>-- An error. FTS4 does not allow multiple tokenize=* directives</i>
1506
CREATE VIRTUAL TABLE papers USING fts4(tokenize=porter, tokenize=simple);
1508
<i>-- Create an FTS3 table with a single column named "tokenize". The</i>
1509
<i>-- table uses the "porter" tokenizer.</i>
1510
CREATE VIRTUAL TABLE papers USING fts3(tokenize=porter, tokenize=simple);
1512
<i>-- An error. Cannot create a table with two columns named "tokenize".</i>
1513
CREATE VIRTUAL TABLE papers USING fts3(tokenize=porter, tokenize=simple, tokenize=icu);
1516
<a name="*fts4compression"></a>
1518
<h2 id="section_6_1">6.1. The compress= and uncompress= options</h2>
1521
The compress and uncompress options allow FTS4 content to be stored in
1522
the database in a compressed form. Both options should be set to the name
1523
of an SQL scalar function registered using <a href="c3ref/create_function.html">sqlite3_create_function()</a>
1524
that accepts a single argument.
1527
The compress function should return a compressed version of the value
1528
passed to it as an argument. Each time data is written to the FTS4 table,
1529
each column value is passed to the compress function and the result value
1530
stored in the database. The compress function may return any type of SQLite
1531
value (blob, text, real, integer or null).
1534
The uncompress function should uncompress data previously compressed by
1535
the compress function. In other words, for all SQLite values X, it should
1536
be true that uncompress(compress(X)) equals X. When data that has been
1537
compressed by the compress function is read from the database by FTS4, it
1538
is passed to the uncompress function before it is used.
1541
If the specified compress or uncompress functions do not exist, the table
1542
may still be created. An error is not returned until the FTS4 table is
1543
read (if the uncompress function does not exist) or written (if it is the
1544
compress function that does not exist).
1546
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create an FTS4 table that stores data in compressed form. This</i>
1547
<i>-- assumes that the scalar functions zip() and unzip() have been (or</i>
1548
<i>-- will be) added to the database handle.</i>
1549
CREATE VIRTUAL TABLE papers USING fts4(author, document, compress=zip, uncompress=unzip);
1553
When implementing the compress and uncompress functions it is important to
1554
pay attention to data types. Specifically, when a user reads a value from
1555
a compressed FTS table, the value returned by FTS is exactly the same
1556
as the value returned by the uncompress function, including the data type.
1557
If that data type is not the same as the data type of the original value as
1558
passed to the compress function (for example if the uncompress function is
1559
returning BLOB when compress was originally passed TEXT), then the users
1560
query may not function as expected.
1562
<a name="*fts4content"></a>
1564
<h2 id="section_6_2">6.2. The content= option </h2>
1567
The content option allows FTS4 to forego storing the text being indexed.
1568
The content option can be used in two ways:
1571
<li><p> The indexed documents are not stored within the SQLite database
1572
at all (a "contentless" FTS4 table), or
1574
<li><p> The indexed documents are stored in a database table created and
1575
managed by the user (an "external content" FTS4 table).
1579
Because the indexed documents themselves are usually much larger than
1580
the full-text index, the content option can be used to achieve
1581
significant space savings.
1583
<h3 id="section_6_2_1">6.2.1. Contentless FTS4 Tables </h3>
1586
In order to create an FTS4 table that does not store a copy of the indexed
1587
documents at all, the content option should be set to an empty string.
1588
For example, the following SQL creates such an FTS4 table with three
1589
columns - "a", "b", and "c":
1591
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE VIRTUAL TABLE t1 USING fts4(content="", a, b, c);
1595
Data can be inserted into such an FTS4 table using an INSERT statements.
1596
However, unlike ordinary FTS4 tables, the user must supply an explicit
1597
integer docid value. For example:
1599
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- This statement is Ok:</i>
1600
INSERT INTO t1(docid, a, b, c) VALUES(1, 'a b c', 'd e f', 'g h i');
1602
<i>-- This statement causes an error, as no docid value has been provided:</i>
1603
INSERT INTO t1(a, b, c) VALUES('j k l', 'm n o', 'p q r');
1607
It is not possible to UPDATE or DELETE a row stored in a contentless FTS4
1608
table. Attempting to do so is an error.
1611
Contentless FTS4 tables also support SELECT statements. However, it is
1612
an error to attempt to retrieve the value of any table column other than
1613
the docid column. The auxiliary function matchinfo() may be used, but
1614
snippet() and offsets() may not. For example:
1616
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- The following statements are Ok:</i>
1617
SELECT docid FROM t1 WHERE t1 MATCH 'xxx';
1618
SELECT docid FROM t1 WHERE a MATCH 'xxx';
1619
SELECT matchinfo(t1) FROM t1 WHERE t1 MATCH 'xxx';
1621
<i>-- The following statements all cause errors, as the value of columns</i>
1622
<i>-- other than docid are required to evaluate them.</i>
1624
SELECT a, b FROM t1 WHERE t1 MATCH 'xxx';
1625
SELECT docid FROM t1 WHERE a LIKE 'xxx%';
1626
SELECT snippet(t1) FROM t1 WHERE t1 MATCH 'xxx';
1630
Errors related to attempting to retrieve column values other than docid
1631
are runtime errors that occur within sqlite3_step(). In some cases, for
1632
example if the MATCH expression in a SELECT query matches zero rows, there
1633
may be no error at all even if a statement does refer to column values
1636
<h3 id="section_6_2_2">6.2.2. External Content FTS4 Tables </h3>
1639
An "external content" FTS4 table is similar to a contentless table, except
1640
that if evaluation of a query requires the value of a column other than
1641
docid, FTS4 attempts to retrieve that value from a table (or view, or
1642
virtual table) nominated by the user (hereafter referred to as the "content
1643
table"). The FTS4 module never writes to the content table, and writing
1644
to the content table does not affect the full-text index. It is the
1645
responsibility of the user to ensure that the content table and the
1646
full-text index are consistent.
1649
An external content FTS4 table is created by setting the content option
1650
to the name of a table (or view, or virtual table) that may be queried by
1651
FTS4 to retrieve column values when required. If the nominated table does
1652
not exist, then an external content table behaves in the same way as
1653
a contentless table. For example:
1655
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE TABLE t2(id INTEGER PRIMARY KEY, a, b, c);
1656
CREATE VIRTUAL TABLE t3 USING fts4(content="t2", a, c);
1660
Assuming the nominated table does exist, then its columns must be the same
1661
as or a superset of those defined for the FTS table.
1664
When a users query on the FTS table requires a column value other than
1665
docid, FTS attempts to read this value from the corresponding column of
1666
the row in the content table with a rowid value equal to the current FTS
1667
docid. Or, if such a row cannot be found in the content table, a NULL
1668
value is used instead. For example:
1670
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE TABLE t2(id INTEGER PRIMARY KEY, a, b, c, d);
1671
CREATE VIRTUAL TABLE t3 USING fts4(content="t2", b, c);
1673
INSERT INTO t2 VALUES(2, 'a b', 'c d', 'e f');
1674
INSERT INTO t2 VALUES(3, 'g h', 'i j', 'k l');
1675
INSERT INTO t3(docid, b, c) SELECT id, b, c FROM t2;
1676
<i>-- The following query returns a single row with two columns containing
1677
-- the text values "i j" and "k l".
1679
-- The query uses the full-text index to discover that the MATCH
1680
-- term matches the row with docid=3. It then retrieves the values
1681
-- of columns b and c from the row with rowid=3 in the content table
1684
SELECT * FROM t3 WHERE t3 MATCH 'k';
1686
<i>-- Following the UPDATE, the query still returns a single row, this
1687
-- time containing the text values "xxx" and "yyy". This is because the
1688
-- full-text index still indicates that the row with docid=3 matches
1689
-- the FTS4 query 'k', even though the documents stored in the content
1690
-- table have been modified.
1692
UPDATE t2 SET b = 'xxx', c = 'yyy' WHERE rowid = 3;
1693
SELECT * FROM t3 WHERE t3 MATCH 'k';
1695
<i>-- Following the DELETE below, the query returns one row containing two
1696
-- NULL values. NULL values are returned because FTS is unable to find
1697
-- a row with rowid=3 within the content table.
1700
SELECT * FROM t3 WHERE t3 MATCH 'k';
1704
When a row is deleted from an external content FTS4 table, FTS4 needs to
1705
retrieve the column values of the row being deleted from the content table.
1706
This is so that FTS4 can update the full-text index entries for each token
1707
that occurs within the deleted row to indicate that that row has been
1708
deleted. If the content table row cannot be found, or if it contains values
1709
inconsistent with the contents of the FTS index, the results can be difficult
1710
to predict. The FTS index may be left containing entries corresponding to the
1711
deleted row, which can lead to seemingly nonsensical results being returned
1712
by subsequent SELECT queries. The same applies when a row is updated, as
1713
internally an UPDATE is the same as a DELETE followed by an INSERT.
1716
Instead of writing separately to the full-text index and the content table,
1717
some users may wish to use database triggers to keep the full-text index
1718
up to date with respect to the set of documents stored in the content table.
1719
For example, using the tables from earlier examples:
1721
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE TRIGGER t2_bu BEFORE UPDATE ON t2 BEGIN
1722
DELETE FROM t3 WHERE docid=old.rowid;
1724
CREATE TRIGGER t2_bd BEFORE DELETE ON t2 BEGIN
1725
DELETE FROM t3 WHERE docid=old.rowid;
1728
CREATE TRIGGER t2_bu AFTER UPDATE ON t2 BEGIN
1729
INSERT INTO t3(docid, b, c) VALUES(new.rowid, new.b, new.c);
1731
CREATE TRIGGER t2_bd AFTER INSERT ON t2 BEGIN
1732
INSERT INTO t3(docid, b, c) VALUES(new.rowid, new.b, new.c);
1737
The DELETE trigger must be fired before the actual delete takes place
1738
on the content table. This is so that FTS4 can still retrieve the original
1739
values in order to update the full-text index. And the INSERT trigger must
1740
be fired after the new row is inserted, so as to handle the case where the
1741
rowid is assigned automatically within the system. The UPDATE trigger must
1742
be split into two parts, one fired before and one after the update of the
1743
content table, for the same reasons.
1746
The <a href="fts3.html#*fts4rebuidcmd">FTS4 "rebuild" command</a>
1747
deletes the entire full-text index and rebuilds it based on the current
1748
set of documents in the content table. Assuming again that "t3" is the
1749
name of the external content FTS4 table, the rebuild command looks like this:
1751
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">INSERT INTO t3(t3) VALUES('rebuild');
1755
This command may also be used with ordinary FTS4 tables, for example if
1756
the implementation of the tokenizer changes. It is an
1757
error to attempt to rebuild the full-text index maintained by a contentless
1758
FTS4 table, since no content will be available to do the rebuilding.
1761
<a name="*fts4languageid"></a>
1763
<h2 id="section_6_3">6.3. The languageid= option</h2>
1766
When the languageid option is present, it specifies the name of
1767
another <a href="vtab.html#hiddencol">hidden column</a> that is added to the FTS4
1768
table and which is used to specify the language stored in each row
1769
of the FTS4 table. The name of the languageid hidden column must
1770
be distinct from all other column names in the FTS4 table. Example:
1772
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE VIRTUAL TABLE t1 USING fts4(x, y, languageid="lid")
1776
The default value of a languageid column is 0. Any value inserted
1777
into a languageid column is converted to a 32-bit (not 64) signed
1781
By default, FTS queries (those that use the MATCH operator)
1782
consider only those rows with the languageid column set to 0. To
1783
query for rows with other languageid values, a constraint of the
1784
form "<language-id> = <integer>" must be added to the queries
1785
WHERE clause. For example:
1787
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">SELECT * FROM t1 WHERE t1 MATCH 'abc' AND lid=5;
1791
It is not possible for a single FTS query to return rows with
1792
different languageid values. The results of adding WHERE clauses
1793
that use other operators (e.g. lid!=5, or lid<=5) are undefined.
1796
If the content option is used along with the languageid option,
1797
then the named languageid column must exist in the content= table
1798
(subject to the usual rules - if a query never needs to read the
1799
content table then this restriction does not apply).
1802
When the languageid option is used, SQLite invokes the xLanguageid()
1803
on the sqlite3_tokenizer_module object immediately after the object
1804
is created in order to pass in the language id that particular
1805
tokenizer should use. The xLanguageid() method will never be called
1806
more than once for any single tokenizer object.
1810
<a name="fts4matchinfo"></a>
1812
<h2 id="section_6_4">6.4. The matchinfo= option</h2>
1815
The matchinfo option may only be set to the value "fts3".
1816
Attempting to set matchinfo to anything other than "fts3" is an error.
1817
If this option is specified, then some of the extra information stored by
1818
FTS4 is omitted. This reduces the amount of disk space consumed by
1819
an FTS4 table until it is almost the same as the amount that would
1820
be used by the equivalent FTS3 table, but also means that the data
1821
accessed by passing the 'l' flag to the <a href="fts3.html#matchinfo">matchinfo()</a> function is
1824
<a name="fts4prefix"></a>
1826
<h2 id="section_6_5">6.5. The prefix= option</h2>
1829
The FTS4 prefix option causes FTS to index term prefixes of specified lengths
1830
in the same way that it always indexes complete terms. The prefix option
1831
must be set to a comma separated list of positive non-zero integers.
1832
For each value N in the list, prefixes of length N bytes (when encoded
1833
using UTF-8) are indexed. FTS4 uses term prefix indexes to speed up
1834
<a href="fts3.html#termprefix">prefix queries</a>. The cost, of course, is that indexing term prefixes as
1835
well as complete terms increases the database size and slows down write
1836
operations on the FTS4 table.
1839
Prefix indexes may be used to optimize <a href="fts3.html#termprefix">prefix queries</a> in two cases.
1840
If the query is for a prefix of N bytes, then a prefix index created
1841
with "prefix=N" provides the best optimization. Or, if no "prefix=N"
1842
index is available, a "prefix=N+1" index may be used instead.
1843
Using a "prefix=N+1" index is less
1844
efficient than a "prefix=N" index, but is better than no prefix index at all.
1846
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create an FTS4 table with indexes to optimize 2 and 4 byte prefix queries.</i>
1847
CREATE VIRTUAL TABLE t1 USING fts4(c1, c2, prefix="2,4");
1849
<i>-- The following two queries are both optimized using the prefix indexes.</i>
1850
SELECT * FROM t1 WHERE t1 MATCH 'ab*';
1851
SELECT * FROM t1 WHERE t1 MATCH 'abcd*';
1853
<i>-- The following two queries are both partially optimized using the prefix</i>
1854
<i>-- indexes. The optimization is not as pronounced as it is for the queries</i>
1855
<i>-- above, but still an improvement over no prefix indexes at all.</i>
1856
SELECT * FROM t1 WHERE t1 MATCH 'a*';
1857
SELECT * FROM t1 WHERE t1 MATCH 'abc*';
1860
<a name="*cmds"></a>
1862
<h1 id="commands">7. Special Commands For FTS3 and FTS4</h1>
1865
Special INSERT operates can be used to issue commands to FTS3 and FTS4 tables.
1866
Every FTS3 and FTS4 has a hidden, read-only column which is the same name as
1867
the table itself. INSERTs into this hidden column are interpreted as commands
1868
to the FTS3/4 table. For a table with the name "xyz" the following commands
1872
<li><p>INSERT INTO xyz(xyz) VALUES('optimize');</p>
1873
<li><p>INSERT INTO xyz(xyz) VALUES('rebuild');</p>
1874
<li><p>INSERT INTO xyz(xyz) VALUES('integrity-check');</p>
1875
<li><p>INSERT INTO xyz(xyz) VALUES('merge=X,Y');</p>
1876
<li><p>INSERT INTO xyz(xyz) VALUES('automerge=B');</p>
1879
<a name="*fts4optcmd"></a>
1881
<h2 id="optimize">7.1. The "optimize" command</h2>
1884
The "optimize" command causes FTS3/4 to merge together all of its
1885
inverted index b-trees into one large and complete b-tree. Doing
1886
an optimize will make subsequent queries run faster since there are
1887
fewer b-trees to search, and it may reduce disk usage by coalescing
1888
redundant entries. However, for a large FTS table, running optimize
1889
can be as expensive as running <a href="lang_vacuum.html">VACUUM</a>. The optimize command
1890
essentially has to read and write the entire FTS table, resulting
1891
in a large transaction.
1894
In batch-mode operation, where an FTS table is initially built up
1895
using a large number of INSERT operations, then queried repeatedly
1896
without further changes, it is often a good idea
1897
to run "optimize" after the last INSERT and before the first query.
1899
<a name="*fts4rebuidcmd"></a>
1901
<h2 id="rebuild">7.2. The "rebuild" command</h2>
1904
The "rebuild" command causes SQLite to discard the entire FTS3/4
1905
table and then rebuild it again from original text. The concept
1906
is similar to <a href="lang_reindex.html">REINDEX</a>, only that it applies to an
1907
FTS3/4 table instead of an ordinary index.
1910
The "rebuild" command should be run whenever the implementation
1911
of a custom tokenizer changes, so that all content can be retokenized.
1912
The "rebuild" command is also useful when using the
1913
<a href="fts3.html#*fts4content">FTS4 content option</a> after changes have been made to the original
1916
<a name="*fts4ickcmd"></a>
1918
<h2 id="integcheck">7.3. The "integrity-check" command</h2>
1921
The "integrity-check" command causes SQLite to read and verify
1922
the accuracy of all inverted indices in an FTS3/4 table by comparing
1923
those inverted indices against the original content. The
1924
"integrity-check" command silently succeeds if the inverted
1925
indices are all ok, but will fail with an SQLITE_CORRUPT error
1926
if any problems are found.
1929
The "integrity-check" command is similar in concept to
1930
<a href="pragma.html#pragma_integrity_check">PRAGMA integrity_check</a>. In a working system, the "integrity-command"
1931
should aways be successful. Possible causes of integrity-check
1934
<li> The application has made changes to the <a href="fts3.html#*shadowtab">FTS shadow tables</a>
1935
directly, without using the FTS3/4 virtual table, causing
1936
the shadow tables to become out of sync with each other.
1937
<li> Using the <a href="fts3.html#*fts4content">FTS4 content option</a> and failing to manually keep
1938
the content in sync with the FTS4 inverted indices.
1939
<li> Bugs in the FTS3/4 virtual table. (The "integrity-check"
1940
command was original conceived as part of the test suite
1942
<li> Corruption to the underlying SQLite database file. (See
1943
documentation on <a href="howtocorrupt.html">how to corrupt</a> and SQLite database for
1944
additional information.)
1947
<a name="*fts4mergecmd"></a>
1949
<h2 id="mergecmd">7.4. The "merge=X,Y" command</h2>
1952
The "merge=X,Y" command (where X and Y are integers) causes SQLite
1953
to do a limited amount of work toward merging the various inverted
1954
index b-trees of an FTS3/4 table together into one large b-tree.
1955
The X value is the target number of "blocks" to be merged, and Y is
1956
the minimum number of b-tree segments on a level required before
1957
merging will be applied to that level. The value of Y should
1958
be between 2 and 16 with a recommended value of 8. The value of X
1959
can be any positive integer but values on the order of 100 to 300
1963
When an FTS table accumulates 16 b-tree segments at the same level,
1964
the next INSERT into that table will cause all 16 segments to be
1965
merged into a single b-tree segment at the next higher level. The
1966
effect of these level merges is that most INSERTs into an FTS table
1967
are very fast and take minimal memory, but an occasional INSERT is
1968
slow and generates a large transaction because of the need to
1969
do merging. This results in "spiky" performance of INSERTs.
1972
To avoid spiky INSERT performance, an application can run the
1973
"merge=X,Y" command periodically, possibly in an idle thread or
1974
idle process, to ensure that the FTS table never accumulates
1975
too many b-tree segments at the same level. INSERT performance
1976
spikes can generally be avoided, and performance of FTS3/4 can be
1977
maximized, by running "merge=X,Y" after every few thousand
1978
document inserts. Each "merge=X,Y" command will run in a separate
1979
transaction (unless they are grouped together using <a href="lang_transaction.html">BEGIN</a>...<a href="lang_transaction.html">COMMIT</a>,
1980
of course). The transactions can be kept small by choosing a value
1981
for X in the range of 100 to 300. The idle thread that is running
1982
the merge commands can know when it is done by checking the difference
1983
in <a href="c3ref/total_changes.html">sqlite3_total_changes()</a> before and after each "merge=X,Y"
1984
command and stopping the loop when the difference drops below two.
1986
<a name="*fts4automergecmd"></a>
1988
<h2 id="automerge"">7.5. The "automerge=B" command</h2>
1991
The "automerge=B" command (where B is either "1" or "0") disables
1992
or enables automatic incremental inverted index merging for an
1993
FTS3/4 table. The default for new tables is for automatic incremental
1994
merging to be disabled. The "automerge=B" command changes this
1995
setting. The change is persistent and continues to be in effect
1996
for all subsequent database connections to the same database.
1999
Enabling automatic incremental merge causes SQLite to do a small
2000
amount of inverted index merging after every INSERT operation.
2001
The amount of merging performed is designed so that the FTS3/4
2002
table never reaches a point where it has 16 segments at the same
2003
level and hence has to do a large merge in order to complete an
2004
insert. In other words, automatic incremental merging is designed
2005
to prevent spiky INSERT performance.
2008
The downside of automatic incremental merging is that it makes
2009
every INSERT, UPDATE, and DELETE operation on an FTS3/4 table run
2010
a little slower, since extra time must be used to do the incremental
2011
merge. For maximum performance, it is recommended that applications
2012
disable automatic incremental merge and instead use the
2013
<a href="fts3.html#*fts4mergecmd">"merge" command</a> in an idle process to keep the inverted indices
2014
well merged. But if the structure of an application does not easily
2015
allow for idle processes, the use of automatic incremental merge is
2016
a very reasonable fallback solution.
2019
<h1 id="tokenizer">8. Tokenizers</h1>
2022
An FTS tokenizer is a set of rules for extracting terms from a document
2023
or basic FTS full-text query.
2026
Unless a specific tokenizer is specified as part of the CREATE
2027
VIRTUAL TABLE statement used to create the FTS table, the default
2028
tokenizer, "simple", is used. The simple tokenizer extracts tokens from
2029
a document or basic FTS full-text query according to the following
2033
<li><p> A term is a contiguous sequence of eligible characters, where
2034
eligible characters are all alphanumeric characters and all characters with
2035
Unicode codepoint values greater than or equal to 128.
2036
All other characters are
2037
discarded when splitting a document into terms. Their only contribution is
2038
to separate adjacent terms.
2040
<li><p> All uppercase characters within the ASCII range (Unicode codepoints
2041
less than 128), are transformed to their lowercase equivalents as part
2042
of the tokenization process. Thus, full-text queries are
2043
case-insensitive when using the simple tokenizer.
2047
For example, when a document containing the text "Right now, they're very
2048
frustrated.", the terms extracted from the document and added to the
2049
full-text index are, in order, "right now they re very frustrated". Such
2050
a document would match a full-text query such as "MATCH 'Frustrated'",
2051
as the simple tokenizer transforms the term in the query to lowercase
2052
before searching the full-text index.
2055
As well as the "simple" tokenizer, the FTS source code features a tokenizer
2056
that uses the <a href="http://tartarus.org/~martin/PorterStemmer/">Porter
2057
Stemming algorithm</a>. This tokenizer uses the same rules to separate
2058
the input document into terms, but as well as folding all terms to lower
2059
case it uses the Porter Stemming algorithm to reduce related English language
2060
words to a common root. For example, using the same input document as in the
2061
paragraph above, the porter tokenizer extracts the following tokens:
2062
"right now thei veri frustrat". Even though some of these terms are not even
2063
English words, in some cases using them to build the full-text index is more
2064
useful than the more intelligible output produced by the simple tokenizer.
2065
Using the porter tokenizer, the document not only matches full-text queries
2066
such as "MATCH 'Frustrated'", but also queries such as "MATCH 'Frustration'",
2067
as the term "Frustration" is reduced by the Porter stemmer algorithm to
2068
"frustrat" - just as "Frustrated" is. So, when using the porter tokenizer,
2069
FTS is able to find not just exact matches for queried terms, but matches
2070
against similar English language terms. For more information on the
2071
Porter Stemmer algorithm, please refer to the page linked above.
2074
Example illustrating the difference between the "simple" and "porter"
2077
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Create a table using the simple tokenizer. Insert a document into it.</i>
2078
CREATE VIRTUAL TABLE simple USING fts3(tokenize=simple);
2079
INSERT INTO simple VALUES('Right now they''re very frustrated');
2081
<i>-- The first of the following two queries matches the document stored in</i>
2082
<i>-- table "simple". The second does not.</i>
2083
SELECT * FROM simple WHERE simple MATCH 'Frustrated';
2084
SELECT * FROM simple WHERE simple MATCH 'Frustration';
2086
<i>-- Create a table using the porter tokenizer. Insert the same document into it</i>
2087
CREATE VIRTUAL TABLE porter USING fts3(tokenize=porter);
2088
INSERT INTO porter VALUES('Right now they''re very frustrated');
2090
<i>-- Both of the following queries match the document stored in table "porter".</i>
2091
SELECT * FROM porter WHERE porter MATCH 'Frustrated';
2092
SELECT * FROM porter WHERE porter MATCH 'Frustration';
2096
If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
2097
symbol defined, then there exists a built-in tokenizer named "icu"
2098
implemented using the ICU library. The first argument passed to the
2099
xCreate() method (see fts3_tokenizer.h) of this tokenizer may be
2100
an ICU locale identifier. For example "tr_TR" for Turkish as used
2101
in Turkey, or "en_AU" for English as used in Australia. For example:
2103
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH)
2107
The ICU tokenizer implementation is very simple. It splits the input
2108
text according to the ICU rules for finding word boundaries and discards
2109
any tokens that consist entirely of white-space. This may be suitable
2110
for some applications in some locales, but not all. If more complex
2111
processing is required, for example to implement stemming or
2112
discard punctuation, this can be done by creating a tokenizer
2113
implementation that uses the ICU tokenizer as part of its implementation.
2115
<a name="unicode61"></a>
2118
The "unicode61" tokenizer is available beginning with SQLite <a href="releaselog/3_7_13.html">version 3.7.13</a>.
2119
Unicode61 works very much like "simple" except that it does full unicode
2120
case folding according to rules in Unicode Version 6.1 and it recognizes
2121
unicode space and punctuation characters and uses those to separate tokens.
2122
The simple tokenizer only does case folding of ASCII characters and only
2123
recognizes ASCII space and punctuation characters as token separators.
2125
<h2 id="section_8_1">8.1. Custom (User Implemented) Tokenizers</h2>
2128
As well as the built-in "simple", "porter" and (possibly) "icu" tokenizers,
2129
FTS exports an interface that allows users to implement custom tokenizers
2130
using C. The interface used to create a new tokenizer is defined and
2131
described in the fts3_tokenizer.h source file.
2134
Registering a new FTS tokenizer is similar to registering a new
2135
virtual table module with SQLite. The user passes a pointer to a
2136
structure containing pointers to various callback functions that
2137
make up the implementation of the new tokenizer type. For tokenizers,
2138
the structure (defined in fts3_tokenizer.h) is called
2139
"sqlite3_tokenizer_module".
2142
FTS does not expose a C-function that users call to register new
2143
tokenizer types with a database handle. Instead, the pointer must
2144
be encoded as an SQL blob value and passed to FTS through the SQL
2145
engine by evaluating a special scalar function, "fts3_tokenizer()".
2146
The fts3_tokenizer() function may be called with one or two arguments,
2149
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">SELECT fts3_tokenizer(<tokenizer-name>);
2150
SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
2154
Where <tokenizer-name> is a string identifying the tokenizer and
2155
<sqlite3_tokenizer_module ptr="1"> is a pointer to an sqlite3_tokenizer_module
2156
structure encoded as an SQL blob. If the second argument is present,
2157
it is registered as tokenizer <tokenizer-name> and a copy of it
2158
returned. If only one argument is passed, a pointer to the tokenizer
2159
implementation currently registered as <tokenizer-name> is returned,
2160
encoded as a blob. Or, if no such tokenizer exists, an SQL exception
2164
<b>SECURITY WARNING</b>: If the fts3/4 extension is used in an environment
2165
where potentially malicious users may execute arbitrary SQL, they should
2166
be prevented from invoking the fts3_tokenizer() function, possibly using
2167
the <a href="c3ref/set_authorizer.html">authorization callback</a>.
2170
The following block contains an example of calling the fts3_tokenizer()
2171
function from C code:
2173
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>/*
2174
** Register a tokenizer implementation with FTS3 or FTS4.
2176
int registerTokenizer(
2179
const sqlite3_tokenizer_module *p
2182
sqlite3_stmt *pStmt;
2183
const char *zSql = "SELECT fts3_tokenizer(?, ?)";
2185
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
2186
if( rc!=SQLITE_OK ){
2190
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
2191
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
2192
sqlite3_step(pStmt);
2194
return sqlite3_finalize(pStmt);
2198
** Query FTS for the tokenizer implementation named zName.
2203
const sqlite3_tokenizer_module **pp
2206
sqlite3_stmt *pStmt;
2207
const char *zSql = "SELECT fts3_tokenizer(?)";
2210
rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
2211
if( rc!=SQLITE_OK ){
2215
sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
2216
if( SQLITE_ROW==sqlite3_step(pStmt) ){
2217
if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
2218
memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
2222
return sqlite3_finalize(pStmt);
2227
<h1 id="section_9">9. Data Structures</h1>
2230
This section describes at a high-level the way the FTS module stores its
2231
index and content in the database. It is <b>not necessary to read or
2232
understand the material in this section in order to use FTS</b> in an
2233
application. However, it may be useful to application developers attempting
2234
to analyze and understand FTS performance characteristics, or to developers
2235
contemplating enhancements to the existing FTS feature set.
2237
<a name="*shadowtab"></a>
2239
<h2 id="section_9_1">9.1. Shadow Tables</h2>
2241
For each FTS virtual table in a database, three to five real (non-virtual) tables
2242
are created to store the underlying data. These real tables are called "shadow tables".
2243
The real tables are named "%_content",
2244
"%_segdir", "%_segments", "%_stat", and "%_docsize", where "%" is replaced by the name
2245
of the FTS virtual table.
2248
The leftmost column of the "%_content" table is an INTEGER PRIMARY KEY field
2249
named "docid". Following this is one column for each column of the FTS
2250
virtual table as declared by the user, named by prepending the column name
2251
supplied by the user with "c<i>N</i>", where <i>N</i> is the index of the
2252
column within the table, numbered from left to right starting with 0. Data
2253
types supplied as part of the virtual table declaration are not used as
2254
part of the %_content table declaration. For example:
2256
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- Virtual table declaration</i>
2257
CREATE VIRTUAL TABLE abc USING fts4(a NUMBER, b TEXT, c);
2259
<i>-- Corresponding %_content table declaration</i>
2260
CREATE TABLE abc_content(docid INTEGER PRIMARY KEY, c0a, c1b, c2c);
2264
The %_content table contains the unadulterated data inserted by the user
2265
into the FTS virtual table by the user. If the user does not explicitly
2266
supply a "docid" value when inserting records, one is selected automatically
2270
The %_stat and %_docsize tables are only created if the FTS table uses the
2271
FTS4 module, not FTS3. Furthermore, the %_docsize table is omitted if the
2272
FTS4 table is created with the <a href="fts3.html#fts4matchinfo">"matchinfo=fts3"</a> directive
2273
specified as part of the CREATE VIRTUAL TABLE statement. If they are created,
2274
the schema of the two tables is as follows:
2275
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE TABLE %_stat(
2276
id INTEGER PRIMARY KEY,
2280
CREATE TABLE %_docsize(
2281
docid INTEGER PRIMARY KEY,
2287
For each row in the FTS table, the %_docsize table contains a corresponding
2288
row with the same "docid" value. The "size" field contains a blob consisting
2289
of <i>N</i> FTS varints, where <i>N</i> is the number of user-defined columns
2290
in the table. Each varint in the "size" blob is the number of tokens in the
2291
corresponding column of the associated row in the FTS table. The %_stat table
2292
always contains a single row with the "id" column set to 0. The "value"
2293
column contains a blob consisting of <i>N+1</i> FTS varints, where <i>N</i>
2294
is again the number of user-defined columns in the FTS table. The first
2295
varint in the blob is set to the total number of rows in the FTS table. The
2296
second and subsequent varints contain the total number of tokens stored in
2297
the corresponding column for all rows of the FTS table.
2300
The two remaining tables, %_segments and %_segdir, are used to store the
2301
full-text index. Conceptually, this index is a lookup table that maps each
2302
term (word) to the set of docid values corresponding to records in the
2303
%_content table that contain one or more occurrences of the term. To
2304
retrieve all documents that contain a specified term, the FTS module
2305
queries this index to determine the set of docid values for records that
2306
contain the term, then retrieves the required documents from the %_content
2307
table. Regardless of the schema of the FTS virtual table, the %_segments
2308
and %_segdir tables are always created as follows:
2310
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">CREATE TABLE %_segments(
2311
blockid INTEGER PRIMARY KEY, <i>-- B-tree node id</i>
2312
block blob <i>-- B-tree node data</i>
2315
CREATE TABLE %_segdir(
2318
start_block INTEGER, <i>-- Blockid of first node in %_segments</i>
2319
leaves_end_block INTEGER, <i>-- Blockid of last leaf node in %_segments</i>
2320
end_block INTEGER, <i>-- Blockid of last node in %_segments</i>
2321
root BLOB, <i>-- B-tree root node</i>
2322
PRIMARY KEY(level, idx)
2327
The schema depicted above is not designed to store the full-text index
2328
directly. Instead, it is used to one or more b-tree structures. There
2329
is one b-tree for each row in the %_segdir table. The %_segdir table
2330
row contains the root node and various meta-data associated with the
2331
b-tree structure, and the %_segments table contains all other (non-root)
2332
b-tree nodes. Each b-tree is referred to as a "segment". Once it has
2333
been created, a segment b-tree is never updated (although it may be
2334
deleted altogether).
2337
The keys used by each segment b-tree are terms (words). As well as the
2338
key, each segment b-tree entry has an associated "doclist" (document list).
2339
A doclist consists of zero or more entries, where each entry consists of:
2342
<li> A docid (document id), and
2343
<li> A list of term offsets, one for each occurrence of the term within
2344
the document. A term offset indicates the number of tokens (words)
2345
that occur before the term in question, not the number of characters
2346
or bytes. For example, the term offset of the term "war" in the
2347
phrase "Ancestral voices prophesying war!" is 3.
2351
Entries within a doclist are sorted by docid. Positions within a doclist
2352
entry are stored in ascending order.
2355
The contents of the logical full-text index is found by merging the
2356
contents of all segment b-trees. If a term is present in more than one
2357
segment b-tree, then it maps to the union of each individual doclist. If,
2358
for a single term, the same docid occurs in more than one doclist, then only
2359
the doclist that is part of the most recently created segment b-tree is
2363
Multiple b-tree structures are used instead of a single b-tree to reduce
2364
the cost of inserting records into FTS tables. When a new record is
2365
inserted into an FTS table that already contains a lot of data, it is
2366
likely that many of the terms in the new record are already present in
2367
a large number of existing records. If a single b-tree were used, then
2368
large doclist structures would have to be loaded from the database,
2369
amended to include the new docid and term-offset list, then written back
2370
to the database. Using multiple b-tree tables allows this to be avoided
2371
by creating a new b-tree which can be merged with the existing b-tree
2372
(or b-trees) later on. Merging of b-tree structures can be performed as
2373
a background task, or once a certain number of separate b-tree structures
2374
have been accumulated. Of course, this scheme makes queries more expensive
2375
(as the FTS code may have to look up individual terms in more than one
2376
b-tree and merge the results), but it has been found that in practice this
2377
overhead is often negligible.
2379
<h2 id="section_9_2">9.2. Variable Length Integer (varint) Format</h2>
2382
Integer values stored as part of segment b-tree nodes are encoded using the
2383
FTS varint format. This encoding is similar, but <b>not identical</b>, to
2384
the <a href="fileformat.html#varint_format">SQLite varint format</a>.
2387
An encoded FTS varint consumes between one and ten bytes of space. The
2388
number of bytes required is determined by the sign and magnitude of the
2389
integer value encoded. More accurately, the number of bytes used to store
2390
the encoded integer depends on the position of the most significant set bit
2391
in the 64-bit twos-complement representation of the integer value. Negative
2392
values always have the most significant bit set (the sign bit), and so are
2393
always stored using the full ten bytes. Positive integer values may be
2394
stored using less space.
2397
The final byte of an encoded FTS varint has its most significant bit
2398
cleared. All preceding bytes have the most significant bit set. Data
2399
is stored in the remaining seven least significant bits of each byte.
2400
The first byte of the encoded representation contains the least significant
2401
seven bits of the encoded integer value. The second byte of the encoded
2402
representation, if it is present, contains the seven next least significant
2403
bits of the integer value, and so on. The following table contains examples
2404
of encoded integer values:
2406
<table style="margin:1em auto; width:80%; border-spacing:0">
2407
<tr style="text-align:left"><th>Decimal<th>Hexadecimal<th width="100%">Encoded Representation
2408
<tr style="text-align:left;background-color:#DDDDDD"><td>43<td>0x000000000000002B<td>0x2B
2409
<tr style="text-align:left"><td>200815<td>0x000000000003106F<td>0x9C 0xA0 0x0C
2410
<tr style="text-align:left;background-color:#DDDDDD"><td>-1<td>0xFFFFFFFFFFFFFFFF<td>0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0x01
2414
<h2 id="section_9_3">9.3. Segment B-Tree Format</h2>
2417
Segment b-trees are prefix-compressed b+-trees. There is one segment b-tree
2418
for each row in the %_segdir table (see above). The root node of the segment
2419
b-tree is stored as a blob in the "root" field of the corresponding row
2420
of the %_segdir table. All other nodes (if any exist) are stored in the
2421
"blob" column of the %_segments table. Nodes within the %_segments table are
2422
identified by the integer value in the blockid field of the corresponding
2423
row. The following table describes the fields of the %_segdir table:
2425
<table style="margin:1em auto; width:80%; border-spacing:0">
2426
<tr style="text-align:left"><th>Column <th width="100%">Interpretation
2427
<tr style="text-align:left;background-color:#DDDDDD"><td>level <td>
2428
Between them, the contents of the "level" and "idx" fields define the
2429
relative age of the segment b-tree. The smaller the value stored in the
2430
"level" field, the more recently the segment b-tree was created. If two
2431
segment b-trees are of the same "level", the segment with the larger
2432
value stored in the "idx" column is more recent. The PRIMARY KEY constraint
2433
on the %_segdir table prevents any two segments from having the same value
2434
for both the "level" and "idx" fields.
2435
<tr style="text-align:left"><td>idx <td> See above.
2436
<tr style="text-align:left;background-color:#DDDDDD"><td>start_block <td>
2437
The blockid that corresponds to the node with the smallest blockid that
2438
belongs to this segment b-tree. Or zero if the entire segment b-tree
2439
fits on the root node. If it exists, this node is always a leaf node.
2440
<tr style="text-align:left"><td>leaves_end_block <td>
2441
The blockid that corresponds to the leaf node with the largest blockid
2442
that belongs to this segment b-tree. Or zero if the entire segment b-tree
2443
fits on the root node.
2444
<tr style="text-align:left;background-color:#DDDDDD"><td>end_block <td>
2445
The blockid that corresponds to the interior node with the largest
2446
blockid that belongs to this segment b-tree. Or zero if the entire segment
2447
b-tree fits on the root node. If it exists, this node is always an
2449
<tr style="text-align:left"><td>root <td>
2450
Blob containing the root node of the segment b-tree.
2454
Apart from the root node, the nodes that make up a single segment b-tree are
2455
always stored using a contiguous sequence of blockids. Furthermore, the
2456
nodes that make up a single level of the b-tree are themselves stored as
2457
a contiguous block, in b-tree order. The contiguous sequence of blockids
2458
used to store the b-tree leaves are allocated starting with the blockid
2459
value stored in the "start_block" column of the corresponding %_segdir row,
2460
and finishing at the blockid value stored in the "leaves_end_block"
2461
field of the same row. It is therefore possible to iterate through all the
2462
leaves of a segment b-tree, in key order, by traversing the %_segments
2463
table in blockid order from "start_block" to "leaves_end_block".
2465
<h3 id="section_9_3_1">9.3.1. Segment B-Tree Leaf Nodes</h3>
2468
The following diagram depicts the format of a segment b-tree leaf node.
2471
<img src="images/fts3_leaf_node.png">
2472
<p> Segment B-Tree Leaf Node Format
2476
The first term stored on each node ("Term 1" in the figure above) is
2477
stored verbatim. Each subsequent term is prefix-compressed with respect
2478
to its predecessor. Terms are stored within a page in sorted (memcmp)
2481
<h3 id="section_9_3_2">9.3.2. Segment B-Tree Interior Nodes</h3>
2484
The following diagram depicts the format of a segment b-tree interior
2488
<img src="images/fts3_interior_node.png">
2489
<p> Segment B-Tree Interior Node Format
2493
<h2 id="section_9_4">9.4. Doclist Format</h2>
2496
A doclist consists of an array of 64-bit signed integers, serialized using
2497
the FTS varint format. Each doclist entry is made up of a series of two
2498
or more integers, as follows:
2501
<li> The docid value. The first entry in a doclist contains the literal docid
2502
value. The first field of each subsequent doclist entry contains the
2503
difference between the new docid and the previous one (always a positive
2505
<li> Zero or more term-offset lists. A term-offset list is present for each
2506
column of the FTS virtual table that contains the term. A term-offset
2507
list consists of the following:
2509
<li> Constant value 1. This field is omitted for any term-offset list
2510
associated with column 0.
2511
<li> The column number (1 for the second leftmost column, etc.). This
2512
field is omitted for any term-offset list associated with column 0.
2513
<li> A list of term-offsets, sorted from smallest to largest. Instead
2514
of storing the term-offset value literally, each integer stored
2515
is the difference between the current term-offset and the previous
2516
one (or zero if the current term-offset is the first), plus 2.
2518
<li> Constant value 0.
2522
<img src="images/fts3_doclist2.png">
2523
<p> FTS3 Doclist Format
2527
<img src="images/fts3_doclist.png">
2528
<p> FTS Doclist Entry Format
2532
For doclists for which the term appears in more than one column of the FTS
2533
virtual table, term-offset lists within the doclist are stored in column
2534
number order. This ensures that the term-offset list associated with
2535
column 0 (if any) is always first, allowing the first two fields of the
2536
term-offset list to be omitted in this case.
2538
<h1 id="appendix_a">
2539
Appendix A: Search Application Tips
2543
FTS is primarily designed to support Boolean full-text queries - queries
2544
to find the set of documents that match a specified criteria. However, many
2545
(most?) search applications require that results are somehow ranked in order
2546
of "relevance", where "relevance" is defined as the likelihood that the user
2547
who performed the search is interested in a specific element of the returned
2548
set of documents. When using a search engine to find documents on the world
2549
wide web, the user expects that the most useful, or "relevant", documents
2550
will be returned as the first page of results, and that each subsequent page
2551
contains progressively less relevant results. Exactly how a machine can
2552
determine document relevance based on a users query is a complicated problem
2553
and the subject of much ongoing research.
2556
One very simple scheme might be to count the number of instances of the
2557
users search terms in each result document. Those documents that contain
2558
many instances of the terms are considered more relevant than those with
2559
a small number of instances of each term. In an FTS application, the
2560
number of term instances in each result could be determined by counting
2561
the number of integers in the return value of the <a href="fts3.html#offsets">offsets</a> function.
2562
The following example shows a query that could be used to obtain the
2563
ten most relevant results for a query entered by the user:
2565
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- This example (and all others in this section) assumes the following schema</i>
2566
CREATE VIRTUAL TABLE documents USING fts3(title, content);
2568
<i>-- Assuming the application has supplied an SQLite user function named "countintegers"</i>
2569
<i>-- that returns the number of space-separated integers contained in its only argument,</i>
2570
<i>-- the following query could be used to return the titles of the 10 documents that contain</i>
2571
<i>-- the greatest number of instances of the users query terms. Hopefully, these 10</i>
2572
<i>-- documents will be those that the users considers more or less the most "relevant".</i>
2573
SELECT title FROM documents
2574
WHERE documents MATCH <query>
2575
ORDER BY countintegers(offsets(documents)) DESC
2580
The query above could be made to run faster by using the FTS <a href="fts3.html#matchinfo">matchinfo</a>
2581
function to determine the number of query term instances that appear in each
2582
result. The matchinfo function is much more efficient than the offsets
2583
function. Furthermore, the matchinfo function provides extra information
2584
regarding the overall number of occurrences of each query term in the entire
2585
document set (not just the current row) and the number of documents in which
2586
each query term appears. This may be used (for example) to attach a higher
2587
weight to less common terms which may increase the overall computed relevancy
2588
of those results the user considers more interesting.
2590
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- If the application supplies an SQLite user function called "rank" that</i>
2591
<i>-- interprets the blob of data returned by matchinfo and returns a numeric</i>
2592
<i>-- relevancy based on it, then the following SQL may be used to return the</i>
2593
<i>-- titles of the 10 most relevant documents in the dataset for a users query.</i>
2594
SELECT title FROM documents
2595
WHERE documents MATCH <query>
2596
ORDER BY rank(matchinfo(documents)) DESC
2601
The SQL query in the example above uses less CPU than the first example
2602
in this section, but still has a non-obvious performance problem. SQLite
2603
satisfies this query by retrieving the value of the "title" column and
2604
matchinfo data from the FTS module for every row matched by the users
2605
query before it sorts and limits the results. Because of the way SQLite's
2606
virtual table interface works, retrieving the value of the "title" column
2607
requires loading the entire row from disk (including the "content" field,
2608
which may be quite large). This means that if the users query matches
2609
several thousand documents, many megabytes of "title" and "content" data
2610
may be loaded from disk into memory even though they will never be used
2614
The SQL query in the following example block is one solution to this
2615
problem. In SQLite, when a <a href="optoverview.html#flattening">sub-query
2616
used in a join contains a LIMIT clause</a>, the results of the sub-query are
2617
calculated and stored in temporary table before the main query is executed.
2618
This means that SQLite will load only the docid and matchinfo data for each
2619
row matching the users query into memory, determine the docid values
2620
corresponding to the ten most relevant documents, then load only the title
2621
and content information for those 10 documents only. Because both the matchinfo
2622
and docid values are gleaned entirely from the full-text index, this results
2623
in dramatically less data being loaded from the database into memory.
2625
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em">SELECT title FROM documents JOIN (
2626
SELECT docid, rank(matchinfo(documents)) AS rank
2628
WHERE documents MATCH <query>
2631
) AS ranktable USING(docid)
2632
ORDER BY ranktable.rank DESC
2636
The next block of SQL enhances the query with solutions to two other problems
2637
that may arise in developing search applications using FTS:
2641
The <a href="fts3.html#snippet">snippet</a> function cannot be used with the above query. Because
2642
the outer query does not include a "WHERE ... MATCH" clause, the snippet
2643
function may not be used with it. One solution is to duplicate the WHERE
2644
clause used by the sub-query in the outer query. The overhead associated
2645
with this is usually negligible.
2647
The relevancy of a document may depend on something other than just
2648
the data available in the return value of matchinfo. For example
2649
each document in the database may be assigned a static weight based
2650
on factors unrelated to its content (origin, author, age, number
2651
of references etc.). These values can be stored by the application
2652
in a separate table that can be joined against the documents table
2653
in the sub-query so that the rank function may access them.
2657
This version of the query is very similar to that used by the
2658
<a href="http://www.sqlite.org/search?q=fts3">sqlite.org documentation search</a>
2661
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>-- This table stores the static weight assigned to each document in FTS table</i>
2662
<i>-- "documents". For each row in the documents table there is a corresponding row</i>
2663
<i>-- with the same docid value in this table.</i>
2664
CREATE TABLE documents_data(docid INTEGER PRIMARY KEY, weight);
2666
<i>-- This query is similar to the one in the block above, except that:</i>
2668
<i>-- 1. It returns a "snippet" of text along with the document title for display. So</i>
2669
<i>-- that the snippet function may be used, the "WHERE ... MATCH ..." clause from</i>
2670
<i>-- the sub-query is duplicated in the outer query.</i>
2672
<i>-- 2. The sub-query joins the documents table with the document_data table, so that</i>
2673
<i>-- implementation of the rank function has access to the static weight assigned</i>
2674
<i>-- to each document.</i>
2675
SELECT title, snippet(documents) FROM documents JOIN (
2676
SELECT docid, rank(matchinfo(documents), documents_data.weight) AS rank
2677
FROM documents JOIN documents_data USING(docid)
2678
WHERE documents MATCH <query>
2681
) AS ranktable USING(docid)
2682
WHERE documents MATCH <query>
2683
ORDER BY ranktable.rank DESC
2687
All the example queries above return the ten most relevant query results.
2688
By modifying the values used with the OFFSET and LIMIT clauses, a query
2689
to return (say) the next ten most relevant results is easy to construct.
2690
This may be used to obtain the data required for a search applications second
2691
and subsequent pages of results.
2694
The next block contains an example rank function that uses matchinfo data
2695
implemented in C. Instead of a single weight, it allows a weight to be
2696
externally assigned to each column of each document. It may be registered
2697
with SQLite like any other user function using <a href="c3ref/create_function.html">sqlite3_create_function</a>.
2699
<div class=codeblock style="margin:0 15ex"><table width=100% style="border:1px solid #80a796;padding:0 1ex;background-color:#EEEEEE"><tr><td><pre style="font-size:1.1em"><i>/*</i>
2700
<i>** SQLite user defined function to use with matchinfo() to calculate the</i>
2701
<i>** relevancy of an FTS match. The value returned is the relevancy score</i>
2702
<i>** (a real value greater than or equal to zero). A larger value indicates </i>
2703
<i>** a more relevant document.</i>
2705
<i>** The overall relevancy returned is the sum of the relevancies of each </i>
2706
<i>** column value in the FTS table. The relevancy of a column value is the</i>
2707
<i>** sum of the following for each reportable phrase in the FTS query:</i>
2709
<i>** (<hit count> / <global hit count>) * <column weight></i>
2711
<i>** where <hit count> is the number of instances of the phrase in the</i>
2712
<i>** column value of the current row and <global hit count> is the number</i>
2713
<i>** of instances of the phrase in the same column of all rows in the FTS</i>
2714
<i>** table. The <column weight> is a weighting factor assigned to each</i>
2715
<i>** column by the caller (see below).</i>
2717
<i>** The first argument to this function must be the return value of the FTS </i>
2718
<i>** matchinfo() function. Following this must be one argument for each column </i>
2719
<i>** of the FTS table containing a numeric weight factor for the corresponding </i>
2720
<i>** column. Example:</i>
2722
<i>** CREATE VIRTUAL TABLE documents USING fts3(title, content)</i>
2724
<i>** The following query returns the docids of documents that match the full-text</i>
2725
<i>** query <query> sorted from most to least relevant. When calculating</i>
2726
<i>** relevance, query term instances in the 'title' column are given twice the</i>
2727
<i>** weighting of those in the 'content' column.</i>
2729
<i>** SELECT docid FROM documents </i>
2730
<i>** WHERE documents MATCH <query> </i>
2731
<i>** ORDER BY rank(matchinfo(documents), 1.0, 0.5) DESC</i>
2733
static void rankfunc(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal){
2734
int *aMatchinfo; <i>/* Return value of matchinfo() */</i>
2735
int nCol; <i>/* Number of columns in the table */</i>
2736
int nPhrase; <i>/* Number of phrases in the query */</i>
2737
int iPhrase; <i>/* Current phrase */</i>
2738
double score = 0.0; <i>/* Value to return */</i>
2740
assert( sizeof(int)==4 );
2742
<i> /* Check that the number of arguments passed to this function is correct.</i>
2743
<i> ** If not, jump to wrong_number_args. Set aMatchinfo to point to the array</i>
2744
<i> ** of unsigned integer values returned by FTS function matchinfo. Set</i>
2745
<i> ** nPhrase to contain the number of reportable phrases in the users full-text</i>
2746
<i> ** query, and nCol to the number of columns in the table.</i>
2748
if( nVal<1 ) goto wrong_number_args;
2749
aMatchinfo = (unsigned int *)sqlite3_value_blob(apVal[0]);
2750
nPhrase = aMatchinfo[0];
2751
nCol = aMatchinfo[1];
2752
if( nVal!=(1+nCol) ) goto wrong_number_args;
2754
<i> /* Iterate through each phrase in the users query. */</i>
2755
for(iPhrase=0; iPhrase<nPhrase; iPhrase++){
2756
int iCol; <i>/* Current column */</i>
2758
<i> /* Now iterate through each column in the users query. For each column,</i>
2759
<i> ** increment the relevancy score by:</i>
2761
<i> ** (<hit count> / <global hit count>) * <column weight></i>
2763
<i> ** aPhraseinfo[] points to the start of the data for phrase iPhrase. So</i>
2764
<i> ** the hit count and global hit counts for each column are found in </i>
2765
<i> ** aPhraseinfo[iCol*3] and aPhraseinfo[iCol*3+1], respectively.</i>
2767
int *aPhraseinfo = &aMatchinfo[2 + iPhrase*nCol*3];
2768
for(iCol=0; iCol<nCol; iCol++){
2769
int nHitCount = aPhraseinfo[3*iCol];
2770
int nGlobalHitCount = aPhraseinfo[3*iCol+1];
2771
double weight = sqlite3_value_double(apVal[iCol+1]);
2773
score += ((double)nHitCount / (double)nGlobalHitCount) * weight;
2778
sqlite3_result_double(pCtx, score);
2781
<i> /* Jump here if the wrong number of arguments are passed to this function */</i>
2783
sqlite3_result_error(pCtx, "wrong number of arguments to function rank()", -1);