~ubuntu-branches/ubuntu/wily/dovecot/wily

« back to all changes in this revision

Viewing changes to src/lib-fts/fts-tokenizer.c

  • Committer: Package Import Robot
  • Author(s): Jelmer Vernooij
  • Date: 2015-05-24 15:01:19 UTC
  • mto: (4.1.53 sid)
  • mto: This revision was merged to the branch mainline in revision 102.
  • Revision ID: package-import@ubuntu.com-20150524150119-hsh6cbr1fqseapga
Tags: upstream-2.2.18
ImportĀ upstreamĀ versionĀ 2.2.18

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
 
2
 
 
3
#include "lib.h"
 
4
#include "array.h"
 
5
#include "istream.h"
 
6
#include "str.h"
 
7
#include "strfuncs.h"
 
8
#include "fts-tokenizer.h"
 
9
#include "fts-tokenizer-private.h"
 
10
 
 
11
static ARRAY(const struct fts_tokenizer *) fts_tokenizer_classes;
 
12
 
 
13
void fts_tokenizers_init(void)
 
14
{
 
15
        if (!array_is_created(&fts_tokenizer_classes)) {
 
16
                fts_tokenizer_register(fts_tokenizer_generic);
 
17
                fts_tokenizer_register(fts_tokenizer_email_address);
 
18
        }
 
19
}
 
20
 
 
21
void fts_tokenizers_deinit(void)
 
22
{
 
23
        if (array_is_created(&fts_tokenizer_classes))
 
24
                array_free(&fts_tokenizer_classes);
 
25
}
 
26
 
 
27
/* private */
 
28
void fts_tokenizer_register(const struct fts_tokenizer *tok_class)
 
29
{
 
30
        if (!array_is_created(&fts_tokenizer_classes))
 
31
                i_array_init(&fts_tokenizer_classes, FTS_TOKENIZER_CLASSES_NR);
 
32
        array_append(&fts_tokenizer_classes, &tok_class, 1);
 
33
}
 
34
 
 
35
/* private */
 
36
void fts_tokenizer_unregister(const struct fts_tokenizer *tok_class)
 
37
{
 
38
        const struct fts_tokenizer *const *tp;
 
39
        unsigned int idx;
 
40
 
 
41
        array_foreach(&fts_tokenizer_classes, tp) {
 
42
                if (strcmp((*tp)->name, tok_class->name) == 0) {
 
43
                        idx = array_foreach_idx(&fts_tokenizer_classes, tp);
 
44
                        array_delete(&fts_tokenizer_classes, idx, 1);
 
45
                        if (array_count(&fts_tokenizer_classes) == 0)
 
46
                                array_free(&fts_tokenizer_classes);
 
47
                        return;
 
48
                }
 
49
        }
 
50
        i_unreached();
 
51
}
 
52
 
 
53
const struct fts_tokenizer *fts_tokenizer_find(const char *name)
 
54
{
 
55
        const struct fts_tokenizer *const *tp;
 
56
 
 
57
        array_foreach(&fts_tokenizer_classes, tp) {
 
58
                if (strcmp((*tp)->name, name) == 0)
 
59
                        return *tp;
 
60
        }
 
61
        return NULL;
 
62
}
 
63
 
 
64
const char *fts_tokenizer_name(const struct fts_tokenizer *tok)
 
65
{
 
66
        return tok->name;
 
67
}
 
68
 
 
69
int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
 
70
                         struct fts_tokenizer *parent,
 
71
                         const char *const *settings,
 
72
                         struct fts_tokenizer **tokenizer_r,
 
73
                         const char **error_r)
 
74
{
 
75
        struct fts_tokenizer *tok;
 
76
        const char *empty_settings = NULL;
 
77
 
 
78
        i_assert(settings == NULL || str_array_length(settings) % 2 == 0);
 
79
 
 
80
        if (settings == NULL)
 
81
                settings = &empty_settings;
 
82
 
 
83
        if (tok_class->v->create(settings, &tok, error_r) < 0) {
 
84
                *tokenizer_r = 0;
 
85
                return -1;
 
86
        }
 
87
        tok->refcount = 1;
 
88
        tok->prev_reply_finished = TRUE;
 
89
        if (parent != NULL) {
 
90
                fts_tokenizer_ref(parent);
 
91
                tok->parent = parent;
 
92
                tok->parent_input = buffer_create_dynamic(default_pool, 128);
 
93
        }
 
94
 
 
95
        *tokenizer_r = tok;
 
96
        return 0;
 
97
}
 
98
 
 
99
void fts_tokenizer_ref(struct fts_tokenizer *tok)
 
100
{
 
101
        i_assert(tok->refcount > 0);
 
102
 
 
103
        tok->refcount++;
 
104
}
 
105
 
 
106
void fts_tokenizer_unref(struct fts_tokenizer **_tok)
 
107
{
 
108
        struct fts_tokenizer *tok = *_tok;
 
109
 
 
110
        i_assert(tok->refcount > 0);
 
111
        *_tok = NULL;
 
112
 
 
113
        if (--tok->refcount > 0)
 
114
                return;
 
115
 
 
116
        if (tok->parent_input != NULL)
 
117
                buffer_free(&tok->parent_input);
 
118
        if (tok->parent != NULL)
 
119
                fts_tokenizer_unref(&tok->parent);
 
120
        tok->v->destroy(tok);
 
121
}
 
122
 
 
123
static int
 
124
fts_tokenizer_next_self(struct fts_tokenizer *tok,
 
125
                        const unsigned char *data, size_t size,
 
126
                        const char **token_r, const char **error_r)
 
127
{
 
128
        int ret = 0;
 
129
        size_t skip = 0;
 
130
 
 
131
        i_assert(tok->prev_reply_finished ||
 
132
                 (data == tok->prev_data && size == tok->prev_size));
 
133
 
 
134
        if (tok->prev_reply_finished) {
 
135
                /* whole new data */
 
136
                ret = tok->v->next(tok, data, size, &skip, token_r, error_r);
 
137
        } else {
 
138
                /* continuing previous data */
 
139
                i_assert(tok->prev_skip <= size);
 
140
                ret = tok->v->next(tok, data + tok->prev_skip,
 
141
                                   size - tok->prev_skip, &skip,
 
142
                                   token_r, error_r);
 
143
        }
 
144
 
 
145
        if (ret > 0) {
 
146
                i_assert(skip <= size - tok->prev_skip);
 
147
                tok->prev_data = data;
 
148
                tok->prev_size = size;
 
149
                tok->prev_skip = tok->prev_skip + skip;
 
150
                tok->prev_reply_finished = FALSE;
 
151
        } else if (ret == 0) {
 
152
                /* we need a new data block */
 
153
                tok->prev_data = NULL;
 
154
                tok->prev_size = 0;
 
155
                tok->prev_skip = 0;
 
156
                tok->prev_reply_finished = TRUE;
 
157
        }
 
158
        return ret;
 
159
}
 
160
 
 
161
void fts_tokenizer_reset(struct fts_tokenizer *tok)
 
162
{
 
163
        tok->v->reset(tok);
 
164
}
 
165
 
 
166
int fts_tokenizer_next(struct fts_tokenizer *tok,
 
167
                       const unsigned char *data, size_t size,
 
168
                       const char **token_r, const char **error_r)
 
169
{
 
170
        int ret;
 
171
 
 
172
        switch (tok->parent_state) {
 
173
        case FTS_TOKENIZER_PARENT_STATE_ADD_DATA:
 
174
                ret = fts_tokenizer_next_self(tok, data, size, token_r, error_r);
 
175
                if (ret <= 0 || tok->parent == NULL || tok->skip_parents)
 
176
                        break;
 
177
                buffer_set_used_size(tok->parent_input, 0);
 
178
                buffer_append(tok->parent_input, *token_r, strlen(*token_r));
 
179
                tok->parent_state++;
 
180
                /* fall through */
 
181
        case FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
 
182
                ret = fts_tokenizer_next(tok->parent, tok->parent_input->data,
 
183
                                         tok->parent_input->used, token_r, error_r);
 
184
                if (ret != 0)
 
185
                        break;
 
186
                tok->parent_state++;
 
187
                /* fall through */
 
188
        case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
 
189
                ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r, error_r);
 
190
                if (ret != 0)
 
191
                        break;
 
192
                /* we're finished sending this token to parent tokenizer.
 
193
                   see if our own tokenizer has more tokens available */
 
194
                tok->parent_state = FTS_TOKENIZER_PARENT_STATE_ADD_DATA;
 
195
                return fts_tokenizer_next(tok, data, size, token_r, error_r);
 
196
        default:
 
197
                i_unreached();
 
198
        }
 
199
        /* we must not be returning empty tokens */
 
200
        i_assert(ret <= 0 || (*token_r)[0] != '\0');
 
201
        return ret;
 
202
}
 
203
 
 
204
int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
 
205
                        const char **error_r)
 
206
{
 
207
        return fts_tokenizer_next(tok, NULL, 0, token_r, error_r);
 
208
}