1
/*************************************************************************************************
2
* Common modules related to estwaver
3
* Copyright (C) 2004-2006 Mikio Hirabayashi
4
* This file is part of Hyper Estraier.
5
* Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6
* the GNU Lesser General Public License as published by the Free Software Foundation; either
7
* version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8
* that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10
* License for more details.
11
* You should have received a copy of the GNU Lesser General Public License along with Hyper
12
* Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13
* Boston, MA 02111-1307 USA.
14
*************************************************************************************************/
17
#ifndef _WAVERMOD_H /* duplication check */
28
/*************************************************************************************************
30
*************************************************************************************************/
33
#define NUMBUFSIZ 32 /* size of a buffer for a number */
34
#define URIBUFSIZ 8192 /* size of a buffer for an URI */
35
#define IOBUFSIZ 8192 /* size of a buffer for I/O */
36
#define MINIBNUM 31 /* bucket number of a small map */
37
#define RESLIMSIZE (1048576*32) /* limitation of the entity body of response */
39
#define CONFFILE "_conf" /* name of the configuration file */
40
#define METAFILE "_meta" /* name of the meta database */
41
#define LOGFILE "_log" /* name of the log file */
42
#define INDEXDIR "_index" /* name of the index directory */
43
#define MYTMPDIR "_tmp" /* name of the temporary directory */
44
#define MMKMAGIC "magic" /* meta key of the magic number of meta DB */
45
#define MMKMAGVAL "[ESTWAVER]" /* value of the magic number of meta DB */
47
#define QUEUEFILE "_queue" /* name of the queue file */
48
#define QUEUELRM 77 /* records in a leaf node of the queue */
49
#define QUEUENIM 192 /* records in a non-leaf node of the queue */
50
#define QUEUELCN 2048 /* number of leaf cache of the queue */
51
#define QUEUENCN 512 /* number of non-leaf cache of the queue */
53
#define TRACEFILE "_trace" /* name of the trace file */
54
#define TRACEBNUM 425977 /* bucket number of the trace database */
55
#define TRACEDNUM 3 /* division number of the trace database */
57
#define DATTRDEPTH "_depth" /* name of the attribute of the depth */
58
#define DRAFTCMD "[DRAFT]" /* built-in command for document draft */
59
#define TEXTCMD "[TEXT]" /* built-in command for plain text */
60
#define HTMLCMD "[HTML]" /* built-in command for HTML */
61
#define MIMECMD "[MIME]" /* built-in command for MIME */
63
enum { /* enumeration for initializing options */
64
WI_SMALL = 1 << 0, /* small scale */
65
WI_LARGE = 1 << 1, /* large scale */
66
WI_HUGE = 1 << 2 /* huge scale */
69
enum { /* enumeration for running modes */
70
LL_DEBUG = 1, /* debug */
71
LL_INFO = 2, /* information */
72
LL_WARN = 3, /* warning */
73
LL_ERROR = 4, /* error */
74
LL_NONE = 5, /* none */
75
LL_CHECK = 6 /* check to open */
78
enum { /* enumeration for crawling strategy */
79
CS_BALANCED, /* balanced of depth, width, and similarity */
80
CS_SIMILARITY, /* similarity first */
81
CS_DEPTH, /* depth first */
82
CS_WIDTH, /* width first */
83
CS_RANDOM /* at random */
86
typedef struct { /* type of structure for priority queue */
87
VILLA *db; /* internal database */
88
double max; /* maximum point */
91
typedef struct { /* type of structure for a keyword */
92
const char *word; /* face of keyword */
93
int wsiz; /* size of the keyword */
94
int pt; /* score tuned by TF-IDF */
97
typedef struct { /* type of structure for a URL normalization */
98
void *regex; /* regular expressions */
99
char *before; /* before substring */
100
char *after; /* after substring */
103
typedef struct { /* type of structure for a permission */
104
void *regex; /* regular expressions */
105
int visit; /* to be visited */
106
int index; /* to be indexed */
109
typedef struct { /* type of structure for a URL rule */
110
void *regex; /* regular expressions */
111
char *type; /* media type */
114
typedef struct { /* type of structure for a media type rule */
115
void *regex; /* regular expressions */
116
char *filter; /* filter command */
119
typedef struct { /* type of structure for waver */
120
char *rootdir; /* root directory */
121
DEPOT *metadb; /* meta database */
122
QUEUE *queue; /* priority queue */
123
CURIA *trace; /* tracking records */
124
ESTMTDB *index; /* document index */
125
CBMAP *seeds; /* seed map */
126
CBMAP *kwords; /* keyword map */
127
CBMAP *sites; /* site map */
128
char *pxhost; /* host name of the proxy */
129
int pxport; /* port number of the proxy */
130
int interval; /* interval time */
131
int timeout; /* timeout of each request */
132
int strategy; /* crawling strategy */
133
double inherit; /* inheritance ratio of similarity */
134
int seeddepth; /* maximum depth of seed documents */
135
int maxdepth; /* maximum depth of recursion */
136
int masscheck; /* standard value for checking mass sites */
137
int queuesize; /* size of the priority queue */
138
CBLIST *unrules; /* rules of URL normalization */
139
CBLIST *pmrules; /* rules of permission */
140
CBLIST *urlrules; /* rules of URL */
141
CBLIST *mtrules; /* rules of media types */
142
int language; /* preferred language */
143
int textlimit; /* text size limitation */
144
int seedkeynum; /* total number of keywords for seed documents */
145
int savekeynum; /* number of keywords saved for each document */
146
int thnum; /* number of threads */
147
int docnum; /* total number of documents */
148
int period; /* running time period */
149
int revisit; /* revisit span */
150
size_t cachesize; /* maximum size of the index cache */
151
CBMAP *nodes; /* remote nodes for alternative indexes */
152
char *draftdir; /* path of the draft directory */
153
char *entitydir; /* path of the entity directory */
154
char *postproc; /* postprocessor for retrieved files */
155
time_t stime; /* start time */
156
int curnum; /* current number of documents */
157
int curnode; /* current using node */
158
double minload; /* minimum load of nodes */
162
/* The handles of the log file. */
166
/* The level of logging. */
167
extern int log_level;
170
/* Open the log file.
171
`rootdir' specifies the path of the root directory.
172
`path' specifies the path of the log file.
173
`level' specifies the leve of logging.
174
`trunc' specifies whether to truncate the log file.
175
The return value is true if success, else it is false. */
176
int log_open(const char *rootdir, const char *path, int level, int trunc);
179
/* Print formatted string into the log file. */
180
void log_print(int level, const char *format, ...);
183
/* Initialize the root directory.
184
`rootdir' specifies the path of the root directory.
185
`options' specifies the options: `WI_SMALL', `WI_LARGE', or `WI_HUGE'.
186
The return value is true if success, else it is false. */
187
int waver_init(const char *rootdir, int options);
190
/* Open a waver handle.
191
`rootdir' specifies the path of the root directory.
192
The return value is the waver handle or `NULL' on failure. */
193
WAVER *waver_open(const char *rootdir);
196
/* Close a waver handle.
197
`waver' specifies a waver handle.
198
The return value is true if success, else it is false. */
199
int waver_close(WAVER *waver);
202
/* Set the current node.
203
`waver' specifies a waver handle. */
204
void waver_set_current_node(WAVER *waver);
207
/* Get the load of the current node.
208
`waver' specifies a waver handle.
209
the return value is the load of the current node. */
210
double waver_current_node_load(WAVER *waver);
213
/* Add a document to a node.
214
`waver' specifies a waver handle.
215
`doc' specifies a document object.
216
`codep' specifies the pointer to a variable to which the status code of respnese is
217
assigned. If it is `NULL', it is not used.
218
The return value is true if success, else it is false. */
219
int waver_node_put_doc(WAVER *waver, ESTDOC *doc, int *codep);
222
/* Remove a document from a node.
223
`waver' specifies a waver handle.
224
`url' specifies the URL of a document.
225
`codep' specifies the pointer to a variable to which the status code of respnese is
226
assigned. If it is `NULL', it is not used.
227
The return value is true if success, else it is false. */
228
int waver_node_out_doc(WAVER *waver, const char *url, int *codep);
231
/* Open a priority queue.
232
`name' specifies the name of a database file.
233
The return value is the queue handle or `NULL' on failure. */
234
QUEUE *queue_open(const char *name);
237
/* Close a priority queue.
238
`queue' specifies a queue handle.
239
The return value is true if successful, or false on failure. */
240
int queue_close(QUEUE *queue);
243
/* Set the range of the priority space of a priority queue.
244
`queue' specifies a queue handle.
245
`range' specifies the range of the priority space. */
246
void queue_set_range(QUEUE *queue, double range);
249
/* Enqueue a record into a priority queue.
250
`queue' specifies a queue handle.
251
`str' specifies a string.
252
`priority' specifies the priority between 0.0 and 1.0.
253
The return value is true if successful, or false on failure. */
254
int queue_enqueue(QUEUE *queue, const char *str, double priority);
257
/* Dequeue a record from a priority queue.
258
`queue' specifies a queue handle.
259
The return value is the pointer to a record or `NULL' if no record exists.
260
Because the region of the return value is allocated with the `malloc' call, it should be
261
released with the `free' call if it is no longer in use. */
262
char *queue_dequeue(QUEUE *queue);
265
/* Get the number of records in a priority queue.
266
`queue' specifies a queue handle.
267
The return value is the number of records. */
268
int queue_rnum(QUEUE *queue);
271
/* Discard inferior records in a priority queue.
272
`queue' specifies a queue handle.
273
`num' specifies the number of records to be kept.
274
The return value is true if successful, or false on failure. */
275
int queue_slim(QUEUE *queue, int num);
278
/* Add a word to a keyword map.
279
`kwords' specifies a keyword map handle.
280
`word' specifies the string of a word.
281
`frequency' specifies the frequency of the word. */
282
void kwords_add(CBMAP *kwords, const char *word, int frequency);
285
/* Reduce elements of a keyword map.
286
`kwords' specifies a keyword map handle.
287
`num' specifies the number of elements after reduction of the keyword map.
288
`fadeout' specifies whether scores do fade-out. */
289
void kwords_reduce(CBMAP *kwords, int num, int fadeout);
292
/* Fetch a document of a URL.
293
`url' specifies the URL of a document.
294
`pxhost' specifies the host name of a proxy. If it is `NULL', it is not used.
295
`pxport' specifies the port number of the proxy.
296
`outsec' specifies timeout in seconds. If it is negative, it is not used.
297
`mdate' specifies the last-modified date. If it is not more than 0, it is not used.
298
`urlrules' specifies a list object conteining type rules of URLs. If it is `NULL', the
299
default rule is applied.
300
`mtrules' specifies a list object conteining filter rules of media types. If it is `NULL',
301
the default rule is applied.
302
`codep' specifies the pointer to a variable to which the status code of respnese is
303
assigned. If it is `NULL', it is not used.
304
`raw' specifies a datum handle to store raw data. If it is `NULL', it is not used.
305
`heads' specifies a map handle to store HTTP headers. If it is `NULL', it is not used.
306
`links' specifies a list handle to store links. If it is `NULL', it is not used.
307
`unrules' specifies a list object conteining URL normalization rules. If it is `NULL', it is
309
`doc' specifies a document handle to store attributes and texts. If it is `NULL', it is not
311
`lang' specifies the code of preferred language.
312
The return value is true if success, else it is false. */
313
int fetch_document(const char *url, const char *pxhost, int pxport, int outsec, time_t mdate,
314
const CBLIST *urlrules, const CBLIST *mtrules,
315
int *codep, CBDATUM *raw, CBMAP *heads,
316
CBLIST *links, const CBLIST *unrules, ESTDOC *doc, int lang);
320
#endif /* duplication check */