~ubuntu-branches/ubuntu/edgy/lurker/edgy

« back to all changes in this revision

Viewing changes to common/Keys.cpp

Committer: Bazaar Package Importer
Author(s): Jonas Meurer
Date: 2004-09-26 16:27:51 UTC
Revision ID: james.westby@ubuntu.com-20040926162751-z1ohcjltv7ojtg6z

Tags: upstream-1.2

Import upstream version 1.2

files added:

AUTHORS

COPYING

ChangeLog

INSTALL

Makefile.am

Makefile.in

NEWS

README

acinclude.m4

aclocal.m4

common

common/CharsetEscape.cpp

common/CharsetEscape.h

common/ConfigFile.cpp

common/ConfigFile.h

common/Keys.cpp

common/Keys.h

common/Makefile.am

common/Makefile.in

common/MessageId.cpp

common/MessageId.h

common/Search.cpp

common/Search.h

common/Summary.cpp

common/Summary.h

common/Thread.cpp

common/XmlEscape.cpp

common/XmlEscape.h

common/md5.cpp

common/md5.h

config.h.in

configure

configure.ac

imgs

imgs/Makefile.am

imgs/Makefile.in

imgs/a.png

imgs/b.png

imgs/bar.png

imgs/c.png

imgs/d.png

imgs/e.png

imgs/f.png

imgs/g.png

imgs/h.png

imgs/i.png

imgs/j.png

imgs/k.png

imgs/next.png

imgs/paperclip.png

imgs/prev.png

imgs/root.png

index

index/Index.cpp

index/Index.h

index/Makefile.am

index/Makefile.in

index/getdate.cpp

index/getdate.h

index/list.cpp

index/lurker-drop-rlimit.cpp

index/lurker-index.1

index/lurker-index.sgml

index/lurker-list.1

index/lurker-list.sgml

index/lurker-params.1

index/lurker-params.sgml

index/lurker-regenerate

index/lurker-regenerate.1

index/lurker-regenerate.sgml

index/lurker-search.1

index/lurker-search.sgml

index/main.cpp

index/params.cpp

index/search.cpp

libesort

libesort/DbMan.cpp

libesort/DbMan.h

libesort/Failer.cpp

libesort/Failer.h

libesort/File.cpp

libesort/File.h

libesort/Makefile.am

libesort/Makefile.in

libesort/Master.cpp

libesort/Master.h

libesort/Memory.cpp

libesort/Memory.h

libesort/Merger.cpp

libesort/Merger.h

libesort/Parameters.cpp

libesort/Source.cpp

libesort/Source.h

libesort/Transaction.cpp

libesort/Transaction.h

libesort/View.cpp

libesort/View.h

libesort/dump.cpp

libesort/esort.h

libesort/io.h

lurker.conf.in

prune

prune/Makefile.am

prune/Makefile.in

prune/PTable.cpp

prune/PTable.h

prune/attach.cpp

prune/list.cpp

prune/lurker-prune.1

prune/lurker-prune.sgml

prune/mbox.cpp

prune/message.cpp

prune/mindex.cpp

prune/prune.cpp

prune/search.cpp

prune/splash.cpp

prune/thread.cpp

render

render/Cache.cpp

render/Cache.h

render/Makefile.am

render/Makefile.in

render/Threading.cpp

render/Threading.h

render/art.cpp

render/attach.cpp

render/bounce.cpp

render/commands.h

render/error.h

render/jump.cpp

render/keyword.cpp

render/list.cpp

render/mailto.cpp

render/main.cpp

render/mbox.cpp

render/message.cpp

render/mindex.cpp

render/parse.cpp

render/parse.h

render/quote.cpp

render/search.cpp

render/splash.cpp

render/thread.cpp

render/url.cpp

tools

tools/depcomp

tools/install-sh

tools/missing

tools/mkinstalldirs

ui/Makefile.am

ui/Makefile.in

ui/ca.xml

ui/common.js

ui/common.xsl

ui/da.xml

ui/de.xml

ui/default.css

ui/el.xml

ui/en.xml

ui/es.xml

ui/fi.xml

ui/hu.xml

ui/index.html

ui/it.xml

ui/ja.xml

ui/lang.xml

ui/lang.xsl

ui/list.xsl

ui/message.xsl

ui/mindex.xsl

ui/nl.xml

ui/pl.xml

ui/pt-BR.xml

ui/pt.xml

ui/search.xsl

ui/splash.xsl

ui/thread.xsl

Show diffs side-by-side

added added

removed removed

common/Keys.cpp

/* $Id: Keys.cpp,v 1.4 2003/06/23 14:38:41 terpstra Exp $

* Keys.cpp - Digest a hunk of string into keywords.

* License: GPL

* Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca>

* This program is free software; you can redistribute it and/or modify

* it under the terms of the GNU General Public License as published by

* the Free Software Foundation; version 2.

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

* You should have received a copy of the GNU General Public License

* along with this program; if not, write to the Free Software

* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

#define _XOPEN_SOURCE 500

#define _FILE_OFFSET_BITS 64

/* #define DEBUG 1 */

#include "Keys.h"

#include <cstring>

/*------------------------------------------------ Private global vars */

/* These are characters which should be interpretted as both part of the word

* and as a word seperator. eg: 'maul.sith.vpn' should be indexed as 'maul',

* 'sith', 'vpn', and 'maul.sith.vpn' because '.' is listed here.

static const char my_keyword_word_splits[] = "$@./:\\-_~&=%?#+";

static char my_keyword_is_split[256];

/* These are characters which should be interpretted as word breaks.

* No known language should use these as letters in a word.

* All chars 000-037 fall in this category too.

static const char my_keyword_word_divs[] = " !\"'()*,;<>[]^`{|}";

static char my_keyword_is_div[256];

/* These tables are the conversion for characters being written to keywords.

static const char my_keyword_orig[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

static const char my_keyword_dest[] = "abcdefghijklmnopqrstuvwxyz";

static char my_keyword_conv[256];

/* We need to be initd */

static int my_keyword_initd = 1;

/*------------------------------------------------ Private helper methods */

/* Combine the prefix with the substring */

static int my_keyword_index_hunk(

const unsigned char* buf,

const unsigned char* eos,

const char* prefix,

int (*writefn)(const char* keyword, void* arg),

void* arg)

{

char out[LU_KEYWORD_LEN+1];

char* w;

char* e;

if (buf == eos)

{ /* Don't index nothing */

return 0;

}

/* A quick check to avoid function calls */

if (prefix[0])

{

strcpy(&out[0], prefix);

w = &out[strlen(prefix)];

}

else

{

w = &out[0];

}

e = &out[sizeof(out) - 1];

/* Copy the range into the buffer while converting it */

while (w != e && buf != eos)

{

*w++ = my_keyword_conv[*buf++];

}

*w = 0;

if (!out[0])

{

100

/* Ignore this keyword */

101

return 0;

102

}

103

104

return writefn(&out[0], arg);

105

}

106

107

/* Look at a section of non-whitespace chars and decide what to do with it. */

108

static int my_keyword_digest_hunk(

109

const unsigned char* buf,

110

const unsigned char* eos,

111

const char* prefix,

112

int (*writefn)(const char* keyword, void* arg),

113

void* arg,

114

int do_div)

115

{

116

const unsigned char* start;

117

const unsigned char* scan;

118

119

/*!!! Make me work with non-romanian languages (eg. japanese) */

120

/* Japanese has no spaces to delineate words */

121

122

/* Don't index vapour.

123

124

if (buf == eos)

125

return 0;

126

127

/* Firstly, index the entire chunk, with leading and trailing chars.

128

129

130

/* Index the entire hunk. */

131

if (my_keyword_index_hunk(buf, eos, prefix, writefn, arg) != 0)

132

return -1;

133

134

if (!do_div) return 0;

135

136

/* Now, divide the chunk into bits which we will keyword index */

137

start = 0;

138

for (scan = buf; scan != eos; scan++)

139

{

140

if (my_keyword_is_split[*scan])

141

{

142

if (start)

143

{

144

if (my_keyword_index_hunk(start, scan,

145

prefix, writefn, arg) != 0)

146

return -1;

147

start = 0;

148

}

149

}

150

else

151

{

152

if (!start)

153

{

154

start = scan;

155

}

156

}

157

}

158

159

if (start)

160

{

161

if (my_keyword_index_hunk(start, eos, prefix, writefn, arg) != 0)

162

return -1;

163

}

164

165

return 0;

166

}

167

168

static void my_keyword_init(void)

169

{

170

unsigned int i;

171

172

/* Clear the lookup tables */

173

memset(&my_keyword_is_split[0], 0, sizeof(my_keyword_is_split));

174

memset(&my_keyword_is_div [0], 0, sizeof(my_keyword_is_div));

175

176

/* Bootstrap the lookup tables */

177

for (i = 0; i < sizeof(my_keyword_word_splits)-1; i++)

178

my_keyword_is_split[((int)my_keyword_word_splits[i])] = 1;

179

for (i = 0; i < sizeof(my_keyword_word_divs)-1; i++)

180

my_keyword_is_div[((int)my_keyword_word_divs[i])] = 1;

181

182

/* All control characters divide words */

183

for (i = 0; i < 040; i++)

184

my_keyword_is_div[i] = 1;

185

186

/* Initialize conversion table */

187

for (i = 0; i < 256; i++)

188

my_keyword_conv[i] = i;

189

190

/* Fill the conversion entries */

191

for (i = 0; i < sizeof(my_keyword_orig)-1; i++)

192

my_keyword_conv[((int)my_keyword_orig[i])] =

193

my_keyword_dest[i];

194

195

my_keyword_initd = 0;

196

}

197

198

/*------------------------------------------------- Public component methods */

199

200

/* Run through a buffer looking for segments of non-divide characters.

201

202

int my_keyword_digest_string(

203

const char* buf,

204

int len,

205

const char* prefix,

206

int (*writefn)(const char* keyword, void* arg),

207

void* arg,

208

int do_div)

209

{

210

const unsigned char* start;

211

const unsigned char* scan;

212

const unsigned char* eos = (const unsigned char*)buf + len;

213

214

if (my_keyword_initd)

215

my_keyword_init();

216

217

start = 0;

218

for (scan = (const unsigned char*)buf; scan != eos; scan++)

219

{

220

if (my_keyword_is_div[*scan])

221

{

222

if (start)

223

{

224

my_keyword_digest_hunk(start, scan,

225

prefix, writefn, arg, do_div);

226

start = 0;

227

}

228

}

229

else

230

{

231

if (!start)

232

{

233

start = scan;

234

}

235

}

236

}

237

238

if (start)

239

{

240

my_keyword_digest_hunk(start, eos, prefix, writefn, arg, do_div);

241

}

242

243

return 0;

244

}

Older »