~ubuntu-branches/debian/stretch/mudlet/stretch

« back to all changes in this revision

Viewing changes to src/hunspell/affentry.cxx

Committer: Bazaar Package Importer
Author(s): Craig Small
Date: 2011-05-14 20:12:49 UTC
mfrom: (1.1.2 upstream)
Revision ID: james.westby@ubuntu.com-20110514201249-184gqx5jjqam02lg

Tags: 2.0-rc5-1

New upstream release

files removed:
.pc/about_ui_version

.pc/about_ui_version/src

.pc/about_ui_version/src/ui

.pc/about_ui_version/src/ui/about_dialog.ui

.pc/hunspell_dict

.pc/hunspell_dict/src

.pc/hunspell_dict/src/TCommandLine.cpp

src/hunspell

src/hunspell/.deps

src/hunspell/.deps/affentry.Plo

src/hunspell/.deps/affixmgr.Plo

src/hunspell/.deps/csutil.Plo

src/hunspell/.deps/dictmgr.Plo

src/hunspell/.deps/filemgr.Plo

src/hunspell/.deps/hashmgr.Plo

src/hunspell/.deps/hunspell.Plo

src/hunspell/.deps/hunzip.Plo

src/hunspell/.deps/phonet.Plo

src/hunspell/.deps/replist.Plo

src/hunspell/.deps/suggestmgr.Plo

src/hunspell/.libs

src/hunspell/.libs/libhunspell-1.2.a

src/hunspell/.libs/libhunspell-1.2.la

src/hunspell/.libs/libhunspell-1.2.lai

src/hunspell/.libs/libhunspell-1.2.so

src/hunspell/.libs/libhunspell-1.2.so.0

src/hunspell/.libs/libhunspell-1.2.so.0.0.0

src/hunspell/Makefile.am

src/hunspell/Makefile.in

src/hunspell/README

src/hunspell/affentry.cxx

src/hunspell/affentry.hxx

src/hunspell/affentry.lo

src/hunspell/affixmgr.cxx

src/hunspell/affixmgr.hxx

src/hunspell/affixmgr.lo

src/hunspell/atypes.hxx

src/hunspell/baseaffix.hxx

src/hunspell/csutil.cxx

src/hunspell/csutil.hxx

src/hunspell/csutil.lo

src/hunspell/dictmgr.cxx

src/hunspell/dictmgr.hxx

src/hunspell/dictmgr.lo

src/hunspell/filemgr.cxx

src/hunspell/filemgr.hxx

src/hunspell/filemgr.lo

src/hunspell/hashmgr.cxx

src/hunspell/hashmgr.hxx

src/hunspell/hashmgr.lo

src/hunspell/htypes.hxx

src/hunspell/hunspell.cxx

src/hunspell/hunspell.dsp

src/hunspell/hunspell.h

src/hunspell/hunspell.hxx

src/hunspell/hunspell.lo

src/hunspell/hunvisapi.h

src/hunspell/hunvisapi.h.in

src/hunspell/hunzip.cxx

src/hunspell/hunzip.hxx

src/hunspell/hunzip.lo

src/hunspell/langnum.hxx

src/hunspell/libhunspell-1.2.la

src/hunspell/license.hunspell

src/hunspell/license.myspell

src/hunspell/makefile.mk

src/hunspell/phonet.cxx

src/hunspell/phonet.hxx

src/hunspell/phonet.lo

src/hunspell/replist.cxx

src/hunspell/replist.hxx

src/hunspell/replist.lo

src/hunspell/suggestmgr.cxx

src/hunspell/suggestmgr.hxx

src/hunspell/suggestmgr.lo

src/hunspell/utf_info.cxx

src/hunspell/w_char.hxx

files modified:
.pc/applied-patches

.pc/hunspell_syslibs/src/TCommandLine.cpp

.pc/lua_syslibs/src/TLuaInterpreter.cpp

.pc/luaglobal_path/src/TLuaInterpreter.cpp

.pc/luaglobal_path/src/mudlet-lua/lua/LuaGlobal.lua

.pc/project_file/src/src.pro

debian/changelog

debian/patches/luaglobal_path

debian/patches/project_file

debian/patches/series

src/Host.h

src/TCommandLine.cpp

src/TLuaInterpreter.cpp

src/dlgIRC.cpp

src/dlgProfilePreferences.cpp

src/main.cpp

src/mudlet-lua/lua/LuaGlobal.lua

src/mudlet-lua/lua/TableUtils.lua

src/mudlet-lua/lua/geyser/GeyserUtil.lua

src/mudlet.cpp

src/mudlet.h

src/src.pro

src/ui/about_dialog.ui

src/ui/profile_preferences.ui

Show diffs side-by-side

added added

removed removed

src/hunspell/affentry.cxx

#include "license.hunspell"

#include "license.myspell"

#include <stdlib.h>

#include <string.h>

#include <stdio.h>

#include <ctype.h>

#include "affentry.hxx"

#include "csutil.hxx"

PfxEntry::PfxEntry(AffixMgr* pmgr, affentry* dp)

{

// register affix manager

pmyMgr = pmgr;

// set up its initial values

aflag = dp->aflag; // flag

strip = dp->strip; // string to strip

appnd = dp->appnd; // string to append

stripl = dp->stripl; // length of strip string

appndl = dp->appndl; // length of append string

numconds = dp->numconds; // length of the condition

opts = dp->opts; // cross product flag

// then copy over all of the conditions

if (opts & aeLONGCOND) {

memcpy(c.conds, dp->c.l.conds1, MAXCONDLEN_1);

c.l.conds2 = dp->c.l.conds2;

} else memcpy(c.conds, dp->c.conds, MAXCONDLEN);

next = NULL;

nextne = NULL;

nexteq = NULL;

morphcode = dp->morphcode;

contclass = dp->contclass;

contclasslen = dp->contclasslen;

}

PfxEntry::~PfxEntry()

{

aflag = 0;

if (appnd) free(appnd);

if (strip) free(strip);

pmyMgr = NULL;

appnd = NULL;

strip = NULL;

if (opts & aeLONGCOND) free(c.l.conds2);

if (morphcode && !(opts & aeALIASM)) free(morphcode);

if (contclass && !(opts & aeALIASF)) free(contclass);

}

// add prefix to this word assuming conditions hold

char * PfxEntry::add(const char * word, int len)

{

char tword[MAXWORDUTF8LEN + 4];

if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&

(len >= numconds) && test_condition(word) &&

(!stripl || (strncmp(word, strip, stripl) == 0)) &&

((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {

/* we have a match so add prefix */

char * pp = tword;

if (appndl) {

strcpy(tword,appnd);

pp += appndl;

}

strcpy(pp, (word + stripl));

return mystrdup(tword);

}

return NULL;

}

inline char * PfxEntry::nextchar(char * p) {

if (p) {

p++;

if (opts & aeLONGCOND) {

// jump to the 2nd part of the condition

if (p == c.conds + MAXCONDLEN_1) return c.l.conds2;

// end of the MAXCONDLEN length condition

} else if (p == c.conds + MAXCONDLEN) return NULL;

return *p ? p : NULL;

}

return NULL;

}

inline int PfxEntry::test_condition(const char * st)

{

const char * pos = NULL; // group with pos input position

bool neg = false; // complementer

bool ingroup = false; // character in the group

if (numconds == 0) return 1;

char * p = c.conds;

while (1) {

switch (*p) {

case '\0': return 1;

case '[': {

neg = false;

ingroup = false;

100

p = nextchar(p);

101

pos = st; break;

102

}

103

case '^': { p = nextchar(p); neg = true; break; }

104

case ']': {

105

if ((neg && ingroup) || (!neg && !ingroup)) return 0;

106

pos = NULL;

107

p = nextchar(p);

108

// skip the next character

109

if (!ingroup) for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);

110

if (*st == '\0' && p) return 0; // word <= condition

111

break;

112

}

113

case '.': if (!pos) { // dots are not metacharacters in groups: [.]

114

p = nextchar(p);

115

// skip the next character

116

for (st++; (opts & aeUTF8) && (*st & 0xc0) == 0x80; st++);

117

if (*st == '\0' && p) return 0; // word <= condition

118

break;

119

}

120

default: {

121

if (*st == *p) {

122

st++;

123

p = nextchar(p);

124

if ((opts & aeUTF8) && (*(st - 1) & 0x80)) { // multibyte

125

while (p && (*p & 0xc0) == 0x80) { // character

126

if (*p != *st) {

127

if (!pos) return 0;

128

st = pos;

129

break;

130

}

131

p = nextchar(p);

132

st++;

133

}

134

if (pos && st != pos) {

135

ingroup = true;

136

while (p && *p != ']' && (p = nextchar(p)));

137

}

138

} else if (pos) {

139

ingroup = true;

140

while (p && *p != ']' && (p = nextchar(p)));

141

}

142

} else if (pos) { // group

143

p = nextchar(p);

144

} else return 0;

145

}

146

}

147

if (!p) return 1;

148

}

149

}

150

151

// check if this prefix entry matches

152

struct hentry * PfxEntry::checkword(const char * word, int len, char in_compound, const FLAG needflag)

153

{

154

int tmpl; // length of tmpword

155

struct hentry * he; // hash entry of root word or NULL

156

char tmpword[MAXWORDUTF8LEN + 4];

157

158

// on entry prefix is 0 length or already matches the beginning of the word.

159

// So if the remaining root word has positive length

160

// and if there are enough chars in root word and added back strip chars

161

// to meet the number of characters conditions, then test it

162

163

tmpl = len - appndl;

164

165

if (tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) {

166

167

// generate new root word by removing prefix and adding

168

// back any characters that would have been stripped

169

170

if (stripl) strcpy (tmpword, strip);

171

strcpy ((tmpword + stripl), (word + appndl));

172

173

// now make sure all of the conditions on characters

174

// are met. Please see the appendix at the end of

175

// this file for more info on exactly what is being

176

// tested

177

178

// if all conditions are met then check if resulting

179

// root word in the dictionary

180

181

if (test_condition(tmpword)) {

182

tmpl += stripl;

183

if ((he = pmyMgr->lookup(tmpword)) != NULL) {

184

do {

185

if (TESTAFF(he->astr, aflag, he->alen) &&

186

// forbid single prefixes with needaffix flag

187

! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&

188

// needflag

189

((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||

190

(contclass && TESTAFF(contclass, needflag, contclasslen))))

191

return he;

192

he = he->next_homonym; // check homonyms

193

} while (he);

194

}

195

196

// prefix matched but no root word was found

197

// if aeXPRODUCT is allowed, try again but now

198

// ross checked combined with a suffix

199

200

//if ((opts & aeXPRODUCT) && in_compound) {

201

if ((opts & aeXPRODUCT)) {

202

he = pmyMgr->suffix_check(tmpword, tmpl, aeXPRODUCT, this, NULL,

203

0, NULL, FLAG_NULL, needflag, in_compound);

204

if (he) return he;

205

}

206

}

207

}

208

return NULL;

209

}

210

211

// check if this prefix entry matches

212

struct hentry * PfxEntry::check_twosfx(const char * word, int len,

213

char in_compound, const FLAG needflag)

214

{

215

int tmpl; // length of tmpword

216

struct hentry * he; // hash entry of root word or NULL

217

char tmpword[MAXWORDUTF8LEN + 4];

218

219

// on entry prefix is 0 length or already matches the beginning of the word.

220

// So if the remaining root word has positive length

221

// and if there are enough chars in root word and added back strip chars

222

// to meet the number of characters conditions, then test it

223

224

tmpl = len - appndl;

225

226

if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

227

(tmpl + stripl >= numconds)) {

228

229

// generate new root word by removing prefix and adding

230

// back any characters that would have been stripped

231

232

if (stripl) strcpy (tmpword, strip);

233

strcpy ((tmpword + stripl), (word + appndl));

234

235

// now make sure all of the conditions on characters

236

// are met. Please see the appendix at the end of

237

// this file for more info on exactly what is being

238

// tested

239

240

// if all conditions are met then check if resulting

241

// root word in the dictionary

242

243

if (test_condition(tmpword)) {

244

tmpl += stripl;

245

246

// prefix matched but no root word was found

247

// if aeXPRODUCT is allowed, try again but now

248

// cross checked combined with a suffix

249

250

if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {

251

he = pmyMgr->suffix_check_twosfx(tmpword, tmpl, aeXPRODUCT, this, needflag);

252

if (he) return he;

253

}

254

}

255

}

256

return NULL;

257

}

258

259

// check if this prefix entry matches

260

char * PfxEntry::check_twosfx_morph(const char * word, int len,

261

char in_compound, const FLAG needflag)

262

{

263

int tmpl; // length of tmpword

264

char tmpword[MAXWORDUTF8LEN + 4];

265

266

// on entry prefix is 0 length or already matches the beginning of the word.

267

// So if the remaining root word has positive length

268

// and if there are enough chars in root word and added back strip chars

269

// to meet the number of characters conditions, then test it

270

271

tmpl = len - appndl;

272

273

if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

274

(tmpl + stripl >= numconds)) {

275

276

// generate new root word by removing prefix and adding

277

// back any characters that would have been stripped

278

279

if (stripl) strcpy (tmpword, strip);

280

strcpy ((tmpword + stripl), (word + appndl));

281

282

// now make sure all of the conditions on characters

283

// are met. Please see the appendix at the end of

284

// this file for more info on exactly what is being

285

// tested

286

287

// if all conditions are met then check if resulting

288

// root word in the dictionary

289

290

if (test_condition(tmpword)) {

291

tmpl += stripl;

292

293

// prefix matched but no root word was found

294

// if aeXPRODUCT is allowed, try again but now

295

// ross checked combined with a suffix

296

297

if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {

298

return pmyMgr->suffix_check_twosfx_morph(tmpword, tmpl,

299

aeXPRODUCT, this, needflag);

300

}

301

}

302

}

303

return NULL;

304

}

305

306

// check if this prefix entry matches

307

char * PfxEntry::check_morph(const char * word, int len, char in_compound, const FLAG needflag)

308

{

309

int tmpl; // length of tmpword

310

struct hentry * he; // hash entry of root word or NULL

311

char tmpword[MAXWORDUTF8LEN + 4];

312

char result[MAXLNLEN];

313

char * st;

314

315

*result = '\0';

316

317

// on entry prefix is 0 length or already matches the beginning of the word.

318

// So if the remaining root word has positive length

319

// and if there are enough chars in root word and added back strip chars

320

// to meet the number of characters conditions, then test it

321

322

tmpl = len - appndl;

323

324

if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

325

(tmpl + stripl >= numconds)) {

326

327

// generate new root word by removing prefix and adding

328

// back any characters that would have been stripped

329

330

if (stripl) strcpy (tmpword, strip);

331

strcpy ((tmpword + stripl), (word + appndl));

332

333

// now make sure all of the conditions on characters

334

// are met. Please see the appendix at the end of

335

// this file for more info on exactly what is being

336

// tested

337

338

// if all conditions are met then check if resulting

339

// root word in the dictionary

340

341

if (test_condition(tmpword)) {

342

tmpl += stripl;

343

if ((he = pmyMgr->lookup(tmpword)) != NULL) {

344

do {

345

if (TESTAFF(he->astr, aflag, he->alen) &&

346

// forbid single prefixes with needaffix flag

347

! TESTAFF(contclass, pmyMgr->get_needaffix(), contclasslen) &&

348

// needflag

349

((!needflag) || TESTAFF(he->astr, needflag, he->alen) ||

350

(contclass && TESTAFF(contclass, needflag, contclasslen)))) {

351

if (morphcode) {

352

mystrcat(result, " ", MAXLNLEN);

353

mystrcat(result, morphcode, MAXLNLEN);

354

} else mystrcat(result,getKey(), MAXLNLEN);

355

if (!HENTRY_FIND(he, MORPH_STEM)) {

356

mystrcat(result, " ", MAXLNLEN);

357

mystrcat(result, MORPH_STEM, MAXLNLEN);

358

mystrcat(result, HENTRY_WORD(he), MAXLNLEN);

359

}

360

// store the pointer of the hash entry

361

if (HENTRY_DATA(he)) {

362

mystrcat(result, " ", MAXLNLEN);

363

mystrcat(result, HENTRY_DATA2(he), MAXLNLEN);

364

} else {

365

// return with debug information

366

char * flag = pmyMgr->encode_flag(getFlag());

367

mystrcat(result, " ", MAXLNLEN);

368

mystrcat(result, MORPH_FLAG, MAXLNLEN);

369

mystrcat(result, flag, MAXLNLEN);

370

free(flag);

371

}

372

mystrcat(result, "\n", MAXLNLEN);

373

}

374

he = he->next_homonym;

375

} while (he);

376

}

377

378

// prefix matched but no root word was found

379

// if aeXPRODUCT is allowed, try again but now

380

// ross checked combined with a suffix

381

382

if ((opts & aeXPRODUCT) && (in_compound != IN_CPD_BEGIN)) {

383

st = pmyMgr->suffix_check_morph(tmpword, tmpl, aeXPRODUCT, this,

384

FLAG_NULL, needflag);

385

if (st) {

386

mystrcat(result, st, MAXLNLEN);

387

free(st);

388

}

389

}

390

}

391

}

392

393

if (*result) return mystrdup(result);

394

return NULL;

395

}

396

397

SfxEntry::SfxEntry(AffixMgr * pmgr, affentry* dp)

398

{

399

// register affix manager

400

pmyMgr = pmgr;

401

402

// set up its initial values

403

aflag = dp->aflag; // char flag

404

strip = dp->strip; // string to strip

405

appnd = dp->appnd; // string to append

406

stripl = dp->stripl; // length of strip string

407

appndl = dp->appndl; // length of append string

408

numconds = dp->numconds; // length of the condition

409

opts = dp->opts; // cross product flag

410

411

// then copy over all of the conditions

412

if (opts & aeLONGCOND) {

413

memcpy(c.l.conds1, dp->c.l.conds1, MAXCONDLEN_1);

414

c.l.conds2 = dp->c.l.conds2;

415

} else memcpy(c.conds, dp->c.conds, MAXCONDLEN);

416

417

rappnd = myrevstrdup(appnd);

418

morphcode = dp->morphcode;

419

contclass = dp->contclass;

420

contclasslen = dp->contclasslen;

421

}

422

423

424

SfxEntry::~SfxEntry()

425

{

426

aflag = 0;

427

if (appnd) free(appnd);

428

if (rappnd) free(rappnd);

429

if (strip) free(strip);

430

pmyMgr = NULL;

431

appnd = NULL;

432

strip = NULL;

433

if (opts & aeLONGCOND) free(c.l.conds2);

434

if (morphcode && !(opts & aeALIASM)) free(morphcode);

435

if (contclass && !(opts & aeALIASF)) free(contclass);

436

}

437

438

// add suffix to this word assuming conditions hold

439

char * SfxEntry::add(const char * word, int len)

440

{

441

char tword[MAXWORDUTF8LEN + 4];

442

443

/* make sure all conditions match */

444

if ((len > stripl || (len == 0 && pmyMgr->get_fullstrip())) &&

445

(len >= numconds) && test_condition(word + len, word) &&

446

(!stripl || (strcmp(word + len - stripl, strip) == 0)) &&

447

((MAXWORDUTF8LEN + 4) > (len + appndl - stripl))) {

448

/* we have a match so add suffix */

449

strcpy(tword,word);

450

if (appndl) {

451

strcpy(tword + len - stripl, appnd);

452

} else {

453

*(tword + len - stripl) = '\0';

454

}

455

return mystrdup(tword);

456

}

457

return NULL;

458

}

459

460

inline char * SfxEntry::nextchar(char * p) {

461

if (p) {

462

p++;

463

if (opts & aeLONGCOND) {

464

// jump to the 2nd part of the condition

465

if (p == c.l.conds1 + MAXCONDLEN_1) return c.l.conds2;

466

// end of the MAXCONDLEN length condition

467

} else if (p == c.conds + MAXCONDLEN) return NULL;

468

return *p ? p : NULL;

469

}

470

return NULL;

471

}

472

473

inline int SfxEntry::test_condition(const char * st, const char * beg)

474

{

475

const char * pos = NULL; // group with pos input position

476

bool neg = false; // complementer

477

bool ingroup = false; // character in the group

478

if (numconds == 0) return 1;

479

char * p = c.conds;

480

st--;

481

int i = 1;

482

while (1) {

483

switch (*p) {

484

case '\0': return 1;

485

case '[': { p = nextchar(p); pos = st; break; }

486

case '^': { p = nextchar(p); neg = true; break; }

487

case ']': { if (!neg && !ingroup) return 0;

488

i++;

489

// skip the next character

490

if (!ingroup) {

491

for (; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);

492

st--;

493

}

494

pos = NULL;

495

neg = false;

496

ingroup = false;

497

p = nextchar(p);

498

if (st < beg && p) return 0; // word <= condition

499

break;

500

}

501

case '.': if (!pos) { // dots are not metacharacters in groups: [.]

502

p = nextchar(p);

503

// skip the next character

504

for (st--; (opts & aeUTF8) && (st >= beg) && (*st & 0xc0) == 0x80; st--);

505

if (st < beg) { // word <= condition

506

if (p) return 0; else return 1;

507

}

508

if ((opts & aeUTF8) && (*st & 0x80)) { // head of the UTF-8 character

509

st--;

510

if (st < beg) { // word <= condition

511

if (p) return 0; else return 1;

512

}

513

}

514

break;

515

}

516

default: {

517

if (*st == *p) {

518

p = nextchar(p);

519

if ((opts & aeUTF8) && (*st & 0x80)) {

520

st--;

521

while (p && (st >= beg)) {

522

if (*p != *st) {

523

if (!pos) return 0;

524

st = pos;

525

break;

526

}

527

// first byte of the UTF-8 multibyte character

528

if ((*p & 0xc0) != 0x80) break;

529

p = nextchar(p);

530

st--;

531

}

532

if (pos && st != pos) {

533

if (neg) return 0;

534

else if (i == numconds) return 1;

535

ingroup = true;

536

while (p && *p != ']' && (p = nextchar(p)));

537

st--;

538

}

539

if (p && *p != ']') p = nextchar(p);

540

} else if (pos) {

541

if (neg) return 0;

542

else if (i == numconds) return 1;

543

ingroup = true;

544

while (p && *p != ']' && (p = nextchar(p)));

545

// if (p && *p != ']') p = nextchar(p);

546

st--;

547

}

548

if (!pos) {

549

i++;

550

st--;

551

}

552

if (st < beg && p && *p != ']') return 0; // word <= condition

553

} else if (pos) { // group

554

p = nextchar(p);

555

} else return 0;

556

}

557

}

558

if (!p) return 1;

559

}

560

}

561

562

// see if this suffix is present in the word

563

struct hentry * SfxEntry::checkword(const char * word, int len, int optflags,

564

PfxEntry* ppfx, char ** wlst, int maxSug, int * ns, const FLAG cclass, const FLAG needflag,

565

const FLAG badflag)

566

{

567

int tmpl; // length of tmpword

568

struct hentry * he; // hash entry pointer

569

unsigned char * cp;

570

char tmpword[MAXWORDUTF8LEN + 4];

571

PfxEntry* ep = ppfx;

572

573

// if this suffix is being cross checked with a prefix

574

// but it does not support cross products skip it

575

576

if (((optflags & aeXPRODUCT) != 0) && ((opts & aeXPRODUCT) == 0))

577

return NULL;

578

579

// upon entry suffix is 0 length or already matches the end of the word.

580

// So if the remaining root word has positive length

581

// and if there are enough chars in root word and added back strip chars

582

// to meet the number of characters conditions, then test it

583

584

tmpl = len - appndl;

585

// the second condition is not enough for UTF-8 strings

586

// it checked in test_condition()

587

588

if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

589

(tmpl + stripl >= numconds)) {

590

591

// generate new root word by removing suffix and adding

592

// back any characters that would have been stripped or

593

// or null terminating the shorter string

594

595

strcpy (tmpword, word);

596

cp = (unsigned char *)(tmpword + tmpl);

597

if (stripl) {

598

strcpy ((char *)cp, strip);

599

tmpl += stripl;

600

cp = (unsigned char *)(tmpword + tmpl);

601

} else *cp = '\0';

602

603

// now make sure all of the conditions on characters

604

// are met. Please see the appendix at the end of

605

// this file for more info on exactly what is being

606

// tested

607

608

// if all conditions are met then check if resulting

609

// root word in the dictionary

610

611

if (test_condition((char *) cp, (char *) tmpword)) {

612

613

#ifdef SZOSZABLYA_POSSIBLE_ROOTS

614

fprintf(stdout,"%s %s %c\n", word, tmpword, aflag);

615

#endif

616

if ((he = pmyMgr->lookup(tmpword)) != NULL) {

617

do {

618

// check conditional suffix (enabled by prefix)

619

if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() &&

620

TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&

621

(((optflags & aeXPRODUCT) == 0) ||

622

(ep && TESTAFF(he->astr, ep->getFlag(), he->alen)) ||

623

// enabled by prefix

624

((contclass) && (ep && TESTAFF(contclass, ep->getFlag(), contclasslen)))

625

) &&

626

// handle cont. class

627

((!cclass) ||

628

((contclass) && TESTAFF(contclass, cclass, contclasslen))

629

) &&

630

// check only in compound homonyms (bad flags)

631

(!badflag || !TESTAFF(he->astr, badflag, he->alen)

632

) &&

633

// handle required flag

634

((!needflag) ||

635

(TESTAFF(he->astr, needflag, he->alen) ||

636

((contclass) && TESTAFF(contclass, needflag, contclasslen)))

637

)

638

) return he;

639

he = he->next_homonym; // check homonyms

640

} while (he);

641

642

// obsolote stemming code (used only by the

643

// experimental SuffixMgr:suggest_pos_stems)

644

// store resulting root in wlst

645

} else if (wlst && (*ns < maxSug)) {

646

int cwrd = 1;

647

for (int k=0; k < *ns; k++)

648

if (strcmp(tmpword, wlst[k]) == 0) cwrd = 0;

649

if (cwrd) {

650

wlst[*ns] = mystrdup(tmpword);

651

if (wlst[*ns] == NULL) {

652

for (int j=0; j<*ns; j++) free(wlst[j]);

653

*ns = -1;

654

return NULL;

655

}

656

(*ns)++;

657

}

658

}

659

}

660

}

661

return NULL;

662

}

663

664

// see if two-level suffix is present in the word

665

struct hentry * SfxEntry::check_twosfx(const char * word, int len, int optflags,

666

PfxEntry* ppfx, const FLAG needflag)

667

{

668

int tmpl; // length of tmpword

669

struct hentry * he; // hash entry pointer

670

unsigned char * cp;

671

char tmpword[MAXWORDUTF8LEN + 4];

672

PfxEntry* ep = ppfx;

673

674

675

// if this suffix is being cross checked with a prefix

676

// but it does not support cross products skip it

677

678

if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)

679

return NULL;

680

681

// upon entry suffix is 0 length or already matches the end of the word.

682

// So if the remaining root word has positive length

683

// and if there are enough chars in root word and added back strip chars

684

// to meet the number of characters conditions, then test it

685

686

tmpl = len - appndl;

687

688

if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

689

(tmpl + stripl >= numconds)) {

690

691

// generate new root word by removing suffix and adding

692

// back any characters that would have been stripped or

693

// or null terminating the shorter string

694

695

strcpy (tmpword, word);

696

cp = (unsigned char *)(tmpword + tmpl);

697

if (stripl) {

698

strcpy ((char *)cp, strip);

699

tmpl += stripl;

700

cp = (unsigned char *)(tmpword + tmpl);

701

} else *cp = '\0';

702

703

// now make sure all of the conditions on characters

704

// are met. Please see the appendix at the end of

705

// this file for more info on exactly what is being

706

// tested

707

708

// if all conditions are met then recall suffix_check

709

710

if (test_condition((char *) cp, (char *) tmpword)) {

711

if (ppfx) {

712

// handle conditional suffix

713

if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen))

714

he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);

715

else

716

he = pmyMgr->suffix_check(tmpword, tmpl, optflags, ppfx, NULL, 0, NULL, (FLAG) aflag, needflag);

717

} else {

718

he = pmyMgr->suffix_check(tmpword, tmpl, 0, NULL, NULL, 0, NULL, (FLAG) aflag, needflag);

719

}

720

if (he) return he;

721

}

722

}

723

return NULL;

724

}

725

726

// see if two-level suffix is present in the word

727

char * SfxEntry::check_twosfx_morph(const char * word, int len, int optflags,

728

PfxEntry* ppfx, const FLAG needflag)

729

{

730

int tmpl; // length of tmpword

731

unsigned char * cp;

732

char tmpword[MAXWORDUTF8LEN + 4];

733

PfxEntry* ep = ppfx;

734

char * st;

735

736

char result[MAXLNLEN];

737

738

*result = '\0';

739

740

// if this suffix is being cross checked with a prefix

741

// but it does not support cross products skip it

742

743

if ((optflags & aeXPRODUCT) != 0 && (opts & aeXPRODUCT) == 0)

744

return NULL;

745

746

// upon entry suffix is 0 length or already matches the end of the word.

747

// So if the remaining root word has positive length

748

// and if there are enough chars in root word and added back strip chars

749

// to meet the number of characters conditions, then test it

750

751

tmpl = len - appndl;

752

753

if ((tmpl > 0 || (tmpl == 0 && pmyMgr->get_fullstrip())) &&

754

(tmpl + stripl >= numconds)) {

755

756

// generate new root word by removing suffix and adding

757

// back any characters that would have been stripped or

758

// or null terminating the shorter string

759

760

strcpy (tmpword, word);

761

cp = (unsigned char *)(tmpword + tmpl);

762

if (stripl) {

763

strcpy ((char *)cp, strip);

764

tmpl += stripl;

765

cp = (unsigned char *)(tmpword + tmpl);

766

} else *cp = '\0';

767

768

// now make sure all of the conditions on characters

769

// are met. Please see the appendix at the end of

770

// this file for more info on exactly what is being

771

// tested

772

773

// if all conditions are met then recall suffix_check

774

775

if (test_condition((char *) cp, (char *) tmpword)) {

776

if (ppfx) {

777

// handle conditional suffix

778

if ((contclass) && TESTAFF(contclass, ep->getFlag(), contclasslen)) {

779

st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);

780

if (st) {

781

if (ppfx->getMorph()) {

782

mystrcat(result, ppfx->getMorph(), MAXLNLEN);

783

mystrcat(result, " ", MAXLNLEN);

784

}

785

mystrcat(result,st, MAXLNLEN);

786

free(st);

787

mychomp(result);

788

}

789

} else {

790

st = pmyMgr->suffix_check_morph(tmpword, tmpl, optflags, ppfx, aflag, needflag);

791

if (st) {

792

mystrcat(result, st, MAXLNLEN);

793

free(st);

794

mychomp(result);

795

}

796

}

797

} else {

798

st = pmyMgr->suffix_check_morph(tmpword, tmpl, 0, NULL, aflag, needflag);

799

if (st) {

800

mystrcat(result, st, MAXLNLEN);

801

free(st);

802

mychomp(result);

803

}

804

}

805

if (*result) return mystrdup(result);

806

}

807

}

808

return NULL;

809

}

810

811

// get next homonym with same affix

812

struct hentry * SfxEntry::get_next_homonym(struct hentry * he, int optflags, PfxEntry* ppfx,

813

const FLAG cclass, const FLAG needflag)

814

{

815

PfxEntry* ep = ppfx;

816

FLAG eFlag = ep ? ep->getFlag() : FLAG_NULL;

817

818

while (he->next_homonym) {

819

he = he->next_homonym;

820

if ((TESTAFF(he->astr, aflag, he->alen) || (ep && ep->getCont() && TESTAFF(ep->getCont(), aflag, ep->getContLen()))) &&

821

((optflags & aeXPRODUCT) == 0 ||

822

TESTAFF(he->astr, eFlag, he->alen) ||

823

// handle conditional suffix

824

((contclass) && TESTAFF(contclass, eFlag, contclasslen))

825

) &&

826

// handle cont. class

827

((!cclass) ||

828

((contclass) && TESTAFF(contclass, cclass, contclasslen))

829

) &&

830

// handle required flag

831

((!needflag) ||

832

(TESTAFF(he->astr, needflag, he->alen) ||

833

((contclass) && TESTAFF(contclass, needflag, contclasslen)))

834

)

835

) return he;

836

}

837

return NULL;

838

}

839

840

841

#if 0

842

843

Appendix: Understanding Affix Code

844

845

846

An affix is either a prefix or a suffix attached to root words to make

847

other words.

848

849

Basically a Prefix or a Suffix is set of AffEntry objects

850

which store information about the prefix or suffix along

851

with supporting routines to check if a word has a particular

852

prefix or suffix or a combination.

853

854

The structure affentry is defined as follows:

855

856

struct affentry

857

{

858

unsigned short aflag; // ID used to represent the affix

859

char * strip; // string to strip before adding affix

860

char * appnd; // the affix string to add

861

unsigned char stripl; // length of the strip string

862

unsigned char appndl; // length of the affix string

863

char numconds; // the number of conditions that must be met

864

char opts; // flag: aeXPRODUCT- combine both prefix and suffix

865

char conds[SETSIZE]; // array which encodes the conditions to be met

866

};

867

868

869

Here is a suffix borrowed from the en_US.aff file. This file

870

is whitespace delimited.

871

872

SFX D Y 4

873

SFX D 0 e d

874

SFX D y ied [^aeiou]y

875

SFX D 0 ed [^ey]

876

SFX D 0 ed [aeiou]y

877

878

This information can be interpreted as follows:

879

880

In the first line has 4 fields

881

882

Field

883

-----

884

1 SFX - indicates this is a suffix

885

2 D - is the name of the character flag which represents this suffix

886

3 Y - indicates it can be combined with prefixes (cross product)

887

4 4 - indicates that sequence of 4 affentry structures are needed to

888

properly store the affix information

889

890

The remaining lines describe the unique information for the 4 SfxEntry

891

objects that make up this affix. Each line can be interpreted

892

as follows: (note fields 1 and 2 are as a check against line 1 info)

893

894

Field

895

-----

896

1 SFX - indicates this is a suffix

897

2 D - is the name of the character flag for this affix

898

3 y - the string of chars to strip off before adding affix

899

(a 0 here indicates the NULL string)

900

4 ied - the string of affix characters to add

901

5 [^aeiou]y - the conditions which must be met before the affix

902

can be applied

903

904

Field 5 is interesting. Since this is a suffix, field 5 tells us that

905

there are 2 conditions that must be met. The first condition is that

906

the next to the last character in the word must *NOT* be any of the

907

following "a", "e", "i", "o" or "u". The second condition is that

908

the last character of the word must end in "y".

909

910

So how can we encode this information concisely and be able to

911

test for both conditions in a fast manner? The answer is found

912

but studying the wonderful ispell code of Geoff Kuenning, et.al.

913

(now available under a normal BSD license).

914

915

If we set up a conds array of 256 bytes indexed (0 to 255) and access it

916

using a character (cast to an unsigned char) of a string, we have 8 bits

917

of information we can store about that character. Specifically we

918

could use each bit to say if that character is allowed in any of the

919

last (or first for prefixes) 8 characters of the word.

920

921

Basically, each character at one end of the word (up to the number

922

of conditions) is used to index into the conds array and the resulting

923

value found there says whether the that character is valid for a

924

specific character position in the word.

925

926

For prefixes, it does this by setting bit 0 if that char is valid

927

in the first position, bit 1 if valid in the second position, and so on.

928

929

If a bit is not set, then that char is not valid for that postion in the

930

word.

931

932

If working with suffixes bit 0 is used for the character closest

933

to the front, bit 1 for the next character towards the end, ...,

934

with bit numconds-1 representing the last char at the end of the string.

935

936

Note: since entries in the conds[] are 8 bits, only 8 conditions

937

(read that only 8 character positions) can be examined at one

938

end of a word (the beginning for prefixes and the end for suffixes.

939

940

So to make this clearer, lets encode the conds array values for the

941

first two affentries for the suffix D described earlier.

942

943

944

For the first affentry:

945

numconds = 1 (only examine the last character)

946

947

conds['e'] = (1 << 0) (the word must end in an E)

948

all others are all 0

949

950

For the second affentry:

951

numconds = 2 (only examine the last two characters)

952

953

conds[X] = conds[X] | (1 << 0) (aeiou are not allowed)

954

where X is all characters *but* a, e, i, o, or u

955

956

957

conds['y'] = (1 << 1) (the last char must be a y)

958

all other bits for all other entries in the conds array are zero

959

960

961

#endif

962

Older »