~ubuntu-branches/ubuntu/utopic/bmf/utopic

Viewing changes to .pc/40-patch-lex.c/lex.c

Committer: Package Import Robot
Author(s): Jari Aalto
Date: 2012-02-13 02:48:03 UTC
Revision ID: package-import@ubuntu.com-20120213024803-w8pckhxogpxalgtm

Tags: 0.9.4-6

* debian/compat
  - Update to 9
* debian/control
  - (Build-Depends): Update to debhelper 9, dpkg-dev 1.16.1.
  - (Standards-Version): Update to 3.9.2.
  - Tweak description to address lintian warning.
* debian/copyright
  - (X-*): Add headers Vcs and Bugs.
* debian/patches
  - (40): New. Fix for folded headers.
* debian/rules
  - Remove unnecessary rules.
  - Use hardened CFLAGS (release goal).
    http://wiki.debian.org/ReleaseGoals/SecurityHardeningBuildFlags

files added:
.pc/40-patch-lex.c

.pc/40-patch-lex.c/lex.c

debian/patches/40-patch-lex.c

files modified:
.pc/applied-patches

debian/changelog

debian/compat

debian/control

debian/copyright

debian/patches/series

debian/rules

lex.c

Show diffs side-by-side

added added

removed removed

.pc/40-patch-lex.c/lex.c

/* $Id: lex.c,v 1.18 2002/10/20 20:29:15 tommy Exp $ */

* This program is free software. It may be distributed under the terms

* in the file LICENSE, found in the top level of the distribution.

* lex.c: generate token stream for bmf.

#include "config.h"

#include "dbg.h"

#include "str.h"

#include "lex.h"

static cpchar g_htmltags[] =

{

"abbr",

"above",

"accesskey",

"acronym",

"align",

"alink",

"all",

"alt",

"applet",

"archive",

"axis",

"basefont",

"baseline",

"below",

"bgcolor",

"big",

"body",

"border",

"bottom",

"box",

"button",

"cellpadding",

"cellspacing",

"center",

"char",

"charoff",

"charset",

"circle",

"cite",

"class",

"classid",

"clear",

"codebase",

"codetype",

"color",

"cols",

"colspan",

"compact",

"content",

"coords",

"data",

"datetime",

"declare",

"default",

"defer",

"dfn",

"dir",

"disabled",

"face",

"font",

"frameborder",

"groups",

"head",

"headers",

"height",

"href",

"hreflang",

"hsides",

"hspace",

"http-equiv",

"iframe",

"img",

"input",

"ismap",

"justify",

"kbd",

"label",

"lang",

"language",

"left",

"lhs",

"link",

"longdesc",

"map",

"marginheight",

"marginwidth",

"media",

"meta",

"middle",

"multiple",

"name",

100

"nohref",

101

"none",

102

"noresize",

103

"noshade",

104

"nowrap",

105

"object",

106

"onblur",

107

"onchange",

108

"onclick",

109

"ondblclick",

110

"onfocus",

111

"onkeydown",

112

"onkeypress",

113

"onkeyup",

114

"onload",

115

"onmousedown",

116

"onmousemove",

117

"onmouseout",

118

"onmouseover",

119

"onmouseup",

120

"onselect",

121

"onunload",

122

"param",

123

"poly",

124

"profile",

125

"prompt",

126

"readonly",

127

"rect",

128

"rel",

129

"rev",

130

"rhs",

131

"right",

132

"rows",

133

"rowspan",

134

"rules",

135

"samp",

136

"scheme",

137

"scope",

138

"script",

139

"scrolling",

140

"select",

141

"selected",

142

"shape",

143

"size",

144

"small",

145

"span",

146

"src",

147

"standby",

148

"strike",

149

"strong",

150

"style",

151

"sub",

152

"summary",

153

"sup",

154

"tabindex",

155

"table",

156

"target",

157

"textarea",

158

"title",

159

"top",

160

"type",

161

"usemap",

162

"valign",

163

"value",

164

"valuetype",

165

"var",

166

"vlink",

167

"void",

168

"vsides",

169

"vspace",

170

"width"

171

};

172

static const uint g_nhtmltags = sizeof(g_htmltags)/sizeof(cpchar);

173

174

static cpchar g_ignoredheaders[] =

175

{

176

"Date:",

177

"Delivery-date:",

178

"Message-ID:",

179

"X-Sorted:",

180

"X-Spam-"

181

};

182

static const uint g_nignoredheaders = sizeof(g_ignoredheaders)/sizeof(cpchar);

183

184

static inline bool_t is_whitespace( int c )

185

{

186

return ( c == ' ' || c == '\t' || c == '\r' );

187

}

188

189

static inline bool_t is_base64char(c)

190

{

191

return ( isalnum(c) || (c == '/' || c == '+') );

192

}

193

194

static inline bool_t is_wordmidchar(c)

195

{

196

return ( isalnum(c) || c == '$' || c == '\'' || c == '.' || c == '-' );

197

}

198

199

static inline bool_t is_wordendchar(c)

200

{

201

return ( isalnum(c) || c == '$' );

202

}

203

204

static inline bool_t is_htmltag( cpchar p, uint len, uint* ptoklen )

205

{

206

int lo, hi, mid, minlen, cmp;

207

208

*ptoklen = 0;

209

210

hi = g_nhtmltags-1;

211

lo = -1;

212

while( hi-lo > 1 )

213

{

214

mid = (hi+lo)/2;

215

minlen = min( strlen(g_htmltags[mid]), len );

216

cmp = strncmp( g_htmltags[mid], p, minlen );

217

if( cmp > 0 || (cmp == 0 && minlen < len && !islower(p[minlen])) )

218

hi = mid;

219

else

220

lo = mid;

221

}

222

minlen = min( strlen(g_htmltags[hi]), len );

223

if( len == minlen || strncmp(g_htmltags[hi], p, minlen) != 0 )

224

{

225

return false;

226

}

227

228

/* check if is_word() will have a longer match */

229

if( is_wordendchar(p[minlen]) )

230

{

231

return false;

232

}

233

if( is_wordmidchar(p[minlen]) && is_wordendchar(p[minlen+1]) )

234

{

235

return false;

236

}

237

238

*ptoklen = strlen(g_htmltags[hi]);

239

240

return true;

241

}

242

243

static inline bool_t is_htmlcomment( cpchar p, uint len, uint* ptoklen )

244

{

245

*ptoklen = 0;

246

247

if( len >=4 && memcmp( p, "<!--", 4 ) == 0 )

248

{

249

*ptoklen = 4;

250

return true;

251

}

252

if( len >= 3 && memcmp( p, "-->", 3 ) == 0 )

253

{

254

*ptoklen = 3;

255

return true;

256

}

257

258

return false;

259

}

260

261

static inline bool_t is_base64( cpchar p, uint len, uint* ptoklen )

262

{

263

*ptoklen = 0;

264

while( len > 0 )

265

{

266

if( *p != '\n' && *p != '\r' && !is_base64char(*p) )

267

{

268

return false;

269

}

270

p++;

271

len--;

272

(*ptoklen)++;

273

}

274

return true;

275

}

276

277

static inline bool_t is_mimeboundary( cpchar p, uint len, uint* ptoklen )

278

{

279

*ptoklen = 0;

280

281

if( len < 3 || p[0] != '-' || p[1] != '-' )

282

{

283

return false;

284

}

285

p += 2;

286

len -= 2;

287

*ptoklen += 2;

288

while( len > 0 )

289

{

290

if( is_whitespace(*p) )

291

{

292

return false;

293

}

294

if( *p == '\n' || *p == '\r' )

295

{

296

break;

297

}

298

p++;

299

len--;

300

(*ptoklen)++;

301

}

302

return true;

303

}

304

305

static inline bool_t is_ipaddr( cpchar p, uint len, uint* ptoklen )

306

{

307

uint noctets, ndigits;

308

309

*ptoklen = 0;

310

311

noctets = 0;

312

while( len > 0 && noctets < 4 )

313

{

314

ndigits = 0;

315

while( len > 0 && isdigit(*p) )

316

{

317

ndigits++;

318

p++;

319

len--;

320

(*ptoklen)++;

321

}

322

if( ndigits == 0 || ndigits > 3 )

323

{

324

return false;

325

}

326

noctets++;

327

if( noctets < 4 )

328

{

329

if( *p != '.' )

330

{

331

return false;

332

}

333

p++;

334

len--;

335

(*ptoklen)++;

336

}

337

}

338

if( noctets < 4 )

339

{

340

return false;

341

}

342

return true;

343

}

344

345

static inline bool_t is_word( cpchar p, uint len, uint* ptoklen )

346

{

347

if( len < 3 )

348

{

349

return false;

350

}

351

if( !(isalpha(*p) || *p == '$') )

352

{

353

return false;

354

}

355

*ptoklen = 1;

356

p++;

357

len--;

358

while( len > 0 )

359

{

360

if( !is_wordmidchar(*p) )

361

{

362

break;

363

}

364

(*ptoklen)++;

365

p++;

366

len--;

367

}

368

while( *ptoklen >= 3 && !is_wordendchar(*(p-1)) )

369

{

370

(*ptoklen)--;

371

p--;

372

len++;

373

}

374

if( *ptoklen < 3 )

375

{

376

return false;

377

}

378

379

return true;

380

}

381

382

static inline bool_t is_ignoredheader( cpchar p, uint len, uint* ptoklen )

383

{

384

int lo, hi, mid, minlen, cmp;

385

386

hi = g_nignoredheaders-1;

387

lo = -1;

388

while( hi-lo > 1 )

389

{

390

mid = (hi+lo)/2;

391

minlen = min( strlen(g_ignoredheaders[mid]), len );

392

cmp = strncasecmp( g_ignoredheaders[mid], p, minlen );

393

if( cmp >= 0 )

394

hi = mid;

395

else

396

lo = mid;

397

}

398

minlen = min( strlen(g_ignoredheaders[hi]), len );

399

if( len == minlen || strncasecmp(g_ignoredheaders[hi], p, minlen) != 0 )

400

{

401

return false;

402

}

403

*ptoklen = len;

404

return true;

405

}

406

407

static inline bool_t is_mailerid( cpchar p, uint len, uint* ptoklen )

408

{

409

if( len < 4 || strncmp( p, "\tid ", 4 ) != 0 )

410

{

411

return false;

412

}

413

*ptoklen = len;

414

return true;

415

}

416

417

static inline bool_t is_spamtext( cpchar p, uint len, uint* ptoklen )

418

{

419

if( len < 5 || strncmp( p, "SPAM:", 5 ) != 0 )

420

{

421

return false;

422

}

423

*ptoklen = len;

424

return true;

425

}

426

427

static inline bool_t is_smtpid( cpchar p, uint len, uint* ptoklen )

428

{

429

if( len < 8 || strncmp( p, "SMTP id ", 8 ) != 0 )

430

{

431

return false;

432

}

433

*ptoklen = len;

434

return true;

435

}

436

437

static inline bool_t is_boundaryequal( cpchar p, uint len, uint* ptoklen )

438

{

439

if( len < 9 || strncmp( p, "boundary=", 9 ) != 0 )

440

{

441

return false;

442

}

443

*ptoklen = len;

444

return true;

445

}

446

447

static inline bool_t is_nameequal( cpchar p, uint len, uint* ptoklen )

448

{

449

if( len < 6 || strncmp( p, "name=\"", 6 ) != 0 )

450

{

451

return false;

452

}

453

*ptoklen = 6;

454

return true;

455

}

456

457

static inline bool_t is_filenameequal( cpchar p, uint len, uint* ptoklen )

458

{

459

if( len < 10 || strncmp( p, "filename=\"", 10 ) != 0 )

460

{

461

return false;

462

}

463

*ptoklen = 10;

464

return true;

465

}

466

467

static inline bool_t is_from( cpchar p, uint len, uint* ptoklen )

468

{

469

if( len < 5 || strncmp( p, "From ", 5 ) != 0 )

470

{

471

return false;

472

}

473

*ptoklen = 5;

474

return true;

475

}

476

477

/*****************************************************************************/

478

479

void lex_create( lex_t* pthis, mbox_t mboxtype )

480

{

481

pthis->mboxtype = mboxtype;

482

pthis->section = envelope;

483

pthis->pos = 0;

484

pthis->bom = 0;

485

pthis->eom = 0;

486

pthis->lineend = 0;

487

pthis->buflen = 0;

488

pthis->pbuf = NULL;

489

}

490

491

void lex_destroy( lex_t* pthis )

492

{

493

free( pthis->pbuf );

494

}

495

496

bool_t lex_load( lex_t* pthis, int fd )

497

{

498

uint nalloc;

499

ssize_t nread;

500

501

nalloc = IOBUFSIZE;

502

pthis->pbuf = (char*)malloc( IOBUFSIZE );

503

if( pthis->pbuf == NULL )

504

{

505

return false;

506

}

507

508

while( (nread = read( fd, pthis->pbuf + pthis->buflen, nalloc - pthis->buflen )) > 0 )

509

{

510

pthis->buflen += nread;

511

if( pthis->buflen == nalloc )

512

{

513

char* pnewbuf;

514

nalloc += IOBUFSIZE;

515

pnewbuf = (char*)realloc( pthis->pbuf, nalloc );

516

if( pnewbuf == NULL )

517

{

518

free( pthis->pbuf );

519

pthis->pbuf = NULL;

520

return false;

521

}

522

pthis->pbuf = pnewbuf;

523

}

524

}

525

if( nread < 0 )

526

{

527

free( pthis->pbuf );

528

pthis->pbuf = NULL;

529

return false;

530

}

531

if( pthis->mboxtype == detect )

532

{

533

if( pthis->buflen > 5 && memcmp( pthis->pbuf, "From ", 5 ) == 0 )

534

{

535

verbose( 1, "Input looks like an mbox\n" );

536

pthis->mboxtype = mbox;

537

}

538

else

539

{

540

verbose( 1, "Input looks like a maildir\n" );

541

pthis->mboxtype = maildir;

542

}

543

}

544

545

return true;

546

}

547

548

static bool_t lex_nextline( lex_t* pthis )

549

{

550

cpchar pbuf;

551

uint len;

552

uint toklen;

553

554

again:

555

/* XXX: use and update pthis->section */

556

pthis->pos = pthis->lineend;

557

if( pthis->lineend == pthis->buflen )

558

{

559

return false;

560

}

561

562

pbuf = pthis->pbuf + pthis->pos;

563

len = 0;

564

while( pthis->pos + len < pthis->buflen && pbuf[len] != '\n' )

565

{

566

len++;

567

}

568

if( pthis->pos + len < pthis->buflen )

569

{

570

len++; /* bump past the LF */

571

}

572

573

pthis->lineend = pthis->pos + len;

574

575

/* check beginning-of-line patterns */

576

if( is_base64( pbuf, len, &toklen ) ||

577

is_ignoredheader( pbuf, len, &toklen ) ||

578

is_mailerid( pbuf, len, &toklen ) ||

579

is_mimeboundary( pbuf, len, &toklen ) ||

580

is_spamtext( pbuf, len, &toklen ) )

581

{

582

/* ignore line */

583

pthis->pos += toklen;

584

goto again;

585

}

586

587

return true;

588

}

589

590

void lex_nexttoken( lex_t* pthis, tok_t* ptok )

591

{

592

cpchar pbuf;

593

uint len;

594

uint toklen;

595

596

assert( pthis->pbuf != NULL );

597

598

if( pthis->pos == pthis->eom )

599

{

600

pthis->bom = pthis->pos;

601

}

602

603

again:

604

/* skip whitespace between tokens */

605

while( pthis->pos != pthis->lineend && is_whitespace(pthis->pbuf[pthis->pos]) )

606

{

607

pthis->pos++;

608

}

609

610

pbuf = pthis->pbuf + pthis->pos;

611

len = pthis->lineend - pthis->pos;

612

613

/* possibilities: end-of-line, html-comment, ipaddr, word, junk */

614

615

if( pthis->pos == pthis->lineend )

616

{

617

if( !lex_nextline( pthis ) )

618

{

619

pthis->eom = pthis->pos;

620

ptok->tt = eof;

621

return;

622

}

623

624

pbuf = pthis->pbuf + pthis->pos;

625

len = pthis->lineend - pthis->pos;

626

627

if( pthis->mboxtype == mbox )

628

{

629

if( is_from( pbuf, len, &toklen ) )

630

{

631

pthis->eom = pthis->pos;

632

ptok->tt = from;

633

ptok->p = pthis->pbuf + pthis->pos;

634

ptok->len = toklen;

635

pthis->pos += toklen;

636

return;

637

}

638

}

639

640

goto again; /* skip lws */

641

}

642

643

if( is_htmltag( pbuf, len, &toklen ) ||

644

is_htmlcomment( pbuf, len, &toklen ) ||

645

is_smtpid( pbuf, len, &toklen ) ||

646

is_boundaryequal( pbuf, len, &toklen ) ||

647

is_nameequal( pbuf, len, &toklen ) ||

648

is_filenameequal( pbuf, len, &toklen ) )

649

{

650

/* ignore it */

651

pthis->pos += toklen;

652

goto again;

653

}

654

655

if( is_ipaddr( pbuf, len, &toklen ) )

656

{

657

ptok->tt = word;

658

ptok->p = pthis->pbuf + pthis->pos;

659

ptok->len = toklen;

660

pthis->pos += toklen;

661

return;

662

}

663

if( is_word( pbuf, len, &toklen ) )

664

{

665

ptok->tt = word;

666

ptok->p = pthis->pbuf + pthis->pos;

667

ptok->len = toklen;

668

pthis->pos += toklen;

669

if( toklen > MAXWORDLEN )

670

{

671

goto again;

672

}

673

return;

674

}

675

676

/* junk */

677

pthis->pos++;

678

goto again;

679

}

680

681

/* SpamAssassin style passthru */

682

void lex_passthru( lex_t* pthis, bool_t is_spam, double hits )

683

{

684

char szbuf[256];

685

bool_t in_headers = true;

686

687

assert( pthis->bom < pthis->buflen && pthis->eom <= pthis->buflen );

688

assert( pthis->bom <= pthis->eom );

689

690

pthis->pos = pthis->bom;

691

if( is_spam )

692

{

693

sprintf( szbuf, "X-Spam-Status: Yes, hits=%f required=%f, tests=bmf\n"

694

"X-Spam-Flag: YES\n",

695

hits, SPAM_CUTOFF );

696

}

697

else

698

{

699

sprintf( szbuf, "X-Spam-Status: No, hits=%f required=%f\n",

700

hits, SPAM_CUTOFF );

701

}

702

703

/* existing headers */

704

while( in_headers && pthis->pos < pthis->eom )

705

{

706

cpchar pbuf = pthis->pbuf + pthis->pos;

707

uint len = 0;

708

while( pthis->pos + len < pthis->buflen && pbuf[len] != '\n' )

709

{

710

len++;

711

}

712

if( pthis->pos + len < pthis->buflen )

713

{

714

len++; /* bump past the LF */

715

}

716

717

/* check for end of headers */

718

if( pbuf[0] == '\n' || (pbuf[0] == '\r' && pbuf[1] == '\n') )

719

{

720

/* end of headers */

721

break;

722

}

723

724

/* write header, ignoring existing spam headers */

725

if( strncasecmp( pbuf, "X-Spam-", 7 ) != 0 )

726

{

727

write( STDOUT_FILENO, pbuf, len );

728

}

729

730

pthis->pos += len;

731

}

732

733

/* new headers */

734

write( STDOUT_FILENO, szbuf, strlen(szbuf) );

735

736

/* remainder */

737

if( pthis->pos < pthis->eom )

738

{

739

write( STDOUT_FILENO, pthis->pbuf+pthis->pos, pthis->eom-pthis->pos );

740

}

741

pthis->bom = pthis->eom;

742

}

743

744

#ifdef UNIT_TEST

745

746

int main( int argc, char** argv )

747

{

748

int fd;

749

lex_t lex;

750

tok_t tok;

751

752

fd = STDIN_FILENO;

753

if( argc == 2 )

754

{

755

fd = open( argv[1], O_RDONLY );

756

}

757

758

lex_create( &lex );

759

if( ! lex_load( &lex, fd ) )

760

{

761

fprintf( stderr, "cannot load file\n" );

762

exit( 1 );

763

}

764

765

lex_nexttoken( &lex, &tok );

766

while( tok.tt != eof )

767

{

768

char sztok[64];

769

if( tok.len > MAXWORDLEN )

770

{

771

printf( "*** token too long! ***\n" );

772

exit( 1 );

773

}

774

775

memcpy( sztok, tok.p, tok.len );

776

strlwr( sztok );

777

sztok[tok.len] = '\0';

778

printf( "get_token: %d '%s'\n", tok.tt, sztok );

779

780

lex_nexttoken( &lex, &tok );

781

}

782

783

lex_destroy( &lex );

784

return 0;

785

}

786

787

#endif /* def UNIT_TEST */

Older »