1
/* $Id: lex.c,v 1.18 2002/10/20 20:29:15 tommy Exp $ */
4
* Copyright (c) 2002 Tom Marshall <tommy@tig-grr.com>
6
* This program is free software. It may be distributed under the terms
7
* in the file LICENSE, found in the top level of the distribution.
9
* lex.c: generate token stream for bmf.
17
static cpchar g_htmltags[] =
172
static const uint g_nhtmltags = sizeof(g_htmltags)/sizeof(cpchar);
174
static cpchar g_ignoredheaders[] =
182
static const uint g_nignoredheaders = sizeof(g_ignoredheaders)/sizeof(cpchar);
184
static inline bool_t is_whitespace( int c )
186
return ( c == ' ' || c == '\t' || c == '\r' );
189
static inline bool_t is_base64char(c)
191
return ( isalnum(c) || (c == '/' || c == '+') );
194
static inline bool_t is_wordmidchar(c)
196
return ( isalnum(c) || c == '$' || c == '\'' || c == '.' || c == '-' );
199
static inline bool_t is_wordendchar(c)
201
return ( isalnum(c) || c == '$' );
204
static inline bool_t is_htmltag( cpchar p, uint len, uint* ptoklen )
206
int lo, hi, mid, minlen, cmp;
215
minlen = min( strlen(g_htmltags[mid]), len );
216
cmp = strncmp( g_htmltags[mid], p, minlen );
217
if( cmp > 0 || (cmp == 0 && minlen < len && !islower(p[minlen])) )
222
minlen = min( strlen(g_htmltags[hi]), len );
223
if( len == minlen || strncmp(g_htmltags[hi], p, minlen) != 0 )
228
/* check if is_word() will have a longer match */
229
if( is_wordendchar(p[minlen]) )
233
if( is_wordmidchar(p[minlen]) && is_wordendchar(p[minlen+1]) )
238
*ptoklen = strlen(g_htmltags[hi]);
243
static inline bool_t is_htmlcomment( cpchar p, uint len, uint* ptoklen )
247
if( len >=4 && memcmp( p, "<!--", 4 ) == 0 )
252
if( len >= 3 && memcmp( p, "-->", 3 ) == 0 )
261
static inline bool_t is_base64( cpchar p, uint len, uint* ptoklen )
266
if( *p != '\n' && *p != '\r' && !is_base64char(*p) )
277
static inline bool_t is_mimeboundary( cpchar p, uint len, uint* ptoklen )
281
if( len < 3 || p[0] != '-' || p[1] != '-' )
290
if( is_whitespace(*p) )
294
if( *p == '\n' || *p == '\r' )
305
static inline bool_t is_ipaddr( cpchar p, uint len, uint* ptoklen )
307
uint noctets, ndigits;
312
while( len > 0 && noctets < 4 )
315
while( len > 0 && isdigit(*p) )
322
if( ndigits == 0 || ndigits > 3 )
345
static inline bool_t is_word( cpchar p, uint len, uint* ptoklen )
351
if( !(isalpha(*p) || *p == '$') )
360
if( !is_wordmidchar(*p) )
368
while( *ptoklen >= 3 && !is_wordendchar(*(p-1)) )
382
static inline bool_t is_ignoredheader( cpchar p, uint len, uint* ptoklen )
384
int lo, hi, mid, minlen, cmp;
386
hi = g_nignoredheaders-1;
391
minlen = min( strlen(g_ignoredheaders[mid]), len );
392
cmp = strncasecmp( g_ignoredheaders[mid], p, minlen );
398
minlen = min( strlen(g_ignoredheaders[hi]), len );
399
if( len == minlen || strncasecmp(g_ignoredheaders[hi], p, minlen) != 0 )
407
static inline bool_t is_mailerid( cpchar p, uint len, uint* ptoklen )
409
if( len < 4 || strncmp( p, "\tid ", 4 ) != 0 )
417
static inline bool_t is_spamtext( cpchar p, uint len, uint* ptoklen )
419
if( len < 5 || strncmp( p, "SPAM:", 5 ) != 0 )
427
static inline bool_t is_smtpid( cpchar p, uint len, uint* ptoklen )
429
if( len < 8 || strncmp( p, "SMTP id ", 8 ) != 0 )
437
static inline bool_t is_boundaryequal( cpchar p, uint len, uint* ptoklen )
439
if( len < 9 || strncmp( p, "boundary=", 9 ) != 0 )
447
static inline bool_t is_nameequal( cpchar p, uint len, uint* ptoklen )
449
if( len < 6 || strncmp( p, "name=\"", 6 ) != 0 )
457
static inline bool_t is_filenameequal( cpchar p, uint len, uint* ptoklen )
459
if( len < 10 || strncmp( p, "filename=\"", 10 ) != 0 )
467
static inline bool_t is_from( cpchar p, uint len, uint* ptoklen )
469
if( len < 5 || strncmp( p, "From ", 5 ) != 0 )
477
/*****************************************************************************/
479
void lex_create( lex_t* pthis, mbox_t mboxtype )
481
pthis->mboxtype = mboxtype;
482
pthis->section = envelope;
491
void lex_destroy( lex_t* pthis )
496
bool_t lex_load( lex_t* pthis, int fd )
502
pthis->pbuf = (char*)malloc( IOBUFSIZE );
503
if( pthis->pbuf == NULL )
508
while( (nread = read( fd, pthis->pbuf + pthis->buflen, nalloc - pthis->buflen )) > 0 )
510
pthis->buflen += nread;
511
if( pthis->buflen == nalloc )
515
pnewbuf = (char*)realloc( pthis->pbuf, nalloc );
516
if( pnewbuf == NULL )
522
pthis->pbuf = pnewbuf;
531
if( pthis->mboxtype == detect )
533
if( pthis->buflen > 5 && memcmp( pthis->pbuf, "From ", 5 ) == 0 )
535
verbose( 1, "Input looks like an mbox\n" );
536
pthis->mboxtype = mbox;
540
verbose( 1, "Input looks like a maildir\n" );
541
pthis->mboxtype = maildir;
548
static bool_t lex_nextline( lex_t* pthis )
555
/* XXX: use and update pthis->section */
556
pthis->pos = pthis->lineend;
557
if( pthis->lineend == pthis->buflen )
562
pbuf = pthis->pbuf + pthis->pos;
564
while( pthis->pos + len < pthis->buflen && pbuf[len] != '\n' )
568
if( pthis->pos + len < pthis->buflen )
570
len++; /* bump past the LF */
573
pthis->lineend = pthis->pos + len;
575
/* check beginning-of-line patterns */
576
if( is_base64( pbuf, len, &toklen ) ||
577
is_ignoredheader( pbuf, len, &toklen ) ||
578
is_mailerid( pbuf, len, &toklen ) ||
579
is_mimeboundary( pbuf, len, &toklen ) ||
580
is_spamtext( pbuf, len, &toklen ) )
583
pthis->pos += toklen;
590
void lex_nexttoken( lex_t* pthis, tok_t* ptok )
596
assert( pthis->pbuf != NULL );
598
if( pthis->pos == pthis->eom )
600
pthis->bom = pthis->pos;
604
/* skip whitespace between tokens */
605
while( pthis->pos != pthis->lineend && is_whitespace(pthis->pbuf[pthis->pos]) )
610
pbuf = pthis->pbuf + pthis->pos;
611
len = pthis->lineend - pthis->pos;
613
/* possibilities: end-of-line, html-comment, ipaddr, word, junk */
615
if( pthis->pos == pthis->lineend )
617
if( !lex_nextline( pthis ) )
619
pthis->eom = pthis->pos;
624
pbuf = pthis->pbuf + pthis->pos;
625
len = pthis->lineend - pthis->pos;
627
if( pthis->mboxtype == mbox )
629
if( is_from( pbuf, len, &toklen ) )
631
pthis->eom = pthis->pos;
633
ptok->p = pthis->pbuf + pthis->pos;
635
pthis->pos += toklen;
640
goto again; /* skip lws */
643
if( is_htmltag( pbuf, len, &toklen ) ||
644
is_htmlcomment( pbuf, len, &toklen ) ||
645
is_smtpid( pbuf, len, &toklen ) ||
646
is_boundaryequal( pbuf, len, &toklen ) ||
647
is_nameequal( pbuf, len, &toklen ) ||
648
is_filenameequal( pbuf, len, &toklen ) )
651
pthis->pos += toklen;
655
if( is_ipaddr( pbuf, len, &toklen ) )
658
ptok->p = pthis->pbuf + pthis->pos;
660
pthis->pos += toklen;
663
if( is_word( pbuf, len, &toklen ) )
666
ptok->p = pthis->pbuf + pthis->pos;
668
pthis->pos += toklen;
669
if( toklen > MAXWORDLEN )
681
/* SpamAssassin style passthru */
682
void lex_passthru( lex_t* pthis, bool_t is_spam, double hits )
685
bool_t in_headers = true;
687
assert( pthis->bom < pthis->buflen && pthis->eom <= pthis->buflen );
688
assert( pthis->bom <= pthis->eom );
690
pthis->pos = pthis->bom;
693
sprintf( szbuf, "X-Spam-Status: Yes, hits=%f required=%f, tests=bmf\n"
694
"X-Spam-Flag: YES\n",
699
sprintf( szbuf, "X-Spam-Status: No, hits=%f required=%f\n",
703
/* existing headers */
704
while( in_headers && pthis->pos < pthis->eom )
706
cpchar pbuf = pthis->pbuf + pthis->pos;
708
while( pthis->pos + len < pthis->buflen && pbuf[len] != '\n' )
712
if( pthis->pos + len < pthis->buflen )
714
len++; /* bump past the LF */
717
/* check for end of headers */
718
if( pbuf[0] == '\n' || (pbuf[0] == '\r' && pbuf[1] == '\n') )
724
/* write header, ignoring existing spam headers */
725
if( strncasecmp( pbuf, "X-Spam-", 7 ) != 0 )
727
write( STDOUT_FILENO, pbuf, len );
734
write( STDOUT_FILENO, szbuf, strlen(szbuf) );
737
if( pthis->pos < pthis->eom )
739
write( STDOUT_FILENO, pthis->pbuf+pthis->pos, pthis->eom-pthis->pos );
741
pthis->bom = pthis->eom;
746
int main( int argc, char** argv )
755
fd = open( argv[1], O_RDONLY );
759
if( ! lex_load( &lex, fd ) )
761
fprintf( stderr, "cannot load file\n" );
765
lex_nexttoken( &lex, &tok );
766
while( tok.tt != eof )
769
if( tok.len > MAXWORDLEN )
771
printf( "*** token too long! ***\n" );
775
memcpy( sztok, tok.p, tok.len );
777
sztok[tok.len] = '\0';
778
printf( "get_token: %d '%s'\n", tok.tt, sztok );
780
lex_nexttoken( &lex, &tok );
787
#endif /* def UNIT_TEST */