/* $Id: lex.c,v 1.18 2002/10/20 20:29:15 tommy Exp $ */ /* * Copyright (c) 2002 Tom Marshall * * This program is free software. It may be distributed under the terms * in the file LICENSE, found in the top level of the distribution. * * lex.c: generate token stream for bmf. */ #include "config.h" #include "dbg.h" #include "str.h" #include "lex.h" static cpchar g_htmltags[] = { "abbr", "above", "accesskey", "acronym", "align", "alink", "all", "alt", "applet", "archive", "axis", "basefont", "baseline", "below", "bgcolor", "big", "body", "border", "bottom", "box", "button", "cellpadding", "cellspacing", "center", "char", "charoff", "charset", "circle", "cite", "class", "classid", "clear", "codebase", "codetype", "color", "cols", "colspan", "compact", "content", "coords", "data", "datetime", "declare", "default", "defer", "dfn", "dir", "disabled", "face", "font", "frameborder", "groups", "head", "headers", "height", "href", "hreflang", "hsides", "hspace", "http-equiv", "iframe", "img", "input", "ismap", "justify", "kbd", "label", "lang", "language", "left", "lhs", "link", "longdesc", "map", "marginheight", "marginwidth", "media", "meta", "middle", "multiple", "name", "nohref", "none", "noresize", "noshade", "nowrap", "object", "onblur", "onchange", "onclick", "ondblclick", "onfocus", "onkeydown", "onkeypress", "onkeyup", "onload", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onselect", "onunload", "param", "poly", "profile", "prompt", "readonly", "rect", "rel", "rev", "rhs", "right", "rows", "rowspan", "rules", "samp", "scheme", "scope", "script", "scrolling", "select", "selected", "shape", "size", "small", "span", "src", "standby", "strike", "strong", "style", "sub", "summary", "sup", "tabindex", "table", "target", "textarea", "title", "top", "type", "usemap", "valign", "value", "valuetype", "var", "vlink", "void", "vsides", "vspace", "width" }; static const uint g_nhtmltags = sizeof(g_htmltags)/sizeof(cpchar); static cpchar g_ignoredheaders[] = { "Date:", "Delivery-date:", "Message-ID:", "X-Sorted:", "X-Spam-" }; static const uint g_nignoredheaders = sizeof(g_ignoredheaders)/sizeof(cpchar); static inline bool_t is_whitespace( int c ) { return ( c == ' ' || c == '\t' || c == '\r' ); } static inline bool_t is_base64char(c) { return ( isalnum(c) || (c == '/' || c == '+') ); } static inline bool_t is_wordmidchar(c) { return ( isalnum(c) || c == '$' || c == '\'' || c == '.' || c == '-' ); } static inline bool_t is_wordendchar(c) { return ( isalnum(c) || c == '$' ); } static inline bool_t is_htmltag( cpchar p, uint len, uint* ptoklen ) { int lo, hi, mid, minlen, cmp; *ptoklen = 0; hi = g_nhtmltags-1; lo = -1; while( hi-lo > 1 ) { mid = (hi+lo)/2; minlen = min( strlen(g_htmltags[mid]), len ); cmp = strncmp( g_htmltags[mid], p, minlen ); if( cmp > 0 || (cmp == 0 && minlen < len && !islower(p[minlen])) ) hi = mid; else lo = mid; } minlen = min( strlen(g_htmltags[hi]), len ); if( len == minlen || strncmp(g_htmltags[hi], p, minlen) != 0 ) { return false; } /* check if is_word() will have a longer match */ if( is_wordendchar(p[minlen]) ) { return false; } if( is_wordmidchar(p[minlen]) && is_wordendchar(p[minlen+1]) ) { return false; } *ptoklen = strlen(g_htmltags[hi]); return true; } static inline bool_t is_htmlcomment( cpchar p, uint len, uint* ptoklen ) { *ptoklen = 0; if( len >=4 && memcmp( p, "", 3 ) == 0 ) { *ptoklen = 3; return true; } return false; } static inline bool_t is_base64( cpchar p, uint len, uint* ptoklen ) { *ptoklen = 0; while( len > 0 ) { if( *p != '\n' && *p != '\r' && !is_base64char(*p) ) { return false; } p++; len--; (*ptoklen)++; } return true; } static inline bool_t is_mimeboundary( cpchar p, uint len, uint* ptoklen ) { *ptoklen = 0; if( len < 3 || p[0] != '-' || p[1] != '-' ) { return false; } p += 2; len -= 2; *ptoklen += 2; while( len > 0 ) { if( is_whitespace(*p) ) { return false; } if( *p == '\n' || *p == '\r' ) { break; } p++; len--; (*ptoklen)++; } return true; } static inline bool_t is_ipaddr( cpchar p, uint len, uint* ptoklen ) { uint noctets, ndigits; *ptoklen = 0; noctets = 0; while( len > 0 && noctets < 4 ) { ndigits = 0; while( len > 0 && isdigit(*p) ) { ndigits++; p++; len--; (*ptoklen)++; } if( ndigits == 0 || ndigits > 3 ) { return false; } noctets++; if( noctets < 4 ) { if( *p != '.' ) { return false; } p++; len--; (*ptoklen)++; } } if( noctets < 4 ) { return false; } return true; } static inline bool_t is_word( cpchar p, uint len, uint* ptoklen ) { if( len < 3 ) { return false; } if( !(isalpha(*p) || *p == '$') ) { return false; } *ptoklen = 1; p++; len--; while( len > 0 ) { if( !is_wordmidchar(*p) ) { break; } (*ptoklen)++; p++; len--; } while( *ptoklen >= 3 && !is_wordendchar(*(p-1)) ) { (*ptoklen)--; p--; len++; } if( *ptoklen < 3 ) { return false; } return true; } static inline bool_t is_ignoredheader( cpchar p, uint len, uint* ptoklen ) { int lo, hi, mid, minlen, cmp; hi = g_nignoredheaders-1; lo = -1; while( hi-lo > 1 ) { mid = (hi+lo)/2; minlen = min( strlen(g_ignoredheaders[mid]), len ); cmp = strncasecmp( g_ignoredheaders[mid], p, minlen ); if( cmp >= 0 ) hi = mid; else lo = mid; } minlen = min( strlen(g_ignoredheaders[hi]), len ); if( len == minlen || strncasecmp(g_ignoredheaders[hi], p, minlen) != 0 ) { return false; } *ptoklen = len; return true; } static inline bool_t is_mailerid( cpchar p, uint len, uint* ptoklen ) { if( len < 4 || strncmp( p, "\tid ", 4 ) != 0 ) { return false; } *ptoklen = len; return true; } static inline bool_t is_spamtext( cpchar p, uint len, uint* ptoklen ) { if( len < 5 || strncmp( p, "SPAM:", 5 ) != 0 ) { return false; } *ptoklen = len; return true; } static inline bool_t is_smtpid( cpchar p, uint len, uint* ptoklen ) { if( len < 8 || strncmp( p, "SMTP id ", 8 ) != 0 ) { return false; } *ptoklen = len; return true; } static inline bool_t is_boundaryequal( cpchar p, uint len, uint* ptoklen ) { if( len < 9 || strncmp( p, "boundary=", 9 ) != 0 ) { return false; } *ptoklen = len; return true; } static inline bool_t is_nameequal( cpchar p, uint len, uint* ptoklen ) { if( len < 6 || strncmp( p, "name=\"", 6 ) != 0 ) { return false; } *ptoklen = 6; return true; } static inline bool_t is_filenameequal( cpchar p, uint len, uint* ptoklen ) { if( len < 10 || strncmp( p, "filename=\"", 10 ) != 0 ) { return false; } *ptoklen = 10; return true; } static inline bool_t is_from( cpchar p, uint len, uint* ptoklen ) { if( len < 5 || strncmp( p, "From ", 5 ) != 0 ) { return false; } *ptoklen = 5; return true; } /*****************************************************************************/ void lex_create( lex_t* pthis, mbox_t mboxtype ) { pthis->mboxtype = mboxtype; pthis->section = envelope; pthis->pos = 0; pthis->bom = 0; pthis->eom = 0; pthis->lineend = 0; pthis->buflen = 0; pthis->pbuf = NULL; } void lex_destroy( lex_t* pthis ) { free( pthis->pbuf ); } bool_t lex_load( lex_t* pthis, int fd ) { uint nalloc; ssize_t nread; nalloc = IOBUFSIZE; pthis->pbuf = (char*)malloc( IOBUFSIZE ); if( pthis->pbuf == NULL ) { return false; } while( (nread = read( fd, pthis->pbuf + pthis->buflen, nalloc - pthis->buflen )) > 0 ) { pthis->buflen += nread; if( pthis->buflen == nalloc ) { char* pnewbuf; nalloc += IOBUFSIZE; pnewbuf = (char*)realloc( pthis->pbuf, nalloc ); if( pnewbuf == NULL ) { free( pthis->pbuf ); pthis->pbuf = NULL; return false; } pthis->pbuf = pnewbuf; } } if( nread < 0 ) { free( pthis->pbuf ); pthis->pbuf = NULL; return false; } if( pthis->mboxtype == detect ) { if( pthis->buflen > 5 && memcmp( pthis->pbuf, "From ", 5 ) == 0 ) { verbose( 1, "Input looks like an mbox\n" ); pthis->mboxtype = mbox; } else { verbose( 1, "Input looks like a maildir\n" ); pthis->mboxtype = maildir; } } return true; } static bool_t lex_nextline( lex_t* pthis ) { cpchar pbuf; uint len; uint toklen; again: /* XXX: use and update pthis->section */ pthis->pos = pthis->lineend; if( pthis->lineend == pthis->buflen ) { return false; } pbuf = pthis->pbuf + pthis->pos; len = 0; while( pthis->pos + len < pthis->buflen && pbuf[len] != '\n' ) { len++; } if( pthis->pos + len < pthis->buflen ) { len++; /* bump past the LF */ } pthis->lineend = pthis->pos + len; /* check beginning-of-line patterns */ if( is_base64( pbuf, len, &toklen ) || is_ignoredheader( pbuf, len, &toklen ) || is_mailerid( pbuf, len, &toklen ) || is_mimeboundary( pbuf, len, &toklen ) || is_spamtext( pbuf, len, &toklen ) ) { /* ignore line */ pthis->pos += toklen; goto again; } return true; } void lex_nexttoken( lex_t* pthis, tok_t* ptok ) { cpchar pbuf; uint len; uint toklen; assert( pthis->pbuf != NULL ); if( pthis->pos == pthis->eom ) { pthis->bom = pthis->pos; } again: /* skip whitespace between tokens */ while( pthis->pos != pthis->lineend && is_whitespace(pthis->pbuf[pthis->pos]) ) { pthis->pos++; } pbuf = pthis->pbuf + pthis->pos; len = pthis->lineend - pthis->pos; /* possibilities: end-of-line, html-comment, ipaddr, word, junk */ if( pthis->pos == pthis->lineend ) { if( !lex_nextline( pthis ) ) { pthis->eom = pthis->pos; ptok->tt = eof; return; } pbuf = pthis->pbuf + pthis->pos; len = pthis->lineend - pthis->pos; if( pthis->mboxtype == mbox ) { if( is_from( pbuf, len, &toklen ) ) { pthis->eom = pthis->pos; ptok->tt = from; ptok->p = pthis->pbuf + pthis->pos; ptok->len = toklen; pthis->pos += toklen; return; } } goto again; /* skip lws */ } if( is_htmltag( pbuf, len, &toklen ) || is_htmlcomment( pbuf, len, &toklen ) || is_smtpid( pbuf, len, &toklen ) || is_boundaryequal( pbuf, len, &toklen ) || is_nameequal( pbuf, len, &toklen ) || is_filenameequal( pbuf, len, &toklen ) ) { /* ignore it */ pthis->pos += toklen; goto again; } if( is_ipaddr( pbuf, len, &toklen ) ) { ptok->tt = word; ptok->p = pthis->pbuf + pthis->pos; ptok->len = toklen; pthis->pos += toklen; return; } if( is_word( pbuf, len, &toklen ) ) { ptok->tt = word; ptok->p = pthis->pbuf + pthis->pos; ptok->len = toklen; pthis->pos += toklen; if( toklen > MAXWORDLEN ) { goto again; } return; } /* junk */ pthis->pos++; goto again; } /* SpamAssassin style passthru */ void lex_passthru( lex_t* pthis, bool_t is_spam, double hits ) { char szbuf[256]; bool_t ign_header = false; assert( pthis->bom < pthis->buflen && pthis->eom <= pthis->buflen ); assert( pthis->bom <= pthis->eom ); pthis->pos = pthis->bom; if( is_spam ) { sprintf( szbuf, "X-Spam-Status: Yes, hits=%f required=%f, tests=bmf\n" "X-Spam-Flag: YES\n", hits, SPAM_CUTOFF ); } else { sprintf( szbuf, "X-Spam-Status: No, hits=%f required=%f\n", hits, SPAM_CUTOFF ); } /* existing headers */ while( pthis->pos < pthis->eom ) { cpchar pbuf = pthis->pbuf + pthis->pos; uint len = 0; while( pthis->pos + len < pthis->buflen && pbuf[len] != '\n' ) { len++; } if( pthis->pos + len < pthis->buflen ) { len++; /* bump past the LF */ } /* check for end of headers */ if( pbuf[0] == '\n' || (pbuf[0] == '\r' && pbuf[1] == '\n') ) { /* end of headers */ break; } /* write header, ignoring existing spam headers */ if( ign_header && (pbuf[0] == ' ' || pbuf[0] == '\t') ) /* continuation line of an ignored header */ ; else if( strncasecmp( pbuf, "X-Spam-", 7 ) == 0 ) ign_header = true; else { write( STDOUT_FILENO, pbuf, len ); ign_header = false; } pthis->pos += len; } /* new headers */ write( STDOUT_FILENO, szbuf, strlen(szbuf) ); /* remainder */ if( pthis->pos < pthis->eom ) { write( STDOUT_FILENO, pthis->pbuf+pthis->pos, pthis->eom-pthis->pos ); } pthis->bom = pthis->eom; } #ifdef UNIT_TEST int main( int argc, char** argv ) { int fd; lex_t lex; tok_t tok; fd = STDIN_FILENO; if( argc == 2 ) { fd = open( argv[1], O_RDONLY ); } lex_create( &lex ); if( ! lex_load( &lex, fd ) ) { fprintf( stderr, "cannot load file\n" ); exit( 1 ); } lex_nexttoken( &lex, &tok ); while( tok.tt != eof ) { char sztok[64]; if( tok.len > MAXWORDLEN ) { printf( "*** token too long! ***\n" ); exit( 1 ); } memcpy( sztok, tok.p, tok.len ); strlwr( sztok ); sztok[tok.len] = '\0'; printf( "get_token: %d '%s'\n", tok.tt, sztok ); lex_nexttoken( &lex, &tok ); } lex_destroy( &lex ); return 0; } #endif /* def UNIT_TEST */