/* database routines */ /* ifile - intelligent mail filter for EXMH/MH ifile is Copyright (C) 1997 Jason Rennie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program (see file 'COPYING'); if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include #include #include #define LOG_2 0.69314718 extern arguments args; /* info about command line arguments */ extern int msgs_read; /* number of messages actually read in */ /* variables for keeping track of time/speed of ifile -*/ extern clock_t DMZ_start, DMZ_end, DMZ2_start; /* given the age of a word, returns an integer which is the minimum * required frequency for the word to remain in the database */ long int _trim_freq (long int age) { if (age <= 0) return 0; else return (((long int)(log((float) age) / log(2.0))) - 1); } long int _trim_freq0 (long int age) { return 0; } /* Initializes an ifile database */ /* Written by Jason Rennie */ void ifile_db_init(ifile_db * idata) { long int i; idata->num_folders = 0; idata->num_words = 0; idata->total_docs = 0; idata->total_freq = 0; /* words of AGE should be trashed if they have frequency less than returned * by this function */ if (args.keep_infrequent) idata->trim_freq = _trim_freq0; else idata->trim_freq = _trim_freq; EXT_ARRAY_INIT(idata->folder_name, char *, IFILE_INIT_FOLDERS); EXT_ARRAY_INIT(idata->folder_freq, long int, IFILE_INIT_FOLDERS); EXT_ARRAY_INIT(idata->folder_msg, long int, IFILE_INIT_FOLDERS); for (i=0; i < IFILE_INIT_FOLDERS; i++) { /* EXT_ARRAY_SET(idata->folder_freq, int, i, 0); EXT_ARRAY_SET(idata->folder_msg, int, i, 0); printf("folder %d defaults: freq = %d msgs = %d\n", i, EXT_ARRAY_GET(idata->folder_freq, int, i), EXT_ARRAY_GET(idata->folder_msg, int, i)); */ } htable_init(&(idata->data), IFILE_INIT_WORDS, (unsigned long (*)(const void *, long int)) hash); } /* Free a word entry and its contents */ /* Written by Jeremy Brown */ void wentry_free(db_word_entry *wentry) { if (!wentry) return; free(wentry->word); if (wentry->freq) { EXT_ARRAY_FREE((*wentry->freq),long int); free(wentry->freq); } free(wentry); } /* Free all storage allocated by an ifile database */ /* Written by Jeremy Brown */ void ifile_db_free(ifile_db *idata) { EXT_ARRAY_FREE_ELEMS(idata->folder_name, char *); EXT_ARRAY_FREE(idata->folder_name, char *); EXT_ARRAY_FREE(idata->folder_freq,long int); EXT_ARRAY_FREE(idata->folder_msg, long int); htable_free_guts(&idata->data, free, (void(*)(void *)) wentry_free); } /* Initializes a word entry of an ifile_db type. Does NOT allocate memory. */ /* Written by Jason Rennie */ void ifile_db_entry_init (db_word_entry * wentry) { wentry->word = NULL; wentry->age = 0; wentry->tot_freq = 0; wentry->freq = NULL; } /* Expects a valid file pointer which points to the beginning of an * ifile database (idata) file along with an ifile database structure. * Function reads in the header lines of file and stores information in IDATA. * Expects that IDATA is allocated and initialized. DATA will be advanced * to the beginning of the end of the idata header. * Returns number of folders upon success, -1 upon failure. */ /* Written by Jason Rennie */ long int ifile_read_header (ifile_db * idata, char **bufp) { char *line; char *token = NULL; long int i; long int num; if ((line = readline(bufp)) == NULL) return -1; /* read folder names */ token = strtok(line, " \t"); for (i = 0; token != NULL; i++) { EXT_ARRAY_SET(idata->folder_name, char *, i, ifile_strdup(token)); token = strtok(NULL, " \t"); } idata->num_folders = i; if ((line = readline(bufp)) == NULL) return -1; /* read word frequencies */ token = strtok(line, " \t"); for (i = 0; token != NULL; i++) { num = atoi(token); EXT_ARRAY_SET(idata->folder_freq, long int, i, num); idata->total_freq += num; token = strtok(NULL, " \t"); } if (i != idata->num_folders) ifile_verbosify(ifile_quiet, "Bad data file format - line #2\n"); if ((line = readline(bufp)) == NULL) return -1; /* read document frequencies */ token = strtok(line, " \t"); for (i = 0; token != NULL; i++) { num = atoi(token); EXT_ARRAY_SET(idata->folder_msg, long int, i, num); idata->total_docs += num; token = strtok(NULL, " \t"); } if (i != idata->num_folders) ifile_verbosify(ifile_quiet, "Bad data file format - line #3\n"); return idata->num_folders; } /* Expects a valid file pointer which points to the beginning of the word * entry section of an ifile database (idata) file along with an ifile * database structure. Function reads to the end of the file and stores * information in IDATA. Expects that IDATA is allocated and initialized. * DATA will be advanced to the end of the file. * Returns number of word entries upon success, -1 upon failure.*/ /* Written by Jason Rennie */ long int ifile_read_word_frequencies (ifile_db * idata, char **bufp) { char *line; long int i = 1; while ((line = readline(bufp)) != NULL) { if (line == NULL || line[0] == '\0') ifile_verbosify(ifile_quiet, "Line # %d not in proper word entry format\n", (i+3)); else idata->num_words += ifile_read_word_entry(line, idata); i++; } return idata->num_words; } /* Given a character array which contains a single word entry from an idata * file and a pointer to an ifile database, allocates and adds a * DB_WORD_ENTRY to IDATA->DATA. Returns 1 if line contained a word entry, * 0 otherwise. */ /* written by Jason Rennie */ long int ifile_read_word_entry (char * line, ifile_db * idata) { db_word_entry * wentry = malloc(sizeof(db_word_entry)); char *token = NULL; long int folder, freq; long int num_folders = idata->num_folders; extendable_array * freq_array; /* initialize EXT_ARRAY things on the fly - do NOT do initialization * when creating the DB */ wentry->freq = (extendable_array *) malloc(sizeof(extendable_array)); freq_array = wentry->freq; EXT_ARRAY_INIT_N_SET((*freq_array), long int, num_folders, 0); token = strtok(line, " \t"); /* if this is a blank line, don't add anything to IDATA */ if (token == NULL) { free(wentry); free(freq_array); return 0; } /* add word entry to database of word entries */ htable_put(&(idata->data), (void *) (token), (void *) wentry); wentry->word = ifile_strdup(token); /* read the age of the word */ token = strtok(NULL, " \t"); if (token != NULL) wentry->age = atoi(token); else wentry->age = 0; /* read the per folder frequency */ token = strtok(NULL, ":"); while (token != NULL) { folder = atoi(token); token = strtok(NULL, " \t"); if (token != NULL) { freq = atoi(token); wentry->tot_freq += freq; EXT_ARRAY_SET((*freq_array), long int, folder, freq); } token = strtok(NULL, ":"); } return 1; } /* Given the name of an idata file this function parses the entire data file * and stored the read information into IDATA. * Returns 0 upon success, -1 upon failure. */ /* written by Jason Rennie */ long int ifile_read_db (char * data_file, ifile_db * idata) { int DATA; long int folders; long int words; char *buf, *saved_buf; struct stat st; size_t bufsize; ssize_t nread; ssize_t rc; ifile_verbosify(ifile_progress, "Reading %s from disk...\n", data_file); DATA = open(data_file, O_RDONLY, 0); if (DATA == -1) { ifile_verbosify(ifile_quiet, "Not able to open %s for reading!\n", data_file); return -1; } DMZ_start = clock(); /* Allocate buffer. */ if (stat(data_file, &st) == -1) { ifile_verbosify(ifile_quiet, "Not able to stat %s!\n", data_file); return -1; } bufsize = (size_t) st.st_size; if ((saved_buf = buf = malloc(bufsize)) == NULL) { ifile_verbosify(ifile_quiet, "Not able to allocate %d bytes!\n", bufsize); return -1; } /* Read file. */ nread = 0; while (nread < bufsize) { if ((rc = read(DATA, buf + nread, bufsize - nread)) == -1) { ifile_verbosify(ifile_quiet, "Not able to read %s!\n", data_file); return -1; } nread += rc; } /* Close file. */ (void) close(DATA); folders = ifile_read_header (idata, &buf); words = ifile_read_word_frequencies (idata, &buf); free(saved_buf); DMZ_end = clock(); ifile_verbosify(ifile_progress, "Read %d categories, %d words. Time used: %.3f sec\n", folders, words, ((float)(DMZ_end-DMZ_start))/CLOCKS_PER_SECOND); return 0; } /* Given a message to rate and a database to get information from, calculates * a value for each folder which approximates the liklihood that the user * would have placed the message in that folder. Returns an array of * category ratings */ /* written by Jason Rennie */ category_rating * ifile_rate_categories (htable * message, ifile_db * idata) { long int i; float docval, nval, r; char *token = NULL; category_rating * ratings; hash_elem * elem; db_word_entry * wentry; long int print_calc = -1; /* folder of which rating calculations are printed */ float freq; if (args.folder_calcs != NULL) { print_calc = 0; for (i=0; i < idata->num_folders; i++) { if (strcmp(args.folder_calcs, EXT_ARRAY_GET(idata->folder_name, char *, i)) == 0) print_calc = i; } } ratings = malloc(sizeof(category_rating)*idata->num_folders); DMZ_start = clock(); ifile_verbosify(ifile_progress, "Computing category ratings...\n"); docval = (float)(idata->total_docs + idata->num_folders); if (args.folder_calcs) ifile_verbosify(ifile_debug, "Outputting calculations for folder \"%s\"\n", (EXT_ARRAY_GET(idata->folder_name, char *, print_calc))); for (i=0; i < idata->num_folders; i++) { token = EXT_ARRAY_GET(idata->folder_name, char *, i); ratings[i].category = ifile_strdup(token); ratings[i].rating = 0.0; } for (elem = htable_init_traversal(message); elem != NULL; elem = htable_next_traversal(message, elem)) { wentry = htable_lookup(&(idata->data), (char *) elem->index); if (wentry != NULL) { for (i=0; i < idata->num_folders; i++) { nval = (float) (EXT_ARRAY_GET(idata->folder_freq, long int, i) + idata->num_words); freq = (float) EXT_ARRAY_GET((*(wentry->freq)), long int, i); r = log((freq + 1.0) / nval); if (i == print_calc) ifile_verbosify(ifile_quiet, "word = %s msg = %d db = %d +rating = %.5f\n", (char *) elem->index, (long int) elem->entry, wentry->freq[i], r); ratings[i].rating += ((long int) elem->entry) * r; } } else if (print_calc >= 0 && print_calc < idata->num_folders) ifile_verbosify(ifile_quiet, "word = %s msg = %d db = 0\n", (char *) elem->index, (long int) elem->entry); } DMZ_end=clock(); ifile_verbosify(ifile_progress, "Calculated category scores. Time used: %.3f sec\n", ((double)(DMZ_end-DMZ_start))/CLOCKS_PER_SECOND); return ratings; } /* Free a category_ratings and all allocated contents */ /* written by Jeremy Brown */ void ifile_free_categories(category_rating *cr, ifile_db *idata) { int i; for (i = 0; i < idata->num_folders; i++) free(cr[i].category); free(cr); } /* Given the name of an idata file this function writes the information * stored in IDATA to disk. The pre-existence of an .idata file in the * location is not checked for. * Returns 0 upon success, -1 upon failure. */ /* Written by Jason Rennie for ifile */ long int ifile_write_db (char * data_file, ifile_db * idata) { FILE * DATA; long int folders; long int words; char *temp_data_file; char *user; char host[128]; struct passwd *pwd = getpwuid (getuid ()); if (!pwd) user = "unknown"; else user = pwd->pw_name; if (gethostname (host, sizeof (host))) strcpy(host, "unknown"); temp_data_file = ifile_sprintf ("%s.%s.%s", data_file, user, host); ifile_verbosify (ifile_progress, "Writing %s to disk...\n", data_file); DMZ_start = clock (); /* Open temporary data file for writing of database */ DATA = fopen (temp_data_file, "w"); if (DATA == NULL) ifile_error ("Not able to open temporary file for writing: %s\n", temp_data_file); folders = ifile_write_header (DATA, idata); words = ifile_write_word_frequencies (DATA, idata); if (ferror(DATA)) ifile_error ("Error while writing data to temporary file: %s\n", temp_data_file); fclose (DATA); /* Rename file to regular database name */ rename (temp_data_file, data_file); free (temp_data_file); DMZ_end = clock (); ifile_verbosify (ifile_progress, "Wrote %d folders, %d words. Time used: %.3f sec\n", folders, words, ((float)(DMZ_end-DMZ_start))/CLOCKS_PER_SECOND); return 0; } /* Expects a valid file pointer which points to the beginning of an * ifile database (idata) file along with an ifile database structure. * Function reads in the header lines of file and stores information in IDATA. * Expects that IDATA is allocated and initialized. DATA will be advanced * to the beginning of the end of the idata header. * Returns number of folders upon success, -1 upon failure. */ /* Written by Jason Rennie */ long int ifile_write_header (FILE * DATA, ifile_db * idata) { long int i; long int num; for (i=0; i < idata->num_folders; i++) fprintf(DATA, "%s ", EXT_ARRAY_GET(idata->folder_name, char *, i)); putc('\n', DATA); for (i=0; i < idata->num_folders; i++) { num = EXT_ARRAY_GET(idata->folder_freq, long int, i); fprintf(DATA, "%ld ", num); } putc('\n', DATA); for (i=0; i < idata->num_folders; i++) { num = EXT_ARRAY_GET(idata->folder_msg, long int, i); fprintf(DATA, "%ld ", num); } putc('\n', DATA); return idata->num_folders; } /* Expects a valid file pointer which points to the beginning of the word * entry section of an ifile database (idata) file along with an ifile * database structure. Function reads to the end of the file and stores * information in IDATA. Expects that IDATA is allocated and initialized. * DATA will be advanced to the end of the file. * Returns number of word entries upon success, -1 upon failure.*/ /* Written by Jason Rennie */ long int ifile_write_word_frequencies (FILE * DATA, ifile_db * idata) { long int i; hash_elem * elem; db_word_entry * wentry; long int freq; extendable_array * freq_array; long int num_words = 0; for (elem = htable_init_traversal(&(idata->data)); elem != NULL; elem = htable_next_traversal(&(idata->data), elem)) { wentry = (db_word_entry *) elem->entry; freq_array = wentry->freq; if (wentry->tot_freq == 0) continue; num_words++; fprintf(DATA, "%s %ld ", wentry->word, wentry->age); for (i=0; i < idata->num_folders; i++) { freq = EXT_ARRAY_GET((*freq_array), long int, i); if (freq > 0) fprintf(DATA, "%ld:%ld ", i, freq); } putc('\n', DATA); } return num_words; } /* Adds EPOCHS to each word's age and eliminates words from the database which * are overly infrequent Uses IDATA->TRIM_FREQ() to calculate which words * should be tossed. Returns the number of trimmed words. */ long int ifile_age_words (ifile_db * idata, long int epochs) { long int i; hash_elem * elem; db_word_entry * wentry; long int wfreq, ffreq, new_freq; long int trimmed_words = 0; for (elem = htable_init_traversal(&(idata->data)); elem != NULL; elem = htable_next_traversal(&(idata->data), elem)) { wentry = (db_word_entry *) elem->entry; wentry->age += epochs; if (idata->trim_freq(wentry->age) > wentry->tot_freq) { /* update the word frequency values for each folder */ for (i=0; i < idata->num_folders; i++) { wfreq = EXT_ARRAY_GET((*wentry->freq), long int, i); ffreq = EXT_ARRAY_GET(idata->folder_freq, long int, i); new_freq = (ffreq >= wfreq) ? (ffreq - wfreq) : 0; EXT_ARRAY_SET(idata->folder_freq, long int, i, new_freq); EXT_ARRAY_SET((*wentry->freq), long int, i, 0); } wentry->tot_freq = 0; trimmed_words++; } } return trimmed_words; } /* if we wanted to make ifile more efficient, we would allocate our * idata->data->freq arrays so that they are one larger than the number * of folders. This would make it so that we would never have to reallocate * these arrays since it is currently not possible to add messages to more * than one folder (per execution) */ /* Adds the word statistics from MESSAGE to FOLDER in IDATA. If CREATE is set to FALSE, nothing is done if FOLDER does not already exist. */ /* Written by Jason Rennie */ void ifile_add_db (char * folder, htable * message, ifile_db * idata, int create) { hash_elem * elem; db_word_entry * wentry; long int folder_index; long int freq = 0; long int folder_freq = 0; long int i; long int num_msgs; folder_index = -1; for (i=0; i < idata->num_folders; i++) { if (strcmp(folder, EXT_ARRAY_GET(idata->folder_name, char *, i)) == 0) folder_index = i; } if (folder_index == -1) { if (!create) return; EXT_ARRAY_SET(idata->folder_name, char *, idata->num_folders, ifile_strdup(folder)); folder_index = idata->num_folders; idata->num_folders++; } for (elem = htable_init_traversal (message); elem != NULL; elem = htable_next_traversal (message, elem)) { ifile_verbosify(ifile_debug, "adding... %s %d\n", (char *) elem->index, (long int) elem->entry); wentry = htable_lookup (&(idata->data), (char *) elem->index); if (wentry == NULL) { wentry = (db_word_entry *) malloc(sizeof(db_word_entry)); ifile_db_entry_init(wentry); wentry->freq = (extendable_array *) malloc(sizeof(extendable_array)); wentry->word = ifile_strdup((char *) elem->index); wentry->tot_freq = 0; wentry->age = 0; EXT_ARRAY_INIT((*wentry->freq), long int, IFILE_INIT_FOLDERS); freq = 0; htable_put (&(idata->data), ((char *) elem->index), wentry); idata->num_words++; } else freq = EXT_ARRAY_GET((*wentry->freq), long int, folder_index); folder_freq += (long int) elem->entry; freq += (long int) elem->entry; idata->total_freq += (long int) elem->entry; wentry->tot_freq += (long int) elem->entry; EXT_ARRAY_SET((*wentry->freq), long int, folder_index, freq); } /* increase message count by one */ num_msgs = EXT_ARRAY_GET(idata->folder_msg, long int, folder_index); EXT_ARRAY_SET(idata->folder_msg, long int, folder_index, (num_msgs+1)); /* adjust the folder word frequency count */ folder_freq += EXT_ARRAY_GET(idata->folder_freq, long int, folder_index); EXT_ARRAY_SET(idata->folder_freq, long int, folder_index, folder_freq); } /* Removes the word statistics of MESSAGE from FOLDER in IDATA. * if FOLDER does not exist, or any word of MESSAGE has a lower * frequency than in the database, an error message is printed. */ /* Written by Jason Rennie */ void ifile_del_db (char * folder, htable * message, ifile_db * idata) { hash_elem * elem; db_word_entry * wentry; long int folder_index; long int freq; long int i; long int folder_freq = 0; long int num_msgs; folder_index = -1; for (i=0; i < idata->num_folders; i++) { if (strcmp(folder, EXT_ARRAY_GET(idata->folder_name, char *, i)) == 0) folder_index = i; } if (folder_index == -1) { ifile_verbosify(ifile_quiet, "Folder \"%s\" does not appear to exist\n", folder); return; } for (elem = htable_init_traversal (message); elem != NULL; elem = htable_next_traversal (message, elem)) { wentry = htable_lookup (&(idata->data), (char *) elem->index); if (wentry == NULL) { ifile_verbosify(ifile_verbose, "Word \"%s\" does not exist in the database. Skipping.\n", (char *) elem->index); continue; } else freq = EXT_ARRAY_GET((*wentry->freq), long int, folder_index); freq -= (long int) elem->entry; folder_freq -= (long int) elem->entry; if (freq <= 0) { ifile_verbosify(ifile_verbose, "Word \"%s\" has lower frequency in database than in message.\n Setting database frequency to 0\n", (char *) elem->index); freq = 0; } EXT_ARRAY_SET((*wentry->freq), long int, folder_index, freq); } /* update the word frequency count for the folder */ folder_freq += EXT_ARRAY_GET(idata->folder_freq, long int, folder_index); if (folder_freq < 0) folder_freq = 0; EXT_ARRAY_SET(idata->folder_freq, long int, folder_index, folder_freq); /* update the message count for the folder */ num_msgs = EXT_ARRAY_GET(idata->folder_msg, long int, folder_index); num_msgs = (num_msgs >= 1) ? (num_msgs-1) : 0; EXT_ARRAY_SET(idata->folder_msg, long int, folder_index, num_msgs); }