2
Copyright (C) 2009- The University of Notre Dame
3
This software is distributed under the GNU General Public License.
4
See the file COPYING for details.
16
#include "memory_info.h"
20
#include "sequence_filter.h"
23
static int kmer_size = 22;
24
static int window_size = 22;
25
static unsigned long max_mem_kb = ULONG_MAX;
27
static char *repeat_filename = 0;
28
static char *sequence_filename = 0;
29
static char *second_sequence_filename = 0;
30
static char *output_filename = 0;
32
#define MEMORY_FOR_MERS(max_mem) (MIN((get_mem_avail()*0.95),(max_mem))-get_mem_usage())
33
#define DYNAMIC_RECTANGLE_SIZE(max_mem) (MEMORY_FOR_MERS((max_mem))/KB_PER_SEQUENCE)
35
static unsigned long get_mem_avail()
37
UINT64_T total, avail;
38
memory_info_get(&total, &avail);
39
return (unsigned long) avail / 1024;
42
static unsigned long get_mem_usage()
45
memory_usage_get(&rss, &total);
49
static void show_version(const char *cmd)
51
printf("%s version %d.%d.%d built by %s@%s on %s at %s\n", cmd, CCTOOLS_VERSION_MAJOR, CCTOOLS_VERSION_MINOR, CCTOOLS_VERSION_MICRO, BUILD_USER, BUILD_HOST, __DATE__, __TIME__);
54
static void show_help(const char *cmd)
56
printf("Use: %s [options] <sequences file> [second sequence file]\n", cmd);
57
printf("where options are:\n");
58
printf(" -s <size> Size of \"rectangle\" for filtering. You can determine\n");
59
printf(" the size dynamically by passing in d rather than a number.\n");
60
printf(" -r <file> A meryl file of repeat mers to be filtered out.\n");
61
printf(" -k <number> The k-mer size to use in candidate selection (default is 22).\n");
62
printf(" -w <number> The minimizer window size to use in candidate selection (default is 22).\n");
63
printf(" -o <filename> The output file. Default is stdout.\n");
64
printf(" output file to indicate it has ended (default is nothing)\n");
65
printf(" -d <subsys> Enable debug messages for this subsystem. Try 'd -all' to start .\n");
66
printf(" -v Show version string\n");
67
printf(" -h Show this help screen\n");
70
static void get_options(int argc, char **argv, const char *progname)
74
while((c = getopt(argc, argv, "d:r:s:k:w:f:o:vh")) != (char) -1) {
77
repeat_filename = optarg;
82
max_mem_kb = strtol(optarg + 1, 0, 10);
84
max_mem_kb = ULONG_MAX;
86
rectangle_size = atoi(optarg);
87
if(rectangle_size == 0) {
88
fprintf(stderr, "Invalid rectangle size %s\n", optarg);
93
kmer_size = atoi(optarg);
96
window_size = atoi(optarg);
99
output_filename = optarg;
102
debug_flags_set(optarg);
105
show_version(progname);
113
if(argc - optind == 1) {
114
sequence_filename = argv[optind++];
115
} else if(argc - optind == 2) {
116
sequence_filename = argv[optind++];
117
second_sequence_filename = argv[optind++];
120
fprintf(stderr, "Incorrect number of arguments. Expected 1 or 2, got %d\n", argc - optind);
126
int main(int argc, char **argv)
132
int start_x, end_x, start_y, end_y;
134
get_options(argc, argv, "sand_filter_kernel");
136
unsigned long start_mem, cand_mem, table_mem;
138
input = fopen(sequence_filename, "r");
139
if(!input) fatal("couldn't open %s: %s\n",sequence_filename,strerror(errno));
141
if(repeat_filename) {
142
repeats = fopen(repeat_filename, "r");
143
if(!repeats) fatal("couldn't open %s: %s\n",repeat_filename,strerror(errno));
146
if(output_filename) {
147
output = fopen(output_filename, "w");
152
// Data is in the form:
161
set_window_size(window_size);
163
// If we only give one file, do an all vs. all
165
if(!second_sequence_filename) {
166
num_seqs = load_seqs(input);
172
// If we had two files, do not compare ones from
173
// the same file to each other.
175
FILE *input2 = fopen(second_sequence_filename, "r");
177
fprintf(stderr, "Could not open file %s for reading.\n", second_sequence_filename);
180
num_seqs = load_seqs_two_files(input, &end_x, input2, &end_y);
183
debug(D_DEBUG,"First file contains %d sequences, stored from (%d,%d].\n", end_x, start_x, end_x);
184
debug(D_DEBUG,"Second file contains %d sequences, stored from (%d,%d].\n", end_y-end_x, start_y, end_y);
188
debug(D_DEBUG,"Loaded %d sequences\n",num_seqs);
190
init_cand_table(num_seqs * 5);
191
init_mer_table(num_seqs * 5);
194
int repeat_count = init_repeat_mer_table(repeats, 2000000, 0);
196
debug(D_DEBUG,"Loaded %d repeated mers\n", repeat_count);
199
if(rectangle_size == -1) {
200
// Do get_mem_avail*0.95 to leave some memory for overhead
201
rectangle_size = DYNAMIC_RECTANGLE_SIZE(max_mem_kb);
202
debug(D_DEBUG,"Mem avail: %lu, rectangle size: %d\n",(unsigned long)MEMORY_FOR_MERS(max_mem_kb), rectangle_size);
205
int curr_start_x = start_x;
206
int curr_start_y = start_y;
208
candidate_t *output_list = 0;
211
while(curr_start_y < end_y) {
212
while(curr_start_x < end_x) {
213
if(start_x == start_y) {
214
debug(D_DEBUG,"Loading mer table (%d,%d)\n", curr_rect_x, curr_rect_y);
216
debug(D_DEBUG,"Loading mer table for [%d,%d) and [%d,%d)\n",curr_start_x, MIN(curr_start_x + rectangle_size, end_x), curr_start_y, MIN(curr_start_y + rectangle_size, end_y));
219
start_mem = get_mem_usage();
221
load_mer_table_subset(curr_start_x, MIN(curr_start_x + rectangle_size, end_x), curr_start_y, MIN(curr_start_y + rectangle_size, end_y), (curr_start_x == curr_start_y));
223
table_mem = get_mem_usage();
225
debug(D_DEBUG,"Finished loading, now generating candidates\n");
226
debug(D_DEBUG,"Memory used: %lu\n", table_mem - start_mem);
228
generate_candidates();
229
cand_mem = get_mem_usage();
231
debug(D_DEBUG,"Total candidates generated: %llu\n", (long long unsigned int) total_cand);
233
output_list = retrieve_candidates(&num_in_list);
234
output_candidate_list(output, output_list, num_in_list);
238
debug(D_DEBUG,"Now freeing\n");
243
debug(D_DEBUG,"Successfully output and freed!\n");
246
curr_start_x += rectangle_size;
249
curr_start_y += rectangle_size;
250
curr_rect_x = curr_rect_y;
252
curr_start_x = curr_start_y;
254
curr_start_x = start_x;