~ubuntu-branches/ubuntu/vivid/cctools/vivid

« back to all changes in this revision

Viewing changes to sand/src/sand_filter_kernel.c

  • Committer: Bazaar Package Importer
  • Author(s): Michael Hanke
  • Date: 2011-05-07 09:05:00 UTC
  • Revision ID: james.westby@ubuntu.com-20110507090500-lqpmdtwndor6e7os
Tags: upstream-3.3.2
ImportĀ upstreamĀ versionĀ 3.3.2

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
Copyright (C) 2009- The University of Notre Dame
 
3
This software is distributed under the GNU General Public License.
 
4
See the file COPYING for details.
 
5
*/
 
6
 
 
7
#include <stdio.h>
 
8
#include <math.h>
 
9
#include <unistd.h>
 
10
#include <stdlib.h>
 
11
#include <ctype.h>
 
12
#include <string.h>
 
13
#include <errno.h>
 
14
#include <limits.h>
 
15
 
 
16
#include "memory_info.h"
 
17
#include "debug.h"
 
18
#include "macros.h"
 
19
 
 
20
#include "sequence_filter.h"
 
21
 
 
22
static int num_seqs;
 
23
static int kmer_size = 22;
 
24
static int window_size = 22;
 
25
static unsigned long max_mem_kb = ULONG_MAX;
 
26
 
 
27
static char *repeat_filename = 0;
 
28
static char *sequence_filename = 0;
 
29
static char *second_sequence_filename = 0;
 
30
static char *output_filename = 0;
 
31
 
 
32
#define MEMORY_FOR_MERS(max_mem) (MIN((get_mem_avail()*0.95),(max_mem))-get_mem_usage())
 
33
#define DYNAMIC_RECTANGLE_SIZE(max_mem) (MEMORY_FOR_MERS((max_mem))/KB_PER_SEQUENCE)
 
34
 
 
35
static unsigned long get_mem_avail()
 
36
{
 
37
        UINT64_T total, avail;
 
38
        memory_info_get(&total, &avail);
 
39
        return (unsigned long) avail / 1024;
 
40
}
 
41
 
 
42
static unsigned long get_mem_usage()
 
43
{
 
44
        UINT64_T rss, total;
 
45
        memory_usage_get(&rss, &total);
 
46
        return rss / 1024;
 
47
}
 
48
 
 
49
static void show_version(const char *cmd)
 
50
{
 
51
        printf("%s version %d.%d.%d built by %s@%s on %s at %s\n", cmd, CCTOOLS_VERSION_MAJOR, CCTOOLS_VERSION_MINOR, CCTOOLS_VERSION_MICRO, BUILD_USER, BUILD_HOST, __DATE__, __TIME__);
 
52
}
 
53
 
 
54
static void show_help(const char *cmd)
 
55
{
 
56
        printf("Use: %s [options] <sequences file> [second sequence file]\n", cmd);
 
57
        printf("where options are:\n");
 
58
        printf(" -s <size>      Size of \"rectangle\" for filtering. You can determine\n");
 
59
        printf("                the size dynamically by passing in d rather than a number.\n");
 
60
        printf(" -r <file>      A meryl file of repeat mers to be filtered out.\n");
 
61
        printf(" -k <number>    The k-mer size to use in candidate selection (default is 22).\n");
 
62
        printf(" -w <number>    The minimizer window size to use in candidate selection (default is 22).\n");
 
63
        printf(" -o <filename>  The output file. Default is stdout.\n");
 
64
        printf("                output file to indicate it has ended (default is nothing)\n");
 
65
        printf(" -d <subsys>    Enable debug messages for this subsystem.  Try 'd -all' to start .\n");
 
66
        printf(" -v             Show version string\n");
 
67
        printf(" -h             Show this help screen\n");
 
68
}
 
69
 
 
70
static void get_options(int argc, char **argv, const char *progname)
 
71
{
 
72
        char c;
 
73
 
 
74
        while((c = getopt(argc, argv, "d:r:s:k:w:f:o:vh")) != (char) -1) {
 
75
                switch (c) {
 
76
                case 'r':
 
77
                        repeat_filename = optarg;
 
78
                        break;
 
79
                case 's':
 
80
                        if(*optarg == 'd') {
 
81
                                rectangle_size = -1;
 
82
                                max_mem_kb = strtol(optarg + 1, 0, 10);
 
83
                                if(max_mem_kb <= 0)
 
84
                                        max_mem_kb = ULONG_MAX;
 
85
                        } else
 
86
                                rectangle_size = atoi(optarg);
 
87
                        if(rectangle_size == 0) {
 
88
                                fprintf(stderr, "Invalid rectangle size %s\n", optarg);
 
89
                                exit(1);
 
90
                        }
 
91
                        break;
 
92
                case 'k':
 
93
                        kmer_size = atoi(optarg);
 
94
                        break;
 
95
                case 'w':
 
96
                        window_size = atoi(optarg);
 
97
                        break;
 
98
                case 'o':
 
99
                        output_filename = optarg;
 
100
                        break;
 
101
                case 'd':
 
102
                        debug_flags_set(optarg);
 
103
                        break;
 
104
                case 'v':
 
105
                        show_version(progname);
 
106
                        exit(0);
 
107
                case 'h':
 
108
                        show_help(progname);
 
109
                        exit(0);
 
110
                }
 
111
        }
 
112
 
 
113
        if(argc - optind == 1) {
 
114
                sequence_filename = argv[optind++];
 
115
        } else if(argc - optind == 2) {
 
116
                sequence_filename = argv[optind++];
 
117
                second_sequence_filename = argv[optind++];
 
118
        } else {
 
119
                show_help(progname);
 
120
                fprintf(stderr, "Incorrect number of arguments. Expected 1 or 2, got %d\n", argc - optind);
 
121
                exit(1);
 
122
        }
 
123
 
 
124
}
 
125
 
 
126
int main(int argc, char **argv)
 
127
{
 
128
        FILE *input;
 
129
        FILE *repeats = 0;
 
130
        FILE *output;
 
131
 
 
132
        int start_x, end_x, start_y, end_y;
 
133
 
 
134
        get_options(argc, argv, "sand_filter_kernel");
 
135
 
 
136
        unsigned long start_mem, cand_mem, table_mem;
 
137
 
 
138
        input = fopen(sequence_filename, "r");
 
139
        if(!input) fatal("couldn't open %s: %s\n",sequence_filename,strerror(errno));
 
140
 
 
141
        if(repeat_filename) {
 
142
                repeats = fopen(repeat_filename, "r");
 
143
                if(!repeats) fatal("couldn't open %s: %s\n",repeat_filename,strerror(errno));
 
144
        }
 
145
 
 
146
        if(output_filename) {
 
147
                output = fopen(output_filename, "w");
 
148
        } else {
 
149
                output = stdout;
 
150
        }
 
151
 
 
152
        // Data is in the form:
 
153
        // >id metadata
 
154
        // data
 
155
        // >id metadata
 
156
        // data
 
157
        // >>
 
158
        // ...
 
159
 
 
160
        set_k(kmer_size);
 
161
        set_window_size(window_size);
 
162
 
 
163
        // If we only give one file, do an all vs. all
 
164
        // on them.
 
165
        if(!second_sequence_filename) {
 
166
                num_seqs = load_seqs(input);
 
167
                start_x = 0;
 
168
                end_x = num_seqs;
 
169
                start_y = 0;
 
170
                end_y = num_seqs;
 
171
        }
 
172
        // If we had two files, do not compare ones from
 
173
        // the same file to each other.
 
174
        else {
 
175
                FILE *input2 = fopen(second_sequence_filename, "r");
 
176
                if(!input2) {
 
177
                        fprintf(stderr, "Could not open file %s for reading.\n", second_sequence_filename);
 
178
                        exit(1);
 
179
                }
 
180
                num_seqs = load_seqs_two_files(input, &end_x, input2, &end_y);
 
181
                start_x = 0;
 
182
                start_y = end_x;
 
183
                debug(D_DEBUG,"First file contains %d sequences, stored from (%d,%d].\n", end_x, start_x, end_x);
 
184
                debug(D_DEBUG,"Second file contains %d sequences, stored from (%d,%d].\n", end_y-end_x, start_y, end_y);
 
185
        }
 
186
        fclose(input);
 
187
 
 
188
        debug(D_DEBUG,"Loaded %d sequences\n",num_seqs);
 
189
 
 
190
        init_cand_table(num_seqs * 5);
 
191
        init_mer_table(num_seqs * 5);
 
192
 
 
193
        if(repeats) {
 
194
                int repeat_count = init_repeat_mer_table(repeats, 2000000, 0);
 
195
                fclose(repeats);
 
196
                debug(D_DEBUG,"Loaded %d repeated mers\n", repeat_count);
 
197
        }
 
198
 
 
199
        if(rectangle_size == -1) {
 
200
                // Do get_mem_avail*0.95 to leave some memory for overhead
 
201
                rectangle_size = DYNAMIC_RECTANGLE_SIZE(max_mem_kb);
 
202
                debug(D_DEBUG,"Mem avail: %lu, rectangle size: %d\n",(unsigned long)MEMORY_FOR_MERS(max_mem_kb), rectangle_size);
 
203
        }
 
204
 
 
205
        int curr_start_x = start_x;
 
206
        int curr_start_y = start_y;
 
207
 
 
208
        candidate_t *output_list = 0;
 
209
        int num_in_list;
 
210
 
 
211
        while(curr_start_y < end_y) {
 
212
                while(curr_start_x < end_x) {
 
213
                        if(start_x == start_y) {
 
214
                                debug(D_DEBUG,"Loading mer table (%d,%d)\n", curr_rect_x, curr_rect_y);
 
215
                        } else {
 
216
                                debug(D_DEBUG,"Loading mer table for [%d,%d) and [%d,%d)\n",curr_start_x, MIN(curr_start_x + rectangle_size, end_x), curr_start_y, MIN(curr_start_y + rectangle_size, end_y));
 
217
                        }
 
218
 
 
219
                        start_mem = get_mem_usage();
 
220
 
 
221
                        load_mer_table_subset(curr_start_x, MIN(curr_start_x + rectangle_size, end_x), curr_start_y, MIN(curr_start_y + rectangle_size, end_y), (curr_start_x == curr_start_y));
 
222
 
 
223
                        table_mem = get_mem_usage();
 
224
 
 
225
                        debug(D_DEBUG,"Finished loading, now generating candidates\n");
 
226
                        debug(D_DEBUG,"Memory used: %lu\n", table_mem - start_mem);
 
227
 
 
228
                        generate_candidates();
 
229
                        cand_mem = get_mem_usage();
 
230
 
 
231
                        debug(D_DEBUG,"Total candidates generated: %llu\n", (long long unsigned int) total_cand);
 
232
 
 
233
                        output_list = retrieve_candidates(&num_in_list);
 
234
                        output_candidate_list(output, output_list, num_in_list);
 
235
                        free(output_list);
 
236
                        fflush(output);
 
237
 
 
238
                        debug(D_DEBUG,"Now freeing\n");
 
239
 
 
240
                        free_cand_table();
 
241
                        free_mer_table();
 
242
 
 
243
                        debug(D_DEBUG,"Successfully output and freed!\n");
 
244
 
 
245
                        curr_rect_x++;
 
246
                        curr_start_x += rectangle_size;
 
247
                }
 
248
                curr_rect_y++;
 
249
                curr_start_y += rectangle_size;
 
250
                curr_rect_x = curr_rect_y;
 
251
                if(start_y == 0) {
 
252
                        curr_start_x = curr_start_y;
 
253
                } else {
 
254
                        curr_start_x = start_x;
 
255
                }
 
256
        }
 
257
 
 
258
        fclose(output);
 
259
 
 
260
        return 0;
 
261
}