5
// Last Modified: Fri May 19 17:10:05 EDT 2006
7
// This program reads the sequence in the first command-line
8
// file and then takes the start and end positions specified in
9
// the second command-line file and checks for anomalous
10
// start/stop codons and frame shifts.
18
static bool Check_Previous_Stop = false;
19
// Determines whether to check if codon before start coordinate
20
// is a valid stop codon
21
static bool Check_Start_Codon = true;
22
// Determines whether to check if first codon is a valid start
23
static char * Coord_File_Name = NULL;
24
// From the second command-line argument
25
static int Num_Start_Codons;
26
// Number of different start codon patterns
27
static int Num_Stop_Codons;
28
// Number of different stop codon patterns
29
static char * Sequence_File_Name = NULL;
30
// From the first command-line argument
31
static vector <const char *> Start_Codon;
32
// Sequences assumed to be start codons
33
static vector <const char *> Stop_Codon;
34
// Sequences assumed to be stop codons
38
(int argc, char * argv [])
43
char * Buffer, Line [MAX_LINE], Name [MAX_LINE];
44
char Codon [4] = "tag";
45
int Direction, Frame_Shift;
46
long int Buffer_Len, Gene_Len;
47
long int i, j, Begin, End, Len, Start, Stop;
48
int problem_ct = 0, ok_ct = 0;
52
Parse_Command_Line (argc, argv);
54
Set_Start_And_Stop_Codons ();
56
fp = File_Open (Sequence_File_Name, "r");
58
Buffer = (char *) Safe_malloc (INIT_SIZE);
59
Buffer_Len = INIT_SIZE;
61
Fasta_Read (fp, Data, hdr);
65
Len = Data . length ();
66
Data . insert (Data . begin (), 'x');
67
// Put a dummy character at the front of Data so subscripts
70
fp = File_Open (Coord_File_Name, "r");
72
while (fgets (Line, MAX_LINE, fp) != NULL)
76
if (sscanf (Line, "%s %ld %ld", Name, & Start, & End) != 3)
78
printf ("Bad line: %s\n...Skipping\n", Line);
82
if (Start < End && End - Start <= Len / 2 || Start - End > Len / 2)
85
Gene_Len = 1 + End - Start;
89
if (Buffer_Len < Gene_Len + 1)
90
Buffer = (char *) Safe_realloc (Buffer, 1 + Gene_Len);
91
Buffer_Len = 1 + Gene_Len;
92
for (i = 0; i < Gene_Len; i ++)
98
Buffer [i] = tolower (Data [j]);
105
Gene_Len = 1 + Start - End;
109
if (Buffer_Len < Gene_Len + 1)
110
Buffer = (char *) Safe_realloc (Buffer, 1 + Gene_Len);
111
Buffer_Len = 1 + Gene_Len;
112
for (i = 0; i < Gene_Len; i ++)
118
Buffer [i] = Complement (tolower (Data [j]));
123
if (Check_Previous_Stop)
127
for (i = 3; i > 0; i --)
129
Codon [i] = tolower (Data [Start - i + Len]);
131
Codon [i] = tolower (Data [Start - i]);
135
for (i = 3; i > 0; i --)
137
Codon [i] = Complement (tolower (Data [Start + i - Len]));
139
Codon [i] = Complement (tolower (Data [Start + i]));
141
if (! Is_Stop_Codon (Codon))
143
printf ("%-10s %8ld %8ld no stop before start\n",
148
if (Check_Start_Codon && ! Is_Start_Codon (Buffer))
150
printf ("%-10s has bad start codon = %.3s\n", Name, Buffer);
153
if (! Is_Stop_Codon (Buffer + Gene_Len - 3))
155
printf ("%-10s has bad stop codon = %s\n", Name, Buffer + Gene_Len - 3);
157
for (j = Gene_Len; j < Len; j += 3)
159
for (i = 0; i < 3; i ++)
162
if (Start + i + j > Len)
163
Codon [i] = tolower (Data [Start + i + j - Len]);
165
Codon [i] = tolower (Data [Start + i + j]);
169
if (Start - i - j < 1)
170
Codon [i] = Complement (tolower (Data [Start - i - j + Len]));
172
Codon [i] = Complement (tolower (Data [Start - i - j]));
174
if (Is_Stop_Codon (Codon))
178
printf (" next stop occurs at offset %ld"
179
" Gene_Len = %ld diff = %+ld\n",
180
j, Gene_Len, j - Gene_Len + 3);
183
Frame_Shift = (Gene_Len % 3);
186
printf ("%-10s %8ld %8ld has %+d frame shift\n",
187
Name, Start, End, Frame_Shift);
190
for (i = 0; i < Gene_Len - 3; i += 3)
191
if (Is_Stop_Codon (Buffer + i))
193
if (i < Gene_Len - 3)
195
Stop = Start + Direction * (i - 1);
200
printf (" Best prefix is %8ld %8ld Len = %ld\n",
205
printf (" No stop found in start frame\n");
209
for (i = Gene_Len - 6; i >= 0; i -= 3)
210
if (Is_Stop_Codon (Buffer + i))
213
Begin = Start + Direction * i;
218
printf (" Best suffix is %8ld %8ld Len = %ld\n",
219
Begin, End, Gene_Len - i - 3);
224
for (i = 0; i < Gene_Len - 3; i += 3)
225
if (Is_Stop_Codon (Buffer + i))
227
printf ("%-10s has stop codon %.3s at offset %ld"
228
" Gene_Len = %ld diff = %+ld\n",
229
Name, Buffer + i, i, Gene_Len, Gene_Len - 3 - i);
239
fprintf (stderr, " OK orfs = %7d\n", ok_ct);
240
fprintf (stderr, "Problem orfs = %7d\n", problem_ct);
242
catch (std :: exception & e)
244
cerr << "** Standard Exception **" << endl;
254
static bool Is_Start_Codon
257
// Return true iff the first 3 characters of p match a
258
// string in global Start_Codon .
263
for (i = 0; i < Num_Start_Codons; i ++)
264
if (strncmp (p, Start_Codon [i], 3) == 0)
272
static bool Is_Stop_Codon
275
// Return true iff the first 3 characters of p match a
276
// string in global Stop_Codon .
281
for (i = 0; i < Num_Stop_Codons; i ++)
282
if (strncmp (p, Stop_Codon [i], 3) == 0)
290
static void Parse_Command_Line
291
(int argc, char * argv [])
293
// Get options and parameters from command line with argc
294
// arguments in argv [0 .. (argc - 1)] .
303
while (! errflg && ((ch = getopt (argc, argv, "A:stZ:")) != EOF))
307
Start_Codon . clear ();
308
for (p = strtok (optarg, ","); p != NULL; p = strtok (NULL, ","))
312
Start_Codon . push_back (q);
317
Check_Start_Codon = FALSE;
321
Check_Previous_Stop = TRUE;
325
Stop_Codon . clear ();
326
for (p = strtok (optarg, ","); p != NULL; p = strtok (NULL, ","))
330
Stop_Codon . push_back (q);
335
fprintf (stderr, "Unrecognized option -%c\n", optopt);
347
if (optind > argc - 2)
353
Sequence_File_Name = argv [optind ++];
354
Coord_File_Name = argv [optind ++];
361
static void Set_Start_And_Stop_Codons
364
// Set globals Start_Codon and Stop_Codon to the sequences
365
// that are allowed to be start and stop codons for genes.
370
if (Start_Codon . size () == 0)
372
n = sizeof (DEFAULT_START_CODON) / sizeof (char *);
373
for (i = 0; i < n; i ++)
374
Start_Codon . push_back (DEFAULT_START_CODON [i]);
377
if (Stop_Codon . size () == 0)
379
n = sizeof (DEFAULT_STOP_CODON) / sizeof (char *);
380
for (i = 0; i < n; i ++)
381
Stop_Codon . push_back (DEFAULT_STOP_CODON [i]);
384
Num_Start_Codons = Start_Codon . size ();
385
Num_Stop_Codons = Stop_Codon . size ();
395
// Print to stderr description of options and command line for
400
"USAGE: anomaly [options] <sequence-file> <coord-file>\n"
402
"Read DNA sequence in <sequence-file> and for each region specified\n"
403
"by the coordinates in <coord-file>, check whether the region\n"
404
"represents a normal gene, i.e., it begins with a start codon, ends\n"
405
"with a stop codon, and has no frame shifts.\n"
406
"Output goes to standard output.\n"
410
" Use comma-separated list of codons as start codons\n"
411
" Sample format: -A atg,gtg\n"
412
" Default start codons are atg,gtg,ttg\n"
414
" Omit the check that the first codon is a start codon.\n"
416
" Check whether the codon preceding the start coordinate position\n"
417
" is a stop codon. This is useful if the coordinates represent\n"
418
" the entire region between stop codons.\n"
420
" Use comma-separated list of codons as stop codons\n"
421
" Sample format: -Z tag,tga,taa\n"