1
/////////////////////////////////////////////////////////////////
4
// Class for reading/manipulating single sequence character data.
5
/////////////////////////////////////////////////////////////////
15
#include "SafeVector.h"
16
#include "FileBuffer.h"
18
/////////////////////////////////////////////////////////////////
21
// Class for storing sequence information.
22
/////////////////////////////////////////////////////////////////
26
bool isValid; // a boolean indicating whether the sequence data is valid or not
27
string header; // string containing the comment line of the FASTA file
28
SafeVector<char> *data; // pointer to character data
29
int length; // length of the sequence
30
int sequenceLabel; // integer sequence label, typically to indicate the ordering of sequences
31
// in a Multi-FASTA file
32
int inputLabel; // position of sequence in original input
34
/////////////////////////////////////////////////////////////////
35
// Sequence::Sequence()
37
// Default constructor. Does nothing.
38
/////////////////////////////////////////////////////////////////
40
Sequence () : isValid (false), header (""), data (NULL), length (0), sequenceLabel (0), inputLabel (0) {}
44
/////////////////////////////////////////////////////////////////
45
// Sequence::Sequence()
47
// Constructor. Reads the sequence from a FileBuffer.
48
/////////////////////////////////////////////////////////////////
50
Sequence (FileBuffer &infile, bool stripGaps = false) : isValid (false), header ("~"), data (NULL), length(0), sequenceLabel (0), inputLabel (0) {
52
// read until the first non-blank line
53
while (!infile.eof()){
54
infile.GetLine (header);
55
if (header.length() != 0) break;
58
// check to make sure that it is a correct header line
59
if (header[0] == '>'){
61
// if so, remove the leading ">"
62
header = header.substr (1);
64
// remove any leading or trailing white space in the header comment
65
while (header.length() > 0 && isspace (header[0])) header = header.substr (1);
66
while (header.length() > 0 && isspace (header[header.length() - 1])) header = header.substr(0, header.length() - 1);
68
// get ready to read the data[] array; note that data[0] is always '@'
70
data = new SafeVector<char>; assert (data);
71
data->push_back ('@');
73
// get a character from the file
74
while (infile.Get(ch)){
76
// if we've reached a new comment line, put the character back and stop
77
if (ch == '>'){ infile.UnGet(); break; }
80
if (isspace (ch)) continue;
82
// substitute gap character
83
if (ch == '.') ch = '-';
84
if (stripGaps && ch == '-') continue;
86
// check for known characters
87
if (!((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || ch == '*' || ch == '-')){
88
cerr << "ERROR: Unknown character encountered: " << ch << endl;
92
// everything's ok so far, so just store this character.
97
// sequence must contain data in order to be valid
107
/////////////////////////////////////////////////////////////////
108
// Sequence::Sequence()
110
// Constructor. Builds a sequence from existing data. Note
111
// that the data must use one-based indexing where data[0] should
113
/////////////////////////////////////////////////////////////////
115
Sequence (SafeVector<char> *data, string header, int length, int sequenceLabel, int inputLabel) :
116
isValid (data != NULL), header(header), data(data), length (length), sequenceLabel (sequenceLabel), inputLabel (inputLabel) {
118
assert ((*data)[0] == '@');
121
/////////////////////////////////////////////////////////////////
122
// Sequence::Sequence()
124
// Destructor. Release allocated memory.
125
/////////////////////////////////////////////////////////////////
136
/////////////////////////////////////////////////////////////////
137
// Sequence::GetHeader()
139
// Return the string comment associated with this sequence.
140
/////////////////////////////////////////////////////////////////
142
string GetHeader () const {
146
/////////////////////////////////////////////////////////////////
147
// Sequence::GetName()
149
// Return the first word of the string comment associated with this sequence.
150
/////////////////////////////////////////////////////////////////
152
string GetName () const {
154
sscanf (header.c_str(), "%s", name);
158
/////////////////////////////////////////////////////////////////
159
// Sequence::GetDataPtr()
161
// Return the iterator to data associated with this sequence.
162
/////////////////////////////////////////////////////////////////
164
SafeVector<char>::iterator GetDataPtr(){
167
return data->begin();
170
/////////////////////////////////////////////////////////////////
171
// Sequence::GetPosition()
173
// Return the character at position i. Recall that the character
174
// data is stored with one-based indexing.
175
/////////////////////////////////////////////////////////////////
177
char GetPosition (int i) const {
180
assert (i >= 1 && i <= length);
184
/////////////////////////////////////////////////////////////////
185
// Sequence::SetLabel()
187
// Sets the sequence label to i.
188
/////////////////////////////////////////////////////////////////
190
void SetLabel (int i){
196
/////////////////////////////////////////////////////////////////
197
// Sequence::SetSortLabel()
199
// Sets the sequence sorting label to i.
200
/////////////////////////////////////////////////////////////////
202
void SetSortLabel (int i){
207
/////////////////////////////////////////////////////////////////
208
// Sequence::GetLabel()
210
// Retrieves the input label.
211
/////////////////////////////////////////////////////////////////
213
int GetLabel () const {
218
/////////////////////////////////////////////////////////////////
219
// Sequence::GetSortLabel()
221
// Retrieves the sorting label.
222
/////////////////////////////////////////////////////////////////
224
int GetSortLabel () const {
226
return sequenceLabel;
229
/////////////////////////////////////////////////////////////////
232
// Checks to see if the sequence successfully loaded.
233
/////////////////////////////////////////////////////////////////
239
/////////////////////////////////////////////////////////////////
240
// Sequence::Length()
242
// Returns the length of the sequence.
243
/////////////////////////////////////////////////////////////////
245
int GetLength () const {
251
/////////////////////////////////////////////////////////////////
252
// Sequence::WriteMFA()
254
// Writes the sequence to outfile in MFA format. Uses numColumns
255
// columns per line. If useIndex is set to false, then the
256
// header is printed as normal, but if useIndex is true, then
257
// ">S###" is printed where ### represents the sequence label.
258
/////////////////////////////////////////////////////////////////
260
void WriteMFA (ostream &outfile, int numColumns, bool useIndex = false) const {
263
assert (!outfile.fail());
267
outfile << ">S" << GetLabel() << endl;
269
outfile << ">" << header << endl;
271
// print out character data
273
for (; ct <= length; ct++){
274
outfile << (*data)[ct];
275
if (ct % numColumns == 0) outfile << endl;
277
if ((ct-1) % numColumns != 0) outfile << endl;
280
/////////////////////////////////////////////////////////////////
283
// Returns a new deep copy of the seqeuence.
284
/////////////////////////////////////////////////////////////////
286
Sequence *Clone () const {
287
Sequence *ret = new Sequence();
290
ret->isValid = isValid;
291
ret->header = header;
292
ret->data = new SafeVector<char>; assert (ret->data);
293
*(ret->data) = *data;
294
ret->length = length;
295
ret->sequenceLabel = sequenceLabel;
296
ret->inputLabel = inputLabel;
301
/////////////////////////////////////////////////////////////////
302
// Sequence::GetRange()
304
// Returns a new sequence object consisting of a range of
305
// characters from the current seuquence.
306
/////////////////////////////////////////////////////////////////
308
Sequence *GetRange (int start, int end) const {
309
Sequence *ret = new Sequence();
312
assert (start >= 1 && start <= length);
313
assert (end >= 1 && end <= length);
314
assert (start <= end);
316
ret->isValid = isValid;
317
ret->header = header;
318
ret->data = new SafeVector<char>; assert (ret->data);
319
ret->data->push_back ('@');
320
for (int i = start; i <= end; i++)
321
ret->data->push_back ((*data)[i]);
322
ret->length = end - start + 1;
323
ret->sequenceLabel = sequenceLabel;
324
ret->inputLabel = inputLabel;
329
/////////////////////////////////////////////////////////////////
330
// Sequence::AddGaps()
332
// Given an SafeVector<char> containing the skeleton for an
333
// alignment and the identity of the current character, this
334
// routine will create a new sequence with all necesssary gaps added.
336
// alignment = "XXXBBYYYBBYYXX"
338
// will perform the transformation
339
// "ATGCAGTCA" --> "ATGCC---GT--CA"
341
/////////////////////////////////////////////////////////////////
343
Sequence *AddGaps (SafeVector<char> *alignment, char id){
344
Sequence *ret = new Sequence();
347
ret->isValid = isValid;
348
ret->header = header;
349
ret->data = new SafeVector<char>; assert (ret->data);
350
ret->length = (int) alignment->size();
351
ret->sequenceLabel = sequenceLabel;
352
ret->inputLabel = inputLabel;
353
ret->data->push_back ('@');
355
SafeVector<char>::iterator dataIter = data->begin() + 1;
356
for (SafeVector<char>::iterator iter = alignment->begin(); iter != alignment->end(); ++iter){
357
if (*iter == 'B' || *iter == id){
358
ret->data->push_back (*dataIter);
362
ret->data->push_back ('-');
368
/////////////////////////////////////////////////////////////////
369
// Sequence::GetString()
371
// Returns the sequence as a string with gaps removed.
372
/////////////////////////////////////////////////////////////////
376
for (int i = 1; i <= length; i++){
377
if ((*data)[i] != '-') s += (*data)[i];
383
/////////////////////////////////////////////////////////////////
384
// Sequence::GetMapping()
386
// Returns a SafeVector<int> containing the indices of every
387
// character in the sequence. For instance, if the data is
388
// "ATGCC---GT--CA", the method returns {1,2,3,4,5,9,10,13,14}.
389
/////////////////////////////////////////////////////////////////
391
SafeVector<int> *GetMapping () const {
392
SafeVector<int> *ret = new SafeVector<int>(1, 0);
393
for (int i = 1; i <= length; i++){
394
if ((*data)[i] != '-') ret->push_back (i);
399
/////////////////////////////////////////////////////////////////
400
// Sequence::Highlight()
402
// Changes all positions with score >= cutoff to upper case and
403
// all positions with score < cutoff to lower case.
404
/////////////////////////////////////////////////////////////////
406
void Highlight (const SafeVector<float> &scores, const float cutoff){
407
for (int i = 1; i <= length; i++){
408
if (scores[i-1] >= cutoff)
409
(*data)[i] = toupper ((*data)[i]);
411
(*data)[i] = tolower ((*data)[i]);