2
* $Id: alnread.c,v 1.12 2004/09/17 12:21:48 bollin Exp $
4
* ===========================================================================
7
* National Center for Biotechnology Information
9
* This software/database is a "United States Government Work" under the
10
* terms of the United States Copyright Act. It was written as part of
11
* the author's official duties as a United States Government employee and
12
* thus cannot be copyrighted. This software/database is freely available
13
* to the public for use. The National Library of Medicine and the U.S.
14
* Government have not placed any restriction on its use or reproduction.
16
* Although all reasonable efforts have been taken to ensure the accuracy
17
* and reliability of the software and data, the NLM and the U.S.
18
* Government do not and cannot warrant the performance or results that
19
* may be obtained by using this software or data. The NLM and the U.S.
20
* Government disclaim all warranties, express or implied, including
21
* warranties of performance, merchantability or fitness for any particular
24
* Please cite the author in any work or product based on this material.
26
* ===========================================================================
28
* Authors: Colleen Bollin
32
#include <util/creaders/alnread.h>
38
static const int kMaxPrintedIntLen = 10;
39
#define MAX_PRINTED_INT_LEN_PLUS_ONE 11
46
/* structures used internally */
47
typedef struct SLineInfo {
52
struct SLineInfo * next;
53
} SLineInfo, * TLineInfoPtr;
55
typedef struct SLineInfoReader {
56
TLineInfoPtr first_line;
57
TLineInfoPtr curr_line;
60
} SLineInfoReader, * TLineInfoReaderPtr;
62
typedef struct SIntLink {
64
struct SIntLink * next;
65
} SIntLink, * TIntLinkPtr;
67
typedef struct SStringCount {
70
TIntLinkPtr line_numbers;
71
struct SStringCount * next;
72
} SStringCount, * TStringCountPtr;
74
typedef struct SSizeInfo {
77
struct SSizeInfo * next;
78
} SSizeInfo, * TSizeInfoPtr;
80
typedef struct SLengthList {
81
TSizeInfoPtr lengthrepeats;
83
struct SLengthList * next;
84
} SLengthListData, * SLengthListPtr;
86
typedef struct SCommentLoc {
89
struct SCommentLoc * next;
90
} SCommentLoc, * TCommentLocPtr;
92
typedef struct SBracketedCommentList
94
TLineInfoPtr comment_lines;
95
struct SBracketedCommentList * next;
96
} SBracketedCommentList, * TBracketedCommentListPtr;
98
typedef struct SAlignRawSeq {
100
TLineInfoPtr sequence_data;
101
TIntLinkPtr id_lines;
102
struct SAlignRawSeq * next;
103
} SAlignRawSeq, * TAlignRawSeqPtr;
105
typedef struct SAlignFileRaw {
106
TLineInfoPtr line_list;
107
TLineInfoPtr organisms;
108
TAlignRawSeqPtr sequences;
110
TLineInfoPtr deflines;
114
TIntLinkPtr offset_list;
115
FReportErrorFunction report_error;
116
void * report_error_userdata;
118
int expected_num_sequence;
119
int expected_sequence_len;
121
} SAlignRawFileData, * SAlignRawFilePtr;
123
/* These functions are used for storing and transmitting information
124
* about errors encountered while reading the alignment data.
127
/* This function allocates memory for a new error structure and populates
128
* the structure with default values.
129
* The new structure will be added to the end of the linked list of error
130
* structures pointed to by list.
132
extern TErrorInfoPtr ErrorInfoNew (TErrorInfoPtr list)
134
TErrorInfoPtr eip, last;
136
eip = (TErrorInfoPtr) malloc ( sizeof (SErrorInfo));
140
eip->category = eAlnErr_Unknown;
146
while (last != NULL && last->next != NULL) {
156
/* This function recursively frees the memory associated with a list of
157
* error structures as well as the member variables of the error structures.
159
extern void ErrorInfoFree (TErrorInfoPtr eip)
164
ErrorInfoFree (eip->next);
170
/* This function creates and sends an error message regarding a NEXUS comment
174
s_ReportCharCommentError
178
FReportErrorFunction errfunc,
182
const char * errformat = "Specified %s character does not match NEXUS"
183
" comment in file (specified %s, comment %c)";
185
if (errfunc == NULL || val_name == NULL || expected == NULL) {
189
eip = ErrorInfoNew (NULL);
191
eip->category = eAlnErr_BadFormat;
192
eip->message = (char *) malloc (strlen (errformat) + strlen (val_name)
193
+ strlen (expected) + 2);
194
if (eip->message != NULL) {
195
sprintf (eip->message, errformat, val_name, expected, seen);
197
errfunc (eip, errdata);
202
/* This function creates and sends an error message regarding a character
203
* that is unexpected in sequence data.
213
FReportErrorFunction errfunc,
217
const char * err_format =
218
"%d bad characters (%c) found at position %d (%s).";
220
if (errfunc == NULL || num_bad == 0 || bad_char == 0
225
eip = ErrorInfoNew (NULL);
230
eip->category = eAlnErr_BadData;
231
if (id != NULL) eip->id = strdup (id);
232
eip->line_num = line_number;
233
eip->message = (char *) malloc (strlen (err_format) + 2 * kMaxPrintedIntLen
234
+ strlen (reason) + 3);
235
if (eip->message != NULL)
237
sprintf (eip->message, err_format, num_bad, bad_char, offset, reason);
239
errfunc (eip, errdata);
243
/* This function creates and sends an error message regarding an ID that
244
* was found in the wrong location.
247
s_ReportInconsistentID
250
FReportErrorFunction report_error,
251
void * report_error_userdata)
255
if (report_error == NULL) {
258
eip = ErrorInfoNew (NULL);
262
eip->category = eAlnErr_BadFormat;
263
eip->id = strdup (id);
264
eip->line_num = line_number;
265
eip->message = strdup ("Found unexpected ID");
266
report_error (eip, report_error_userdata);
270
/* This function creates and sends an error message regarding a line
271
* of sequence data that was expected to have a different length.
274
s_ReportInconsistentBlockLine
277
FReportErrorFunction report_error,
278
void * report_error_userdata)
282
if (report_error == NULL) {
285
eip = ErrorInfoNew (NULL);
289
eip->category = eAlnErr_BadFormat;
290
eip->id = strdup (id);
291
eip->line_num = line_number;
292
eip->message = strdup ("Inconsistent block line formatting");
293
report_error (eip, report_error_userdata);
297
/* This function creates and sends an error message regarding mismatched
301
s_ReportDefinitionLineMismatch
302
(FReportErrorFunction report_error,
303
void * report_error_userdata)
307
if (report_error == NULL) {
310
eip = ErrorInfoNew (NULL);
315
eip->category = eAlnErr_BadData;
316
eip->message = strdup ("Mismatched definition lines");
317
report_error (eip, report_error_userdata);
321
/* This function recursively creates and sends an error message
322
* regarding the number of times items in list appear.
325
s_ReportDefinitionLines
326
(TStringCountPtr list,
327
FReportErrorFunction report_error,
328
void * report_error_userdata)
331
const char * err_null_format = "Null definition line occurs %d times";
332
const char * err_format = "Definition line %s occurs %d times";
334
if (list == NULL || report_error == NULL) {
337
eip = ErrorInfoNew (NULL);
342
eip->category = eAlnErr_BadData;
343
if (list->string == NULL) {
344
eip->message = malloc (strlen (err_null_format)
345
+ kMaxPrintedIntLen + 1);
346
if (eip->message != NULL) {
347
sprintf (eip->message, err_null_format, list->num_appearances);
350
eip->message = malloc (strlen (err_format)
351
+ strlen (list->string)
352
+ kMaxPrintedIntLen + 1);
353
if (eip->message != NULL) {
354
sprintf (eip->message, err_format, list->string,
355
list->num_appearances);
358
report_error (eip, report_error_userdata);
360
s_ReportDefinitionLines (list->next, report_error, report_error_userdata);
364
/* This function creates and sends an error message regarding a line of
365
* sequence data that was expected to be a different length.
368
s_ReportLineLengthError
372
FReportErrorFunction report_error,
373
void * report_error_userdata)
377
const char * format = "Expected line length %d, actual length %d";
380
if (lip == NULL || report_error == NULL) {
384
eip = ErrorInfoNew (NULL);
388
eip->category = eAlnErr_BadFormat;
389
eip->id = strdup (id);
390
eip->line_num = lip->line_num;
391
msg = (char *) malloc (strlen (format) + kMaxPrintedIntLen + 1);
393
if (lip->data == NULL) {
396
len = strlen (lip->data);
398
sprintf (msg, format, expected_length, len);
401
report_error (eip, report_error_userdata);
405
/* This function creates and sends an error message regarding a block of
406
* sequence data that was expected to contain more lines.
409
s_ReportBlockLengthError
414
FReportErrorFunction report_error,
415
void * report_error_userdata)
418
const char * err_format = "Expected %d lines in block, found %d";
420
if (report_error == NULL) {
424
eip = ErrorInfoNew (NULL);
428
eip->category = eAlnErr_BadFormat;
429
eip->id = strdup (id);
430
eip->line_num = line_num;
431
eip->message = malloc (strlen (err_format) + 2 * kMaxPrintedIntLen + 1);
432
if (eip->message != NULL) {
433
sprintf (eip->message, err_format, expected_num, actual_num);
435
report_error (eip, report_error_userdata);
439
/* This function creates and sends an error message regarding missing
443
s_ReportMissingSequenceData
445
FReportErrorFunction report_error,
446
void * report_error_userdata)
450
if (report_error == NULL) {
453
eip = ErrorInfoNew (NULL);
457
eip->category = eAlnErr_Fatal;
458
eip->id = strdup (id);
459
eip->message = strdup ("No data found");
460
report_error (eip, report_error_userdata);
464
/* This function creates and sends an error message indicating that the
465
* most common length of the sequences in the file do not match a comment
469
s_ReportBadSequenceLength
473
FReportErrorFunction report_error,
474
void * report_error_userdata)
477
const char * format_str = "Expected sequence length %d, actual length %d";
479
if (report_error == NULL) {
482
eip = ErrorInfoNew (NULL);
486
eip->category = eAlnErr_BadFormat;
487
eip->id = strdup (id);
488
eip->message = malloc (strlen (format_str) + 50);
489
if (eip->message != NULL) {
490
sprintf (eip->message, format_str, expected_length, actual_length);
492
report_error (eip, report_error_userdata);
496
/* This function creates and sends an error message indicating that the
497
* number of sequences read does not match a comment in the alignment file.
500
s_ReportIncorrectNumberOfSequences
503
FReportErrorFunction report_error,
504
void * report_error_userdata)
507
const char * err_format = "Expected %d sequences, found %d";
509
if (report_error == NULL) {
512
eip = ErrorInfoNew (NULL);
516
eip->category = eAlnErr_BadFormat;
517
eip->message = (char *) malloc (strlen (err_format) +
518
2 * kMaxPrintedIntLen + 1);
520
if (eip->message != NULL)
522
sprintf (eip->message, err_format, num_expected, num_found);
524
report_error (eip, report_error_userdata);
529
s_ReportIncorrectSequenceLength
532
FReportErrorFunction report_error,
533
void * report_error_userdata)
536
const char * err_format = "Expected sequences of length %d, found %d";
538
if (report_error == NULL) {
541
eip = ErrorInfoNew (NULL);
546
eip->category = eAlnErr_BadFormat;
547
eip->message = (char *)malloc (strlen (err_format)
548
+ 2 * kMaxPrintedIntLen + 1);
549
if (eip->message != NULL)
551
sprintf (eip->message, err_format, len_expected, len_found);
553
report_error (eip, report_error_userdata);
557
/* This function creates and sends an error message regarding a non-unique
561
s_ReportRepeatedOrganismName
566
FReportErrorFunction report_error,
567
void * report_error_userdata)
570
const char * err_format = "Organism name %s also appears at line %d";
572
if (report_error == NULL || org_name == NULL) {
575
eip = ErrorInfoNew (NULL);
579
eip->category = eAlnErr_BadData;
580
eip->line_num = line_num;
582
eip->id = strdup (id);
584
eip->message = malloc (strlen (err_format) + strlen (org_name)
585
+ kMaxPrintedIntLen + 1);
586
if (eip->message != NULL) {
587
sprintf (eip->message, err_format, org_name, second_line_num);
589
report_error (eip, report_error_userdata);
593
/* This function creates and sends an error message indicating that some or
594
* all of the organism information for the sequences are missing.
597
s_ReportMissingOrganismInfo
598
(FReportErrorFunction report_error,
599
void * report_error_userdata)
603
if (report_error == NULL) {
606
eip = ErrorInfoNew (NULL);
611
eip->category = eAlnErr_BadData;
612
eip->message = strdup ("Missing organism information");
613
report_error (eip, report_error_userdata);
617
/* This function creates and sends an error message regarding an ID that is
618
* used for more than one sequence.
622
(TStringCountPtr scp,
623
FReportErrorFunction report_error,
624
void * report_error_userdata)
627
const char * err_format = "ID %s appears in the following locations:";
629
TIntLinkPtr line_number;
631
if (report_error == NULL || scp == NULL || scp->string == NULL) {
635
eip = ErrorInfoNew (NULL);
640
eip->category = eAlnErr_BadData;
641
eip->id = strdup (scp->string);
642
if (scp->line_numbers != NULL) {
643
eip->line_num = scp->line_numbers->ival;
645
eip->message = (char *) malloc ( strlen (err_format)
646
+ strlen (scp->string)
647
+ scp->num_appearances * 15
649
if (eip->message != NULL) {
650
sprintf (eip->message, err_format, scp->string);
651
cp = eip->message + strlen (eip->message);
652
for (line_number = scp->line_numbers;
654
line_number = line_number->next) {
655
sprintf (cp, " %d", line_number->ival);
659
report_error (eip, report_error_userdata);
663
/* This function creates and sends an error message indicating that the file
664
* being read is an ASN.1 file.
668
(FReportErrorFunction errfunc,
672
const char * msg = "This is an ASN.1 file, "
673
"which cannot be read by this function.";
675
if (errfunc == NULL) {
679
eip = ErrorInfoNew (NULL);
681
eip->category = eAlnErr_BadData;
682
eip->message = (char *) malloc (strlen (msg) + 1);
683
if (eip->message != NULL) {
684
sprintf (eip->message, msg);
686
errfunc (eip, errdata);
691
/* This function reports that some sequences are inside brackets (indicating a segmented set)
692
* and that some sequences are outside the brackets.
695
s_ReportSegmentedAlignmentError
696
(TIntLinkPtr offset_list,
697
FReportErrorFunction errfunc,
701
const char * msg = "This file contains sequences in brackets (indicating "
702
"a segmented alignment) as well as sequences not in brackets at lines "
703
"%s. Please either add or remove brackets to correct this problem.";
707
char * line_text_list;
708
char * line_text_list_offset;
710
if (errfunc == NULL || offset_list == NULL) {
714
for (t = offset_list; t != NULL; t = t->next)
718
msg_len = num_lines * (kMaxPrintedIntLen + 2);
723
line_text_list = (char *) malloc (msg_len);
724
if (line_text_list == NULL) return;
725
line_text_list_offset = line_text_list;
726
for (t = offset_list; t != NULL; t = t->next)
730
sprintf (line_text_list_offset, "%d", t->ival);
732
else if (num_lines == 2)
734
sprintf (line_text_list_offset, "%d and ", t->ival);
736
else if (t->next->next == NULL)
738
sprintf (line_text_list_offset, "%d, and ", t->ival);
742
sprintf (line_text_list_offset, "%d, ", t->ival);
744
line_text_list_offset += strlen (line_text_list_offset);
747
msg_len += strlen (msg) + 1;
749
eip = ErrorInfoNew (NULL);
751
eip->category = eAlnErr_BadData;
752
eip->message = (char *) malloc (msg_len);
753
if (eip->message != NULL) {
754
sprintf (eip->message, msg, line_text_list);
756
errfunc (eip, errdata);
758
free (line_text_list);
762
/* This function reports an error if a line looks like it might contain an organism comment
763
* but is somehow improperly formatted
765
static void s_ReportOrgCommentError
767
FReportErrorFunction errfunc,
771
const char * msg = "This line may contain an improperly formatted organism description.\n"
772
"Organism descriptions should be of the form [org=tax name] or [organism=tax name].\n";
774
if (errfunc == NULL || linestring == NULL) {
778
eip = ErrorInfoNew (NULL);
780
eip->category = eAlnErr_BadData;
781
eip->message = (char *) malloc (strlen (msg) + strlen (linestring) + 1);
782
if (eip->message != NULL) {
783
strcpy (eip->message, msg);
784
strcat (eip->message, linestring);
786
errfunc (eip, errdata);
791
/* This function reports that the number of segments in an alignment of
792
* segmented sets is inconsistent.
794
static void s_ReportBadNumSegError
798
FReportErrorFunction errfunc,
802
const char * msg = "This segmented set contains a different number of segments (%d) than expected (%d).\n";
804
if (errfunc == NULL) {
808
eip = ErrorInfoNew (NULL);
810
eip->line_num = line_num;
811
eip->category = eAlnErr_BadData;
812
eip->message = (char *) malloc (strlen (msg) + 2 * kMaxPrintedIntLen + 1);
813
if (eip->message != NULL) {
814
sprintf (eip->message, msg, num_seg, num_seg_exp);
816
errfunc (eip, errdata);
821
/* This function allocates memory for a SSequenceInfo structure and
822
* initializes the member variables. It returns a pointer to the newly
825
extern TSequenceInfoPtr SequenceInfoNew (void)
827
TSequenceInfoPtr sip;
829
sip = (TSequenceInfoPtr) malloc (sizeof (SSequenceInfo));
833
sip->missing = strdup ("?");
834
sip->beginning_gap = strdup (".");
835
sip->middle_gap = strdup ("-");
836
sip->end_gap = strdup (".");
837
sip->match = strdup (".");
838
sip->alphabet = NULL;
843
/* This function frees memory associated with the member variables of
844
* the SSequenceInfo structure and with the structure itself.
846
extern void SequenceInfoFree (TSequenceInfoPtr sip)
851
free (sip->alphabet);
853
free (sip->beginning_gap);
854
free (sip->middle_gap);
857
sip->alphabet = NULL;
862
/* This function creates and sends an error message regarding an unused line.
868
TLineInfoPtr line_val,
869
FReportErrorFunction errfunc,
873
const char * errformat1 = "Line %d could not be assigned to an interleaved block";
874
const char * errformat2 = "Lines %d through %d could not be assigned to an interleaved block";
875
const char * errformat3 = "Contents of unused line: %s";
878
if (errfunc == NULL || line_val == NULL) {
882
eip = ErrorInfoNew (NULL);
884
eip->category = eAlnErr_BadFormat;
885
eip->line_num = line_num_start;
886
if (line_num_start == line_num_stop) {
887
eip->message = (char *) malloc (strlen (errformat1)
888
+ kMaxPrintedIntLen + 1);
889
if (eip->message != NULL) {
890
sprintf (eip->message, errformat1, line_num_start);
893
eip->message = (char *) malloc (strlen (errformat2)
894
+ 2 * kMaxPrintedIntLen + 1);
895
if (eip->message != NULL) {
896
sprintf (eip->message, errformat2, line_num_start,
900
errfunc (eip, errdata);
902
/* report contents of unused lines */
903
for (skip = line_num_start;
904
skip < line_num_stop + 1 && line_val != NULL;
906
if (line_val->data == NULL) {
909
eip = ErrorInfoNew (NULL);
911
eip->category = eAlnErr_BadFormat;
912
eip->line_num = skip;
913
eip->message = (char *) malloc (strlen (errformat3)
914
+ strlen (line_val->data) + 1);
915
if (eip->message != NULL) {
916
sprintf (eip->message, errformat3, line_val->data);
918
errfunc (eip, errdata);
920
line_val = line_val->next;
925
/* The following functions are used to manage a linked list of integer
929
/* This function creates a new SIntLink structure with a value of ival.
930
* The new structure will be placed at the end of list if list is not NULL.
931
* The function will return a pointer to the new structure.
938
TIntLinkPtr ilp, last;
940
ilp = (TIntLinkPtr) malloc (sizeof (SIntLink));
947
while (last != NULL && last->next != NULL) {
957
/* This function recursively frees memory associated with a linked list
958
* of SIntLink structures.
960
static void s_IntLinkFree (TIntLinkPtr ilp)
965
s_IntLinkFree (ilp->next);
970
/* These functions are used to accumulate and retrieve information on
971
* how often a size of data (number of lines or number of characters) occurs.
974
/* This function allocates space for a new SSizeInfo structure and
975
* initializes its member variables. If list is not NULL, the new structure
976
* is added to the end of the list.
977
* The function returns a pointer to the newly allocated structure.
979
static TSizeInfoPtr s_SizeInfoNew (TSizeInfoPtr list)
981
TSizeInfoPtr sip, last;
983
sip = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
989
sip->num_appearances = 0;
992
while (last != NULL && last->next != NULL) {
1002
/* This function recursively frees the memory associated with a linked list
1003
* of SSizeInfo structures.
1005
static void s_SizeInfoFree (TSizeInfoPtr list)
1010
s_SizeInfoFree (list->next);
1016
/* This function returns eTrue if the two SSizeInfo structures have
1017
* the same size_value and number of appearances, eFalse otherwise.
1026
|| s1->size_value != s2->size_value
1027
|| s1->num_appearances != s2->num_appearances) {
1034
/* This function searches list for a SSizeInfo structure with the
1035
* same size_value as size_value. If it finds such a structure, it
1036
* adds the value of num_appearances to the num_appearances for that
1037
* structure, otherwise the function creates a new structure at the end
1038
* of the list with the specified values of size_value and num_appearances.
1039
* The function returns a pointer to the list of SSizeInfo structures.
1041
static TSizeInfoPtr s_AddSizeInfoAppearances
1044
int num_appearances)
1046
TSizeInfoPtr p, last;
1049
for (p = list; p != NULL && p->size_value != size_value; p = p->next) {
1053
p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
1057
p->size_value = size_value;
1058
p->num_appearances = num_appearances;
1066
p->num_appearances += num_appearances;
1072
/* This function searches list for a SSizeInfo structure with the
1073
* same size_value as size_value. If it finds such a structure, it
1074
* adds one to the num_appearances for that structure, otherwise the
1075
* function creates a new structure at the end of the list with the
1076
* specified values of size_value and num_appearances.
1077
* The function returns a pointer to the list of SSizeInfo structures.
1084
return s_AddSizeInfoAppearances (list, size_value, 1);
1088
/* This function searches list for the SSizeInfo structure with the
1089
* highest value for num_appearances. If more than one structure exists
1090
* with the highest value for num_appearances, the function chooses the
1091
* value with the highest value for size_value. The function returns a
1092
* pointer to the structure selected based on the above criteria.
1094
static TSizeInfoPtr s_GetMostPopularSizeInfo (TSizeInfoPtr list)
1096
TSizeInfoPtr p, best;
1103
for (p = list->next; p != NULL; p = p->next) {
1104
if (p->num_appearances > best->num_appearances
1105
|| (p->num_appearances == best->num_appearances
1106
&& p->size_value > best->size_value)) {
1114
/* This function uses s_GetMostPopularSizeInfo function to find the structure
1115
* in list that has the highest value for num_appearances and size_value.
1116
* If such a structure is found and has a num_appearances value greater than
1117
* one, the size_value for that structure will be returned, otherwise the
1118
* function returns 0.
1120
static int s_GetMostPopularSize (TSizeInfoPtr list)
1124
best = s_GetMostPopularSizeInfo (list);
1128
if (best->num_appearances > 1) {
1129
return best->size_value;
1136
/* The following functions are used to keep track of patterns of line or
1137
* token lengths, which will be used to identify errors in formatting.
1139
static SLengthListPtr s_LengthListNew (SLengthListPtr list)
1141
SLengthListPtr llp, last;
1143
llp = (SLengthListPtr) malloc (sizeof (SLengthListData));
1148
llp->lengthrepeats = NULL;
1149
llp->num_appearances = 0;
1153
while (last != NULL && last->next != NULL) {
1163
/* This function recursively frees memory for a list of SLengthListData
1164
* structures and its member variables.
1166
static void s_LengthListFree (SLengthListPtr llp)
1171
s_LengthListFree (llp->next);
1172
s_SizeInfoFree (llp->lengthrepeats);
1177
/* This function examines the last SSizeInfo structure in the
1178
* lengthrepeats member variable of llp. If the last structure
1179
* in the list has the same size_value value as the function argument
1180
* size_value, the value of num_appearances for that SizeInforData structure
1181
* will be incremented. Otherwise a new SSizeInfo structure will be
1182
* appended to the end of the lengthrepeats list with the specified
1183
* size_value and a num_appearances value of 1.
1187
(SLengthListPtr llp,
1190
TSizeInfoPtr p, last;
1197
for (p = llp->lengthrepeats; p != NULL; p = p->next) {
1200
if (last == NULL || last->size_value != size_value) {
1201
p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
1205
p->size_value = size_value;
1206
p->num_appearances = 1;
1209
llp->lengthrepeats = p;
1214
last->num_appearances ++;
1219
/* This function examines whether two SLengthListData structures "match" -
1220
* the structures match if each SSizeInfo structure in llp1->lengthrepeats
1221
* has the same size_value and num_appearances values as the SSizeInfo
1222
* structure in the corresponding list position in llp2->lenghrepeats.
1223
* If the two structures match, the function returns eTrue, otherwise the
1224
* function returns eFalse.
1227
s_DoLengthPatternsMatch
1228
(SLengthListPtr llp1,
1229
SLengthListPtr llp2)
1231
TSizeInfoPtr sip1, sip2;
1233
if (llp1 == NULL || llp2 == NULL
1234
|| llp1->lengthrepeats == NULL
1235
|| llp2->lengthrepeats == NULL) {
1238
for (sip1 = llp1->lengthrepeats, sip2 = llp2->lengthrepeats;
1239
sip1 != NULL && sip2 != NULL;
1240
sip1 = sip1->next, sip2 = sip2->next) {
1241
if ( ! s_SizeInfoIsEqual (sip1, sip2)
1242
|| (sip1->next == NULL && sip2->next != NULL)
1243
|| (sip1->next != NULL && sip2->next == NULL)) {
1251
/* This function examines a list of SLengthListData structures to see if
1252
* one of them matches llp. If so, the value of num_appearances in that
1253
* list is incremented by one and llp is freed, otherwise llp is added
1254
* to the end of the list.
1255
* The function returns a pointer to the list of LenghtListData structures.
1257
static SLengthListPtr
1259
(SLengthListPtr list,
1262
SLengthListPtr prev_llp;
1268
while ( prev_llp->next && ! s_DoLengthPatternsMatch (prev_llp, llp)) {
1269
prev_llp = prev_llp->next;
1271
if (s_DoLengthPatternsMatch (prev_llp, llp)) {
1272
prev_llp->num_appearances ++;
1273
s_LengthListFree (llp);
1275
prev_llp->next = llp;
1282
/* This function examines the last SLengthListData structure in list to
1283
* see if it matches llp. If so, the function increments the value of
1284
* num_appearances for the last SLengthListData structure in list and
1285
* frees llp, otherwise the function appends llp to the end of list.
1286
* The function returns a pointer to the list of SLengthListData structures.
1288
static SLengthListPtr
1290
(SLengthListPtr list,
1293
SLengthListPtr prev_llp;
1299
while ( prev_llp->next != NULL ) {
1300
prev_llp = prev_llp->next;
1302
if (s_DoLengthPatternsMatch (prev_llp, llp)) {
1303
prev_llp->num_appearances ++;
1304
s_LengthListFree (llp);
1306
prev_llp->next = llp;
1313
/* This set of functions is used for storing and analyzing individual lines
1314
* or tokens from an alignment file.
1317
/* This function allocates memory for a new SLineInfo structure and
1318
* initializes the structure with a saved copy of string and the specified
1319
* values of line_num and line_offset.
1320
* The function returns a pointer to the new SLineInfo structure.
1330
lip = (TLineInfoPtr) malloc (sizeof (SLineInfo));
1334
lip->data = strdup (string);
1335
lip->line_num = line_num;
1336
lip->line_offset = line_offset;
1337
lip->delete_me = eFalse;
1343
/* This function recursively frees the memory associated with the structures
1344
* and members of the structures in a linked list of SLineInfo structures.
1346
static void s_LineInfoFree (TLineInfoPtr lip)
1351
s_LineInfoFree (lip->next);
1358
/* This function deletes from a linked list of SLineInfo structures
1359
* those structures for which the delete_me flag has been set. The function
1360
* returns a pointer to the beginning of the new list.
1362
static TLineInfoPtr s_DeleteLineInfos (TLineInfoPtr list)
1364
TLineInfoPtr prev = NULL;
1365
TLineInfoPtr lip, nextlip;
1368
while (lip != NULL) {
1369
nextlip = lip->next;
1370
if (lip->delete_me) {
1372
prev->next = lip->next;
1377
s_LineInfoFree (lip);
1387
/* This function creates a new SLineInfo structure, populates it with
1388
* a copy of string and the specified line_num and line_offset values,
1389
* and appends it to the end of "list" if list is not NULL.
1390
* The function will return a pointer to the newly created structure
1391
* if list is NULL, otherwise the function will return list.
1400
TLineInfoPtr lip, p;
1402
if (string == NULL) {
1405
lip = s_LineInfoNew (string, line_num, line_offset);
1413
while (p != NULL && p->next != NULL) {
1421
/* This function creates a new bracketed comment */
1422
static TBracketedCommentListPtr s_BracketedCommentListNew
1423
(TBracketedCommentListPtr list,
1428
TBracketedCommentListPtr comment;
1430
comment = (TBracketedCommentListPtr) malloc (sizeof (SBracketedCommentList));
1431
if (comment == NULL) {
1434
comment->comment_lines = s_LineInfoNew (string, line_num, line_offset);
1435
comment->next = NULL;
1438
while (list->next != NULL) {
1441
list->next = comment;
1447
/* This function frees a bracketed comment list. */
1448
static void s_BracketedCommentListFree (TBracketedCommentListPtr list)
1453
s_BracketedCommentListFree (list->next);
1455
s_LineInfoFree (list->comment_lines);
1458
/* This function adds a line to a bracketed comment. */
1459
static void s_BracketedCommentListAddLine
1460
(TBracketedCommentListPtr comment,
1465
if (comment == NULL) {
1469
comment->comment_lines = s_AddLineInfo (comment->comment_lines, string, line_num, line_offset);
1472
/* This function counts the sequences found in a bracketed comment. */
1473
static int s_CountSequencesInBracketedComment (TBracketedCommentListPtr comment)
1476
int num_segments = 0;
1477
EBool skipped_line_since_last_defline = eTrue;
1479
if (comment == NULL || comment->comment_lines == NULL) {
1483
lip = comment->comment_lines;
1484
/* First line must be left bracket on a line by itself */
1485
if (lip->data[0] != '[' || strspn (lip->data + 1, " \t\r\n") != strlen (lip->data + 1))
1490
while (lip != NULL && lip->next != NULL)
1492
if (lip->data[0] == '>')
1494
if (!skipped_line_since_last_defline)
1501
skipped_line_since_last_defline = eFalse;
1506
skipped_line_since_last_defline = eTrue;
1510
/* Last line must be right bracket on a line by itself */
1511
/* First line must be left bracket on a line by itself */
1512
if (lip->data[0] != ']' || strspn (lip->data + 1, " \t\r\n") != strlen (lip->data + 1))
1517
return num_segments;
1520
/* This function counts the number of sequences that appear in
1521
* bracketed comments. If the number of sequences is inconsistent,
1522
* the function will issue error messages and return a 1, otherwise
1523
* the function will return the number of sequences that appear in
1524
* each bracketed comment.
1526
static int s_GetNumSegmentsInAlignment
1527
(TBracketedCommentListPtr comment_list,
1528
FReportErrorFunction errfunc,
1531
TBracketedCommentListPtr comment;
1532
TSizeInfoPtr segcount_list = NULL;
1533
int num_segments = 1;
1534
int num_segments_this_bracket;
1535
int num_segments_expected;
1538
if (comment_list == NULL)
1540
return num_segments;
1543
for (comment = comment_list; comment != NULL; comment = comment->next)
1545
num_segments_this_bracket = s_CountSequencesInBracketedComment (comment);
1546
segcount_list = s_AddSizeInfoAppearances (segcount_list,
1547
num_segments_this_bracket,
1549
if (comment != comment_list && segcount_list->next != NULL)
1551
best = s_GetMostPopularSizeInfo (segcount_list);
1552
num_segments_expected = best->size_value;
1554
if (num_segments_expected != num_segments_this_bracket)
1556
s_ReportBadNumSegError (comment->comment_lines->line_num,
1557
num_segments_this_bracket, num_segments_expected,
1562
if (segcount_list != NULL && segcount_list->next == NULL && segcount_list->size_value > 0)
1564
num_segments = segcount_list->size_value;
1566
s_SizeInfoFree (segcount_list);
1567
return num_segments;
1570
/* This function gets a list of the offsets of the
1571
* sequences in bracketed comments.
1573
static TIntLinkPtr GetSegmentOffsetList (TBracketedCommentListPtr comment_list)
1575
TIntLinkPtr new_offset, offset_list = NULL;
1576
TBracketedCommentListPtr comment;
1579
if (comment_list == NULL)
1584
for (comment = comment_list; comment != NULL; comment = comment->next)
1586
if (s_CountSequencesInBracketedComment (comment) == 0)
1590
for (lip = comment->comment_lines; lip != NULL; lip = lip->next)
1592
if (lip->data != NULL && lip->data[0] == '>')
1594
new_offset = s_IntLinkNew (lip->line_num + 1, offset_list);
1595
if (offset_list == NULL) offset_list = new_offset;
1602
static char * s_TokenizeString (char * str, char *delimiter, char **last)
1610
if (delimiter == NULL) {
1615
if (str == NULL || *str == 0) {
1618
skip = strspn (str, delimiter);
1620
length = strcspn (str, delimiter);
1621
*last = str + length;
1630
/* This function creates a new list of SLineInfo structures by tokenizing
1631
* each data element from line_list into multiple tokens at whitespace.
1632
* The function returns a pointer to the new list. The original list is
1635
static TLineInfoPtr s_BuildTokenList (TLineInfoPtr line_list)
1637
TLineInfoPtr first_token, lip;
1645
for (lip = line_list; lip != NULL; lip = lip->next) {
1646
if (lip->data != NULL && (tmp = strdup (lip->data)) != NULL) {
1647
piece = s_TokenizeString (tmp, " \t\r", &last);
1648
while (piece != NULL) {
1649
line_pos = piece - tmp;
1650
line_pos += lip->line_offset;
1651
first_token = s_AddLineInfo (first_token, piece,
1654
piece = s_TokenizeString (NULL, " \t\r", &last);
1663
/* This function takes a list of SLineInfo structures, allocates memory
1664
* to hold their contents contiguously, and stores their contents, minus
1665
* the whitespace, in the newly allocated memory.
1666
* The function returns a pointer to this newly allocated memory.
1668
static char * s_LineInfoMergeAndStripSpaces (TLineInfoPtr list)
1680
for (lip = list; lip != NULL; lip = lip->next) {
1681
if (lip->data != NULL) {
1682
len += strlen (lip->data);
1685
result = (char *) malloc (len + 1);
1686
if (result == NULL) {
1690
for (lip = list; lip != NULL; lip = lip->next) {
1691
if (lip->data != NULL) {
1692
cp_from = lip->data;
1693
while (*cp_from != 0) {
1694
if (! isspace ((int )*cp_from)) {
1707
/* The following functions are used to manage the SLineInfoReader
1708
* structure. The intention is to allow the user to access the data
1709
* from a linked list of SLineInfo structures using a given position
1710
* in the data based on the number of sequence data characters rather than
1711
* any particular line number or position in the line. This is useful
1712
* for matching up a data position in a record with a match character with
1713
* the same data position in the first or master record. This is also useful
1714
* for determining how to interpret special characters that may have
1715
* context-sensitive meanings. For example, a ? could indicate a missing
1716
* character if it is inside a sequence but indicate a gap if it is outside
1720
/* This function is used to advance the current data position pointer
1721
* for a SLineInfoReader structure past white space and blank lines
1724
static void s_LineInfoReaderAdvancePastSpace (TLineInfoReaderPtr lirp)
1726
if (lirp->curr_line_pos == NULL) {
1729
while ( isspace ((int ) *lirp->curr_line_pos)
1730
|| *lirp->curr_line_pos == 0) {
1731
while ( isspace ((int )*lirp->curr_line_pos)) {
1732
lirp->curr_line_pos ++;
1734
if (*lirp->curr_line_pos == 0) {
1735
lirp->curr_line = lirp->curr_line->next;
1736
while (lirp->curr_line != NULL
1737
&& lirp->curr_line->data == NULL) {
1738
lirp->curr_line = lirp->curr_line->next;
1740
if (lirp->curr_line == NULL) {
1741
lirp->curr_line_pos = NULL;
1744
lirp->curr_line_pos = lirp->curr_line->data;
1751
/* This function sets the current data position pointer to the first
1752
* non-whitespace character in the sequence data.
1754
static void s_LineInfoReaderReset (TLineInfoReaderPtr lirp)
1759
lirp->curr_line = lirp->first_line;
1761
while (lirp->curr_line != NULL && lirp->curr_line->data == NULL) {
1762
lirp->curr_line = lirp->curr_line->next;
1764
if (lirp->curr_line == NULL) {
1765
lirp->curr_line_pos = NULL;
1766
lirp->data_pos = -1;
1768
lirp->curr_line_pos = lirp->curr_line->data;
1769
s_LineInfoReaderAdvancePastSpace (lirp);
1770
if (lirp->curr_line_pos == NULL) {
1771
lirp->data_pos = -1;
1779
/* This function creates a new SLineInfoReader structure and initializes
1780
* its member variables. The current data position pointer is set to the
1781
* first non-whitespace character in the sequence data, and the data position
1782
* counter is set to zero. The function returns a pointer to the new
1783
* LineInfoReader data structure.
1785
static TLineInfoReaderPtr s_LineInfoReaderNew (TLineInfoPtr line_list)
1787
TLineInfoReaderPtr lirp;
1789
if (line_list == NULL) {
1792
lirp = (TLineInfoReaderPtr) malloc (sizeof (SLineInfoReader));
1797
lirp->first_line = line_list;
1798
s_LineInfoReaderReset (lirp);
1803
/* This function safely interprets the current line number of the
1804
* SLineInfoReader structure. If the structure is NULL or the
1805
* current line is NULL (usually because the data position has been
1806
* advanced to the end of the available sequence data), the function
1807
* returns -1, since the current data position does not actually exist.
1808
* Otherwise, the line number of the character at the current data position
1811
static int s_LineInfoReaderGetCurrentLineNumber (TLineInfoReaderPtr lirp)
1813
if (lirp == NULL || lirp->curr_line == NULL) {
1816
return lirp->curr_line->line_num;
1821
/* This function safely interprets the position of the current data position
1822
* of the SLineInfoReader structure. If the structure is NULL or the
1823
* current line is NULL or the current line position is NULL (usually because
1824
* the data position has been advanced to the end of the available sequence
1825
* data), the function returns -1, since the current data position does not
1827
* Otherwise, the position within the line of the character at the current
1828
* data position is returned.
1830
static int s_LineInfoReaderGetCurrentLineOffset (TLineInfoReaderPtr lirp)
1832
if (lirp == NULL || lirp->curr_line == NULL
1833
|| lirp->curr_line_pos == NULL) {
1836
return lirp->curr_line->line_offset + lirp->curr_line_pos
1837
- lirp->curr_line->data;
1842
/* This function frees the memory associated with the SLineInfoReader
1843
* structure. Notice that this function does NOT free the SLineInfo list.
1844
* This is by design.
1846
static void s_LineInfoReaderFree (TLineInfoReaderPtr lirp)
1856
/* This function retrieves the "pos"th sequence data character from the lines
1857
* of sequence data. If the data position requested is greater than the
1858
* current position, the current data pointer will be advanced until the
1859
* current position is the requested position or there is no more data. If
1860
* there is no more data, the function returns a 0. If the data position
1861
* requested is lower than the current position, the current position is reset
1862
* to the beginning of the sequence and advanced from there.
1863
* As a result, it is clearly more efficient to read the data in the forward
1864
* direction, but it is still possible to access the data randomly.
1868
(TLineInfoReaderPtr lirp,
1871
if (lirp == NULL || lirp->first_line == NULL || pos < 0
1872
|| lirp->data_pos == -1) {
1876
if (lirp->data_pos == pos) {
1877
if (lirp->curr_line_pos == NULL) {
1880
return *lirp->curr_line_pos;
1883
if (lirp->data_pos > pos) {
1884
s_LineInfoReaderReset (lirp);
1887
while (lirp->data_pos < pos && lirp->curr_line != NULL) {
1888
lirp->curr_line_pos ++;
1889
/* skip over spaces, progress to next line if necessary */
1890
s_LineInfoReaderAdvancePastSpace (lirp);
1893
if (lirp->curr_line_pos != NULL) {
1894
return *lirp->curr_line_pos;
1901
/* The following functions are used to manage the SStringCount structure.
1902
* These functions are useful for determining whether a string is unique
1903
* or whether only one string is used for a particular purpose.
1904
* The structure also tracks the line numbers on which a particular string
1908
/* This function allocates memory for a new SStringCount structure,
1909
* initializes its member variables. The function also places the
1910
* structure at the end of list if list is not NULL.
1911
* The function returns a pointer to the newly allocated SStringCount
1914
static TStringCountPtr s_StringCountNew (TStringCountPtr list)
1916
TStringCountPtr new_item, last;
1918
new_item = (TStringCountPtr) malloc (sizeof (SStringCount));
1919
if (new_item == NULL) {
1922
new_item->string = NULL;
1923
new_item->num_appearances = 0;
1924
new_item->line_numbers = NULL;
1925
new_item->next = NULL;
1928
while (last != NULL && last->next != NULL) {
1932
last->next = new_item;
1938
/* This function recursively frees data associated with the structures
1939
* and structure member variables in a linked list of SStringCount
1942
static void s_StringCountFree (TStringCountPtr list)
1947
s_StringCountFree (list->next);
1948
s_IntLinkFree (list->line_numbers);
1953
/* This function searches list to see if the string matches any of the
1954
* existing entries. If so, the num_appearances value for that entry is
1955
* increased and the line_num is added to that entry's list of line numbers.
1956
* Otherwise a new entry is created at the end of the list.
1957
* The function returns list if list was not NULL, or a pointer to the
1958
* newly created SStringCount structure otherwise.
1960
static TStringCountPtr s_AddStringCount (
1963
TStringCountPtr list
1966
TStringCountPtr add_to, last;
1967
TIntLinkPtr new_offset;
1969
if (string == NULL) {
1971
add_to != NULL && add_to->string != NULL;
1972
add_to = add_to->next) {
1978
&& (add_to->string == NULL
1979
|| strcmp (string, add_to->string) != 0);
1980
add_to = add_to->next) {
1985
if (add_to == NULL) {
1986
add_to = s_StringCountNew (list);
1987
if (list == NULL) list = add_to;
1988
if (add_to != NULL) {
1989
add_to->string = string;
1992
if (add_to != NULL) {
1993
add_to->num_appearances ++;
1994
new_offset = s_IntLinkNew (line_num, add_to->line_numbers);
1995
if (add_to->line_numbers == NULL) {
1996
add_to->line_numbers = new_offset;
2002
/* The following functions are replacements for strncasecmp and strcasecmp */
2004
/* This function returns -1 if str1 is less than str2 in the first cmp_count
2005
* characters (using case-insensitive comparisons), 0 if they are equal,
2006
* and 1 if str1 is greater than str2.
2008
static int s_StringNICmp (char * str1, char *str2, int cmp_count)
2012
int char_count, diff;
2014
if (str1 == NULL && str2 == NULL) {
2026
while (*cp1 != 0 && *cp2 != 0 && char_count < cmp_count) {
2027
diff = toupper ((int) *cp1) - toupper ((int) *cp2);
2035
if (char_count == cmp_count) {
2037
} else if (*cp1 == 0 && *cp2 != 0) {
2039
} else if (*cp1 != 0 && *cp2 == 0) {
2047
/* This function returns -1 if str1 is less than str2 using case-insensitive
2048
* comparisons), 0 if they are equal, and 1 if str1 is greater than str2.
2050
static int s_StringICmp (char * str1, char *str2)
2056
if (str1 == NULL && str2 == NULL) {
2067
while (*cp1 != 0 && *cp2 != 0) {
2068
diff = toupper ((int) *cp1) - toupper ((int) *cp2);
2075
if (*cp1 == 0 && *cp2 != 0) {
2077
} else if (*cp1 != 0 && *cp2 == 0) {
2085
/* The following functions are used to analyze specific kinds of lines
2086
* found in alignment files for information regarding the number of
2087
* expected sequences, the expected length of those sequences, and the
2088
* characters used to indicate missing, gap, and match characters.
2091
/* This function reads two numbers separated by whitespace from the
2092
* beginning of the string and uses them to set the expected number of
2093
* sequences and the expected number of characters per sequence.
2096
s_GetFASTAExpectedNumbers
2098
SAlignRawFilePtr afrp)
2105
if (str == NULL || afrp == NULL) {
2109
while (! isdigit ((int )*cp) && *cp != 0) {
2114
while (isdigit ((int )*cpend) && *cpend != 0) {
2126
while (! isdigit ((int )*cp) && *cp != 0) {
2131
while (isdigit ((int )*cpend) && *cpend != 0) {
2142
if (first > 0 && second > 0) {
2143
afrp->expected_num_sequence = first;
2144
afrp->expected_sequence_len = second;
2150
/* This function examines the string str to see if it begins with two
2151
* numbers separated by whitespace. The function returns eTrue if so,
2152
* otherwise it returns eFalse.
2154
static EBool s_IsTwoNumbersSeparatedBySpace (char * str)
2157
EBool found_first_number = eFalse;
2158
EBool found_dividing_space = eFalse;
2159
EBool found_second_number = eFalse;
2160
EBool found_second_number_end = eFalse;
2167
if (! isdigit ((int )*cp) && ! isspace ((int )*cp)) {
2170
if (! found_first_number) {
2171
if (! isdigit ((int )*cp)) {
2174
found_first_number = eTrue;
2175
} else if (! found_dividing_space) {
2176
if ( isspace ((int ) *cp)) {
2177
found_dividing_space = eTrue;
2178
} else if ( ! isdigit ((int )*cp)) {
2181
} else if (! found_second_number) {
2182
if ( isdigit ((int )*cp)) {
2183
found_second_number = eTrue;
2184
} else if (! isspace ((int ) *cp)) {
2187
} else if (! found_second_number_end) {
2188
if ( isspace ((int ) *cp)) {
2189
found_second_number_end = eTrue;
2190
} else if (! isdigit ((int )*cp)) {
2193
} else if (! isspace ((int ) *cp)) {
2198
if (found_second_number) {
2205
/* This function finds a value name in a string, looks for an equals sign
2206
* after the value name, and then looks for an integer value after the
2207
* equals sign. If the integer value is found, the function copies the
2208
* integer value into the val location and returns eTrue, otherwise the
2209
* function returns eFalse.
2212
s_GetOneNexusSizeComment
2217
char buf[MAX_PRINTED_INT_LEN_PLUS_ONE];
2222
if (str == NULL || valname == NULL || val == NULL) {
2226
cpstart = strstr (str, valname);
2227
if (cpstart == NULL) {
2230
cpstart += strlen (valname);
2231
while (*cpstart != 0 && isspace ((int )*cpstart)) {
2234
if (*cpstart != '=') {
2238
while (*cpstart != 0 && isspace ((int )*cpstart)) {
2242
if (! isdigit ((int )*cpstart)) {
2245
cpend = cpstart + 1;
2246
while ( *cpend != 0 && isdigit ((int )*cpend)) {
2249
maxlen = cpend - cpstart;
2250
if (maxlen > kMaxPrintedIntLen) maxlen = kMaxPrintedIntLen;
2252
strncpy (buf, cpstart, maxlen);
2259
/* This function looks for Nexus-style comments to indicate the number of
2260
* sequences and the number of characters per sequence expected from this
2261
* alignment file. If the function finds these comments, it returns eTrue,
2262
* otherwise it returns eFalse.
2265
s_GetNexusSizeComments
2268
EBool * found_nchar,
2269
SAlignRawFilePtr afrp)
2274
if (str == NULL || found_nchar == NULL
2275
|| found_ntax == NULL || afrp == NULL) {
2278
if (! *found_ntax &&
2279
(s_GetOneNexusSizeComment (str, "ntax", &num_sequences)
2280
|| s_GetOneNexusSizeComment (str, "NTAX", &num_sequences))) {
2281
afrp->expected_num_sequence = num_sequences;
2282
*found_ntax = eTrue;
2284
if (! *found_nchar &&
2285
(s_GetOneNexusSizeComment (str, "nchar", &num_chars)
2286
|| s_GetOneNexusSizeComment (str, "NCHAR", &num_chars))) {
2287
afrp->expected_sequence_len = num_chars;
2288
*found_nchar = eTrue;
2293
/* This function looks for characters in Nexus-style comments to
2294
* indicate values for specific kinds of characters (match, missing, gap...).
2295
* If the string str contains val_name followed by an equals sign, the function
2296
* will return the first non-whitespace character following the equals sign,
2297
* otherwise the function will return a 0.
2299
static char GetNexusTypechar (char * str, char * val_name)
2304
if (str == NULL || val_name == NULL) {
2307
cpend = strstr (str, ";");
2308
if (cpend == NULL) {
2311
cp = strstr (str, val_name);
2312
if (cp == NULL || cp > cpend) {
2315
cp += strlen (val_name);
2316
while ( isspace ((int )*cp)) {
2323
while ( isspace ((int )*cp) || *cp == '\'') {
2330
/* This function reads a Nexus-style comment line for the characters
2331
* specified for missing, match, and gap and compares the characters from
2332
* the comment with the characters specified in sequence_info. If any
2333
* discrepancies are found, the function reports the errors and returns eFalse,
2334
* otherwise the function returns eTrue.
2336
static EBool s_CheckNexusCharInfo
2338
TSequenceInfoPtr sequence_info,
2339
FReportErrorFunction errfunc,
2345
if (str == NULL || sequence_info == NULL) {
2349
cp = strstr (str, "format ");
2351
cp = strstr (str, "FORMAT ");
2357
if (errfunc == NULL) {
2361
c = GetNexusTypechar (cp + 7, "missing");
2363
c = GetNexusTypechar (cp + 7, "MISSING");
2365
if (c != 0 && sequence_info->missing != NULL
2366
&& strchr (sequence_info->missing, c) == NULL)
2368
s_ReportCharCommentError (sequence_info->missing, c, "MISSING",
2372
c = GetNexusTypechar (cp + 7, "gap");
2374
c = GetNexusTypechar (cp + 7, "GAP");
2376
if (c != 0 && sequence_info->middle_gap != NULL
2377
&& strchr (sequence_info->middle_gap, c) == NULL)
2379
s_ReportCharCommentError (sequence_info->middle_gap, c, "GAP",
2383
c = GetNexusTypechar (cp + 7, "match");
2385
c = GetNexusTypechar (cp + 7, "MATCH");
2387
if (c != 0 && sequence_info->match != NULL
2388
&& strchr (sequence_info->match, c) == NULL)
2390
s_ReportCharCommentError (sequence_info->match, c, "MATCH",
2397
/* This function examines the string str to see if it consists entirely of
2398
* asterisks, colons, periods, and whitespace. If so, this line is assumed
2399
* to be a Clustal-style consensus line and the function returns eTrue.
2400
* otherwise the function returns false;
2402
static EBool s_IsConsensusLine (char * str)
2405
|| strspn (str, "*:. \t\r\n") < strlen (str)
2406
|| strchr (str, '*') == NULL) {
2414
/* This function identifies lines that begin with a NEXUS keyword and end
2415
* with a semicolon - they will not contain sequence data. The function
2416
* returns eTrue if the line contains only a NEXUS comment, eFalse otherwise.
2418
static EBool s_SkippableNexusComment (char *str)
2420
char * last_semicolon;
2425
last_semicolon = strrchr (str, ';');
2426
if (last_semicolon == NULL
2427
|| strspn (last_semicolon + 1, " \t\r") != strlen (last_semicolon + 1)
2428
|| strchr (str, ';') != last_semicolon) {
2431
if (s_StringNICmp (str, "format ", 7) == 0
2432
|| s_StringNICmp (str, "dimensions ", 11) == 0
2433
|| s_StringNICmp (str, "dimensions ", 11) == 0
2434
|| s_StringNICmp (str, "options ", 8) == 0
2435
|| s_StringNICmp (str, "begin characters", 16) == 0
2436
|| s_StringNICmp (str, "begin data", 10) == 0) {
2444
/* This function determines whether the contents of str are "skippable"
2445
* in that they do not contain sequence data and therefore should not be
2446
* considered part of any block patterns or sequence data.
2448
static EBool s_SkippableString (char * str)
2451
|| s_StringNICmp (str, "matrix", 6) == 0
2452
|| s_StringNICmp (str, "#NEXUS", 6) == 0
2453
|| s_StringNICmp (str, "CLUSTAL W", 8) == 0
2454
|| s_SkippableNexusComment (str)
2455
|| s_IsTwoNumbersSeparatedBySpace (str)
2456
|| s_IsConsensusLine (str)
2457
|| str [0] == ';') {
2465
/* This function determines whether or not str contains a blank line.
2467
static EBool s_IsBlank (char * str)
2474
len = strspn (str, " \t\r");
2475
if (len == strlen (str)) {
2482
/* This function determines whether or not linestring contains a line
2483
* indicating the end of sequence data (organism information and definition
2484
* lines may occur after this line).
2486
static EBool s_FoundStopLine (char * linestring)
2488
if (linestring == NULL) {
2491
if (s_StringNICmp (linestring, "endblock", 8) == 0
2492
|| s_StringNICmp (linestring, "end;", 4) == 0) {
2499
/* This function identifies the beginning line of an ASN.1 file, which
2500
* cannot be read by the alignment reader.
2502
static EBool s_IsASN1 (char * linestring)
2504
if (linestring != NULL && strstr (linestring, "::=") != NULL) {
2512
/* The following functions are used to locate and read comments enclosed
2513
* in brackets. These comments sometimes include organism information.
2516
/* This function frees memory associated with a SCommentLoc structure. */
2517
static void s_CommentLocFree (TCommentLocPtr clp)
2522
s_CommentLocFree (clp->next);
2527
/* This function finds the first comment enclosed in brackets and creates
2528
* a SCommentLoc structure to indicate the position of the comment
2529
* in the string. The function returns a pointer to this structure if a
2530
* comment is found or a NULL if the string does not contain a bracketed
2533
static TCommentLocPtr s_FindComment (char * string)
2539
if (string == NULL) {
2542
cp_start = strstr (string, "[");
2543
if (cp_start != NULL) {
2544
cp_end = strstr (cp_start, "]");
2545
if (cp_end != NULL) {
2546
clp = (TCommentLocPtr) malloc (sizeof (SCommentLoc));
2550
clp->start = cp_start;
2560
/* This function removes a comment from a line. */
2561
static void s_RemoveCommentFromLine (char * linestring)
2565
if (linestring == NULL) {
2569
clp = s_FindComment (linestring);
2570
while (clp != NULL) {
2571
strcpy (clp->start, clp->end + 1);
2572
s_CommentLocFree (clp);
2573
clp = s_FindComment (linestring);
2576
/* if we have read an organism comment and that's all there was on the
2577
* line, get rid of the arrow character as well so it doesn't end up
2578
* in the sequence data
2580
if ( linestring [0] == '>' && linestring [1] == 0) {
2584
/* if the line now contains only space, truncate it */
2585
if (strspn (linestring, " \t\r") == strlen (linestring)) {
2592
/* This function determines whether or not a comment describes an organism
2593
* by looking for org= or organism= inside the brackets.
2595
static EBool s_IsOrganismComment (TCommentLocPtr clp)
2601
if (clp == NULL || clp->start == NULL || clp->end == NULL) {
2610
len = strspn ( clp->start, " \t\r");
2612
cp_end = strstr (cp, "=");
2613
if (cp_end == NULL) {
2617
while (cp_end > cp && isspace ((int )*cp_end)) {
2621
if ((cp_end - cp == 3 && s_StringNICmp (cp, "org", 3) == 0)
2622
|| (cp_end - cp == 8 && s_StringNICmp (cp, "organism", 8) == 0)) {
2629
/* This function finds an organism comment, which includes the first bracketed
2630
* comment with org= or organism=, plus any additional bracketed comments
2631
* separated only by whitespace from the org= or organism= comment.
2632
* The function returns a pointer to a SCommentLoc structure describing
2633
* the location of the organism comment.
2635
static TCommentLocPtr s_FindOrganismComment (char * string)
2637
TCommentLocPtr clp, next_clp;
2639
if (string == NULL) {
2643
clp = s_FindComment (string);
2644
while (clp != NULL && ! s_IsOrganismComment (clp)) {
2645
clp = s_FindComment (clp->end);
2652
next_clp = s_FindComment (clp->end);
2653
while (next_clp != NULL &&
2654
(int) strspn (clp->end + 1, " \t\r") == next_clp->start - clp->end - 1
2655
&& ! s_IsOrganismComment (next_clp))
2657
clp->end = next_clp->end;
2658
next_clp = s_FindComment (clp->end);
2664
/* This function removes an organism comment from a line. */
2665
static void s_RemoveOrganismCommentFromLine (char * string)
2669
while ((clp = s_FindOrganismComment (string)) != NULL) {
2670
strcpy (clp->start, clp->end + 1);
2671
s_CommentLocFree (clp);
2676
/* This function creates an ordered list of comments within an organism
2677
* comment and returns a pointer to the first item in the linked list.
2678
* In an ordered org name, the org= value appears first, followed by other
2679
* bracketed values in alphabetical order.
2681
static TCommentLocPtr s_CreateOrderedOrgCommentList (TCommentLocPtr org_clp)
2683
TCommentLocPtr clp, prev_clp, next_clp, clp_list, ordered_start;
2684
int next_len, this_len, len;
2686
if (org_clp == NULL) {
2690
clp_list = s_FindComment (org_clp->start); /* this is the org= */
2692
ordered_start = s_FindComment (clp_list->end);
2693
if (ordered_start == NULL) {
2696
clp = s_FindComment (ordered_start->end);
2697
while (clp != NULL && clp->start < org_clp->end) {
2698
/* insert new comment into list */
2700
next_clp = ordered_start;
2701
next_len = next_clp->end - next_clp->start;
2702
this_len = clp->end - clp->start;
2703
len = next_len > this_len ? next_len : this_len;
2704
while (next_clp != NULL
2705
&& strncmp (next_clp->start, clp->start, len) < 0)
2707
prev_clp = next_clp;
2708
next_clp = next_clp->next;
2709
if (next_clp != NULL) {
2710
next_len = next_clp->end - next_clp->start;
2711
len = next_len > this_len ? next_len : this_len;
2714
if (prev_clp == NULL) {
2715
clp->next = ordered_start;
2716
ordered_start = clp;
2718
clp->next = prev_clp->next;
2719
prev_clp->next = clp;
2721
clp = s_FindComment (clp->end);
2723
clp_list->next = ordered_start;
2728
/* This function creates an ordered organism name based on the bracketed
2729
* comments contained in the location described by org_clp.
2731
static char * s_CreateOrderedOrgName (TCommentLocPtr org_clp)
2733
TCommentLocPtr clp, clp_list;
2734
char * ordered_org_name;
2737
if (org_clp == NULL) {
2741
ordered_org_name = malloc (org_clp->end - org_clp->start + 2);
2742
if (ordered_org_name == NULL) {
2745
ordered_org_name [0] = 0;
2746
clp_list = s_CreateOrderedOrgCommentList (org_clp);
2747
cp = ordered_org_name;
2748
for (clp = clp_list; clp != NULL; clp = clp->next) {
2749
strncpy (cp, clp->start, clp->end - clp->start + 1);
2750
cp += clp->end - clp->start + 1;
2754
s_CommentLocFree (clp_list);
2756
return ordered_org_name;
2760
/* This function is used to read any organism names that may appear in
2761
* string, including any modifiers that may appear after the organism name.
2763
static void s_ReadOrgNamesFromText
2766
SAlignRawFilePtr afrp)
2775
if (string == NULL || afrp == NULL) {
2779
clp = s_FindOrganismComment (string);
2780
if (clp == NULL && (strstr (string, "org=") != NULL || strstr (string, "organism=") != NULL))
2782
s_ReportOrgCommentError (string, afrp->report_error, afrp->report_error_userdata);
2784
while (clp != NULL) {
2785
org_name = s_CreateOrderedOrgName (clp);
2786
afrp->organisms = s_AddLineInfo (afrp->organisms, org_name, line_num,
2787
clp->start - string);
2789
afrp->num_organisms ++;
2792
if (*clp->end != 0) {
2794
cp += strspn (cp, " \t\r\n");
2796
defline = clp->end + 1;
2797
defline_offset = clp->end - string + 1;
2800
afrp->deflines = s_AddLineInfo (afrp->deflines, defline, line_num,
2802
afrp->num_deflines ++;
2804
comment_end = clp->end;
2805
s_CommentLocFree (clp);
2806
clp = s_FindOrganismComment (comment_end);
2811
/* The following group of functions manages the SAlignRawSeq structure,
2812
* which is used to track the IDs of sequences in the file, the sequence
2813
* characters for those IDs, and the locations of the IDs and sequence
2817
/* This function allocates memory for an SAlignRawSeq structure,
2818
* initializes its member variables, and returns a pointer to the newly
2819
* allocated structure.
2821
static TAlignRawSeqPtr s_AlignRawSeqNew (TAlignRawSeqPtr list)
2823
TAlignRawSeqPtr arsp, last;
2825
arsp = (TAlignRawSeqPtr)malloc (sizeof (SAlignRawSeq));
2830
arsp->sequence_data = NULL;
2831
arsp->id_lines = NULL;
2835
while (last != NULL && last->next != NULL) {
2845
/* This function frees the memory associated with an SAlignRawSeq
2846
* structure's member variables and with the structure itself.
2848
static void s_AlignRawSeqFree (TAlignRawSeqPtr arsp)
2853
s_AlignRawSeqFree (arsp->next);
2855
s_LineInfoFree (arsp->sequence_data);
2856
s_IntLinkFree (arsp->id_lines);
2860
/* This function returns a pointer to the sequence in list with the specified
2861
* ID, unless there is no such sequence, in which case the function returns
2864
static TAlignRawSeqPtr
2865
s_FindAlignRawSeqById
2866
(TAlignRawSeqPtr list,
2869
TAlignRawSeqPtr arsp;
2871
for (arsp = list; arsp != NULL; arsp = arsp->next) {
2872
if (strcmp (arsp->id, id) == 0) {
2880
/* This function finds the position of a given ID in the sequence list,
2881
* unless the ID is not found in the list, in which case the function returns
2885
s_FindAlignRawSeqOffsetById
2886
(TAlignRawSeqPtr list,
2889
TAlignRawSeqPtr arsp;
2892
for (arsp = list, offset = 0; arsp != NULL; arsp = arsp->next, offset++) {
2893
if (strcmp (arsp->id, id) == 0) {
2901
/* This function returns a pointer to the memory in which the ID for the
2902
* Nth sequence is stored, unless there aren't that many sequences, in which
2903
* case NULL is returned.
2906
s_GetAlignRawSeqIDByOffset
2907
(TAlignRawSeqPtr list,
2910
TAlignRawSeqPtr arsp;
2915
while ( arsp != NULL && index != offset ) {
2919
if (index == offset && arsp != NULL) {
2927
/* This function adds data to a sequence by looking for the specified ID in
2928
* the list. If the id is not found, a new sequence with that ID is added to
2929
* the end of the list.
2930
* The function returns a pointer to the first item in the list.
2932
static TAlignRawSeqPtr
2933
s_AddAlignRawSeqById
2934
(TAlignRawSeqPtr list,
2939
int data_line_offset)
2941
TAlignRawSeqPtr arsp;
2944
arsp = s_FindAlignRawSeqById (list, id);
2946
arsp = s_AlignRawSeqNew (list);
2950
if (list == NULL) list = arsp;
2951
arsp->id = strdup (id);
2953
arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
2957
ilp = s_IntLinkNew (id_line_num, arsp->id_lines);
2958
if (arsp->id_lines == NULL) arsp->id_lines = ilp;
2963
/* This function adds data to the Nth sequence in the sequence list and
2964
* returns eTrue, unless there aren't that many sequences in the list, in
2965
* which case the function returns eFalse.
2968
s_AddAlignRawSeqByIndex
2969
(TAlignRawSeqPtr list,
2973
int data_line_offset)
2975
TAlignRawSeqPtr arsp;
2979
for (arsp = list; arsp != NULL && curr < index; arsp = arsp->next) {
2985
arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
2994
/* This function frees memory associated with the SAlignRawFileData structure.
2996
static void s_AlignFileRawFree (SAlignRawFilePtr afrp)
3002
s_LineInfoFree (afrp->organisms);
3003
s_LineInfoFree (afrp->deflines);
3004
s_LineInfoFree (afrp->line_list);
3005
s_AlignRawSeqFree (afrp->sequences);
3006
s_IntLinkFree (afrp->offset_list);
3011
/* This function allocates memory for an SAlignRawFileData structure and
3012
* initializes its member variables. The function returns a pointer to
3013
* the newly allocated structure.
3015
static SAlignRawFilePtr s_AlignFileRawNew (void)
3017
SAlignRawFilePtr afrp;
3019
afrp = (SAlignRawFilePtr)malloc (sizeof (SAlignRawFileData));
3023
afrp->marked_ids = eFalse;
3024
afrp->line_list = NULL;
3025
afrp->organisms = NULL;
3026
afrp->num_organisms = 0;
3027
afrp->deflines = NULL;
3028
afrp->num_deflines = 0;
3029
afrp->block_size = 0;
3030
afrp->offset_list = NULL;
3031
afrp->sequences = NULL;
3032
afrp->report_error = NULL;
3033
afrp->report_error_userdata = NULL;
3034
afrp->alphabet = NULL;
3035
afrp->expected_num_sequence = 0;
3036
afrp->expected_sequence_len = 0;
3037
afrp->num_segments = 1;
3042
/* The following functions are used to analyze the structure of a file and
3043
* assemble the sequences listed in the file.
3044
* Sequence data in a file is organized in one of two general formats -
3045
* interleaved or contiguous. Interleaved data can be recognized by looking
3046
* for repeated blocks of the same number of lines within a file separated
3047
* by blank or skippable lines from other lines in the file. The first of
3048
* these blocks must have at least two elements separated by whitespace
3049
* in each line, the first of these elements is the ID for the sequence in
3050
* that row and for the sequences in that position within the block for the
3051
* remainder of the file.
3052
* Contiguous data can be recognized by either looking for "marked" sequence
3053
* IDs, which begin with a '>' character, or by looking for repeated patterns
3054
* of lines with the same numbers of characters.
3057
/* The following functions are used to analyze interleaved data. */
3059
/* This function creates a SLengthListData structure that describes the pattern
3060
* of character lengths in the string pointed to by cp.
3062
static SLengthListPtr s_GetBlockPattern (char * cp)
3064
SLengthListPtr this_pattern;
3067
this_pattern = s_LengthListNew (NULL);
3068
if (this_pattern == NULL) {
3072
this_pattern->num_appearances = 1;
3074
len = strcspn (cp, " \t\r");
3075
s_AddLengthRepeat (this_pattern, len);
3077
cp += strspn (cp, " \t\r");
3079
return this_pattern;
3083
/* This function attempts to predict whether the following lines will be
3084
* an interleaved block. If so, the function returns the location of the
3085
* beginning of the block, otherwise the function returns -1.
3088
s_ForecastBlockPattern
3089
(SLengthListPtr pattern_list,
3090
TIntLinkPtr next_offset,
3097
line_counter = line_start;
3098
if (next_offset != NULL
3099
&& next_offset->ival - line_counter < block_size) {
3103
for (llp = pattern_list;
3105
&& (next_offset == NULL || line_counter < next_offset->ival - 1)
3106
&& line_counter - line_start < block_size;
3109
if (llp->lengthrepeats == NULL) {
3112
line_counter += llp->num_appearances;
3114
if (line_counter - line_start == block_size) {
3115
if (llp->next == NULL) {
3119
if (llp->lengthrepeats == NULL) {
3127
/* This function looks for malformed blocks between the identified blocks
3128
* indicated by the offset_list. It returns a pointer to the list with the
3129
* new locations inserted at the appropriate locations.
3132
s_AugmentBlockPatternOffsetList
3133
(SLengthListPtr pattern_list,
3134
TIntLinkPtr offset_list,
3139
TIntLinkPtr next_offset, prev_offset, new_offset;
3143
next_offset = offset_list;
3146
while (llp != NULL) {
3147
if (next_offset != NULL && line_counter == next_offset->ival) {
3148
prev_offset = next_offset;
3149
next_offset = next_offset->next;
3150
/* skip past the lines for this block */
3151
while (line_counter - prev_offset->ival < block_size
3154
line_counter += llp->num_appearances;
3158
forecast_pos = s_ForecastBlockPattern (llp, next_offset,
3161
if (forecast_pos > 0) {
3162
new_offset = s_IntLinkNew (forecast_pos, NULL);
3163
if (new_offset == NULL) {
3166
if (prev_offset == NULL) {
3167
new_offset->next = offset_list;
3168
offset_list = new_offset;
3170
new_offset->next = next_offset;
3171
prev_offset->next = new_offset;
3173
prev_offset = new_offset;
3174
/* skip past the lines for this block */
3175
while (line_counter - prev_offset->ival < block_size
3178
line_counter += llp->num_appearances;
3182
line_counter += llp->num_appearances;
3191
/* This function looks for lines that could not be assigned to an interleaved
3192
* block. It returns eTrue if it finds any such lines after the first offset,
3193
* eFalse otherwise, and reports all instances of unused lines as errors.
3197
(SLengthListPtr pattern_list,
3198
SAlignRawFilePtr afrp)
3203
int block_line_counter;
3204
EBool rval = eFalse;
3205
TLineInfoPtr line_val;
3208
if (pattern_list == NULL || afrp == NULL
3209
|| afrp->offset_list == NULL || afrp->block_size < 2) {
3213
offset = afrp->offset_list;
3216
line_val = afrp->line_list;
3218
while (llp != NULL && line_val != NULL) {
3219
while (llp != NULL && line_val != NULL
3220
&& (offset == NULL || line_counter < offset->ival)) {
3221
if (llp->lengthrepeats != NULL) {
3222
s_ReportUnusedLine (line_counter,
3223
line_counter + llp->num_appearances - 1,
3226
afrp->report_error_userdata);
3227
if (offset != afrp->offset_list) {
3231
line_counter += llp->num_appearances;
3233
skip < llp->num_appearances && line_val != NULL;
3235
line_val = line_val->next;
3239
block_line_counter = 0;
3240
while (block_line_counter < afrp->block_size && llp != NULL) {
3241
block_line_counter += llp->num_appearances;
3242
line_counter += llp->num_appearances;
3244
skip < llp->num_appearances && line_val != NULL;
3246
line_val = line_val->next;
3250
if (offset != NULL) {
3251
offset = offset->next;
3258
/* This function examines a list of line lengths, looking for interleaved
3259
* blocks. If it finds them, it will set the SAlignRawFileData offset_list
3260
* member variable to point to a list of locations for the blocks.
3263
s_FindInterleavedBlocks
3264
(SLengthListPtr pattern_list,
3265
SAlignRawFilePtr afrp)
3267
SLengthListPtr llp, llp_next;
3268
TSizeInfoPtr size_list, best_ptr;
3269
TIntLinkPtr new_offset;
3272
afrp->block_size = 0;
3274
afrp->offset_list = NULL;
3275
for (llp = pattern_list; llp != NULL; llp = llp->next) {
3276
llp_next = llp->next;
3277
if (llp->num_appearances > 1
3278
&& (llp_next == NULL || llp_next->lengthrepeats == NULL)) {
3279
size_list = s_AddSizeInfo (size_list, llp->num_appearances);
3282
best_ptr = s_GetMostPopularSizeInfo (size_list);
3283
if (best_ptr != NULL && best_ptr->num_appearances > 1) {
3284
afrp->block_size = best_ptr->size_value;
3286
for (llp = pattern_list; llp != NULL; llp = llp->next) {
3287
llp_next = llp->next;
3288
if (llp->num_appearances == afrp->block_size
3289
&& (llp_next == NULL || llp_next->lengthrepeats == NULL))
3291
new_offset = s_IntLinkNew (line_counter, afrp->offset_list);
3292
if (new_offset == NULL) {
3295
if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
3297
line_counter += llp->num_appearances;
3299
afrp->offset_list = s_AugmentBlockPatternOffsetList (pattern_list,
3303
if (s_FindUnusedLines (pattern_list, afrp)) {
3304
s_IntLinkFree (afrp->offset_list);
3305
afrp->offset_list = NULL;
3306
afrp->block_size = 0;
3308
s_SizeInfoFree (size_list);
3312
static void s_TrimEndSpace (char *linestring)
3317
if (linestring == NULL) return;
3318
len = strlen (linestring);
3319
cp = linestring + len - 1;
3320
while (cp > linestring && (*cp == ' ' || *cp == '\t' || *cp == '\r' || *cp == '\n'))
3327
static SAlignRawFilePtr
3329
(FReadLineFunction readfunc,
3331
TSequenceInfoPtr sequence_info,
3332
FReportErrorFunction errfunc,
3336
SAlignRawFilePtr afrp;
3339
int overall_line_count;
3340
EBool found_expected_ntax = eFalse;
3341
EBool found_expected_nchar = eFalse;
3342
EBool found_char_comment = eFalse;
3343
SLengthListPtr pattern_list = NULL;
3344
SLengthListPtr this_pattern;
3347
TIntLinkPtr new_offset;
3348
EBool in_taxa_comment;
3349
EBool in_bracketed_comment = eFalse;
3350
TBracketedCommentListPtr comment_list = NULL, last_comment = NULL;
3354
if (readfunc == NULL || sequence_info == NULL) {
3358
afrp = s_AlignFileRawNew ();
3363
afrp->alphabet = strdup (sequence_info->alphabet);
3364
afrp->report_error = errfunc;
3365
afrp->report_error_userdata = errdata;
3367
overall_line_count = 0;
3368
found_stop = eFalse;
3369
in_taxa_comment = eFalse;
3370
linestring = readfunc (userdata);
3371
if (s_IsASN1 (linestring)) {
3372
s_ReportASN1Error (afrp->report_error, afrp->report_error_userdata);
3373
s_AlignFileRawFree (afrp);
3377
while (linestring != NULL && linestring [0] != EOF) {
3378
s_TrimEndSpace (linestring);
3379
s_ReadOrgNamesFromText (linestring, overall_line_count, afrp);
3380
/* we want to remove the comment from the line for the purpose
3381
* of looking for blank lines and skipping,
3382
* but save comments for storing in array if line is not skippable or
3385
len = strspn (linestring, " \t\r\n");
3386
tmp = strdup (linestring + len);
3391
if (! found_stop && ! in_taxa_comment) {
3392
found_stop = s_FoundStopLine (tmp);
3395
if (! found_expected_ntax || ! found_expected_nchar) {
3396
if (s_IsTwoNumbersSeparatedBySpace (tmp)) {
3397
s_GetFASTAExpectedNumbers (tmp, afrp);
3398
found_expected_ntax = eTrue;
3399
found_expected_nchar = eTrue;
3401
s_GetNexusSizeComments (tmp, &found_expected_ntax,
3402
&found_expected_nchar, afrp);
3405
if (! found_char_comment) {
3406
found_char_comment = s_CheckNexusCharInfo (tmp, sequence_info,
3408
afrp->report_error_userdata);
3411
if (in_taxa_comment) {
3412
if (strncmp (tmp, "end;", 4) == 0) {
3413
in_taxa_comment = eFalse;
3416
} else if (strncmp (tmp, "begin taxa;", 11) == 0) {
3418
in_taxa_comment = eTrue;
3421
/* remove complete single-line bracketed comments from line
3422
*before checking for multiline bracketed comments */
3423
s_RemoveCommentFromLine (tmp);
3425
if (in_bracketed_comment) {
3426
len = strspn (linestring, " \t\r\n");
3427
if (last_comment != NULL)
3429
s_BracketedCommentListAddLine (last_comment, linestring + len,
3430
overall_line_count, len);
3432
if (strchr (tmp, ']') != NULL) {
3433
in_bracketed_comment = eFalse;
3436
} else if (tmp [0] == '[' && strchr (tmp, ']') == NULL) {
3437
in_bracketed_comment = eTrue;
3438
len = strspn (linestring, " \t\r\n");
3439
last_comment = s_BracketedCommentListNew (comment_list,
3441
overall_line_count, len);
3442
if (comment_list == NULL)
3444
comment_list = last_comment;
3449
if (s_SkippableString (tmp)) {
3453
if (tmp [0] == '>' && ! found_stop) {
3454
afrp->marked_ids = eTrue;
3455
new_offset = s_IntLinkNew (overall_line_count + 1,
3457
if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
3460
if (! afrp->marked_ids) {
3461
/* add to length list for interleaved block search */
3462
len = strcspn (tmp, " \t\r");
3465
len = strspn (cp, " \t\r");
3470
this_pattern = s_GetBlockPattern (tmp);
3472
this_pattern = s_GetBlockPattern (cp);
3474
pattern_list = s_AddPatternRepeat (pattern_list,
3477
this_pattern = s_GetBlockPattern (tmp);
3478
pattern_list = s_AddPatternRepeat (pattern_list,
3483
len = strspn (linestring, " \t\r\n");
3484
afrp->line_list = s_AddLineInfo (afrp->line_list,
3486
overall_line_count, len);
3490
linestring = readfunc (userdata);
3491
overall_line_count ++;
3493
afrp->num_segments = s_GetNumSegmentsInAlignment (comment_list, errfunc, errdata);
3494
if (afrp->num_segments > 1)
3496
if (afrp->offset_list != NULL)
3498
s_ReportSegmentedAlignmentError (afrp->offset_list,
3500
s_AlignFileRawFree (afrp);
3501
s_LengthListFree (pattern_list);
3502
s_BracketedCommentListFree (comment_list);
3507
afrp->offset_list = GetSegmentOffsetList (comment_list);
3508
afrp->marked_ids = eTrue;
3511
if (! afrp->marked_ids) {
3512
s_FindInterleavedBlocks (pattern_list, afrp);
3514
s_LengthListFree (pattern_list);
3515
s_BracketedCommentListFree (comment_list);
3520
/* This function analyzes a block to see if it contains, as the first element
3521
* of any of its lines, one of the sequence IDs already identified. If the
3522
* one of the lines does begin with a sequence ID, all of the lines are
3523
* assumed to begin with sequence IDs and the function returns eTrue, otherwise
3524
* the function returns eFalse.
3528
(SAlignRawFilePtr afrp,
3529
TLineInfoPtr first_line,
3530
int num_lines_in_block)
3535
TAlignRawSeqPtr arsp;
3539
if (afrp->sequences == NULL) {
3543
for (lip = first_line, block_offset = 0;
3544
lip != NULL && block_offset < num_lines_in_block;
3545
lip = lip->next, block_offset++)
3547
linestring = lip->data;
3548
if (linestring != NULL) {
3549
len = strcspn (linestring, " \t\r");
3550
if (len > 0 && len < strlen (linestring)) {
3551
this_id = (char *) malloc (len + 1);
3552
if (this_id == NULL) {
3555
strncpy (this_id, linestring, len);
3557
arsp = s_FindAlignRawSeqById (afrp->sequences, this_id);
3569
/* This function analyzes the lines of the block to see if the pattern of
3570
* the lengths of the whitespace-separated pieces of sequence data matches
3571
* for all lines within the block. The function returns eTrue if this is so,
3572
* otherwise the function returns eFalse.
3576
(SAlignRawFilePtr afrp,
3577
TLineInfoPtr first_line,
3578
int num_lines_in_block,
3583
SLengthListPtr list, this_pattern, best;
3584
int len, block_offset, id_offset;
3591
for (lip = first_line, block_offset = 0;
3592
lip != NULL && block_offset < num_lines_in_block;
3593
lip = lip->next, block_offset ++)
3597
len = strcspn (cp, " \t\r");
3598
tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
3599
if (tmp_id == NULL) {
3602
strncpy (tmp_id, cp, len);
3604
id_offset = s_FindAlignRawSeqOffsetById (afrp->sequences, tmp_id);
3605
if (id_offset != block_offset && ! first_block) {
3607
s_ReportInconsistentID (tmp_id, lip->line_num,
3609
afrp->report_error_userdata);
3613
cp += strspn (cp, " \t\r");
3615
this_pattern = s_GetBlockPattern (cp);
3616
list = s_AddLengthList (list, this_pattern);
3619
/* Now find the pattern with the most appearances */
3621
for (this_pattern = list;
3622
this_pattern != NULL;
3623
this_pattern = this_pattern->next)
3625
if (this_pattern->num_appearances == 0) continue;
3627
|| this_pattern->num_appearances > best->num_appearances)
3629
best = this_pattern;
3633
/* now identify and report inconsistent lines */
3634
for (lip = first_line, block_offset = 0;
3635
lip != NULL && block_offset < num_lines_in_block;
3636
lip = lip->next, block_offset ++)
3640
len = strcspn (cp, " \t\r");
3641
tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
3642
if (tmp_id == NULL) {
3645
strncpy (tmp_id, cp, len);
3648
cp += strspn (cp, " \t\r");
3650
tmp_id = s_GetAlignRawSeqIDByOffset (afrp->sequences, block_offset);
3652
this_pattern = s_GetBlockPattern (cp);
3653
if ( ! s_DoLengthPatternsMatch (this_pattern, best)) {
3655
s_ReportInconsistentBlockLine (tmp_id, lip->line_num,
3657
afrp->report_error_userdata);
3659
s_LengthListFree (this_pattern);
3664
s_LengthListFree (list);
3669
/* This function processes a block of lines and adds the sequence data from
3670
* each line in the block to the appropriate sequence in the list.
3674
(SAlignRawFilePtr afrp,
3676
int num_lines_in_block,
3685
EBool this_block_has_ids;
3688
this_block_has_ids = s_DoesBlockHaveIds (afrp, lines, num_lines_in_block);
3689
s_BlockIsConsistent (afrp, lines, num_lines_in_block, this_block_has_ids,
3691
for (lip = lines, line_number = 0;
3692
lip != NULL && line_number < num_lines_in_block;
3693
lip = lip->next, line_number ++)
3695
linestring = lip->data;
3696
if (linestring != NULL) {
3698
if (this_block_has_ids) {
3699
len = strcspn (linestring, " \t\r");
3700
this_id = (char *) malloc (len + 1);
3701
if (this_id == NULL) {
3704
strncpy (this_id, linestring, len);
3706
cp = linestring + len;
3708
len = strspn (linestring, " \t\r");
3711
afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
3715
lip->line_offset + cp - linestring);
3718
if (! s_AddAlignRawSeqByIndex (afrp->sequences, line_number,
3720
lip->line_num, lip->line_offset))
3722
s_ReportBlockLengthError ("", lip->line_num,
3726
afrp->report_error_userdata);
3734
/* This function removes comments from the lines of an interleaved block of
3738
s_RemoveCommentsFromBlock
3739
(TLineInfoPtr first_line,
3740
int num_lines_in_block)
3745
for (lip = first_line, block_offset = 0;
3746
lip != NULL && block_offset < num_lines_in_block;
3749
s_RemoveCommentFromLine (lip->data);
3754
/* This function processes the interleaved block of data found at each
3755
* location listed in afrp->offset_list.
3757
static void s_ProcessAlignRawFileByBlockOffsets (SAlignRawFilePtr afrp)
3760
TIntLinkPtr offset_ptr;
3762
EBool first_block = eTrue;
3763
EBool in_taxa_comment = eFalse;
3770
offset_ptr = afrp->offset_list;
3771
lip = afrp->line_list;
3772
while (lip != NULL && offset_ptr != NULL
3773
&& (in_taxa_comment || ! s_FoundStopLine (lip->data))) {
3774
if (in_taxa_comment) {
3775
if (strncmp (lip->data, "end;", 4) == 0) {
3776
in_taxa_comment = eFalse;
3778
} else if (lip->data != NULL
3779
&& strncmp (lip->data, "begin taxa;", 11) == 0) {
3780
in_taxa_comment = eTrue;
3782
if (line_counter == offset_ptr->ival) {
3783
s_RemoveCommentsFromBlock (lip, afrp->block_size);
3784
s_ProcessBlockLines (afrp, lip, afrp->block_size, first_block);
3785
first_block = eFalse;
3786
offset_ptr = offset_ptr->next;
3794
/* The following functions are used to analyze contiguous data. */
3797
s_CreateSequencesBasedOnTokenPatterns
3798
(TLineInfoPtr token_list,
3799
TIntLinkPtr offset_list,
3800
SLengthListPtr * anchorpattern,
3801
SAlignRawFilePtr afrp)
3805
TIntLinkPtr offset_ptr, next_offset_ptr;
3808
int pattern_line_counter;
3811
if (token_list == NULL || offset_list == NULL
3812
|| anchorpattern == NULL
3817
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
3819
if (anchorpattern [curr_seg] == NULL || anchorpattern [curr_seg]->lengthrepeats == NULL)
3827
offset_ptr = offset_list;
3830
for (offset_ptr = offset_list;
3831
offset_ptr != NULL && lip != NULL;
3832
offset_ptr = offset_ptr->next)
3834
next_offset_ptr = offset_ptr->next;
3835
while (line_counter < offset_ptr->ival - 1 && lip != NULL) {
3840
curr_id = lip->data;
3843
for (sip = anchorpattern[curr_seg]->lengthrepeats;
3846
&& (next_offset_ptr == NULL
3847
|| line_counter < next_offset_ptr->ival - 1);
3850
for (pattern_line_counter = 0;
3851
pattern_line_counter < sip->num_appearances
3853
&& (next_offset_ptr == NULL
3854
|| line_counter < next_offset_ptr->ival - 1);
3855
pattern_line_counter ++)
3857
if ((int) strlen (lip->data) != sip->size_value) {
3858
s_ReportLineLengthError (curr_id, lip, sip->size_value,
3860
afrp->report_error_userdata);
3862
afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
3872
if (sip != NULL && lip != NULL) {
3873
s_ReportBlockLengthError (curr_id, lip->line_num,
3875
line_counter - offset_ptr->ival,
3877
afrp->report_error_userdata);
3881
if (curr_seg >= afrp->num_segments)
3889
/* The following functions are used for analyzing contiguous data with
3893
/* This function creates a new LengthList pattern for each marked ID.
3894
* After each new list is created, the function checks to see if the
3895
* new pattern matches any pattern already in the list of patterns seen.
3896
* If so, the function deletes the new pattern and increments
3897
* num_appearances for the pattern in the list, otherwise the function
3898
* adds the new pattern to the list.
3899
* When the list is complete, the function finds the pattern with the
3900
* most appearances and returns that pattern as the anchor pattern to use
3901
* when checking sequence data blocks for consistency with one another.
3903
static SLengthListPtr *
3904
s_CreateAnchorPatternForMarkedIDs
3905
(SAlignRawFilePtr afrp)
3907
SLengthListPtr * list;
3908
SLengthListPtr * best;
3909
SLengthListPtr this_pattern;
3918
/* initialize length lists */
3919
list = (SLengthListPtr *) malloc (afrp->num_segments * sizeof (SLengthListPtr));
3924
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
3926
list[curr_seg] = NULL;
3928
/* initialize best ptrs */
3929
/* list is one element longer, to hold null terminator */
3930
best = (SLengthListPtr *) malloc ((afrp->num_segments + 1) * sizeof (SLengthListPtr));
3935
for (curr_seg = 0; curr_seg < afrp->num_segments + 1; curr_seg ++)
3937
best[curr_seg] = NULL;
3940
/* initialize pattern */
3941
this_pattern = NULL;
3944
for (lip = afrp->line_list;
3945
lip != NULL && ! s_FoundStopLine (lip->data);
3948
if (lip->data == NULL) continue;
3949
if (lip->data [0] == ']' || lip->data [0] == '[') continue;
3950
if (lip->data [0] == '>') {
3951
if (this_pattern != NULL) {
3952
list [curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
3954
if (curr_seg >= afrp->num_segments)
3959
this_pattern = s_LengthListNew (NULL);
3960
if (this_pattern == NULL) {
3961
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
3963
s_LengthListFree (list [curr_seg]);
3968
this_pattern->num_appearances = 1;
3969
} else if (this_pattern != NULL) {
3970
/* This section gets rid of base pair number comments */
3972
while ( isspace ((int )*cp) || isdigit ((int )*cp)) {
3975
s_AddLengthRepeat (this_pattern, strlen (cp));
3978
if (this_pattern != NULL) {
3979
list[curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
3982
/* Now find the pattern with the most appearances for each segment*/
3983
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg++)
3985
for (this_pattern = list [curr_seg];
3986
this_pattern != NULL;
3987
this_pattern = this_pattern->next)
3989
if (this_pattern->num_appearances == 0) continue;
3990
if (best [curr_seg] == NULL
3991
|| this_pattern->num_appearances > best[curr_seg]->num_appearances)
3993
best[curr_seg] = this_pattern;
3998
/* free all patterns before and after anchor pattern */
3999
if (best [curr_seg] != NULL) {
4000
s_LengthListFree (best [curr_seg]->next);
4001
best [curr_seg]->next = NULL;
4004
if (best [curr_seg] != list [curr_seg]) {
4005
this_pattern = list [curr_seg];
4006
while ( this_pattern != NULL && this_pattern->next != best[curr_seg] ) {
4007
this_pattern = this_pattern->next;
4009
if (this_pattern != NULL) {
4010
this_pattern->next = NULL;
4011
s_LengthListFree (list [curr_seg]);
4016
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
4018
if (best[curr_seg] == NULL)
4020
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
4022
s_LengthListFree (best [curr_seg]);
4032
/* This function removes base pair count comments from the data sections
4033
* for contiguous marked ID sequences.
4035
static void s_RemoveBasePairCountCommentsFromData (SAlignRawFilePtr afrp)
4037
TIntLinkPtr this_offset, next_offset;
4042
if (afrp == NULL || afrp->offset_list == NULL) {
4045
this_offset = afrp->offset_list;
4046
next_offset = this_offset->next;
4047
lip = afrp->line_list;
4049
while (lip != NULL && this_offset != NULL) {
4050
if (line_count == this_offset->ival) {
4051
while (lip != NULL &&
4052
(next_offset == NULL
4053
|| line_count < next_offset->ival - 1)) {
4056
cp += strspn (cp, " \t\r\n1234567890");
4057
if (cp != lip->data) {
4058
strcpy (lip->data, cp);
4064
this_offset = this_offset->next;
4065
if (this_offset != NULL) {
4066
next_offset = this_offset->next;
4076
/* This function assumes that the offset_list has already been populated
4077
* with the locations of the data blocks. It analyzes the blocks of data
4078
* to find the most frequently occurring pattern of lengths of data and then
4079
* uses that pattern to attach the data to the correct IDs and report any
4080
* errors in formatting.
4082
static void s_ProcessAlignFileRawForMarkedIDs (SAlignRawFilePtr afrp)
4084
SLengthListPtr * anchorpattern;
4090
s_RemoveBasePairCountCommentsFromData (afrp);
4091
anchorpattern = s_CreateAnchorPatternForMarkedIDs (afrp);
4092
if (anchorpattern == NULL || afrp->offset_list == NULL) {
4095
s_CreateSequencesBasedOnTokenPatterns (afrp->line_list, afrp->offset_list,
4096
anchorpattern, afrp);
4100
/* The following functions are used for analyzing contiguous sequence data
4101
* without marked IDs.
4104
/* This function left-shifts a string, character by character. */
4110
if (cp_from == cp_to || cp_from == NULL || cp_to == NULL) {
4113
while (*cp_to != 0) {
4122
/* This function removes bracketed comments from a linked list of
4123
* SLineInfo structures. The function returns a pointer to the
4124
* list without the comments.
4126
static TLineInfoPtr s_RemoveCommentsFromTokens (TLineInfoPtr list)
4129
int num_comment_starts;
4135
num_comment_starts = 0;
4136
in_comment = eFalse;
4137
for (lip = list; lip != NULL; lip = lip->next) {
4138
if (lip->data == NULL) {
4139
lip->delete_me = eTrue;
4143
for (cp = lip->data; *cp != 0; cp++) {
4146
s_StringLeftShift (lip->data, cp + 1);
4149
s_StringLeftShift (cp_r, cp + 1);
4151
if (cp_r > lip->data) {
4153
while (cp_r >= lip->data && *cp_r != '[') {
4156
if (cp_r < lip->data) {
4163
if (num_comment_starts > 0) {
4164
num_comment_starts --;
4166
} else if (*cp == '[') {
4168
num_comment_starts ++;
4172
if (num_comment_starts == 0) {
4173
in_comment = eFalse;
4175
lip->delete_me = eTrue;
4177
} else if (num_comment_starts > 0) {
4178
cp_r = strchr (lip->data, '[');
4184
if (lip->data [0] == 0) {
4185
lip->delete_me = eTrue;
4189
list = s_DeleteLineInfos (list);
4194
/* This function removes Nexus comments from a linked list of SLineInfo
4195
* structures. The function returns a pointer to the list without the
4198
static TLineInfoPtr s_RemoveNexusCommentsFromTokens (TLineInfoPtr list)
4200
TLineInfoPtr lip, start_lip, end_lip;
4205
while (lip != NULL) {
4206
if (s_StringICmp (lip->data, "#NEXUS") == 0) {
4209
while (end_lip != NULL
4210
&& s_StringICmp (end_lip->data, "matrix") != 0) {
4211
end_lip = end_lip->next;
4213
if (end_lip != NULL) {
4214
while (start_lip != end_lip) {
4215
start_lip->delete_me = eTrue;
4216
start_lip = start_lip->next;
4218
end_lip->delete_me = eTrue;
4219
lip = end_lip->next;
4227
list = s_DeleteLineInfos (list);
4232
/* This function finds the number of characters that occur most frequently
4233
* in a token and returns a pointer to a SSizeInfo structure that
4234
* describes the most common length and the number of times it appears.
4237
s_FindMostFrequentlyOccurringTokenLength
4241
TSizeInfoPtr list_ptr, new_list, best_ptr, return_best;
4244
for (list_ptr = list; list_ptr != NULL; list_ptr = list_ptr->next) {
4245
if (not_this_size != list_ptr->size_value) {
4246
new_list = s_AddSizeInfoAppearances (new_list,
4247
list_ptr->size_value,
4248
list_ptr->num_appearances);
4251
best_ptr = s_GetMostPopularSizeInfo (new_list);
4253
if (best_ptr != NULL) {
4254
return_best = s_SizeInfoNew (NULL);
4255
if (return_best != NULL) {
4256
return_best->size_value = best_ptr->size_value;
4257
return_best->num_appearances = best_ptr->num_appearances;
4260
s_SizeInfoFree (new_list);
4265
/* This function examines all instances of an anchor pattern in the data
4266
* and checks to see if the line immediately after the anchor pattern should
4267
* be used as part of the anchor pattern. This function exists because
4268
* frequently, but not always, contiguous data will consist of multiple lines
4269
* of data of the same length (for example, 80 characters), followed by one
4270
* shorter line with the remaining data. We must also make sure that we do
4271
* not accidentally include the ID of the next sequence in the data of the
4272
* previous sequence.
4275
s_ExtendAnchorPattern
4276
(SLengthListPtr anchorpattern,
4277
TSizeInfoPtr line_lengths)
4279
TSizeInfoPtr last_line_lengths, sip, sip_next, twoafter;
4280
int best_last_line_length;
4281
int anchor_line_length;
4283
if (anchorpattern == NULL
4284
|| anchorpattern->lengthrepeats == NULL
4285
|| line_lengths == NULL) {
4289
last_line_lengths = NULL;
4290
anchor_line_length = anchorpattern->lengthrepeats->size_value;
4292
/* also check to make sure that there's more than one line between
4293
* this pattern and the next pattern, otherwise the next line is the
4294
* ID for the next pattern and shouldn't be included in the anchor
4296
for (sip = line_lengths; sip != NULL; sip = sip->next) {
4297
if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
4298
sip_next = sip->next;
4299
if (sip_next != NULL
4300
&& sip_next->size_value > 0
4301
&& sip_next->size_value != anchor_line_length
4302
&& ((twoafter = sip_next->next) == NULL
4303
|| twoafter->size_value != anchor_line_length))
4305
last_line_lengths = s_AddSizeInfo (last_line_lengths,
4306
sip_next->size_value);
4310
best_last_line_length = s_GetMostPopularSize (last_line_lengths);
4311
if (best_last_line_length > 0) {
4312
s_AddLengthRepeat (anchorpattern, best_last_line_length);
4314
s_SizeInfoFree (last_line_lengths);
4318
/* This function looks for the most frequently occurring pattern, where a
4319
* pattern is considered to be N contiguous tokens of M characters. The
4320
* function then checks to see if there is usually a token of a particular
4321
* length that immediately follows this pattern that is not the ID for the
4322
* next sequence. If so, this line length is added to the pattern.
4323
* The function returns a pointer to this pattern.
4325
static SLengthListPtr s_FindMostPopularPattern (TSizeInfoPtr list)
4327
SLengthListPtr patternlist, newpattern;
4328
TSizeInfoPtr sip, popular_line_length;
4329
SLengthListPtr index, best;
4330
int not_this_length;
4333
for (sip = list; sip != NULL; sip = sip->next) {
4334
if (sip->size_value > 0) {
4335
newpattern = s_LengthListNew (NULL);
4336
if (newpattern == NULL) {
4337
s_LengthListFree (patternlist);
4340
newpattern->num_appearances = 1;
4341
newpattern->lengthrepeats = s_SizeInfoNew (NULL);
4342
if (newpattern->lengthrepeats == NULL) {
4343
s_LengthListFree (patternlist);
4346
newpattern->lengthrepeats->size_value = sip->size_value;
4347
newpattern->lengthrepeats->num_appearances = sip->num_appearances;
4348
patternlist = s_AddLengthList (patternlist, newpattern);
4351
if (patternlist == NULL) {
4356
for (index = patternlist; index != NULL; index = index->next) {
4357
if (index->lengthrepeats->num_appearances < 2) {
4360
if (best==NULL || best->num_appearances < index->num_appearances) {
4362
} else if (best->num_appearances == index->num_appearances
4363
&& best->lengthrepeats->size_value <
4364
index->lengthrepeats->size_value) {
4369
/* Free data in list before best pattern */
4370
index = patternlist;
4371
while ( index != NULL && index->next != best ) {
4372
index = index->next;
4374
if (index != NULL) {
4376
s_LengthListFree (patternlist);
4378
/* Free data in list after best pattern */
4380
s_LengthListFree (best->next);
4384
popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list, 0);
4386
if (best != NULL && best->lengthrepeats != NULL
4387
&& popular_line_length != NULL
4388
&& best->lengthrepeats->size_value == popular_line_length->size_value)
4390
not_this_length = popular_line_length->size_value;
4391
s_SizeInfoFree (popular_line_length);
4392
popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list,
4397
|| (popular_line_length != NULL
4398
&& popular_line_length->size_value > best->lengthrepeats->size_value
4399
&& popular_line_length->num_appearances > best->num_appearances))
4402
best = s_LengthListNew (NULL);
4407
best->lengthrepeats = s_SizeInfoNew (NULL);
4408
if (best->lengthrepeats == NULL) {
4411
best->lengthrepeats->size_value = popular_line_length->size_value;
4412
best->lengthrepeats->num_appearances = 1;
4414
/* extend anchor pattern to include best length of last line */
4415
s_ExtendAnchorPattern (best, list);
4418
s_SizeInfoFree (popular_line_length);
4424
/* This function creates an SIntLink list to describe the locations
4425
* of occurrences of the anchorpattern in the SSizeInfo list.
4426
* The function returns a pointer to the SIntLink list.
4431
SLengthListPtr anchorpattern)
4434
TIntLinkPtr offset_list, new_offset;
4435
TSizeInfoPtr sip, prev_sip;
4437
if (list == NULL || anchorpattern == NULL) {
4443
for (sip = list; sip != NULL; sip = sip->next) {
4444
if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
4445
new_offset = s_IntLinkNew (line_counter, offset_list);
4446
if (new_offset == NULL) {
4447
s_IntLinkFree (offset_list);
4450
if (offset_list == NULL) {
4451
offset_list = new_offset;
4455
line_counter += sip->num_appearances;
4462
/* This function determines whether or not the number of expected sequence
4463
* characters are available starting at a token after line_start and stopping
4464
* at least one token before the next known sequence data block in the list.
4465
* If so, the function returns the number of the token at which the sequence
4466
* data begins. Otherwise the function returns -1.
4472
TIntLinkPtr next_offset,
4476
int offset, end_offset;
4478
int line_counter, num_chars;
4484
for (offset = sip_offset; offset < list->num_appearances; offset++) {
4485
line_counter = line_start + offset;
4486
num_chars = list->size_value * (list->num_appearances - offset);
4488
while (num_chars < pattern_length
4489
&& (next_offset == NULL || line_counter < next_offset->ival)
4490
&& sip->next != NULL)
4493
for (end_offset = 0;
4494
end_offset < sip->num_appearances
4495
&& num_chars < pattern_length
4496
&& (next_offset == NULL
4497
|| line_counter < next_offset->ival);
4500
num_chars += sip->size_value;
4504
if (num_chars == pattern_length) {
4505
return line_start + offset;
4512
/* This function examines the offset list and searches for holes where blocks
4513
* of sequence data without the exact expected formatting might exist. The
4514
* function adds the offsets of any new blocks to the list and returns a
4515
* pointer to the augmented offset list.
4519
(TIntLinkPtr offset_list,
4521
SLengthListPtr anchorpattern)
4525
TIntLinkPtr prev_offset, next_offset, new_offset;
4526
int line_counter, forecast_position, line_skip;
4527
EBool skipped_previous = eFalse;
4530
if (list == NULL || anchorpattern == NULL) {
4535
for (sip = anchorpattern->lengthrepeats; sip != NULL; sip = sip->next) {
4536
pattern_length += (sip->size_value * sip->num_appearances);
4538
if (pattern_length == 0) {
4543
next_offset = offset_list;
4546
while (sip != NULL) {
4547
/* if we are somehow out of synch, don't get caught in infinite loop */
4548
if (next_offset != NULL && line_counter > next_offset->ival) {
4549
next_offset = next_offset->next;
4550
} else if (next_offset != NULL && line_counter == next_offset->ival) {
4551
skipped_previous = eFalse;
4552
prev_offset = next_offset;
4553
next_offset = next_offset->next;
4554
/* advance sip and line counter past the end of this pattern */
4556
while (num_chars < pattern_length && sip != NULL) {
4557
num_chars += sip->size_value * sip->num_appearances;
4558
line_counter += sip->num_appearances;
4561
} else if (skipped_previous) {
4563
while (sip != NULL && line_skip < sip->num_appearances
4564
&& (next_offset == NULL
4565
|| line_counter < next_offset->ival)) {
4566
/* see if we can build a pattern that matches the pattern
4569
forecast_position = s_ForecastPattern (line_counter,
4571
next_offset, line_skip,
4573
if (forecast_position > 0) {
4574
new_offset = s_IntLinkNew (forecast_position, NULL);
4575
if (new_offset == NULL) {
4578
if (prev_offset == NULL) {
4579
new_offset->next = offset_list;
4580
offset_list = new_offset;
4582
new_offset->next = next_offset;
4583
prev_offset->next = new_offset;
4585
prev_offset = new_offset;
4586
/* now advance sip and line counter past the end
4587
* of the pattern we have just created
4590
while (num_chars < pattern_length && sip != NULL) {
4592
line_skip < sip->num_appearances
4593
&& num_chars < pattern_length;
4596
num_chars += sip->size_value;
4599
if (line_skip == sip->num_appearances) {
4605
line_counter += sip->num_appearances;
4611
skipped_previous = eTrue;
4612
line_counter += sip->num_appearances;
4620
/* This function finds the most frequently occurring distance between
4621
* two sequence data blocks and returns that value.
4623
static int s_GetMostPopularPatternLength (TIntLinkPtr offset_list)
4625
int line_counter, best_length;
4626
TSizeInfoPtr pattern_length_list;
4629
if (offset_list == NULL) {
4634
pattern_length_list = NULL;
4635
for (offset = offset_list; offset != NULL; offset = offset->next) {
4636
if (line_counter != -1) {
4637
pattern_length_list = s_AddSizeInfo (pattern_length_list,
4638
offset->ival - line_counter);
4640
line_counter = offset->ival;
4642
best_length = s_GetMostPopularSize (pattern_length_list);
4643
s_SizeInfoFree (pattern_length_list);
4648
/* This function finds the most frequently appearing number of characters
4649
* in a block of sequence data and returns that value.
4652
s_GetBestCharacterLength
4653
(TLineInfoPtr token_list,
4654
TIntLinkPtr offset_list,
4658
TIntLinkPtr prev_offset, new_offset;
4659
int line_diff, num_chars, best_num_chars;
4660
TSizeInfoPtr pattern_length_list = NULL;
4662
if (token_list == NULL || offset_list == NULL || block_length < 1) {
4665
/* get length of well-formatted block size */
4668
for (new_offset = offset_list;
4669
new_offset != NULL && lip != NULL;
4670
new_offset = new_offset->next)
4672
if (prev_offset == NULL) {
4673
/* skip first tokens */
4675
line_diff < new_offset->ival && lip != NULL;
4681
if (prev_offset != NULL) {
4684
line_diff < new_offset->ival - prev_offset->ival
4688
if (line_diff < new_offset->ival - prev_offset->ival - 1) {
4689
num_chars += strlen (lip->data);
4693
if (new_offset->ival - prev_offset->ival == block_length) {
4694
pattern_length_list = s_AddSizeInfo (pattern_length_list,
4698
prev_offset = new_offset;
4700
best_num_chars = s_GetMostPopularSize (pattern_length_list);
4701
if (best_num_chars == 0 && pattern_length_list != NULL) {
4702
best_num_chars = pattern_length_list->size_value;
4704
s_SizeInfoFree (pattern_length_list);
4705
pattern_length_list = NULL;
4706
return best_num_chars;
4711
s_CountCharactersBetweenOffsets
4714
int desired_num_chars)
4716
int line_diff, num_chars, total_chars, pattern_length, num_starts;
4718
TIntLinkPtr length_list, start_list, start_ptr, length;
4719
int start_of_unknown;
4720
int num_additional_offsets_needed;
4722
if (list == NULL || distance == 0 || desired_num_chars == 0) {
4726
/* because the first offset is the start of a known pattern, we should
4727
* skip to the end of that pattern and start looking for additional
4731
for (lip = list, line_diff = 0;
4732
lip != NULL && line_diff < distance
4733
&& total_chars < desired_num_chars;
4734
lip = lip->next, line_diff++) {
4735
num_chars = strlen (lip->data);
4736
total_chars += num_chars;
4738
while (lip != NULL && line_diff < distance && s_IsBlank (lip->data)) {
4742
/* skip over line we would need for ID */
4748
if (lip == NULL || line_diff == distance) {
4753
start_of_unknown = line_diff;
4758
lip != NULL && line_diff < distance;
4759
lip = lip->next, line_diff++)
4761
num_chars = strlen (lip->data);
4762
length = s_IntLinkNew (num_chars, length_list);
4763
if (length_list == NULL) {
4764
length_list = length;
4766
total_chars += num_chars;
4769
/* how many offsets do we need? */
4770
num_additional_offsets_needed = (total_chars / desired_num_chars);
4771
if (num_additional_offsets_needed == 0) {
4775
/* Find all the places you could start and get the exact right number
4781
for (start_ptr = length_list, line_diff = start_of_unknown;
4782
start_ptr != NULL && line_diff < distance
4783
&& pattern_length < distance - line_diff ;
4784
start_ptr = start_ptr->next, line_diff++) {
4785
num_chars = start_ptr->ival;
4787
length = start_ptr->next;
4788
while (num_chars < desired_num_chars
4789
&& pattern_length + line_diff < distance
4792
num_chars += length->ival;
4794
length = length->next;
4796
if (num_chars == desired_num_chars) {
4797
length = s_IntLinkNew (line_diff, start_list);
4798
if (start_list == NULL) {
4799
start_list = length;
4805
/* now select best set of start points */
4807
s_IntLinkFree (length_list);
4808
s_IntLinkFree (start_list);
4813
/* This function inserts new block locations into the offset_list
4814
* by looking for likely starts of abnormal patterns.
4816
static void s_InsertNewOffsets
4817
(TLineInfoPtr token_list,
4818
TIntLinkPtr offset_list,
4823
TLineInfoPtr lip, prev_start;
4824
TIntLinkPtr prev_offset, new_offset, splice_offset;
4825
int line_diff, num_chars, line_start;
4827
if (token_list == NULL || offset_list == NULL
4828
|| block_length < 1 || best_num_chars < 1)
4835
for (new_offset = offset_list;
4836
new_offset != NULL && lip != NULL;
4837
new_offset = new_offset->next) {
4838
if (prev_offset == NULL) {
4839
/* just advance through tokens */
4841
line_diff < new_offset->ival && lip != NULL;
4846
if (new_offset->ival - prev_offset->ival == block_length) {
4847
/* just advance through tokens */
4849
line_diff < new_offset->ival - prev_offset->ival
4855
/* look for intermediate breaks */
4859
line_diff < new_offset->ival - prev_offset->ival
4860
&& lip != NULL && num_chars < best_num_chars;
4862
num_chars += strlen (lip->data);
4868
/* set new offset at first line of next pattern */
4871
if (line_diff < new_offset->ival - prev_offset->ival) {
4872
line_start = line_diff + prev_offset->ival;
4873
/* advance token pointer to new piece */
4874
while (line_diff < new_offset->ival - prev_offset->ival
4880
/* insert new offset value */
4881
splice_offset = s_IntLinkNew (line_start, NULL);
4882
if (splice_offset == NULL) {
4885
splice_offset->next = new_offset;
4886
prev_offset->next = splice_offset;
4888
s_CountCharactersBetweenOffsets (lip,
4889
new_offset->ival - splice_offset->ival,
4894
prev_offset = new_offset;
4897
/* iterate through the last block */
4899
line_diff < block_length && lip != NULL;
4904
/* if we have room for one more sequence, or even most of one more sequence, add it */
4905
if (lip != NULL && ! s_SkippableString (lip->data)) {
4906
splice_offset = s_IntLinkNew (line_diff + prev_offset->ival, prev_offset);
4911
/* This function returns true if the string contains digits, false otherwise */
4912
static EBool s_ContainsDigits (char *data)
4916
if (data == NULL) return eFalse;
4917
for (cp = data; *cp != 0; cp++) {
4918
if (isdigit (*cp)) {
4926
/* This function processes the alignment file data by dividing the original
4927
* lines into pieces based on whitespace and looking for patterns of length
4930
static void s_ProcessAlignFileRawByLengthPattern (SAlignRawFilePtr afrp)
4932
TLineInfoPtr token_list;
4933
SLengthListPtr list;
4935
SLengthListPtr anchorpattern[2];
4936
TIntLinkPtr offset_list;
4940
if (afrp == NULL || afrp->line_list == NULL) {
4944
token_list = s_BuildTokenList (afrp->line_list);
4945
token_list = s_RemoveCommentsFromTokens (token_list);
4946
token_list = s_RemoveNexusCommentsFromTokens (token_list);
4948
list = s_LengthListNew ( NULL );
4949
for (lip = token_list;
4950
lip != NULL && ! s_FoundStopLine (lip->data);
4953
if (s_SkippableString (lip->data) || s_ContainsDigits(lip->data)) {
4954
s_AddLengthRepeat (list, 0);
4956
s_AddLengthRepeat (list, strlen (lip->data));
4960
anchorpattern [0] = s_FindMostPopularPattern (list->lengthrepeats);
4961
anchorpattern [1] = NULL;
4962
if (anchorpattern [0] == NULL || anchorpattern[0]->lengthrepeats == NULL) {
4966
/* find anchor patterns in original list,
4967
* find distances between anchor patterns
4969
offset_list = s_CreateOffsetList (list->lengthrepeats, anchorpattern[0]);
4970
offset_list = s_AugmentOffsetList (offset_list,
4971
list->lengthrepeats,
4974
/* resolve unusual distances between anchor patterns */
4975
best_length = s_GetMostPopularPatternLength (offset_list);
4976
if (best_length < 1 && offset_list != NULL && offset_list->next != NULL) {
4977
best_length = offset_list->next->ival - offset_list->ival;
4979
best_num_chars = s_GetBestCharacterLength (token_list, offset_list,
4981
s_InsertNewOffsets (token_list, offset_list, best_length, best_num_chars,
4984
/* use token before each anchor pattern as ID, use tokens for distance
4985
* between anchor patterns for sequence data
4987
s_CreateSequencesBasedOnTokenPatterns (token_list, offset_list,
4988
anchorpattern, afrp);
4990
s_LengthListFree (anchorpattern[0]);
4991
s_LengthListFree (list);
4992
s_LineInfoFree (token_list);
4996
/* The following functions are used to convert data from the internal
4997
* representation into the form that will be passed to the calling
4998
* program. Information from the ID strings is parsed to remove
4999
* definition lines and organism information, the gap characters are
5000
* standardized to '-', the missing characters are standardizes to 'N',
5001
* match characters are replaced with characters from the first record,
5002
* and bad characters are reported.
5005
/* This function allocates memory for a new AligmentFileData structure
5006
* and initializes its member variables.
5008
extern TAlignmentFilePtr AlignmentFileNew (void)
5010
TAlignmentFilePtr afp;
5012
afp = (TAlignmentFilePtr) malloc (sizeof (SAlignmentFile));
5016
afp->num_sequences = 0;
5017
afp->num_organisms = 0;
5018
afp->num_deflines = 0;
5019
afp->num_segments = 0;
5021
afp->sequences = NULL;
5022
afp->organisms = NULL;
5023
afp->deflines = NULL;
5028
/* This function frees the memory associated with an AligmentFileData
5029
* structure and its member variables.
5031
extern void AlignmentFileFree (TAlignmentFilePtr afp)
5038
if (afp->ids != NULL) {
5039
for (index = 0; index < afp->num_sequences; index++) {
5040
free (afp->ids [index]);
5045
if (afp->sequences != NULL) {
5046
for (index = 0; index < afp->num_sequences; index++) {
5047
free (afp->sequences [index]);
5049
free (afp->sequences);
5050
afp->sequences = NULL;
5052
if (afp->organisms != NULL) {
5053
for (index = 0; index < afp->num_organisms; index++) {
5054
free (afp->organisms [index]);
5056
free (afp->organisms);
5057
afp->sequences = NULL;
5059
if (afp->deflines != NULL) {
5060
for (index = 0; index < afp->num_deflines; index++) {
5061
free (afp->deflines [index]);
5063
free (afp->deflines);
5064
afp->deflines = NULL;
5070
/* This function parses the identifier string used by the alignment file
5071
* to identify a sequence to find the portion of the string that is actually
5072
* an ID, as opposed to organism information or definition line.
5074
static char * s_GetIdFromString (char * str)
5085
cp += strspn (str, " >\t");
5086
len = strcspn (cp, " \t\r\n");
5090
id = malloc (len + 1);
5094
strncpy (id, cp, len);
5100
/* This function pulls defline information from the ID string, if there is
5103
static char * s_GetDeflineFromIdString (char * str)
5113
cp += strspn (str, " >\t");
5114
len = strcspn (cp, " \t\r\n");
5119
len = strspn (cp, " \t\r\n");
5131
/* This function takes the ID strings read from the file and parses them
5132
* to obtain a defline (if there is extra text after the ID and/or
5133
* organism information) and to obtain the actual ID for the sequence.
5135
static EBool s_ReprocessIds (SAlignRawFilePtr afrp)
5137
TStringCountPtr list, scp;
5138
TAlignRawSeqPtr arsp;
5150
lip = afrp->deflines;
5151
for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
5152
if (arsp->id_lines != NULL) {
5153
line_num = arsp->id_lines->ival;
5157
s_RemoveOrganismCommentFromLine (arsp->id);
5158
id = s_GetIdFromString (arsp->id);
5160
defline = s_GetDeflineFromIdString (arsp->id);
5161
afrp->deflines = s_AddLineInfo (afrp->deflines, defline,
5164
afrp->num_deflines ++;
5168
list = s_AddStringCount (arsp->id, line_num, list);
5171
for (scp = list; scp != NULL; scp = scp->next) {
5172
if (scp->num_appearances > 1) {
5174
s_ReportRepeatedId (scp, afrp->report_error,
5175
afrp->report_error_userdata);
5182
/* This function reports unacceptable characters in a sequence. Frequently
5183
* there will be more than one character of the same kind (for instance,
5184
* when the user has incorrectly specified a gap character), so repeated
5185
* characters are reported together. The function advances the data
5186
* position in the SLineInfoReader structure lirp, and returns the
5187
* current data position for lirp.
5190
s_ReportRepeatedBadCharsInSequence
5191
(TLineInfoReaderPtr lirp,
5194
FReportErrorFunction report_error,
5195
void * report_error_userdata)
5197
int bad_line_num, bad_line_offset;
5199
char bad_char, curr_char;
5202
bad_line_num = s_LineInfoReaderGetCurrentLineNumber (lirp);
5203
bad_line_offset = s_LineInfoReaderGetCurrentLineOffset (lirp);
5204
bad_char = *lirp->curr_line_pos;
5206
data_position = lirp->data_pos + 1;
5207
while ((curr_char = s_FindNthDataChar (lirp, data_position)) == bad_char) {
5211
s_ReportBadCharError (id, bad_char, num_bad_chars,
5212
bad_line_offset, bad_line_num, reason,
5213
report_error, report_error_userdata);
5214
return data_position;
5218
/* This function does context-sensitive replacement of the missing,
5219
* match, and gap characters and also identifies bad characters.
5220
* Gap characters found in the wrong location in the sequence are
5221
* considered an error. Characters that are not missing, match, or
5222
* gap characters and are not in the specified sequence alphabet are
5223
* reported as errors. Match characters in the first sequence are also
5224
* reported as errors.
5225
* The function will return eTrue if any errors were found, or eFalse
5226
* if there were no errors.
5229
s_FindBadDataCharsInSequence
5230
(TAlignRawSeqPtr arsp,
5231
TAlignRawSeqPtr master_arsp,
5232
TSequenceInfoPtr sip,
5234
FReportErrorFunction report_error,
5235
void * report_error_userdata)
5237
TLineInfoReaderPtr lirp, master_lirp;
5241
char curr_char, master_char;
5242
EBool found_middle_start;
5243
EBool rval = eFalse;
5244
EBool match_not_in_beginning_gap;
5245
EBool match_not_in_end_gap;
5247
if (arsp == NULL || master_arsp == NULL || sip == NULL) {
5250
lirp = s_LineInfoReaderNew (arsp->sequence_data);
5254
if (arsp != master_arsp) {
5255
master_lirp = s_LineInfoReaderNew (master_arsp->sequence_data);
5256
if (master_lirp == NULL) {
5257
s_LineInfoReaderFree (lirp);
5264
if (strcspn (sip->beginning_gap, sip->match)
5265
== strlen (sip->beginning_gap)) {
5266
match_not_in_beginning_gap = eTrue;
5268
match_not_in_beginning_gap = eFalse;
5271
if (strcspn (sip->end_gap, sip->match) == strlen (sip->end_gap)) {
5272
match_not_in_end_gap = eTrue;
5274
match_not_in_end_gap = eFalse;
5277
/* First, find middle start and end positions and report characters
5278
* that are not beginning gap before the middle
5280
found_middle_start = eFalse;
5282
curr_char = s_FindNthDataChar (lirp, data_position);
5283
while (curr_char != 0) {
5284
if (strchr (sip->alphabet, curr_char) != NULL) {
5285
if (! found_middle_start) {
5286
middle_start = data_position;
5287
found_middle_start = eTrue;
5289
middle_end = data_position + 1;
5291
} else if (! found_middle_start) {
5292
if (match_not_in_beginning_gap
5293
&& strchr (sip->match, curr_char) != NULL)
5295
middle_start = data_position;
5296
found_middle_start = eTrue;
5297
middle_end = data_position + 1;
5299
} else if (strchr (sip->beginning_gap, curr_char) == NULL) {
5300
/* Report error - found character that is not beginning gap
5302
data_position = s_ReportRepeatedBadCharsInSequence (lirp,
5304
"expect only beginning gap characters here",
5305
report_error, report_error_userdata);
5308
*lirp->curr_line_pos = '-';
5312
if (match_not_in_end_gap
5313
&& strchr (sip->match, curr_char) != NULL)
5315
middle_end = data_position + 1;
5319
curr_char = s_FindNthDataChar (lirp, data_position);
5322
if (! found_middle_start) {
5323
if (num_segments > 1)
5329
s_ReportMissingSequenceData (arsp->id,
5330
report_error, report_error_userdata);
5331
s_LineInfoReaderFree (lirp);
5337
/* Now complain about bad middle characters */
5338
data_position = middle_start;
5339
while (data_position < middle_end)
5341
curr_char = s_FindNthDataChar (lirp, data_position);
5342
while (data_position < middle_end
5343
&& strchr (sip->alphabet, curr_char) != NULL) {
5345
curr_char = s_FindNthDataChar (lirp, data_position);
5347
if (curr_char == 0 || data_position >= middle_end) {
5348
/* do nothing, done with middle */
5349
} else if (strchr (sip->missing, curr_char) != NULL) {
5350
*lirp->curr_line_pos = 'N';
5352
} else if (strchr (sip->match, curr_char) != NULL) {
5353
master_char = s_FindNthDataChar (master_lirp, data_position);
5354
if (master_char == 0) {
5355
/* report error - unable to get master char */
5356
if (master_arsp == arsp) {
5357
data_position = s_ReportRepeatedBadCharsInSequence (lirp,
5359
"can't specify match chars in first sequence",
5360
report_error, report_error_userdata);
5362
data_position = s_ReportRepeatedBadCharsInSequence (lirp,
5364
"can't find source for match chars",
5365
report_error, report_error_userdata);
5369
*lirp->curr_line_pos = master_char;
5372
} else if (strchr (sip->middle_gap, curr_char) != NULL) {
5373
*lirp->curr_line_pos = '-';
5376
/* Report error - found bad character in middle */
5377
data_position = s_ReportRepeatedBadCharsInSequence (lirp,
5379
"expect only sequence, missing, match,"
5380
" and middle gap characters here",
5381
report_error, report_error_userdata);
5386
/* Now find and complain about end characters */
5387
data_position = middle_end;
5388
curr_char = s_FindNthDataChar (lirp, data_position);
5389
while (curr_char != 0) {
5390
if (strchr (sip->end_gap, curr_char) == NULL) {
5391
/* Report error - found bad character in middle */
5392
data_position = s_ReportRepeatedBadCharsInSequence (lirp, arsp->id,
5393
"expect only end gap characters here",
5394
report_error, report_error_userdata);
5397
*lirp->curr_line_pos = '-';
5400
curr_char = s_FindNthDataChar (lirp, data_position);
5402
s_LineInfoReaderFree (lirp);
5403
s_LineInfoReaderFree (master_lirp);
5408
/* This function examines each sequence and replaces the special characters
5409
* and reports bad characters in each one. The function will return eTrue
5410
* if any of the sequences contained bad characters or eFalse if no errors
5414
s_s_FindBadDataCharsInSequenceList
5415
(SAlignRawFilePtr afrp,
5416
TSequenceInfoPtr sip)
5418
TAlignRawSeqPtr arsp;
5419
EBool rval = eFalse;
5421
if (afrp == NULL || afrp->sequences == NULL) {
5424
for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
5425
if (s_FindBadDataCharsInSequence (arsp, afrp->sequences, sip,
5428
afrp->report_error_userdata)) {
5436
/* This function examines the organisms listed for the alignment and determines
5437
* whether any of the organism names (including the associated comments) are
5440
static EBool s_AreOrganismsUnique (SAlignRawFilePtr afrp)
5442
TLineInfoPtr this_org, lip;
5443
TAlignRawSeqPtr arsp;
5446
if (afrp == NULL || afrp->num_organisms == 0
5447
|| afrp->organisms == NULL) {
5451
for (this_org = afrp->organisms;
5453
this_org = this_org->next) {
5454
lip = afrp->organisms;
5455
arsp = afrp->sequences;
5456
while (lip != NULL && lip != this_org
5457
&& strcmp (lip->data, this_org->data) != 0 && arsp != NULL) {
5461
if (lip != NULL && lip != this_org) {
5462
are_unique = eFalse;
5463
s_ReportRepeatedOrganismName (arsp->id, this_org->line_num,
5467
afrp->report_error_userdata);
5474
#if 0 /* this step was removed by indexer request */
5475
/* This function reports whether the definition lines are identical for
5476
* each sequence or not.
5478
static EBool s_AreDeflinesIdentical (SAlignRawFilePtr afrp)
5481
TStringCountPtr list;
5489
for (lip = afrp->deflines; lip != NULL; lip = lip->next) {
5490
list = s_AddStringCount (lip->data, lip->line_num, list);
5493
if (list != NULL && list->next != NULL) {
5495
s_ReportDefinitionLineMismatch (afrp->report_error,
5496
afrp->report_error_userdata);
5497
s_ReportDefinitionLines (list, afrp->report_error,
5498
afrp->report_error_userdata);
5500
s_StringCountFree (list);
5506
/* This function uses the contents of an SAlignRawFileData structure to
5507
* create an SAlignmentFile structure with the appropriate information.
5509
static TAlignmentFilePtr
5510
s_ConvertDataToOutput
5511
(SAlignRawFilePtr afrp,
5512
TSequenceInfoPtr sip)
5514
TAlignRawSeqPtr arsp;
5516
TSizeInfoPtr * lengths;
5518
TAlignmentFilePtr afp;
5522
if (afrp == NULL || sip == NULL || afrp->sequences == NULL) {
5525
afp = AlignmentFileNew ();
5530
afp->num_organisms = afrp->num_organisms;
5531
afp->num_deflines = afrp->num_deflines;
5532
afp->num_segments = afrp->num_segments;
5533
afp->num_sequences = 0;
5536
for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
5537
afp->num_sequences++;
5540
if (afp->num_sequences != afrp->num_organisms
5541
&& afp->num_sequences / afp->num_segments != afrp->num_organisms) {
5542
s_ReportMissingOrganismInfo (afrp->report_error,
5543
afrp->report_error_userdata);
5545
s_AreOrganismsUnique (afrp);
5548
afp->sequences = (char **)malloc (afp->num_sequences
5550
if (afp->sequences == NULL) {
5551
AlignmentFileFree (afp);
5554
afp->ids = (char **)malloc (afp->num_sequences * sizeof (char *));
5555
if (afp->ids == NULL) {
5556
AlignmentFileFree (afp);
5559
if (afp->num_organisms > 0) {
5560
afp->organisms = (char **) malloc (afp->num_organisms
5562
if (afp->organisms == NULL) {
5563
AlignmentFileFree (afp);
5567
if (afp->num_deflines > 0) {
5568
afp->deflines = (char **)malloc (afp->num_deflines
5570
if (afp->deflines == NULL) {
5571
AlignmentFileFree (afp);
5576
/* copy in deflines */
5577
for (lip = afrp->deflines, index = 0;
5578
lip != NULL && index < afp->num_deflines;
5579
lip = lip->next, index++) {
5580
if (lip->data == NULL) {
5581
afp->deflines [index] = NULL;
5583
afp->deflines [index] = strdup (lip->data);
5586
while (index < afp->num_deflines) {
5587
afp->deflines [index ++] = NULL;
5590
/* copy in organism information */
5591
for (lip = afrp->organisms, index = 0;
5592
lip != NULL && index < afp->num_organisms;
5593
lip = lip->next, index++) {
5594
afp->organisms [index] = strdup (lip->data);
5597
/* we need to store length information about different segments separately */
5598
lengths = (TSizeInfoPtr *) malloc (sizeof (TSizeInfoPtr) * afrp->num_segments);
5599
if (lengths == NULL) {
5600
AlignmentFileFree (afp);
5603
best_length = (int *) malloc (sizeof (int) * afrp->num_segments);
5604
if (best_length == NULL) {
5606
AlignmentFileFree (afp);
5609
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++) {
5610
lengths [curr_seg] = NULL;
5611
best_length [curr_seg] = 0;
5614
/* copy in sequence data */
5616
for (arsp = afrp->sequences, index = 0;
5617
arsp != NULL && index < afp->num_sequences;
5618
arsp = arsp->next, index++) {
5619
afp->sequences [index] =
5620
s_LineInfoMergeAndStripSpaces (arsp->sequence_data);
5622
if (afp->sequences [index] != NULL) {
5623
lengths [curr_seg] = s_AddSizeInfo (lengths [curr_seg], strlen (afp->sequences [index]));
5625
afp->ids [index] = strdup (arsp->id);
5627
if (curr_seg >= afrp->num_segments) {
5631
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
5633
best_length [curr_seg] = s_GetMostPopularSize (lengths [curr_seg]);
5634
if (best_length [curr_seg] == 0 && lengths [curr_seg] != NULL) {
5635
best_length [curr_seg] = lengths [curr_seg]->size_value;
5640
for (index = 0; index < afp->num_sequences; index++) {
5641
if (afp->sequences [index] == NULL) {
5642
s_ReportMissingSequenceData (afp->ids [index],
5644
afrp->report_error_userdata);
5645
} else if ((int) strlen (afp->sequences [index]) != best_length [curr_seg]) {
5646
s_ReportBadSequenceLength (afp->ids [index], best_length [curr_seg],
5647
strlen (afp->sequences [index]),
5649
afrp->report_error_userdata);
5652
if (curr_seg >= afrp->num_segments) {
5657
if (afrp->expected_num_sequence > 0
5658
&& afrp->expected_num_sequence != afp->num_sequences)
5660
s_ReportIncorrectNumberOfSequences (afrp->expected_num_sequence,
5663
afrp->report_error_userdata);
5665
if (afrp->expected_sequence_len > 0
5666
&& afrp->expected_sequence_len != best_length [0])
5668
s_ReportIncorrectSequenceLength (afrp->expected_sequence_len,
5671
afrp->report_error_userdata);
5675
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
5677
s_SizeInfoFree (lengths [curr_seg]);
5685
/* This is the function called by the calling program to read an alignment
5686
* file. The readfunc argument is a function pointer supplied by the
5687
* calling program which this library will use to read in data from the
5688
* file one line at a time. The fileuserdata argument is a pointer to
5689
* data used by the calling program's readfunc function and will be passed
5690
* back with each call to readfunc.
5691
* The errfunc argument is a function pointer supplied by the calling
5692
* program for reporting errors. The erroruserdata argument is a pointer
5693
* to data used by the calling program's errfunc function and will be
5694
* passed back with each call to readfunc.
5695
* The sequence_info argument contains the sequence alphabet and missing,
5696
* match, and gap characters to use in interpreting the sequence data.
5698
extern TAlignmentFilePtr
5700
(FReadLineFunction readfunc,
5701
void * fileuserdata,
5702
FReportErrorFunction errfunc,
5703
void * erroruserdata,
5704
TSequenceInfoPtr sequence_info)
5706
SAlignRawFilePtr afrp;
5707
TAlignmentFilePtr afp;
5709
if (sequence_info == NULL || sequence_info->alphabet == NULL) {
5712
afrp = s_ReadAlignFileRaw ( readfunc, fileuserdata, sequence_info,
5713
errfunc, erroruserdata);
5718
if (afrp->block_size > 1) {
5719
s_ProcessAlignRawFileByBlockOffsets (afrp);
5720
} else if (afrp->marked_ids) {
5721
s_ProcessAlignFileRawForMarkedIDs (afrp);
5723
s_ProcessAlignFileRawByLengthPattern (afrp);
5726
s_ReprocessIds (afrp);
5728
#if 0 /* this step was removed by indexer request */
5729
/* Note - have to check deflines after reprocessing IDs */
5730
s_AreDeflinesIdentical (afrp);
5733
if (s_s_FindBadDataCharsInSequenceList (afrp, sequence_info)) {
5734
s_AlignFileRawFree (afrp);
5738
afp = s_ConvertDataToOutput (afrp, sequence_info);
5739
s_AlignFileRawFree (afrp);
5745
* ===========================================================================
5746
* $Log: alnread.c,v $
5747
* Revision 1.12 2004/09/17 12:21:48 bollin
5748
* allow all-gap segments in segmented alignments
5750
* Revision 1.11 2004/08/11 15:23:07 vakatov
5751
* Compilation warning fix (unused static func)
5753
* Revision 1.10 2004/05/20 19:40:24 bollin
5754
* Made chnages to allow reading of alignments of segmented sets.
5755
* Also added warnings for when organism lines may be present but improperly
5758
* Revision 1.9 2004/03/16 21:05:15 bollin
5759
* Added some improvements to the portion of the alignment reader that deals
5760
* with contiguous alignments that do not have a '>' at the beginning of each
5763
* Revision 1.8 2004/03/16 16:25:38 bollin
5764
* Added function to recognize a file as ASN.1 and reject immediately
5766
* Revision 1.7 2004/03/09 21:27:39 bollin
5767
* in s_InsertNewOffsets, if the list ends while searching for the next pattern, exit immediately (prevents NULL pointer access)
5769
* Revision 1.6 2004/03/04 19:15:07 bollin
5770
* file reading now skips over multi-line bracketed comments
5772
* Revision 1.5 2004/03/04 16:29:32 bollin
5773
* added skip of taxa comment for PAUP format alignment files
5775
* Revision 1.4 2004/02/10 16:15:13 bollin
5776
* now checks for unused lines when finding interleaved blocks, will reject and try other methods if unused lines found after first block found.
5778
* Revision 1.3 2004/02/05 16:29:32 bollin
5779
* smarter function for skipping NEXUS comment lines
5781
* Revision 1.2 2004/02/04 19:49:11 bollin
5782
* fixed infinite loop condition in s_AugmentOffsetList, properly skip over first non-space column when looking for interleaved block patterns in s_ReadAlignFileRaw
5784
* Revision 1.1 2004/02/03 16:47:02 ucko
5785
* Add Colleen Bollin's Toolkit-independent alignment reader.
5787
* Revision 1.38 2004/01/30 22:46:08 bollin
5788
* renamed defined variable, fixed typo in comment
5790
* Revision 1.37 2004/01/30 21:48:14 bollin
5791
* changes for compatibility with Windows
5793
* Revision 1.36 2004/01/30 21:33:41 bollin
5794
* replaced strncasecmp and strncase function calls
5796
* Revision 1.35 2004/01/29 19:16:27 bollin
5797
* use EBool for boolean values
5799
* Revision 1.34 2004/01/29 17:58:11 bollin
5800
* aligned assignment blocks in New functions
5802
* Revision 1.33 2004/01/29 17:43:40 bollin
5803
* added directory specification to alnread.h include line
5805
* Revision 1.32 2004/01/29 17:41:29 bollin
5806
* added comment block, id tags, log
5808
* ===========================================================================