1
// genbank and Macke converting program
6
static int paren_string(char *line, char *pstring, int index) {
7
int len = str0len(line);
11
for (indk = 0; index < len; index++) {
13
pstring[indk++] = line[index];
14
if (line[index] == '(')
16
if (line[index] == ')')
21
pstring[--indk] = '\0';
25
static void get_atcc_string(const char *line, char *temp, int index) {
26
// Get the rest of the line until reaching certain terminators, such as ';', ',', '.',...
28
int len = str0len(line);
32
for (indk = 0; index < len; index++, indk++) {
33
temp[indk] = line[index];
34
if (temp[indk] == '(')
36
if (temp[indk] == ')')
41
else if (paren_num == 0 && (temp[indk] == ';' || temp[indk] == '.' || temp[indk] == ',' || temp[indk] == '/' || temp[indk] == '\n'))
47
static char *get_atcc(const Macke& macke, char *source) {
48
static int cc_num = 16;
49
static const char *CC[16] = {
50
"ATCC", "CCM", "CDC", "CIP", "CNCTC",
51
"DSM", "EPA", "JCM", "NADC", "NCDO", "NCTC", "NRCC",
52
"NRRL", "PCC", "USDA", "VPI"
55
int indi, indj, index;
57
char buffer[LONGTEXT], temp[LONGTEXT], pstring[LONGTEXT];
61
for (indi = 0; indi < cc_num; indi++) {
63
while ((index = paren_string(source, pstring, index)) > 0) {
64
if ((indj = find_pattern(pstring, CC[indi])) >= 0) {
66
indj += str0len(CC[indi]);
68
indj = Skip_white_space(pstring, indj);
70
get_atcc_string(pstring, buffer, indj);
71
sprintf(temp, "%s %s", CC[indi], buffer);
72
length = str0len(atcc);
81
// append eoln to the atcc string
82
length = str0len(atcc);
84
macke.atcc[length] = '\0';
87
return (nulldup(atcc));
90
static char *genbank_get_atcc(const GenBank& gbk, const Macke& macke) {
91
// Get atcc from SOURCE line in Genbank data file.
94
// get culture collection #
95
if (has_content(gbk.source)) {
96
atcc = get_atcc(macke, gbk.source);
98
if (!has_content(atcc) && has_content(macke.strain)) {
99
// add () to macke strain to be processed correctly
101
sprintf(temp, "(%s)", macke.strain);
102
atcc = get_atcc(macke, temp);
107
void Macke::add_35end_remark(char end35, char yn) {
108
if (yn == ' ') return;
110
char *content = strf("%c' end complete: %s\n", end35, yn == 'y' ? "Yes" : "No");
115
void Macke::add_remarks_from(const GenbankRef& ref) {
116
add_remark_if_content("ref:", ref.ref);
117
add_remark_if_content("auth:", ref.author);
118
add_remark_if_content("jour:", ref.journal);
119
add_remark_if_content("title:", ref.title);
120
add_remark_if_content("standard:", ref.standard);
123
void Macke::add_remarks_from(const OrgInfo& orginf) {
124
add_remark_if_content("Source of strain:", orginf.source); // copy source of strain
125
add_remark_if_content("Former name:", orginf.formname); // copy former name
126
add_remark_if_content("Alternate name:", orginf.nickname); // copy alternate name
127
add_remark_if_content("Common name:", orginf.commname); // copy common name
128
add_remark_if_content("Host organism:", orginf.hostorg); // copy host organism
131
void Macke::add_remarks_from(const RDP_comments& comments) {
132
add_remarks_from(comments.orginf);
133
add_remarks_from(comments.seqinf);
135
// other comments, not RDP DataBase specially defined
136
int len = str0len(comments.others);
138
for (int indi = 0, indj = 0; indi < len; indi++) {
140
temp[indj++] = comments.others[indi];
141
if (comments.others[indi] == '\n' || comments.others[indi] == '\0') {
150
void Macke::add_remarks_from(const SeqInfo& seqinf) {
151
add_remark_if_content("RDP ID:", seqinf.RDPid); // copy RDP ID
152
add_remark_if_content("Sequencing methods:", seqinf.methods); // copy methods
154
add_35end_remark('3', seqinf.comp3);
155
add_35end_remark('5', seqinf.comp5);
158
void Macke::add_remarks_from(const GenBank& gbk) {
159
// Create Macke remarks.
161
// REFERENCE the first reference
163
add_remark_if_content("ref:", gbk.get_ref(0).ref);
165
// The rest of the REFERENCES
166
for (int indi = 1; indi < gbk.get_refcount(); indi++) {
167
add_remarks_from(gbk.get_ref(indi));
170
add_remark_if_content("KEYWORDS:", gbk.keywords); // copy keywords as remark
171
add_remark_if_content("GenBank ACCESSION:", gbk.accession); // copy accession as remark when genbank entry also exists.
172
add_remarks_from(gbk.comments);
175
static void correct_subspecies(char *subspecies) {
176
// Remove the strain information in subspecies which is sometime mistakenly written into it.
179
if ((indj = find_pattern(subspecies, "str\n")) >= 0 || (indj = find_strain(subspecies, 0)) >= 0) {
180
ca_assert(subspecies[indj-1] == ' '); // assume to overwrite a space
181
subspecies[indj - 1] = '\n';
182
subspecies[indj] = '\0';
186
static void check_consistency(const char *what, char* const& var, const char *New) {
187
if (has_content(var)) {
188
if (!str_equal(var, New)) {
189
warningf(20, "Inconsistent %s definitions detected:\n"
191
"and %s", what, var, New);
199
static void get_string(char *temp, const char *line, int index) {
200
// Get the rest of the line until reaching certain terminators,
201
// such as ';', ',', '.',...
202
// Always append "\n" at the end of the result.
204
index = Skip_white_space(line, index);
206
int len = str0len(line);
210
for (indk = 0; index < len; index++, indk++) {
211
temp[indk] = line[index];
212
if (temp[indk] == '(')
214
if (temp[indk] == ')')
219
else if (temp[indk] == '\n' || (paren_num == 0 && temp[indk] == ';'))
222
if (indk > 1 && is_end_mark(temp[indk - 1]))
228
static void copy_subspecies_and_check_consistency(char* const& subspecies, const char *from, int indj) {
230
get_string(temp, from, indj);
231
correct_subspecies(temp);
232
check_consistency("subspecies", subspecies, temp);
234
static void copy_strain_and_check_consistency(char* const& strain, const char *from, int indj) {
236
get_string(temp, from, indj);
237
check_consistency("strain", strain, temp);
240
static void check_strain_from(char* const& strain, const char *from) {
241
if (has_content(from)) {
242
int indj = skip_strain(from, ' ');
243
if (indj >= 0) copy_strain_and_check_consistency(strain, from, indj);
247
static char *genbank_get_strain(const GenBank& gbk) {
248
// Get strain from DEFINITION, COMMENT or SOURCE line in Genbank data file.
249
char strain[LONGTEXT];
253
if (has_content(gbk.comments.others)) {
254
int indj = find_pattern(gbk.comments.others, "*source:");
256
int indk = skip_pattern(gbk.comments.others + indj, "strain=");
257
if (indk >= 0) copy_strain_and_check_consistency(strain, gbk.comments.others, indj+indk);
261
check_strain_from(strain, gbk.definition);
262
check_strain_from(strain, gbk.source);
264
return nulldup(strain);
267
static char *genbank_get_subspecies(const GenBank& gbk) {
268
// Get subspecies information from SOURCE, DEFINITION, or COMMENT line of Genbank data file.
270
char subspecies[LONGTEXT];
272
subspecies[0] = '\0';
274
if (has_content(gbk.definition)) {
275
if ((indj = skip_pattern(gbk.definition, "subsp. ")) >= 0) {
276
copy_subspecies_and_check_consistency(subspecies, gbk.definition, indj);
279
if (has_content(gbk.comments.others)) {
280
if ((indj = find_pattern(gbk.comments.others, "*source:")) >= 0) {
281
int indk = skip_subspecies(gbk.comments.others + indj, '=');
283
copy_subspecies_and_check_consistency(subspecies, gbk.comments.others, indj+indk);
288
if (has_content(gbk.source)) {
289
if ((indj = skip_subspecies(gbk.source, ' ')) >= 0) {
290
copy_subspecies_and_check_consistency(subspecies, gbk.source, indj);
294
return nulldup(subspecies);
297
static void mtog_decode_ref_and_remarks(const Macke& macke, GenBank& gbk) {
298
// Decode remarks of Macke to GenBank format.
299
ca_assert(gbk.get_refcount() == 0);
301
if (has_content(macke.author)) freedup(gbk.get_new_ref().author, macke.author);
302
if (has_content(macke.journal)) freedup(gbk.get_latest_ref().journal, macke.journal);
303
if (has_content(macke.title)) freedup(gbk.get_latest_ref().title, macke.title);
305
bool first_ref = true;
307
RDP_comments& comments = gbk.comments;
308
OrgInfo& orginf = comments.orginf;
309
SeqInfo& seqinf = comments.seqinf;
311
for (int ridx = 0; ridx < macke.get_rem_count(); ridx++) {
313
int offset = macke_key_word(macke.get_rem(ridx), 0, key);
315
if (str_equal(key, "ref")) {
316
GenbankRef& ref = first_ref ? gbk.get_latest_ref() : gbk.get_new_ref();
317
freeset(ref.ref, macke.copy_multi_rem(ridx, offset));
320
else if (str_equal(key, "auth")) {
321
freeset(gbk.get_latest_ref().author, macke.copy_multi_rem(ridx, offset));
323
else if (str_equal(key, "title")) {
324
freeset(gbk.get_latest_ref().title, macke.copy_multi_rem(ridx, offset));
326
else if (str_equal(key, "jour")) {
327
freeset(gbk.get_latest_ref().journal, macke.copy_multi_rem(ridx, offset));
329
else if (str_equal(key, "standard")) {
330
freeset(gbk.get_latest_ref().standard, macke.copy_multi_rem(ridx, offset));
332
else if (str_equal(key, "KEYWORDS")) {
333
freeset(gbk.keywords, macke.copy_multi_rem(ridx, offset));
334
terminate_with(gbk.keywords, '.');
336
else if (str_equal(key, "GenBank ACCESSION")) {
337
freeset(gbk.accession, macke.copy_multi_rem(ridx, offset));
339
else if (str_equal(key, "Source of strain")) {
340
freeset(orginf.source, macke.copy_multi_rem(ridx, offset));
342
else if (str_equal(key, "Former name")) {
343
freeset(orginf.formname, macke.copy_multi_rem(ridx, offset));
345
else if (str_equal(key, "Alternate name")) {
346
freeset(orginf.nickname, macke.copy_multi_rem(ridx, offset));
348
else if (str_equal(key, "Common name")) {
349
freeset(orginf.commname, macke.copy_multi_rem(ridx, offset));
351
else if (str_equal(key, "Host organism")) {
352
freeset(orginf.hostorg, macke.copy_multi_rem(ridx, offset));
354
else if (str_equal(key, "RDP ID")) {
355
freeset(seqinf.RDPid, macke.copy_multi_rem(ridx, offset));
357
else if (str_equal(key, "Sequencing methods")) {
358
freeset(seqinf.methods, macke.copy_multi_rem(ridx, offset));
360
else if (str_equal(key, "3' end complete")) {
361
scan_token_or_die(key, macke.get_rem(ridx) + offset);
362
seqinf.comp3 = str_equal(key, "Yes") ? 'y' : 'n';
364
else if (str_equal(key, "5' end complete")) {
365
scan_token_or_die(key, macke.get_rem(ridx) + offset);
366
seqinf.comp5 = str_equal(key, "Yes") ? 'y' : 'n';
368
else { // other (non-interpreted) comments
369
Append(comments.others, macke.get_rem(ridx));
374
static void mtog_genbank_def_and_source(const Macke& macke, GenBank& gbk) {
375
// Define GenBank DEFINITION and SOURCE lines the way RDP group likes.
376
copy_content(gbk.definition, macke.name);
377
if (has_content(macke.subspecies)) {
378
if (!has_content(gbk.definition)) {
379
warning(22, "Genus and Species not defined");
380
skip_eolnl_and_append(gbk.definition, "subsp. ");
383
skip_eolnl_and_append(gbk.definition, " subsp. ");
385
Append(gbk.definition, macke.subspecies);
388
if (has_content(macke.strain)) {
389
if (!has_content(gbk.definition)) {
390
warning(23, "Genus and Species and Subspecies not defined");
391
skip_eolnl_and_append(gbk.definition, "str. ");
394
skip_eolnl_and_append(gbk.definition, " str. ");
396
Append(gbk.definition, macke.strain);
399
// create SOURCE line, temp.
400
if (copy_content(gbk.source, gbk.definition)) terminate_with(gbk.source, '.');
402
// append keyword to definition, if there is keyword.
403
if (has_content(gbk.keywords)) {
404
if (has_content(gbk.definition))
405
skip_eolnl_and_append(gbk.definition, "; \n");
407
// Here keywords must be ended by a '.' already
408
skip_eolnl_and_append(gbk.definition, gbk.keywords);
411
skip_eolnl_and_append(gbk.definition, ".\n");
414
int mtog(const Macke& macke, GenBank& gbk, const Seq& seq) { // __ATTR__USERESULT
415
// Convert Macke format to Genbank format.
419
strcpy(temp, macke.seqabbr);
421
for (indi = str0len(temp); indi < 13; temp[indi++] = ' ') {}
423
if (has_content(macke.date))
424
sprintf((temp + 10), "%7d bp RNA RNA %s\n", seq.get_len(), genbank_date(macke.date));
426
sprintf((temp + 10), "%7d bp RNA RNA %s\n", seq.get_len(), genbank_date(today_date()));
428
freedup(gbk.locus, temp);
431
if (copy_content(gbk.organism, macke.name)) terminate_with(gbk.organism, '.');
433
RDP_comments& comments = gbk.comments;
434
OrgInfo& orginf = comments.orginf;
435
SeqInfo& seqinf = comments.seqinf;
437
copy_content(seqinf.methods, macke.rna);
439
if (!copy_content(seqinf.gbkentry, macke.acs))
440
copy_content(seqinf.gbkentry, macke.nbk);
442
copy_content(orginf.cultcoll, macke.atcc);
443
mtog_decode_ref_and_remarks(macke, gbk);
445
// final conversion of cultcoll
446
if (!has_content(orginf.cultcoll)) copy_content(orginf.cultcoll, macke.atcc);
448
// define GenBank DEFINITION, after GenBank KEYWORD is defined.
449
mtog_genbank_def_and_source(macke, gbk);
454
int gtom(const GenBank& gbk, Macke& macke) { // __ATTR__USERESULT
455
// Convert from Genbank format to Macke format.
457
// copy sequence abbr, assume every entry in gbk must end with \n\0
458
// no '\n' at the end of the string
461
genbank_key_word(gbk.locus, 0, temp);
462
freedup(macke.seqabbr, temp);
465
// copy name and definition
466
if (!copy_content(macke.name, gbk.organism) && has_content(gbk.definition)) {
467
char genus[TOKENSIZE];
468
char species[TOKENSIZE];
470
ASSERT_RESULT(int, 2, sscanf(gbk.definition, "%s %s", genus, species));
472
int last = str0len(species)-1;
473
if (species[last] == ';') species[last] = '\0';
475
freeset(macke.name, strf("%s %s\n", genus, species));
478
const OrgInfo& orginf = gbk.comments.orginf;
479
const SeqInfo& seqinf = gbk.comments.seqinf;
481
copy_content(macke.atcc, orginf.cultcoll); // copy cultcoll name and number
482
copy_content(macke.rna, seqinf.methods); // copy rna(methods)
484
freeset(macke.date, gbk.get_date()); Append(macke.date, "\n");
486
// copy genbank entry (gbkentry has higher priority than gbk.accession)
487
if (!copy_content(macke.acs, seqinf.gbkentry)) {
488
char buffer[TOKENSIZE];
489
if (has_content(gbk.accession) && !str_equal(gbk.accession, "No information\n")) {
490
scan_token_or_die(buffer, gbk.accession);
491
strcat(buffer, "\n");
494
strcpy(buffer, "\n");
496
freedup(macke.acs, buffer);
499
// copy the first reference from GenBank to Macke
500
if (gbk.has_refs()) {
501
copy_content(macke.author, gbk.get_ref(0).author);
502
copy_content(macke.journal, gbk.get_ref(0).journal);
503
copy_content(macke.title, gbk.get_ref(0).title);
505
// the rest of references are put into remarks, rem:.....
506
macke.add_remarks_from(gbk);
508
// adjust the strain, subspecies, and atcc information
509
freeset(macke.strain, genbank_get_strain(gbk));
510
freeset(macke.subspecies, genbank_get_subspecies(gbk));
511
if (!has_content(macke.atcc)) {
512
freeset(macke.atcc, genbank_get_atcc(gbk, macke));