2
Copyright (c) 2007-2009 Stefan Kurtz <kurtz@zbh.uni-hamburg.de>
3
Copyright (c) 2007-2009 Center for Bioinformatics, University of Hamburg
5
Permission to use, copy, modify, and distribute this software for any
6
purpose with or without fee is hereby granted, provided that the above
7
copyright notice and this permission notice appear in all copies.
9
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24
#include "core/fileutils.h"
26
#include "core/assert_api.h"
27
#include "core/divmodmul.h"
28
#include "core/error.h"
30
#include "core/fasta.h"
31
#include "core/fileutils.h"
32
#include "core/format64.h"
33
#include "core/ma_api.h"
34
#include "core/progressbar.h"
35
#include "core/seqiterator_sequence_buffer.h"
36
#include "core/xansi_api.h"
37
#include "giextract.h"
40
#define COMPLETE(VALUE)\
41
((VALUE)->frompos == 1UL && (VALUE)->topos == 0)
45
#define CHECKPOSITIVE(VAL,FORMAT,WHICH)\
48
gt_error_set(err,"file \"%s\", line " Formatuint64_t \
49
": illegal format: %s element = " FORMAT \
50
" is not a positive integer",\
51
gt_str_get(fileofkeystoextract),\
52
PRINTuint64_tcast(linenum+1),\
61
unsigned long frompos, topos;
65
static int comparefastakeys(const void *a,const void *b)
67
int cmp = strcmp(((Fastakeyquery *) a)->fastakey,
68
((Fastakeyquery *) b)->fastakey);
78
if (((Fastakeyquery *) a)->frompos < ((Fastakeyquery *) b)->frompos)
82
if (((Fastakeyquery *) a)->frompos > ((Fastakeyquery *) b)->frompos)
86
if (((Fastakeyquery *) a)->topos < ((Fastakeyquery *) b)->topos)
90
if (((Fastakeyquery *) a)->topos > ((Fastakeyquery *) b)->topos)
97
static unsigned long remdupsfastakeyqueries(Fastakeyquery *fastakeyqueries,
98
unsigned long numofqueries,
101
if (numofqueries == 0)
106
Fastakeyquery *storeptr, *readptr;
107
unsigned long newnumofqueries;
109
for (storeptr = fastakeyqueries, readptr = fastakeyqueries+1;
110
readptr < fastakeyqueries + numofqueries;
113
if (strcmp(storeptr->fastakey,readptr->fastakey) != 0 ||
114
storeptr->frompos != readptr->frompos ||
115
storeptr->topos != readptr->topos)
118
if (storeptr != readptr)
122
storeptr->frompos = readptr->frompos;
123
storeptr->topos = readptr->topos;
124
len = strlen(readptr->fastakey);
125
storeptr->fastakey = gt_realloc(storeptr->fastakey,
126
sizeof (char) * (len+1));
127
strcpy(storeptr->fastakey,readptr->fastakey);
131
newnumofqueries = (unsigned long) (storeptr - fastakeyqueries + 1);
132
if (newnumofqueries < numofqueries)
136
printf("# removed %lu duplicate queries\n",
137
numofqueries - newnumofqueries);
139
for (storeptr = fastakeyqueries + newnumofqueries;
140
storeptr < fastakeyqueries + numofqueries;
143
gt_free(storeptr->fastakey);
144
storeptr->fastakey = NULL;
147
return newnumofqueries;
151
static void fastakeyqueries_delete(Fastakeyquery *fastakeyqueries,
152
unsigned long numofqueries)
154
if (fastakeyqueries != NULL)
158
for (idx=0; idx<numofqueries; idx++)
160
gt_free(fastakeyqueries[idx].fastakey);
162
gt_free(fastakeyqueries);
166
static int extractkeyfromcurrentline(Fastakeyquery *fastakeyptr,
167
unsigned long keysize,
168
const GtStr *currentline,
170
const GtStr *fileofkeystoextract,
173
char *lineptr = gt_str_get(currentline);
174
long readlongfrompos, readlongtopos;
178
for (idx = 0; lineptr[idx] != '\0' && !isspace(lineptr[idx]); idx++)
182
fastakeyptr->fastakey = gt_malloc(sizeof (char) * (idx+1));
185
if (idx != (size_t) keysize)
187
gt_error_set(err,"key \"%*.*s\" is not of size %lu",(int) idx,
188
(int) idx,lineptr,keysize);
194
strncpy(fastakeyptr->fastakey,lineptr,idx);
195
fastakeyptr->fastakey[idx] = '\0';
196
fastakeyptr->frompos = 1UL;
197
fastakeyptr->topos = 0;
198
if (sscanf(lineptr+idx,"%ld %ld\n",&readlongfrompos,&readlongtopos) == 2)
200
CHECKPOSITIVE(readlongfrompos,"%ld","second");
203
fastakeyptr->frompos = (unsigned long) readlongfrompos;
204
CHECKPOSITIVE(readlongtopos,"%ld","third");
208
fastakeyptr->topos = (unsigned long) readlongtopos;
214
fastakeyptr->markhit = false;
215
if (!COMPLETE(fastakeyptr) && fastakeyptr->frompos > fastakeyptr->topos)
217
gt_error_set(err, "file \"%s\", line " Formatuint64_t
218
"illegal format: second value "
219
"%lu is larger than third value %lu",
220
gt_str_get(fileofkeystoextract),
221
PRINTuint64_tcast(linenum+1),
222
fastakeyptr->frompos,
227
return haserr ? -1 : 0;
230
static Fastakeyquery *readfileofkeystoextract(bool verbose,
231
unsigned long *numofqueries,
232
const GtStr *fileofkeystoextract,
239
Fastakeyquery *fastakeyqueries;
246
*numofqueries = gt_file_number_of_lines(gt_str_get(fileofkeystoextract));
247
if (*numofqueries == 0)
249
gt_error_set(err,"empty file \"%s\" not allowed",
250
gt_str_get(fileofkeystoextract));
253
fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err);
260
printf("# opened keyfile \"%s\"\n",gt_str_get(fileofkeystoextract));
262
fastakeyqueries = gt_malloc(sizeof (*fastakeyqueries) * (*numofqueries));
263
currentline = gt_str_new();
264
for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++)
266
if (extractkeyfromcurrentline(fastakeyqueries + linenum,
276
gt_str_reset(currentline);
278
gt_str_delete(currentline);
282
fastakeyqueries_delete(fastakeyqueries,*numofqueries);
285
qsort(fastakeyqueries,(size_t) *numofqueries,sizeof (*fastakeyqueries),
289
printf("# %lu fastakey-queries successfully parsed and sorted\n",
292
*numofqueries = remdupsfastakeyqueries(fastakeyqueries,*numofqueries,verbose);
294
for (i=0; i<*numofqueries; i++)
296
printf("%lu %s\n",i,fastakeyqueries[i].fastakey);
299
return fastakeyqueries;
302
static unsigned long searchdesinfastakeyqueries(const char *extractkey,
305
unsigned long numofqueries)
307
const Fastakeyquery *leftptr, *rightptr, *midptr;
310
leftptr = fastakeyqueries;
311
rightptr = fastakeyqueries + numofqueries - 1;
312
while (leftptr <= rightptr)
314
midptr = leftptr + GT_DIV2((unsigned long) (rightptr-leftptr));
315
cmp = strcmp(extractkey,midptr->fastakey);
318
if (midptr > fastakeyqueries &&
319
strcmp(extractkey,(midptr-1)->fastakey) == 0)
321
rightptr = midptr - 1;
324
return (unsigned long) (midptr - fastakeyqueries);
333
leftptr = midptr + 1;
340
static void outputnonmarked(const Fastakeyquery *fastakeyqueries,
341
unsigned long numofqueries)
343
unsigned long idx, countmissing = 0;
345
for (idx=0; idx<numofqueries; idx++)
347
if (!fastakeyqueries[idx].markhit)
349
printf("unsatisfied %s",fastakeyqueries[idx].fastakey);
350
if (COMPLETE(fastakeyqueries + idx))
352
printf(" complete\n");
355
printf(" %lu %lu\n",fastakeyqueries[idx].frompos,
356
fastakeyqueries[idx].topos);
361
printf("# number of unsatified fastakey-queries: %lu\n",countmissing);
364
static const char *desc2key(unsigned long *keylen,const char *desc,
367
unsigned long i, firstpipe = 0, secondpipe = 0;
370
for (i=0; desc[i] != '\0'; i++)
384
if (firstpipe == 0 || secondpipe == 0)
386
gt_error_set(err,"Cannot find key in description \"%s\"",desc);
389
gt_assert(firstpipe < secondpipe);
390
*keylen = secondpipe - firstpipe - 1;
391
return desc + firstpipe + 1;
394
static int giextract_encodedseq2fasta(FILE *fpout,
395
const GtEncseq *encseq,
396
unsigned long seqnum,
397
const Fastakeyquery *fastakeyquery,
398
unsigned long linewidth,
399
GT_UNUSED GtError *err)
402
unsigned long desclen;
405
desc = gt_encseq_description(encseq, &desclen, seqnum);
406
gt_xfputc('>',fpout);
407
if (fastakeyquery != NULL && !COMPLETE(fastakeyquery))
409
printf("%s %lu %lu ",fastakeyquery->fastakey,
410
fastakeyquery->frompos,
411
fastakeyquery->topos);
413
gt_xfwrite(desc,sizeof *desc,(size_t) desclen,fpout);
416
unsigned long frompos, topos, seqstartpos, seqlength ;
418
gt_xfputc('\n',fpout);
419
seqstartpos = gt_encseq_seqstartpos(encseq, seqnum);
420
seqlength = gt_encseq_seqlength(encseq, seqnum);
421
if (fastakeyquery != NULL && !COMPLETE(fastakeyquery))
423
frompos = fastakeyquery->frompos-1;
424
topos = fastakeyquery->topos - fastakeyquery->frompos + 1;
430
gt_encseq2symbolstring(fpout,
433
seqstartpos + frompos,
437
return haserr ? -1 : 0;
440
#define MAXFIXEDKEYSIZE 11
444
char key[MAXFIXEDKEYSIZE+1];
445
unsigned long seqnum;
448
static int compareFixedkeys(const void *a,const void *b)
450
const Fixedsizekey *fa = a;
451
const Fixedsizekey *fb = b;
452
return strcmp(((const Fixedsizekey *) fa)->key,
453
((const Fixedsizekey *) fb)->key);
456
#define GT_KEYSTABFILESUFFIX ".kys"
458
int gt_extractkeysfromdesfile(const char *indexname,
463
FILE *fpin, *fpout = NULL;
466
unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/
467
bool haserr = false, firstdesc = true;
468
char *previouskey = NULL;
469
Fixedsizekey *keytab = NULL, *keytabptr = NULL;
470
GtEncseq *encseq = NULL;
471
unsigned long numofentries = 0;
472
const unsigned long linewidth = 60UL;
474
fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err);
481
fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err);
491
for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF;
494
keyptr = desc2key(&keylen,gt_str_get(line),err);
502
gt_error_set(err,"key of length 0 in \"%s\" not expected",
509
if (keylen > (unsigned long) CHAR_MAX)
511
gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; "
512
"no key must be larger than %d",
513
(int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX);
517
constantkeylen = keylen;
518
previouskey = gt_malloc(sizeof (char) * (constantkeylen+1));
522
gt_xfputc((char) constantkeylen,fpout);
526
if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE)
528
gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; "
529
"no key must be larger than %d",
530
(int) keylen,(int) keylen,keyptr,keylen,
535
el = gt_encseq_loader_new();
536
gt_encseq_loader_set_logger(el, logger);
537
encseq = gt_encseq_loader_load(el, indexname, err);
538
gt_encseq_loader_delete(el);
544
numofentries = gt_encseq_num_of_sequences(encseq);
545
gt_assert(numofentries > 0);
546
keytab = gt_malloc(sizeof (*keytab) * numofentries);
551
if (constantkeylen != keylen)
553
gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of "
554
"the same length which for all previously seen "
556
(int) keylen,(int) keylen,keyptr,keylen,
561
gt_assert(previouskey != NULL);
562
if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0)
564
gt_error_set(err,"previous key \"%s\" is not lexicographically smaller "
565
"than current key \"%*.*s\"",
566
previouskey,(int) keylen,(int) keylen,keyptr);
570
printf("previous key \"%s\" (no %lu) is lexicographically larger "
571
"than current key \"%*.*s\"\n",
572
previouskey,linenum,(int) keylen,(int) keylen,keyptr);
579
gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout);
580
gt_xfputc('\0',fpout);
583
gt_assert(keytabptr != NULL);
584
strncpy(keytabptr->key,keyptr,(size_t) constantkeylen);
585
keytabptr->key[constantkeylen] = '\0';
586
keytabptr->seqnum = linenum;
589
strncpy(previouskey,keyptr,(size_t) constantkeylen);
590
previouskey[constantkeylen] = '\0';
595
gt_logger_log(logger,"number of keys of length %lu = %lu",
596
constantkeylen,linenum);
598
gt_logger_log(logger,"number of incorrectly ordered keys = %lu",
605
gt_free(previouskey);
606
if (!haserr && sortkeys)
608
gt_assert(keytabptr != NULL);
609
gt_assert(numofentries > 0);
610
gt_assert(keytabptr == keytab + numofentries);
611
qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys);
612
gt_assert(keytabptr != NULL);
613
for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries;
616
if (giextract_encodedseq2fasta(stdout,
630
gt_encseq_delete(encseq);
634
return haserr ? -1 : 0;
637
bool gt_deskeysfileexists(const char *indexname)
639
return gt_file_with_suffix_exists(indexname,GT_KEYSTABFILESUFFIX);
642
static unsigned long searchfastaqueryindes(const char *extractkey,
644
unsigned long numofkeys,
645
unsigned long keysize)
647
unsigned long left = 0, right = numofkeys - 1, mid;
650
while (left <= right)
652
mid = left + GT_DIV2((unsigned long) (right-left));
653
cmp = strcmp(extractkey,keytab + 1UL + mid * (keysize+1));
665
gt_assert(mid < numofkeys);
673
static int itersearchoverallkeys(const GtEncseq *encseq,
675
unsigned long numofkeys,
676
unsigned long keysize,
677
const GtStr *fileofkeystoextract,
678
unsigned long linewidth,
684
unsigned long seqnum, countmissing = 0;
686
Fastakeyquery fastakeyquery;
690
gt_error_set(err,"use option width to specify line width for formatting");
693
fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err);
698
currentline = gt_str_new();
699
fastakeyquery.fastakey = gt_malloc(sizeof (char) * (keysize+1));
700
for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++)
702
if (extractkeyfromcurrentline(&fastakeyquery,
712
seqnum = searchfastaqueryindes(fastakeyquery.fastakey,keytab,numofkeys,
714
if (seqnum < numofkeys)
716
if (giextract_encodedseq2fasta(stdout,
730
gt_str_reset(currentline);
732
if (!haserr && countmissing > 0)
734
printf("# number of unsatified fastakey-queries: %lu\n",countmissing);
736
gt_str_delete(currentline);
738
gt_free(fastakeyquery.fastakey);
739
return haserr ? - 1 : 0;
742
static int readkeysize(const char *indexname,GtError *err)
749
fp = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"rb",err);
756
GT_UNUSED size_t ret;
758
ret = fread(&cc,sizeof cc, (size_t) 1, fp);
761
gt_error_set(err,"error when trying to read first byte of file %s%s: %s",
762
indexname,GT_KEYSTABFILESUFFIX,strerror(errno));
768
return haserr ? -1 : (int) cc;
771
int gt_extractkeysfromfastaindex(const char *indexname,
772
const GtStr *fileofkeystoextract,
773
unsigned long linewidth,GtError *err)
775
GtEncseq *encseq = NULL;
776
GtEncseqLoader *el = NULL;
778
unsigned long numofdbsequences = 0, keysize = 0;
780
el = gt_encseq_loader_new();
781
encseq = gt_encseq_loader_load(el, indexname, err);
782
gt_encseq_loader_delete(el);
791
numofdbsequences = gt_encseq_num_of_sequences(encseq);
792
retval = readkeysize(indexname,err);
797
keysize = (unsigned long) retval;
802
unsigned long keytablength;
804
keytablength = 1UL + numofdbsequences * (keysize+1);
805
keytab = gt_fa_mmap_check_size_with_suffix(indexname,
806
GT_KEYSTABFILESUFFIX,
815
if (itersearchoverallkeys(encseq,keytab,numofdbsequences,
816
keysize,fileofkeystoextract,
822
gt_fa_xmunmap(keytab);
826
gt_encseq_delete(encseq);
829
return haserr ? -1 : 0;
832
int gt_extractkeysfromfastafile(bool verbose,
835
const GtStr *fileofkeystoextract,
836
GtStrArray *referencefiletab,
839
GtSeqIterator *seqit;
840
const GtUchar *sequence;
841
char *desc, *headerbufferspace = NULL, *keyspace = NULL;
843
unsigned long allockeyspace = 0, len, keylen, numofqueries, keyposition,
847
Fastakeyquery *fastakeyqueries;
848
size_t headerbuffersize = 0, headerlength;
851
fastakeyqueries = readfileofkeystoextract(verbose,&numofqueries,
852
fileofkeystoextract,err);
853
if (fastakeyqueries == NULL)
857
totalsize = gt_files_estimate_total_size(referencefiletab);
860
printf("# estimated total size is " Formatuint64_t "\n",
861
PRINTuint64_tcast(totalsize));
863
seqit = gt_seqiterator_sequence_buffer_new(referencefiletab, err);
868
if (!had_err && verbose)
870
gt_progressbar_start(gt_seqiterator_getcurrentcounter(seqit,
876
while (had_err != -1 && countmarkhit < numofqueries)
878
had_err = gt_seqiterator_next(seqit, &sequence, &len, &desc, err);
883
keyptr = desc2key(&keylen,desc,err);
889
if (allockeyspace < keylen)
891
keyspace = gt_realloc(keyspace,sizeof (*keyspace) * (keylen+1));
892
allockeyspace = keylen;
894
gt_assert(keyspace != NULL);
895
strncpy(keyspace,keyptr,(size_t) keylen);
896
keyspace[keylen] = '\0';
897
keyposition = searchdesinfastakeyqueries(keyspace,fastakeyqueries,
899
if (keyposition < numofqueries)
901
while (keyposition < numofqueries &&
902
strcmp(fastakeyqueries[keyposition].fastakey,keyspace) == 0)
905
if (fastakeyqueries[keyposition].markhit)
907
fprintf(stderr,"key %s was already found before\n",
908
fastakeyqueries[keyposition].fastakey);
909
exit(GT_EXIT_PROGRAMMING_ERROR);
912
headerlength = strlen(desc);
913
if (headerbuffersize < headerlength + EXTRABUF + 1)
915
headerbuffersize = headerlength + EXTRABUF + 1;
916
headerbufferspace = gt_realloc(headerbufferspace,
917
sizeof (*headerbufferspace)
920
if (COMPLETE(fastakeyqueries + keyposition))
923
(void) snprintf(headerbufferspace,headerbuffersize,
925
(int) keylen,(int) keylen,keyspace,
928
gt_fasta_show_entry(desc, (const char *) sequence, len, width,
932
(void) snprintf(headerbufferspace,headerbuffersize,
934
(int) keylen,(int) keylen,keyspace,
935
fastakeyqueries[keyposition].frompos,
936
fastakeyqueries[keyposition].topos,
938
gt_fasta_show_entry(headerbufferspace,
940
(sequence+fastakeyqueries[keyposition].
942
fastakeyqueries[keyposition].topos -
943
fastakeyqueries[keyposition].frompos+1,
946
fastakeyqueries[keyposition].markhit = true;
952
printf("%s 1 %lu\n",keyspace, len);
956
gt_free(headerbufferspace);
960
gt_progressbar_stop();
964
outputnonmarked(fastakeyqueries,numofqueries);
966
fastakeyqueries_delete(fastakeyqueries,numofqueries);
967
gt_seqiterator_delete(seqit);