1
/* $Id: blast_util.h,v 1.56 2004/09/07 17:20:37 dondosha Exp $
2
* ===========================================================================
5
* National Center for Biotechnology Information
7
* This software/database is a "United States Government Work" under the
8
* terms of the United States Copyright Act. It was written as part of
9
* the author's offical duties as a United States Government employee and
10
* thus cannot be copyrighted. This software/database is freely available
11
* to the public for use. The National Library of Medicine and the U.S.
12
* Government have not placed any restriction on its use or reproduction.
14
* Although all reasonable efforts have been taken to ensure the accuracy
15
* and reliability of the software and data, the NLM and the U.S.
16
* Government do not and cannot warrant the performance or results that
17
* may be obtained by using this software or data. The NLM and the U.S.
18
* Government disclaim all warranties, express or implied, including
19
* warranties of performance, merchantability or fitness for any particular
22
* Please cite the author in any work or product based on this material.
24
* ===========================================================================
26
* Author: Ilya Dondoshansky
30
/** @file blast_util.h
31
* Various auxiliary BLAST utility functions
34
#ifndef __BLAST_UTIL__
35
#define __BLAST_UTIL__
37
#include <algo/blast/core/blast_def.h>
43
/** Different types of sequence encodings for sequence retrieval from the
46
#define BLASTP_ENCODING 0 /**< NCBIstdaa */
47
#define BLASTNA_ENCODING 1 /**< Special encoding for preliminary stage of
48
BLAST: permutation of NCBI8na */
49
#define NCBI4NA_ENCODING 2 /**< NCBI8na */
50
#define NCBI2NA_ENCODING 3 /**< NCBI2na */
51
#define ERROR_ENCODING 255 /**< Error value for encoding */
54
/** Does character encode a residue? */
55
#define IS_residue(x) (x <= 250)
58
/** Bit mask for obtaining a single base from a byte in ncbi2na format */
59
#define NCBI2NA_MASK 0x03
61
/** Macro to extract base N from a byte x (N >= 0, N < 4) */
62
#define NCBI2NA_UNPACK_BASE(x, N) (((x)>>(2*(N))) & NCBI2NA_MASK)
65
/** Deallocate memory only for the sequence in the sequence block */
67
Int2 BlastSequenceBlkClean(BLAST_SequenceBlk* seq_blk);
69
/** Deallocate memory for a sequence block */
71
BLAST_SequenceBlk* BlastSequenceBlkFree(BLAST_SequenceBlk* seq_blk);
73
/** Copies contents of the source sequence block without copying sequence
74
* buffers; sets all "field_allocated" booleans to FALSE, to make sure
75
* fields are not freed on the call to BlastSequenceBlkFree.
76
* @param copy New sequence block [out]
77
* @param src Input sequence block [in]
80
void BlastSequenceBlkCopy(BLAST_SequenceBlk** copy,
81
BLAST_SequenceBlk* src);
83
/** Set number for a given program type. Return is zero on success.
84
* @param program string name of program [in]
85
* @param number Enumerated value of program [out]
87
Int2 BlastProgram2Number(const char *program, EBlastProgramType *number);
89
/** Return string name for program given a number. Return is zero on success.
90
* @param number Enumerated value of program [in]
91
* @param program string name of program (memory should be deallocated by called) [out]
94
Int2 BlastNumber2Program(EBlastProgramType number, char* *program);
96
/** Allocates memory for *sequence_blk and then populates it.
97
* @param buffer start of sequence [in]
98
* @param length query sequence length [in]
99
* @param context context number [in]
100
* @param seq_blk SequenceBlk to be allocated and filled in [out]
101
* @param buffer_allocated Is the buffer allocated? If yes, 'sequence_start' is
102
* the start of the sequence, otherwise it is 'sequence'. [in]
103
* @deprecated Use BlastSeqBlkNew and BlastSeqBlkSet* functions instead
107
BlastSetUp_SeqBlkNew (const Uint1* buffer, Int4 length, Int4 context,
108
BLAST_SequenceBlk* *seq_blk, Boolean buffer_allocated);
110
/** Allocates a new sequence block structure.
111
* @param retval Pointer to where the sequence block structure will be
115
Int2 BlastSeqBlkNew(BLAST_SequenceBlk** retval);
117
/** Stores the sequence in the sequence block structure.
118
* @param seq_blk The sequence block structure to modify [in/out]
119
* @param sequence Actual sequence buffer. The first byte must be a sentinel
121
* @param seqlen Length of the sequence buffer above [in]
124
Int2 BlastSeqBlkSetSequence(BLAST_SequenceBlk* seq_blk,
125
const Uint1* sequence,
128
/** Stores the compressed nucleotide sequence in the sequence block structure
129
* for the subject sequence when BLASTing 2 sequences. This sequence should be
130
* encoded in NCBI2NA_ENCODING and NOT have sentinel bytes.
131
* @param seq_blk The sequence block structure to modify [in/out]
132
* @param sequence Actual sequence buffer. [in]
135
Int2 BlastSeqBlkSetCompressedSequence(BLAST_SequenceBlk* seq_blk,
136
const Uint1* sequence);
139
/** GetTranslation to get the translation of the nucl. sequence in the
140
* appropriate frame and with the appropriate GeneticCode.
141
* The function return an allocated char*, the caller must delete this.
142
* The first and last spaces of this char* contain NULLB's.
143
* @param query_seq Forward strand of the nucleotide sequence [in]
144
* @param query_seq_rev Reverse strand of the nucleotide sequence [in]
145
* @param nt_length Length of the nucleotide sequence [in]
146
* @param frame What frame to translate into? [in]
147
* @param buffer Preallocated buffer for the translated sequence [in][out]
148
* @param genetic_code Genetic code to use for translation,
149
* in ncbistdaa encoding [in]
150
* @return Length of the traslated protein sequence.
153
Int4 BLAST_GetTranslation(const Uint1* query_seq,
154
const Uint1* query_seq_rev, Int4 nt_length, Int2 frame, Uint1* buffer,
155
const Uint1* genetic_code);
159
/** Translate a nucleotide sequence without ambiguity codes.
160
* This is used for the first-pass translation of the database.
161
* The genetic code to be used is determined by the translation_table
162
* This function translates a packed (ncbi2na) nucl. alphabet. It
163
* views a basepair as being in one of four sets of 2-bits:
164
* |0|1|2|3||0|1|2|3||0|1|2|3||...
166
* 1st byte | 2 byte | 3rd byte...
168
* A codon that starts at the beginning of the above sequence starts in
169
* state "0" and includes basepairs 0, 1, and 2. The next codon, in the
170
* same frame, after that starts in state "3" and includes 3, 0, and 1.
173
* changed the single main loop to
174
* - advance to state 0,
175
* - optimized inner loop does two (3 byte->4 codon) translation per iteration
176
* (loads are moved earlier so they can be done in advance.)
179
* @param translation The translation table [in]
180
* @param length Length of the nucleotide sequence [in]
181
* @param nt_seq The original nucleotide sequence [in]
182
* @param frame What frame to translate to? [in]
183
* @param prot_seq Preallocated buffer for the (translated) protein sequence,
184
* with NULLB sentinels on either end. [out]
187
Int4 BLAST_TranslateCompressedSequence(Uint1* translation, Int4 length,
188
const Uint1* nt_seq, Int2 frame, Uint1* prot_seq);
190
/** Reverse a nucleotide sequence in the blastna encoding, adding sentinel
191
* bytes on both ends.
192
* @param sequence Forward strand of the sequence [in]
193
* @param length Length of the sequence plus 1 for the sentinel byte [in]
194
* @param rev_sequence_ptr Reverse strand of the sequence [out]
197
Int2 GetReverseNuclSequence(const Uint1* sequence, Int4 length,
198
Uint1** rev_sequence_ptr);
200
/** This function translates the context number of a context into the frame of
202
* @param prog_number Integer corresponding to the BLAST program
203
* @param context_number Context number
204
* @return Sequence frame (+-1 for nucleotides, -3..3 for translations)
207
Int2 BLAST_ContextToFrame(EBlastProgramType prog_number, Int4 context_number);
209
/** Given a context from BLAST engine core, return the query index.
210
* @param context Context saved in a BlastHSP structure [in]
211
* @param program Type of BLAST program [in]
212
* @return Query index in a set of queries.
215
Int4 Blast_GetQueryIndexFromContext(Int4 context, EBlastProgramType program);
217
/** Find the length of an individual query within a concatenated set of
219
* @param query_info Queries information structure containing offsets into
220
* the concatenated sequence [in]
221
* @param context Index of the query/strand/frame within the concatenated
223
* @return Length of the individual sequence/strand/frame.
226
Int4 BLAST_GetQueryLength(const BlastQueryInfo* query_info, Int4 context);
228
/** Deallocate memory for query information structure */
230
BlastQueryInfo* BlastQueryInfoFree(BlastQueryInfo* query_info);
232
/** Duplicates the query information structure */
234
BlastQueryInfo* BlastQueryInfoDup(BlastQueryInfo* query_info);
237
Int2 BLAST_PackDNA(Uint1* buffer, Int4 length, Uint1 encoding,
240
/** Initialize the mixed-frame sequence for out-of-frame gapped extension.
241
* @param query_blk Sequence block containing the concatenated frames of the
242
* query. The mixed-frame sequence is saved here. [in] [out]
243
* @param query_info Query information structure containing offsets into the*
244
* concatenated sequence. [in]
247
Int2 BLAST_InitDNAPSequence(BLAST_SequenceBlk* query_blk,
248
BlastQueryInfo* query_info);
250
/** Translate nucleotide into 6 frames. All frames are put into a
251
* translation buffer, with sentinel NULLB bytes in between.
252
* Array of offsets into the translation buffer is also returned.
253
* For out-of-frame gapping option, a mixed frame sequence is created.
254
* @param nucl_seq The nucleotide sequence [in]
255
* @param encoding Sequence encoding: ncbi2na or ncbi4na [in]
256
* @param nucl_length Length of the nucleotide sequence [in]
257
* @param genetic_code The genetic code to be used for translations,
258
* in ncbistdaa encoding [in]
259
* @param translation_buffer_ptr Buffer to hold the frames of the translated
261
* @param frame_offsets_ptr Offsets into the translation buffer for each
263
* @param mixed_seq_ptr Pointer to buffer for the mixed frame sequence [out]
266
Int2 BLAST_GetAllTranslations(const Uint1* nucl_seq, Uint1 encoding,
267
Int4 nucl_length, const Uint1* genetic_code,
268
Uint1** translation_buffer_ptr, Int4** frame_offsets_ptr,
269
Uint1** mixed_seq_ptr);
271
/** Get one frame translation - needed when only parts of subject sequences
273
* @param nucl_seq Pointer to start of nucleotide sequence to be translated [in]
274
* @param nucl_length Length of nucleotide sequence to be translated [in]
275
* @param frame What frame to translate into [in]
276
* @param genetic_code What genetic code to use? [in]
277
* @param translation_buffer_ptr Pointer to buffer with translated
279
* @param protein_length Length of the translation buffer [out]
280
* @param mixed_seq_ptr Pointer to buffer with mixed frame sequence, in case
281
* of out-of-frame gapping; buffer filled only if argument
285
int GetPartialTranslation(const Uint1* nucl_seq,
286
Int4 nucl_length, Int2 frame, const Uint1* genetic_code,
287
Uint1** translation_buffer_ptr, Int4* protein_length,
288
Uint1** mixed_seq_ptr);
291
/** Convert translation frame into a context for the concatenated translation
295
Int4 FrameToContext(Int2 frame);
298
/** The following binary search routine assumes that array A is filled. */
300
Int4 BSearchInt4(Int4 n, Int4* A, Int4 size);
305
#endif /* !__BLAST_UTIL__ */