~ubuntu-branches/ubuntu/quantal/ncbi-tools6/quantal

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/* $Id: debruijn.c,v 1.4 2004/02/09 21:24:59 ucko Exp $

* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's offical duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================

*/

static char const rcsid[] = "$Id: debruijn.c,v 1.4 2004/02/09 21:24:59 ucko Exp $";

/*
 * example driver for de Bruijn sequences.
 *
 * this code generates all n-mers over a protein
 * or dna alphabet. useful for creating test sequences.
 */

#include <ncbi.h>
#include <algo/blast/core/lookup_util.h>

static Args myargs[] = {
  { "word size",
    NULL, NULL, NULL, FALSE, 'n', ARG_INT, 0.0, 0, NULL },
  { "alphabet\n"
    "(supply 'ncbistdaa' or 'ncbi2na' for standard\n"
    "alphabets, or supply your own alphabet)\n",
    NULL, NULL, NULL, FALSE, 'a', ARG_STRING, 0.0, 0, NULL },
};

Uint1 ncbistdaa[] = "-abcdefghiklmnpqrstvwxyzu*";
Uint1 ncbi2na[] = "acgt";

Int2 Main(void)
{
  Int4 i;
  Int4 n, k;
  Uint1 *output;
  Int4 outputsize;
  Uint1 *alphabet=NULL;
  
  if ( ! GetArgs("debruijn", sizeof(myargs)/sizeof(myargs[0]), myargs) )
   return 1;
  
  n = myargs[0].intvalue;

  if (n < 1)
    {
      fprintf(stderr,"n must be greater than one.\n");
      return 1;
    }

  alphabet = myargs[1].strvalue;

  if (strcmp("ncbistdaa", myargs[1].strvalue) == 0)
    alphabet = ncbistdaa;

  if (strcmp("ncbi2na", myargs[1].strvalue) == 0)
    alphabet = ncbi2na;

  k = strlen(alphabet);
  
  /* output array needs:
   * k^n bytes - to store the de Bruijn sequence
   * n-1 bytes - to unwrap (see below)
   * 1   byte  - for the terminating NUL
   */

  outputsize = iexp(k,n) + (n-1);
  output = (char *) malloc(outputsize + 1);

  /* compute the (n,k) de Bruijn sequence */  
  debruijn(n,k,output,alphabet);
  
  /* We don't want a true cyclical de Bruijn sequence; we want
   * all words in a straight line- copy the first n-1 letters
   * to the end.
   */

  for(i=0;i<(n-1);i++)
    output[outputsize-n+1+i] = output[i];

  /* Terminate the string. */

  output[outputsize] = '\0';

  puts(output);

  free(output);
}