~ubuntu-branches/ubuntu/saucy/fastqc/saucy-proposed

« back to all changes in this revision

Viewing changes to uk/ac/babraham/FastQC/Modules/KmerContent.java

  • Committer: Package Import Robot
  • Author(s): Andreas Tille
  • Date: 2012-11-20 13:38:32 UTC
  • Revision ID: package-import@ubuntu.com-20121120133832-psohzlsak64g7bdy
Tags: upstream-0.10.1+dfsg
ImportĀ upstreamĀ versionĀ 0.10.1+dfsg

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/**
 
2
 * Copyright Copyright 2010-12 Simon Andrews
 
3
 *
 
4
 *    This file is part of FastQC.
 
5
 *
 
6
 *    FastQC is free software; you can redistribute it and/or modify
 
7
 *    it under the terms of the GNU General Public License as published by
 
8
 *    the Free Software Foundation; either version 3 of the License, or
 
9
 *    (at your option) any later version.
 
10
 *
 
11
 *    FastQC is distributed in the hope that it will be useful,
 
12
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 
13
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
14
 *    GNU General Public License for more details.
 
15
 *
 
16
 *    You should have received a copy of the GNU General Public License
 
17
 *    along with FastQC; if not, write to the Free Software
 
18
 *    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 
19
 */
 
20
package uk.ac.babraham.FastQC.Modules;
 
21
 
 
22
import java.awt.BorderLayout;
 
23
import java.awt.Graphics;
 
24
import java.awt.image.BufferedImage;
 
25
import java.io.IOException;
 
26
import java.util.Arrays;
 
27
import java.util.Hashtable;
 
28
import java.util.Iterator;
 
29
import java.util.Vector;
 
30
import java.util.zip.ZipEntry;
 
31
import java.util.zip.ZipOutputStream;
 
32
 
 
33
import javax.imageio.ImageIO;
 
34
import javax.swing.JLabel;
 
35
import javax.swing.JPanel;
 
36
import javax.swing.JScrollPane;
 
37
import javax.swing.JSplitPane;
 
38
import javax.swing.JTable;
 
39
import javax.swing.table.AbstractTableModel;
 
40
import javax.swing.table.TableModel;
 
41
 
 
42
import uk.ac.babraham.FastQC.Graphs.BaseGroup;
 
43
import uk.ac.babraham.FastQC.Graphs.LineGraph;
 
44
import uk.ac.babraham.FastQC.Report.HTMLReportArchive;
 
45
import uk.ac.babraham.FastQC.Sequence.Sequence;
 
46
 
 
47
public class KmerContent implements QCModule {
 
48
 
 
49
        private Hashtable<String, Kmer> kmers = new Hashtable<String, Kmer>((int)Math.pow(4, MAX_KMER_SIZE));
 
50
        private long gCount = 0;
 
51
        private long aCount = 0;
 
52
        private long tCount = 0;
 
53
        private long cCount = 0;
 
54
        
 
55
        private int longestSequence = 0;
 
56
        private long [][] totalKmerCounts = new long [0][0];
 
57
        private long skipCount = 0;
 
58
        
 
59
        private static int MIN_KMER_SIZE = 5;
 
60
        private static int MAX_KMER_SIZE = 5;
 
61
        
 
62
        public boolean calculated = false;
 
63
        
 
64
        private Kmer [] enrichedKmers = null;
 
65
        private double [][] enrichments = null;
 
66
        private String [] xCategories = new String[0];
 
67
        private String [] xLabels = new String[0];
 
68
        
 
69
        BaseGroup [] groups;
 
70
 
 
71
        public KmerContent () {
 
72
                if (System.getProperty("fastqc.kmer_size") != null) {
 
73
                        int kmerSize = Integer.parseInt(System.getProperty("fastqc.kmer_size"));
 
74
                        MIN_KMER_SIZE = kmerSize;
 
75
                        MAX_KMER_SIZE = kmerSize;
 
76
                }
 
77
        }
 
78
        
 
79
        public boolean ignoreFilteredSequences() {
 
80
                return true;
 
81
        }
 
82
        
 
83
        public JPanel getResultsPanel() {
 
84
                
 
85
                if (!calculated) calculateEnrichment();
 
86
                JPanel returnPanel = new JPanel();
 
87
                returnPanel.setLayout(new BorderLayout());
 
88
                returnPanel.add(new JLabel("Overrepresented Kmers",JLabel.CENTER),BorderLayout.NORTH);
 
89
                
 
90
                JSplitPane splitPanel = new JSplitPane(JSplitPane.VERTICAL_SPLIT);
 
91
                
 
92
                if (enrichedKmers.length > 0) {
 
93
                        TableModel model = new ResultsTable(enrichedKmers);
 
94
                        splitPanel.setBottomComponent(new JScrollPane(new JTable(model)));
 
95
                        splitPanel.setTopComponent(new LineGraph(enrichments, 0d, 100d, "Position in read (bp)", xLabels, xCategories, "Relative enrichment over read length"));
 
96
                        returnPanel.add(splitPanel,BorderLayout.CENTER);
 
97
                }
 
98
                else {
 
99
                        returnPanel.add(new JLabel("There are no overrepresented Kmers",JLabel.CENTER),BorderLayout.CENTER);
 
100
                }
 
101
                
 
102
                return returnPanel;
 
103
        }
 
104
        
 
105
        /**
 
106
         * This method simply keeps a count of the number of Kmers of a given size
 
107
         * seen at each position within the run.  We can use this later on to calculate
 
108
         * the enrichment of the Kmers we actually count.
 
109
         * 
 
110
         * We take in the Kmer sequence even though this isn't used in the total counts
 
111
         * we do this because we don't want to count Kmers with Ns in them, but we do
 
112
         * need to ensure that the data structure is expanded to the right size, and if
 
113
         * we have libraries where later positions are Ns in all sequences then our
 
114
         * data structure ends up too short and we crash. 
 
115
         * 
 
116
         * @param position Position within the read.  0 indexed
 
117
         * @param kmerLength Actual length of the Kmer analysed
 
118
         */
 
119
        private void addKmerCount (int position,int kmerLength, String kmer) {
 
120
        
 
121
                
 
122
                if (position >= totalKmerCounts.length) {
 
123
                        // We need to expand the array
 
124
                        long [][] newCounts = new long[position+1][];
 
125
                        for (int i=0;i<totalKmerCounts.length;i++) {
 
126
                                newCounts[i] = totalKmerCounts[i];
 
127
                        }
 
128
                        for (int i=totalKmerCounts.length;i<newCounts.length;i++) {
 
129
                                newCounts[i] = new long[MAX_KMER_SIZE];
 
130
                        }
 
131
                        
 
132
                        totalKmerCounts = newCounts;
 
133
                }
 
134
                
 
135
                if (kmer.indexOf("N") >=0) return;
 
136
 
 
137
                ++totalKmerCounts[position][kmerLength-1];
 
138
                
 
139
        }
 
140
 
 
141
        private synchronized void calculateEnrichment () {
 
142
                
 
143
                // For each kmer we work out the number of times we should have
 
144
                // seen this by chance, and then the obs/exp for each of the
 
145
                // kmers.  We can then filter for a specific level of enrichment
 
146
                // and show only those Kmers
 
147
                
 
148
                float totalBases = gCount+aCount+tCount+cCount;
 
149
                
 
150
                float gProb = ((float)gCount)/totalBases;
 
151
                float aProb = ((float)aCount)/totalBases;
 
152
                float tProb = ((float)tCount)/totalBases;
 
153
                float cProb = ((float)cCount)/totalBases;
 
154
                
 
155
                // We'll be grouping together positions later so make up the groups now
 
156
                groups = BaseGroup.makeBaseGroups((longestSequence-MIN_KMER_SIZE)+1);
 
157
 
 
158
                Vector<Kmer>enrichedKmers = new Vector<Kmer>();
 
159
                                
 
160
                Iterator<Kmer> rawKmers = kmers.values().iterator();
 
161
                
 
162
                KMER: while (rawKmers.hasNext()) {
 
163
                        Kmer k = rawKmers.next();
 
164
                        
 
165
                        long totalKmerCount = 0;
 
166
 
 
167
                        for (int i=0;i<totalKmerCounts.length;i++) {
 
168
                                totalKmerCount += totalKmerCounts[i][k.sequence().length()-1];
 
169
                        }
 
170
                        
 
171
                        
 
172
                        float prob = 1;
 
173
                        char [] chars = k.sequence().toCharArray();
 
174
                        for (int c=0;c<chars.length;c++) {
 
175
                                switch (chars[c]) {
 
176
                                case 'G': prob *= gProb;break;
 
177
                                case 'A': prob *= aProb;break;
 
178
                                case 'T': prob *= tProb;break;
 
179
                                case 'C': prob *= cProb;break;
 
180
                                default: continue KMER; // Ignore Kmers containing non-GATC chars
 
181
                                }
 
182
                        }
 
183
                        // Now work out how many of these kmers we should have seen
 
184
                        float predicted = prob * totalKmerCount;
 
185
                                                
 
186
                        k.setObsExp(k.count()/predicted);
 
187
                                
 
188
                        // We shall also calculate the positional variation in obs/exp
 
189
                                
 
190
                        float [] obsExpPositions = new float[groups.length];
 
191
                                
 
192
                        long [] positionCounts = k.getPositions();
 
193
                                
 
194
                        for (int g=0;g<groups.length;g++) {
 
195
                                // This is a summation of the number of Kmers of this length which
 
196
                                // fall into this base group
 
197
                                long totalGroupCount = 0;
 
198
                                
 
199
                                // This is a summation of the number of hit Kmers which fall within
 
200
                                // this base group.
 
201
                                long totalGroupHits = 0;
 
202
                                for (int p=groups[g].lowerCount()-1;p<groups[g].upperCount() && p < positionCounts.length ;p++) {
 
203
                                        totalGroupCount += totalKmerCounts[p][chars.length-1];
 
204
                                        totalGroupHits += positionCounts[p];
 
205
                                }
 
206
                        
 
207
                                // We used to have a filter here which provided a default value where there
 
208
                                // were fewer than 1000 observations at a given position.  This caused breakage
 
209
                                // in small files where every position had fewer than 1000 observations and since
 
210
                                // I can't see why this exclusion was there in the first place, I've now removed
 
211
                                // it.  If the reason for its original inclusion reappears then we can put it back
 
212
                                // (with a more sensible default value), but for the moment this appears to be the
 
213
                                // better fix.
 
214
                                
 
215
                                predicted = prob * totalGroupCount;
 
216
                                obsExpPositions[g] = totalGroupHits/predicted;
 
217
                        }
 
218
                        k.setObsExpPositions(obsExpPositions);
 
219
                        
 
220
                        if (k.obsExp() > 3 || k.maxObsExp() > 5) {
 
221
                                enrichedKmers.add(k);
 
222
                        }                       
 
223
                        
 
224
                }
 
225
                
 
226
                Kmer [] finalKMers = enrichedKmers.toArray(new Kmer[0]);
 
227
                Arrays.sort(finalKMers);                                
 
228
                
 
229
                // Now we take the enrichment positions for the top 6 hits and
 
230
                // record these so we can plot them on a line graph
 
231
                enrichments = new double [Math.min(6, finalKMers.length)][];
 
232
                xLabels = new String[enrichments.length];
 
233
                
 
234
                xCategories = new String [groups.length];
 
235
                
 
236
                for (int i=0;i<xCategories.length;i++) {
 
237
                        xCategories[i] = groups[i].toString();
 
238
                }
 
239
                
 
240
                for (int k=0;k<enrichments.length;k++) {
 
241
                        enrichments[k] = new double[groups.length];
 
242
                        
 
243
                        float [] obsExpPos = finalKMers[k].getObsExpPositions();
 
244
                        
 
245
                        // Find the max enrichment for this sequence
 
246
                        float max = 0;
 
247
                        for (int i=0;i<obsExpPos.length;i++) {                                                          
 
248
                                if (obsExpPos[i] > max) max = obsExpPos[i]; 
 
249
                        }
 
250
                        
 
251
                        for (int g=0;g<groups.length;g++) {                             
 
252
                                enrichments[k][g] = obsExpPos[g]/max * 100; 
 
253
                        }
 
254
                        
 
255
                        xLabels[k] = finalKMers[k].sequence();
 
256
                        
 
257
                }
 
258
                
 
259
                this.enrichedKmers = finalKMers;                
 
260
                
 
261
                calculated = true;
 
262
        }
 
263
        
 
264
                
 
265
        public void processSequence(Sequence sequence) {
 
266
                calculated = false;
 
267
                
 
268
                ++skipCount;
 
269
                if (skipCount % 5 != 0) return;
 
270
                
 
271
                char [] seq = sequence.getSequence().toCharArray();
 
272
 
 
273
                if (seq.length > longestSequence) {
 
274
                        longestSequence = seq.length;
 
275
                }
 
276
                                
 
277
                for (int i=0;i<seq.length;i++) {
 
278
                        switch (seq[i]) {
 
279
                        case 'G': ++gCount;break;
 
280
                        case 'A': ++aCount;break;
 
281
                        case 'T': ++tCount;break;
 
282
                        case 'C': ++cCount;break;
 
283
                        }
 
284
                }
 
285
                
 
286
                // Now we go through all of the Kmers to count these
 
287
                for (int kmerSize=MIN_KMER_SIZE;kmerSize<=MAX_KMER_SIZE;kmerSize++) {
 
288
                        for (int i=0;i<=seq.length-kmerSize;i++) {
 
289
                                
 
290
                                String kmer = sequence.getSequence().substring(i, i+kmerSize);
 
291
                                
 
292
                                // Add to the counts before skipping Kmers containing Ns (see
 
293
                                // explanation in addKmerCount for the reasoning).
 
294
                                addKmerCount(i, kmerSize, kmer);
 
295
                                
 
296
                                // Skip Kmers containing N
 
297
                                if (kmer.indexOf("N") >=0) continue;
 
298
 
 
299
                                if (kmers.containsKey(kmer)) {
 
300
                                        kmers.get(kmer).incrementCount(i);
 
301
                                }
 
302
                                else {
 
303
                                        kmers.put(kmer, new Kmer(kmer,i,(seq.length-kmerSize)+1));
 
304
                                }
 
305
 
 
306
                        }
 
307
                }
 
308
        }
 
309
        
 
310
        public void reset () {
 
311
                calculated = false;
 
312
                gCount = 0;
 
313
                aCount = 0;
 
314
                tCount = 0;
 
315
                cCount = 0;
 
316
                totalKmerCounts = new long[0][0];
 
317
                longestSequence = 0;
 
318
                skipCount = 0;
 
319
                enrichedKmers = null;
 
320
                kmers.clear();
 
321
        }
 
322
 
 
323
        public String description() {
 
324
                return "Identifies short sequences which are overrepresented";
 
325
        }
 
326
 
 
327
        public String name() {
 
328
                return "Kmer Content";
 
329
        }
 
330
 
 
331
        public boolean raisesError() {
 
332
                if (!calculated) calculateEnrichment();
 
333
                
 
334
                // We raise an error if the most enriched kmer is seen more than 100 times
 
335
                // more frequently than we expect.
 
336
                
 
337
                if (enrichedKmers.length > 0 && enrichedKmers[0].maxObsExp() > 10) return true;
 
338
                return false;
 
339
        }
 
340
 
 
341
        public boolean raisesWarning() {
 
342
                if (!calculated) calculateEnrichment();
 
343
                
 
344
                // We raise a warning if there are any enriched kmers
 
345
                if (enrichedKmers.length > 0) return true;
 
346
                return false;
 
347
        }
 
348
 
 
349
        public void makeReport(HTMLReportArchive report) throws IOException {
 
350
                if (!calculated) calculateEnrichment();
 
351
                
 
352
                if (enrichedKmers.length > 0) {
 
353
                        ZipOutputStream zip = report.zipFile();
 
354
                        zip.putNextEntry(new ZipEntry(report.folderName()+"/Images/kmer_profiles.png"));
 
355
 
 
356
                        BufferedImage b = new BufferedImage(800,600,BufferedImage.TYPE_INT_RGB);
 
357
                        Graphics g = b.getGraphics();
 
358
 
 
359
                        LineGraph lg = new LineGraph(enrichments, 0d, 100d, "Position in read (bp)", xLabels, xCategories, "Relative enrichment over read length");
 
360
                        lg.paint(g,800,600);
 
361
 
 
362
                        ImageIO.write((BufferedImage)(b),"PNG",zip);
 
363
 
 
364
                        StringBuffer sb = report.htmlDocument();
 
365
 
 
366
                        sb.append("<p><img class=\"indented\" src=\"Images/kmer_profiles.png\" alt=\"Kmer graph\"></p>\n");
 
367
 
 
368
                }
 
369
                
 
370
                
 
371
                ResultsTable table = new ResultsTable(enrichedKmers);
 
372
                
 
373
                StringBuffer b = report.htmlDocument();
 
374
                StringBuffer d = report.dataDocument();
 
375
                
 
376
                if (enrichedKmers.length == 0) {
 
377
                        b.append("<p>No overrepresented Kmers</p>\n");
 
378
                }
 
379
                
 
380
                else {
 
381
                        b.append("<table>\n");
 
382
                        // Do the headers
 
383
                        b.append("<tr>\n");
 
384
                        d.append("#");
 
385
                        for (int c=0;c<table.getColumnCount();c++) {
 
386
                                b.append("<th>");
 
387
                                b.append(table.getColumnName(c));
 
388
                                d.append(table.getColumnName(c));
 
389
                                b.append("</th>\n");
 
390
                                if (c<table.getColumnCount()-1) {
 
391
                                        d.append("\t");
 
392
                                }
 
393
                        }
 
394
                        b.append("</tr>\n");
 
395
                        d.append("\n");
 
396
                        
 
397
                        // Do the rows
 
398
                        for (int r=0;r<table.getRowCount();r++) {
 
399
                                b.append("<tr>\n");
 
400
                                for (int c=0;c<table.getColumnCount();c++) {
 
401
                                        b.append("<td>");
 
402
                                        b.append(table.getValueAt(r, c));
 
403
                                        d.append(table.getValueAt(r, c));
 
404
                                        b.append("</td>\n");
 
405
                                        if (c<table.getColumnCount()-1) {
 
406
                                                d.append("\t");
 
407
                                        }
 
408
                                }
 
409
                                b.append("</tr>\n");
 
410
                                d.append("\n");
 
411
                        }
 
412
                        
 
413
                        b.append("</table>\n");
 
414
                }       
 
415
        }
 
416
        
 
417
        private class Kmer implements Comparable<Kmer>{
 
418
                
 
419
                private String sequence;
 
420
                private long count = 0;
 
421
                private float obsExp = 0;
 
422
                private float [] obsExpPositions = null;
 
423
                private long [] positions = new long[0];
 
424
                
 
425
                public Kmer (String sequence, int position, int seqLength) {
 
426
 
 
427
                        // Do this slightly convoluted dance to try to avoid
 
428
                        // keeping the whole original sequence in memory
 
429
                        char [] chars = sequence.toCharArray();
 
430
                        this.sequence = new String(chars);
 
431
                        count = 1;
 
432
                        positions = new long[seqLength];
 
433
                        ++positions[position];
 
434
                }
 
435
                
 
436
                public void incrementCount (int position) {
 
437
                        ++count;
 
438
                        
 
439
                        if (position >= positions.length) {
 
440
                                long [] newPositions = new long[position+1];
 
441
                                for (int i=0;i<positions.length;i++) {
 
442
                                        newPositions[i] = positions[i];
 
443
                                }
 
444
                                positions = newPositions;
 
445
                        }
 
446
                        
 
447
                        ++positions[position];
 
448
                        
 
449
                }
 
450
                
 
451
                public long [] getPositions () {
 
452
                        return positions;
 
453
                }
 
454
                
 
455
                public String sequence () {
 
456
                        return sequence;
 
457
                }
 
458
                
 
459
                public long count () {
 
460
                        return count;
 
461
                }
 
462
                
 
463
                public void setObsExp (float oe) {
 
464
                        this.obsExp = oe;
 
465
                }
 
466
                
 
467
                public void setObsExpPositions (float [] oePositions) {
 
468
                        this.obsExpPositions = oePositions;
 
469
                }
 
470
                
 
471
                public float [] getObsExpPositions () {
 
472
                        return obsExpPositions;
 
473
                }
 
474
                
 
475
                public float obsExp () {
 
476
                        return obsExp;
 
477
                }
 
478
                
 
479
                public float maxObsExp () {
 
480
                        float max = 0;
 
481
                        for (int i=0;i<obsExpPositions.length;i++) {
 
482
                                if (obsExpPositions[i]>max) max = obsExpPositions[i];
 
483
                        }
 
484
                        return max;
 
485
                }
 
486
 
 
487
                public int maxPosition () {
 
488
                        float max = 0;
 
489
                        int position = 0;
 
490
                        for (int i=0;i<obsExpPositions.length;i++) {
 
491
                                if (obsExpPositions[i]>max) {
 
492
                                        max = obsExpPositions[i];
 
493
                                        position = i+1;
 
494
                                }
 
495
                        }
 
496
                        
 
497
                        if (position == 0) {
 
498
                                System.err.println("No value > 0 for "+sequence);
 
499
                                position = 1;
 
500
                        }
 
501
                        
 
502
                        return position;
 
503
                }
 
504
 
 
505
                public int compareTo(Kmer o) {
 
506
                        return Float.compare(o.obsExp(), obsExp());
 
507
                }
 
508
        }
 
509
        
 
510
        private class ResultsTable extends AbstractTableModel {
 
511
                
 
512
                private Kmer [] kmers;
 
513
                
 
514
                public ResultsTable (Kmer [] kmers) {
 
515
                        this.kmers = kmers;
 
516
                }
 
517
                
 
518
                
 
519
                // Sequence - Count - Obs/Exp
 
520
                public int getColumnCount() {
 
521
                        return 5;
 
522
                }
 
523
 
 
524
                public int getRowCount() {
 
525
                        return kmers.length;
 
526
                }
 
527
 
 
528
                public Object getValueAt(int rowIndex, int columnIndex) {
 
529
                        switch (columnIndex) {
 
530
                                case 0: return kmers[rowIndex].sequence();
 
531
                                case 1: return kmers[rowIndex].count()*5;
 
532
                                case 2: return kmers[rowIndex].obsExp();
 
533
                                case 3: return kmers[rowIndex].maxObsExp();
 
534
                                case 4: return groups[kmers[rowIndex].maxPosition()-1].toString();
 
535
                                                
 
536
                        }
 
537
                        return null;
 
538
                }
 
539
                
 
540
                public String getColumnName (int columnIndex) {
 
541
                        switch (columnIndex) {
 
542
                                case 0: return "Sequence";
 
543
                                case 1: return "Count";
 
544
                                case 2: return "Obs/Exp Overall";
 
545
                                case 3: return "Obs/Exp Max";
 
546
                                case 4: return "Max Obs/Exp Position";
 
547
                        }
 
548
                        return null;
 
549
                }
 
550
                
 
551
                public Class<?> getColumnClass (int columnIndex) {
 
552
                        switch (columnIndex) {
 
553
                        case 0: return String.class;
 
554
                        case 1: return Integer.class;
 
555
                        case 2: return Float.class;
 
556
                        case 3: return Float.class;
 
557
                        case 4: return String.class;
 
558
                }
 
559
                return null;
 
560
                        
 
561
                }
 
562
        }
 
563
 
 
564
}