2
* Copyright Copyright 2010-12 Simon Andrews
4
* This file is part of FastQC.
6
* FastQC is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; either version 3 of the License, or
9
* (at your option) any later version.
11
* FastQC is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License
17
* along with FastQC; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
package uk.ac.babraham.FastQC.Modules;
22
import java.awt.BorderLayout;
23
import java.awt.Graphics;
24
import java.awt.image.BufferedImage;
25
import java.io.IOException;
26
import java.util.Arrays;
27
import java.util.Hashtable;
28
import java.util.Iterator;
29
import java.util.Vector;
30
import java.util.zip.ZipEntry;
31
import java.util.zip.ZipOutputStream;
33
import javax.imageio.ImageIO;
34
import javax.swing.JLabel;
35
import javax.swing.JPanel;
36
import javax.swing.JScrollPane;
37
import javax.swing.JSplitPane;
38
import javax.swing.JTable;
39
import javax.swing.table.AbstractTableModel;
40
import javax.swing.table.TableModel;
42
import uk.ac.babraham.FastQC.Graphs.BaseGroup;
43
import uk.ac.babraham.FastQC.Graphs.LineGraph;
44
import uk.ac.babraham.FastQC.Report.HTMLReportArchive;
45
import uk.ac.babraham.FastQC.Sequence.Sequence;
47
public class KmerContent implements QCModule {
49
private Hashtable<String, Kmer> kmers = new Hashtable<String, Kmer>((int)Math.pow(4, MAX_KMER_SIZE));
50
private long gCount = 0;
51
private long aCount = 0;
52
private long tCount = 0;
53
private long cCount = 0;
55
private int longestSequence = 0;
56
private long [][] totalKmerCounts = new long [0][0];
57
private long skipCount = 0;
59
private static int MIN_KMER_SIZE = 5;
60
private static int MAX_KMER_SIZE = 5;
62
public boolean calculated = false;
64
private Kmer [] enrichedKmers = null;
65
private double [][] enrichments = null;
66
private String [] xCategories = new String[0];
67
private String [] xLabels = new String[0];
71
public KmerContent () {
72
if (System.getProperty("fastqc.kmer_size") != null) {
73
int kmerSize = Integer.parseInt(System.getProperty("fastqc.kmer_size"));
74
MIN_KMER_SIZE = kmerSize;
75
MAX_KMER_SIZE = kmerSize;
79
public boolean ignoreFilteredSequences() {
83
public JPanel getResultsPanel() {
85
if (!calculated) calculateEnrichment();
86
JPanel returnPanel = new JPanel();
87
returnPanel.setLayout(new BorderLayout());
88
returnPanel.add(new JLabel("Overrepresented Kmers",JLabel.CENTER),BorderLayout.NORTH);
90
JSplitPane splitPanel = new JSplitPane(JSplitPane.VERTICAL_SPLIT);
92
if (enrichedKmers.length > 0) {
93
TableModel model = new ResultsTable(enrichedKmers);
94
splitPanel.setBottomComponent(new JScrollPane(new JTable(model)));
95
splitPanel.setTopComponent(new LineGraph(enrichments, 0d, 100d, "Position in read (bp)", xLabels, xCategories, "Relative enrichment over read length"));
96
returnPanel.add(splitPanel,BorderLayout.CENTER);
99
returnPanel.add(new JLabel("There are no overrepresented Kmers",JLabel.CENTER),BorderLayout.CENTER);
106
* This method simply keeps a count of the number of Kmers of a given size
107
* seen at each position within the run. We can use this later on to calculate
108
* the enrichment of the Kmers we actually count.
110
* We take in the Kmer sequence even though this isn't used in the total counts
111
* we do this because we don't want to count Kmers with Ns in them, but we do
112
* need to ensure that the data structure is expanded to the right size, and if
113
* we have libraries where later positions are Ns in all sequences then our
114
* data structure ends up too short and we crash.
116
* @param position Position within the read. 0 indexed
117
* @param kmerLength Actual length of the Kmer analysed
119
private void addKmerCount (int position,int kmerLength, String kmer) {
122
if (position >= totalKmerCounts.length) {
123
// We need to expand the array
124
long [][] newCounts = new long[position+1][];
125
for (int i=0;i<totalKmerCounts.length;i++) {
126
newCounts[i] = totalKmerCounts[i];
128
for (int i=totalKmerCounts.length;i<newCounts.length;i++) {
129
newCounts[i] = new long[MAX_KMER_SIZE];
132
totalKmerCounts = newCounts;
135
if (kmer.indexOf("N") >=0) return;
137
++totalKmerCounts[position][kmerLength-1];
141
private synchronized void calculateEnrichment () {
143
// For each kmer we work out the number of times we should have
144
// seen this by chance, and then the obs/exp for each of the
145
// kmers. We can then filter for a specific level of enrichment
146
// and show only those Kmers
148
float totalBases = gCount+aCount+tCount+cCount;
150
float gProb = ((float)gCount)/totalBases;
151
float aProb = ((float)aCount)/totalBases;
152
float tProb = ((float)tCount)/totalBases;
153
float cProb = ((float)cCount)/totalBases;
155
// We'll be grouping together positions later so make up the groups now
156
groups = BaseGroup.makeBaseGroups((longestSequence-MIN_KMER_SIZE)+1);
158
Vector<Kmer>enrichedKmers = new Vector<Kmer>();
160
Iterator<Kmer> rawKmers = kmers.values().iterator();
162
KMER: while (rawKmers.hasNext()) {
163
Kmer k = rawKmers.next();
165
long totalKmerCount = 0;
167
for (int i=0;i<totalKmerCounts.length;i++) {
168
totalKmerCount += totalKmerCounts[i][k.sequence().length()-1];
173
char [] chars = k.sequence().toCharArray();
174
for (int c=0;c<chars.length;c++) {
176
case 'G': prob *= gProb;break;
177
case 'A': prob *= aProb;break;
178
case 'T': prob *= tProb;break;
179
case 'C': prob *= cProb;break;
180
default: continue KMER; // Ignore Kmers containing non-GATC chars
183
// Now work out how many of these kmers we should have seen
184
float predicted = prob * totalKmerCount;
186
k.setObsExp(k.count()/predicted);
188
// We shall also calculate the positional variation in obs/exp
190
float [] obsExpPositions = new float[groups.length];
192
long [] positionCounts = k.getPositions();
194
for (int g=0;g<groups.length;g++) {
195
// This is a summation of the number of Kmers of this length which
196
// fall into this base group
197
long totalGroupCount = 0;
199
// This is a summation of the number of hit Kmers which fall within
201
long totalGroupHits = 0;
202
for (int p=groups[g].lowerCount()-1;p<groups[g].upperCount() && p < positionCounts.length ;p++) {
203
totalGroupCount += totalKmerCounts[p][chars.length-1];
204
totalGroupHits += positionCounts[p];
207
// We used to have a filter here which provided a default value where there
208
// were fewer than 1000 observations at a given position. This caused breakage
209
// in small files where every position had fewer than 1000 observations and since
210
// I can't see why this exclusion was there in the first place, I've now removed
211
// it. If the reason for its original inclusion reappears then we can put it back
212
// (with a more sensible default value), but for the moment this appears to be the
215
predicted = prob * totalGroupCount;
216
obsExpPositions[g] = totalGroupHits/predicted;
218
k.setObsExpPositions(obsExpPositions);
220
if (k.obsExp() > 3 || k.maxObsExp() > 5) {
221
enrichedKmers.add(k);
226
Kmer [] finalKMers = enrichedKmers.toArray(new Kmer[0]);
227
Arrays.sort(finalKMers);
229
// Now we take the enrichment positions for the top 6 hits and
230
// record these so we can plot them on a line graph
231
enrichments = new double [Math.min(6, finalKMers.length)][];
232
xLabels = new String[enrichments.length];
234
xCategories = new String [groups.length];
236
for (int i=0;i<xCategories.length;i++) {
237
xCategories[i] = groups[i].toString();
240
for (int k=0;k<enrichments.length;k++) {
241
enrichments[k] = new double[groups.length];
243
float [] obsExpPos = finalKMers[k].getObsExpPositions();
245
// Find the max enrichment for this sequence
247
for (int i=0;i<obsExpPos.length;i++) {
248
if (obsExpPos[i] > max) max = obsExpPos[i];
251
for (int g=0;g<groups.length;g++) {
252
enrichments[k][g] = obsExpPos[g]/max * 100;
255
xLabels[k] = finalKMers[k].sequence();
259
this.enrichedKmers = finalKMers;
265
public void processSequence(Sequence sequence) {
269
if (skipCount % 5 != 0) return;
271
char [] seq = sequence.getSequence().toCharArray();
273
if (seq.length > longestSequence) {
274
longestSequence = seq.length;
277
for (int i=0;i<seq.length;i++) {
279
case 'G': ++gCount;break;
280
case 'A': ++aCount;break;
281
case 'T': ++tCount;break;
282
case 'C': ++cCount;break;
286
// Now we go through all of the Kmers to count these
287
for (int kmerSize=MIN_KMER_SIZE;kmerSize<=MAX_KMER_SIZE;kmerSize++) {
288
for (int i=0;i<=seq.length-kmerSize;i++) {
290
String kmer = sequence.getSequence().substring(i, i+kmerSize);
292
// Add to the counts before skipping Kmers containing Ns (see
293
// explanation in addKmerCount for the reasoning).
294
addKmerCount(i, kmerSize, kmer);
296
// Skip Kmers containing N
297
if (kmer.indexOf("N") >=0) continue;
299
if (kmers.containsKey(kmer)) {
300
kmers.get(kmer).incrementCount(i);
303
kmers.put(kmer, new Kmer(kmer,i,(seq.length-kmerSize)+1));
310
public void reset () {
316
totalKmerCounts = new long[0][0];
319
enrichedKmers = null;
323
public String description() {
324
return "Identifies short sequences which are overrepresented";
327
public String name() {
328
return "Kmer Content";
331
public boolean raisesError() {
332
if (!calculated) calculateEnrichment();
334
// We raise an error if the most enriched kmer is seen more than 100 times
335
// more frequently than we expect.
337
if (enrichedKmers.length > 0 && enrichedKmers[0].maxObsExp() > 10) return true;
341
public boolean raisesWarning() {
342
if (!calculated) calculateEnrichment();
344
// We raise a warning if there are any enriched kmers
345
if (enrichedKmers.length > 0) return true;
349
public void makeReport(HTMLReportArchive report) throws IOException {
350
if (!calculated) calculateEnrichment();
352
if (enrichedKmers.length > 0) {
353
ZipOutputStream zip = report.zipFile();
354
zip.putNextEntry(new ZipEntry(report.folderName()+"/Images/kmer_profiles.png"));
356
BufferedImage b = new BufferedImage(800,600,BufferedImage.TYPE_INT_RGB);
357
Graphics g = b.getGraphics();
359
LineGraph lg = new LineGraph(enrichments, 0d, 100d, "Position in read (bp)", xLabels, xCategories, "Relative enrichment over read length");
362
ImageIO.write((BufferedImage)(b),"PNG",zip);
364
StringBuffer sb = report.htmlDocument();
366
sb.append("<p><img class=\"indented\" src=\"Images/kmer_profiles.png\" alt=\"Kmer graph\"></p>\n");
371
ResultsTable table = new ResultsTable(enrichedKmers);
373
StringBuffer b = report.htmlDocument();
374
StringBuffer d = report.dataDocument();
376
if (enrichedKmers.length == 0) {
377
b.append("<p>No overrepresented Kmers</p>\n");
381
b.append("<table>\n");
385
for (int c=0;c<table.getColumnCount();c++) {
387
b.append(table.getColumnName(c));
388
d.append(table.getColumnName(c));
390
if (c<table.getColumnCount()-1) {
398
for (int r=0;r<table.getRowCount();r++) {
400
for (int c=0;c<table.getColumnCount();c++) {
402
b.append(table.getValueAt(r, c));
403
d.append(table.getValueAt(r, c));
405
if (c<table.getColumnCount()-1) {
413
b.append("</table>\n");
417
private class Kmer implements Comparable<Kmer>{
419
private String sequence;
420
private long count = 0;
421
private float obsExp = 0;
422
private float [] obsExpPositions = null;
423
private long [] positions = new long[0];
425
public Kmer (String sequence, int position, int seqLength) {
427
// Do this slightly convoluted dance to try to avoid
428
// keeping the whole original sequence in memory
429
char [] chars = sequence.toCharArray();
430
this.sequence = new String(chars);
432
positions = new long[seqLength];
433
++positions[position];
436
public void incrementCount (int position) {
439
if (position >= positions.length) {
440
long [] newPositions = new long[position+1];
441
for (int i=0;i<positions.length;i++) {
442
newPositions[i] = positions[i];
444
positions = newPositions;
447
++positions[position];
451
public long [] getPositions () {
455
public String sequence () {
459
public long count () {
463
public void setObsExp (float oe) {
467
public void setObsExpPositions (float [] oePositions) {
468
this.obsExpPositions = oePositions;
471
public float [] getObsExpPositions () {
472
return obsExpPositions;
475
public float obsExp () {
479
public float maxObsExp () {
481
for (int i=0;i<obsExpPositions.length;i++) {
482
if (obsExpPositions[i]>max) max = obsExpPositions[i];
487
public int maxPosition () {
490
for (int i=0;i<obsExpPositions.length;i++) {
491
if (obsExpPositions[i]>max) {
492
max = obsExpPositions[i];
498
System.err.println("No value > 0 for "+sequence);
505
public int compareTo(Kmer o) {
506
return Float.compare(o.obsExp(), obsExp());
510
private class ResultsTable extends AbstractTableModel {
512
private Kmer [] kmers;
514
public ResultsTable (Kmer [] kmers) {
519
// Sequence - Count - Obs/Exp
520
public int getColumnCount() {
524
public int getRowCount() {
528
public Object getValueAt(int rowIndex, int columnIndex) {
529
switch (columnIndex) {
530
case 0: return kmers[rowIndex].sequence();
531
case 1: return kmers[rowIndex].count()*5;
532
case 2: return kmers[rowIndex].obsExp();
533
case 3: return kmers[rowIndex].maxObsExp();
534
case 4: return groups[kmers[rowIndex].maxPosition()-1].toString();
540
public String getColumnName (int columnIndex) {
541
switch (columnIndex) {
542
case 0: return "Sequence";
543
case 1: return "Count";
544
case 2: return "Obs/Exp Overall";
545
case 3: return "Obs/Exp Max";
546
case 4: return "Max Obs/Exp Position";
551
public Class<?> getColumnClass (int columnIndex) {
552
switch (columnIndex) {
553
case 0: return String.class;
554
case 1: return Integer.class;
555
case 2: return Float.class;
556
case 3: return Float.class;
557
case 4: return String.class;