2
* Copyright Copyright 2010-12 Simon Andrews
4
* This file is part of FastQC.
6
* FastQC is free software; you can redistribute it and/or modify
7
* it under the terms of the GNU General Public License as published by
8
* the Free Software Foundation; either version 3 of the License, or
9
* (at your option) any later version.
11
* FastQC is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
* GNU General Public License for more details.
16
* You should have received a copy of the GNU General Public License
17
* along with FastQC; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20
package uk.ac.babraham.FastQC.Modules;
22
import java.awt.BorderLayout;
23
import java.util.ArrayList;
24
import java.util.Arrays;
25
import java.util.HashMap;
26
import java.util.Iterator;
27
import java.util.List;
29
import javax.swing.JLabel;
30
import javax.swing.JPanel;
31
import javax.swing.JScrollPane;
32
import javax.swing.JTable;
33
import javax.swing.table.AbstractTableModel;
34
import javax.swing.table.TableModel;
36
import uk.ac.babraham.FastQC.Report.HTMLReportArchive;
37
import uk.ac.babraham.FastQC.Sequence.Sequence;
38
import uk.ac.babraham.FastQC.Sequence.Contaminant.ContaminantHit;
39
import uk.ac.babraham.FastQC.Sequence.Contaminant.ContaminentFinder;
41
public class OverRepresentedSeqs implements QCModule {
43
protected HashMap<String, Integer>sequences = new HashMap<String, Integer>();
44
protected int count = 0;
45
private OverrepresentedSeq [] overrepresntedSeqs = null;
46
private boolean calculated = false;
47
private boolean frozen = false;
48
private DuplicationLevel duplicationModule;
50
// This is the number of different sequences we want to track
51
private final int OBSERVATION_CUTOFF = 200000;
52
// This is a count of how many unique sequences we've seen so far
53
// so we know when to stop adding them.
54
private int uniqueSequenceCount = 0;
55
// This was the total count at the point at which we saw our total
56
// number of unique sequences, so we know what to correct by when
57
// extrapolating to the whole file
58
protected int countAtUniqueLimit = 0;
61
public OverRepresentedSeqs () {
62
duplicationModule = new DuplicationLevel(this);
65
public String description() {
66
return "Identifies sequences which are overrepresented in the set";
69
public boolean ignoreFilteredSequences() {
73
public DuplicationLevel duplicationLevelModule () {
74
return duplicationModule;
77
public JPanel getResultsPanel() {
78
JPanel returnPanel = new JPanel();
79
returnPanel.setLayout(new BorderLayout());
80
returnPanel.add(new JLabel("Overrepresented sequences",JLabel.CENTER),BorderLayout.NORTH);
82
if (!calculated) getOverrepresentedSeqs();
84
if (overrepresntedSeqs.length > 0) {
85
TableModel model = new ResultsTable(overrepresntedSeqs);
86
JTable table = new JTable(model);
87
table.setCellSelectionEnabled(true);
88
returnPanel.add(new JScrollPane(table),BorderLayout.CENTER);
91
returnPanel.add(new JLabel("There are no overrepresented sequences",JLabel.CENTER),BorderLayout.CENTER);
98
public DuplicationLevel getDuplicationLevelModule () {
99
return duplicationModule;
101
private synchronized void getOverrepresentedSeqs () {
103
// If the duplication module hasn't already done
104
// its calculation it needs to do it now before
105
// we stomp all over the data
106
duplicationModule.calculateLevels();
108
Iterator<String> s = sequences.keySet().iterator();
109
List<OverrepresentedSeq>keepers = new ArrayList<OverrepresentedSeq>();
111
while (s.hasNext()) {
112
String seq = s.next();
113
double percentage = ((double)sequences.get(seq)/count)*100;
114
if (percentage > 0.1) {
115
OverrepresentedSeq os = new OverrepresentedSeq(seq, sequences.get(seq), percentage);
120
overrepresntedSeqs = keepers.toArray(new OverrepresentedSeq[0]);
121
Arrays.sort(overrepresntedSeqs);
127
public void reset () {
132
public String name() {
133
return "Overrepresented sequences";
136
public void processSequence(Sequence sequence) {
142
// Since we rely on identity to match sequences we can't trust really long
143
// sequences, so anything over 75bp gets truncated to 50bp.
144
String seq = sequence.getSequence();
145
if (seq.length() > 75) {
146
seq = new String(seq.substring(0, 50));
149
if (sequences.containsKey(seq)) {
150
sequences.put(seq, sequences.get(seq)+1);
154
sequences.put(seq, 1);
155
++uniqueSequenceCount;
156
countAtUniqueLimit = count;
157
if (uniqueSequenceCount == OBSERVATION_CUTOFF) {
165
private class ResultsTable extends AbstractTableModel {
167
private OverrepresentedSeq [] seqs;
169
public ResultsTable (OverrepresentedSeq [] seqs) {
174
// Sequence - Count - Percentage
175
public int getColumnCount() {
179
public int getRowCount() {
183
public Object getValueAt(int rowIndex, int columnIndex) {
184
switch (columnIndex) {
185
case 0: return seqs[rowIndex].seq();
186
case 1: return seqs[rowIndex].count();
187
case 2: return seqs[rowIndex].percentage();
188
case 3: return seqs[rowIndex].contaminantHit();
194
public String getColumnName (int columnIndex) {
195
switch (columnIndex) {
196
case 0: return "Sequence";
197
case 1: return "Count";
198
case 2: return "Percentage";
199
case 3: return "Possible Source";
204
public Class<?> getColumnClass (int columnIndex) {
205
switch (columnIndex) {
206
case 0: return String.class;
207
case 1: return Integer.class;
208
case 2: return Double.class;
209
case 3: return String.class;
216
private class OverrepresentedSeq implements Comparable<OverrepresentedSeq>{
220
private double percentage;
221
private ContaminantHit contaminantHit;
223
public OverrepresentedSeq (String seq, int count, double percentage) {
226
this.percentage = percentage;
227
this.contaminantHit = ContaminentFinder.findContaminantHit(seq);
230
public String seq () {
234
public int count () {
238
public double percentage () {
242
public String contaminantHit () {
243
if (contaminantHit == null) {
247
return contaminantHit.toString();
251
public int compareTo(OverrepresentedSeq o) {
252
return o.count-count;
256
public boolean raisesError() {
257
if (!calculated) getOverrepresentedSeqs();
258
if (overrepresntedSeqs.length>0) {
259
if (overrepresntedSeqs[0].percentage > 1) {
266
public boolean raisesWarning() {
267
if (!calculated) getOverrepresentedSeqs();
269
if (overrepresntedSeqs.length > 0) return true;
273
public void makeReport(HTMLReportArchive report) {
274
if (!calculated) getOverrepresentedSeqs();
275
ResultsTable table = new ResultsTable(overrepresntedSeqs);
277
StringBuffer b = report.htmlDocument();
278
StringBuffer d = report.dataDocument();
280
if (overrepresntedSeqs.length == 0) {
281
b.append("<p>No overrepresented sequences</p>\n");
285
b.append("<table>\n");
289
for (int c=0;c<table.getColumnCount();c++) {
291
b.append(table.getColumnName(c));
292
d.append(table.getColumnName(c));
294
if (c<table.getColumnCount()-1) {
302
for (int r=0;r<table.getRowCount();r++) {
304
for (int c=0;c<table.getColumnCount();c++) {
306
b.append(table.getValueAt(r, c));
307
d.append(table.getValueAt(r, c));
309
if (c<table.getColumnCount()-1) {
317
b.append("</table>\n");