Rewrite ReadLengthDistribution to count the read lengths into a hash table first and only at the end to produce a GATK report table.

Before that fix, the tool was couldn't work with more then one RG before.
- Address all review comments
This commit is contained in:
Ami Levy-Moonshine 2013-11-07 14:54:35 -05:00
parent cc85e373d0
commit 6ad841cec5
1 changed files with 77 additions and 24 deletions

View File

@ -38,7 +38,10 @@ import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream; import java.io.PrintStream;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.TreeMap;
/** /**
* Outputs the read lengths of all the reads in a file. * Outputs the read lengths of all the reads in a file.
@ -77,51 +80,101 @@ public class ReadLengthDistribution extends ReadWalker<Integer, Integer> {
@Output @Output
public PrintStream out; public PrintStream out;
private GATKReport report; //A map from RG to its column number (its index in an int[] array)
private Map<SAMReadGroupRecord,Integer> readGroupsLocation;
//Each line in the table is a read length and each column it the number of reads of a specific RG with that length. Thus a table is a map between read lengths to array of values (one for each RG).
private Map<Integer,int[]> table;
private List<SAMReadGroupRecord> readGroups;
public void initialize() { public void initialize() {
final List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups(); readGroups = getToolkit().getSAMFileHeader().getReadGroups();
readGroupsLocation = new HashMap<>();
table = new TreeMap<>();
int readGroupsNum = 0;
report = new GATKReport(); if (!readGroups.isEmpty()){
report.addTable("ReadLengthDistribution", "Table of read length distributions", 1 + (readGroups.isEmpty() ? 1 : readGroups.size())); for (SAMReadGroupRecord rg : readGroups){
GATKReportTable table = report.getTable("ReadLengthDistribution"); readGroupsLocation.put(rg,readGroupsNum);
readGroupsNum++;
table.addColumn("readLength"); }
}
if (readGroups.isEmpty())
table.addColumn("SINGLE_SAMPLE");
else
for (SAMReadGroupRecord rg : readGroups)
table.addColumn(rg.getSample());
}
public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
return ( !read.getReadPairedFlag() || read.getReadPairedFlag() && read.getFirstOfPairFlag());
} }
@Override @Override
public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, RefMetaDataTracker RefMetaDataTracker) { public Integer map(final ReferenceContext referenceContext,final GATKSAMRecord samRecord,final RefMetaDataTracker RefMetaDataTracker) {
GATKReportTable table = report.getTable("ReadLengthDistribution");
int length = Math.abs(samRecord.getReadLength()); final int length = Math.abs(samRecord.getReadLength());
String sample = samRecord.getReadGroup().getSample(); final SAMReadGroupRecord rg = samRecord.getReadGroup();
table.increment(length, sample); increment(table,length, rg);
return null; return null;
} }
final private void increment(final Map<Integer,int[]> table,final int length,final SAMReadGroupRecord rg){
if(readGroupsLocation.isEmpty()){
if(table.containsKey(length))
table.get(length)[0]++;
else{
final int[] newLength = {1};
table.put(length,newLength);
}
}
else{
final int rgLocation = readGroupsLocation.get(rg);
if(table.containsKey(length))
table.get(length)[rgLocation]++;
else{
table.put(length,new int[readGroupsLocation.size()]);
table.get(length)[rgLocation]++;
}
}
}
@Override @Override
public Integer reduceInit() { public Integer reduceInit() {
return null; return null;
} }
@Override @Override
public Integer reduce(Integer integer, Integer integer1) { public Integer reduce(final Integer integer,final Integer integer1) {
return null; return null;
} }
public void onTraversalDone(Integer sum) { public void onTraversalDone(final Integer sum) {
final GATKReport report = createGATKReport();
report.print(out); report.print(out);
} }
final private GATKReport createGATKReport(){
final GATKReport report = new GATKReport();
report.addTable("ReadLengthDistribution", "Table of read length distributions", 1 + (readGroupsLocation.isEmpty() ? 1 : readGroupsLocation.size()));
final GATKReportTable tableReport = report.getTable("ReadLengthDistribution");
tableReport.addColumn("readLength");
if (readGroupsLocation.isEmpty()){
tableReport.addColumn("SINGLE_SAMPLE");
int rowIndex = 0;
for (Integer length : table.keySet()){
tableReport.set(rowIndex,0,length);
tableReport.set(rowIndex,1,table.get(length)[0]);
rowIndex++;
}
}
else{
for (SAMReadGroupRecord rg : readGroups)
tableReport.addColumn(rg.getSample());
int rowIndex = 0;
for (Integer length : table.keySet()){
tableReport.set(rowIndex,0,length);
for (int i=0; i < readGroupsLocation.size(); i++)
tableReport.set(rowIndex,i+1,table.get(length)[i]);
rowIndex++;
}
}
return report;
}
} }