Making ReadLengthDistribution Public

Found this neat little walker Kiran wrote stashed in the private tree. Very useful. Generalized it a bit, added GATKDocs and moved it to public. I might include it as a QC step on the pacbio processing pipeline.
* generalize it so it works with non pair ended reads.
* generalize it to work with no read group information
This commit is contained in:
Mauricio Carneiro 2011-08-31 15:34:58 -04:00
parent d604019362
commit 4b5a7046c5
1 changed files with 101 additions and 0 deletions

View File

@ -0,0 +1,101 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import java.io.PrintStream;
import java.util.List;
/**
* Outputs the read lengths of all the reads in a file.
*
* <p>
* Generates a table with the read lengths categorized per sample. If the file has no sample information
* (no read groups) it considers all reads to come from the same sample.
* </p>
*
*
* <h2>Input</h2>
* <p>
* A BAM file.
* </p>
*
* <h2>Output</h2>
* <p>
* A human/R readable table of tab separated values with one column per sample and one row per read.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T ReadLengthDistribution
* -I example.bam
* -R reference.fasta
* -o example.tbl
* </pre>
*
* @author Kiran Garimela
*/
public class ReadLengthDistribution extends ReadWalker<Integer, Integer> {
@Output
public PrintStream out;
private GATKReport report;
public void initialize() {
report = new GATKReport();
report.addTable("ReadLengthDistribution", "Table of read length distributions");
GATKReportTable table = report.getTable("ReadLengthDistribution");
table.addPrimaryKey("readLength");
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
if (readGroups.isEmpty())
table.addColumn("SINGLE_SAMPLE", 0);
else
for (SAMReadGroupRecord rg : readGroups)
table.addColumn(rg.getSample(), 0);
}
public boolean filter(ReferenceContext ref, SAMRecord read) {
return ( !read.getReadPairedFlag() || read.getReadPairedFlag() && read.getFirstOfPairFlag());
}
@Override
public Integer map(ReferenceContext referenceContext, SAMRecord samRecord, ReadMetaDataTracker readMetaDataTracker) {
GATKReportTable table = report.getTable("ReadLengthDistribution");
int length = Math.abs(samRecord.getReadLength());
String sample = samRecord.getReadGroup().getSample();
table.increment(length, sample);
return null;
}
@Override
public Integer reduceInit() {
return null;
}
@Override
public Integer reduce(Integer integer, Integer integer1) {
return null;
}
public void onTraversalDone(Integer sum) {
report.print(out);
}
}