Making ReadLengthDistribution Public
Found this neat little walker Kiran wrote stashed in the private tree. Very useful. Generalized it a bit, added GATKDocs and moved it to public. I might include it as a QC step on the pacbio processing pipeline. * generalize it so it works with non pair ended reads. * generalize it to work with no read group information
This commit is contained in:
parent
d604019362
commit
4b5a7046c5
|
|
@ -0,0 +1,101 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.diagnostics;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMReadGroupRecord;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||||
|
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||||
|
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Outputs the read lengths of all the reads in a file.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Generates a table with the read lengths categorized per sample. If the file has no sample information
|
||||||
|
* (no read groups) it considers all reads to come from the same sample.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* <h2>Input</h2>
|
||||||
|
* <p>
|
||||||
|
* A BAM file.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Output</h2>
|
||||||
|
* <p>
|
||||||
|
* A human/R readable table of tab separated values with one column per sample and one row per read.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Examples</h2>
|
||||||
|
* <pre>
|
||||||
|
* java
|
||||||
|
* -jar GenomeAnalysisTK.jar
|
||||||
|
* -T ReadLengthDistribution
|
||||||
|
* -I example.bam
|
||||||
|
* -R reference.fasta
|
||||||
|
* -o example.tbl
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* @author Kiran Garimela
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public class ReadLengthDistribution extends ReadWalker<Integer, Integer> {
|
||||||
|
@Output
|
||||||
|
public PrintStream out;
|
||||||
|
|
||||||
|
private GATKReport report;
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
report = new GATKReport();
|
||||||
|
report.addTable("ReadLengthDistribution", "Table of read length distributions");
|
||||||
|
GATKReportTable table = report.getTable("ReadLengthDistribution");
|
||||||
|
|
||||||
|
table.addPrimaryKey("readLength");
|
||||||
|
|
||||||
|
List<SAMReadGroupRecord> readGroups = getToolkit().getSAMFileHeader().getReadGroups();
|
||||||
|
if (readGroups.isEmpty())
|
||||||
|
table.addColumn("SINGLE_SAMPLE", 0);
|
||||||
|
|
||||||
|
else
|
||||||
|
for (SAMReadGroupRecord rg : readGroups)
|
||||||
|
table.addColumn(rg.getSample(), 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean filter(ReferenceContext ref, SAMRecord read) {
|
||||||
|
return ( !read.getReadPairedFlag() || read.getReadPairedFlag() && read.getFirstOfPairFlag());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Integer map(ReferenceContext referenceContext, SAMRecord samRecord, ReadMetaDataTracker readMetaDataTracker) {
|
||||||
|
GATKReportTable table = report.getTable("ReadLengthDistribution");
|
||||||
|
|
||||||
|
int length = Math.abs(samRecord.getReadLength());
|
||||||
|
String sample = samRecord.getReadGroup().getSample();
|
||||||
|
|
||||||
|
table.increment(length, sample);
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Integer reduceInit() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Integer reduce(Integer integer, Integer integer1) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(Integer sum) {
|
||||||
|
report.print(out);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue