Improved, public version of ErrorRateByCycle

-- A cleaner table output (molten).  For those interested in seeing how this can be done with GATKReports look here for a nice clean example
-- Integration tests
-- Minor improvements to GATKReportTable with methods to getPrimaryKeys
This commit is contained in:
Mark DePristo 2012-03-07 13:08:52 -05:00
parent a6a8fc0521
commit 0376d73ece
4 changed files with 207 additions and 3 deletions

View File

@ -296,6 +296,10 @@ public class GATKReportTable {
return primaryKeyColumn.contains(primaryKey);
}
public Collection<Object> getPrimaryKeys() {
return Collections.unmodifiableCollection(primaryKeyColumn);
}
/**
* Set the value for a given position in the table
*

View File

@ -0,0 +1,162 @@
package org.broadinstitute.sting.gatk.walkers.diagnostics;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
/**
* Computes the read error rate per position in read (in the original 5'->3' orientation that the read had coming off the machine)
*
* Emits a GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate for each read
* group in the input BAMs FOR ONLY THE FIRST OF PAIR READS.
*
* <h2>Input</h2>
* <p>
* Any number of BAM files
* </p>
*
* <h2>Output</h2>
* <p>
* GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate.
*
* For example, running this tool on the NA12878 data sets:
*
* <pre>
* ##:GATKReport.v0.2 ErrorRatePerCycle : The error rate per sequenced position in the reads
* readgroup cycle mismatches counts qual errorrate
* 20FUK.1 0 80 23368 25 3.47e-03
* 20FUK.1 1 40 23433 28 1.75e-03
* 20FUK.1 2 36 23453 28 1.58e-03
* 20FUK.1 3 26 23476 29 1.15e-03
* 20FUK.1 4 32 23495 29 1.40e-03
* up to 101 cycles
* 20FUK.2 0 77 20886 24 3.73e-03
* 20FUK.2 1 28 20920 29 1.39e-03
* 20FUK.2 2 24 20931 29 1.19e-03
* 20FUK.2 3 30 20940 28 1.48e-03
* 20FUK.2 4 25 20948 29 1.24e-03
* up to 101 cycles
* 20FUK.3 0 78 22038 24 3.58e-03
* 20FUK.3 1 40 22091 27 1.86e-03
* 20FUK.3 2 23 22108 30 1.09e-03
* 20FUK.3 3 36 22126 28 1.67e-03
* </pre>
* </p>
*
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T ErrorRatePerCycle
* -I bundle/current/b37/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam
* -R bundle/current/b37/human_g1k_v37.fasta
* -o example.gatkreport.txt
* </pre>
*
* @author Kiran Garimella, Mark DePristo
*/
public class ErrorRatePerCycle extends LocusWalker<Integer, Integer> {
@Output PrintStream out;
@Argument(fullName="min_base_quality_score", shortName="mbq", doc="Minimum base quality required to consider a base for calling", required=false)
public Integer MIN_BASE_QUAL = 0;
@Argument(fullName="min_mapping_quality_score", shortName="mmq", doc="Minimum read mapping quality required to consider a read for calling", required=false)
public Integer MIN_MAPPING_QUAL = 20;
private GATKReport report;
private GATKReportTable table;
private final static String reportName = "ErrorRatePerCycle";
private final static String reportDescription = "The error rate per sequenced position in the reads";
/**
* Allows us to use multiple records for the key (read group x cycle)
*/
private static class TableKey implements Comparable<TableKey> {
final String readGroup;
final int cycle;
private TableKey(final String readGroup, final int cycle) {
this.readGroup = readGroup;
this.cycle = cycle;
}
@Override
public int compareTo(final TableKey tableKey) {
final int scmp = readGroup.compareTo(tableKey.readGroup);
if ( scmp == 0 )
return Integer.valueOf(cycle).compareTo(tableKey.cycle);
else
return scmp;
}
}
public void initialize() {
report = new GATKReport();
report.addTable(reportName, reportDescription);
table = report.getTable(reportName);
table.addPrimaryKey("key", false);
table.addColumn("readgroup", 0);
table.addColumn("cycle", 0);
table.addColumn("mismatches", 0);
table.addColumn("counts", 0);
table.addColumn("qual", 0);
table.addColumn("errorrate", 0.0f, "%.2e");
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
for ( final PileupElement p : context.getBasePileup() ) {
final GATKSAMRecord read = p.getRead();
final int offset = p.getOffset();
final boolean firstOfPair = ! read.getReadPairedFlag() || read.getFirstOfPairFlag();
if ( firstOfPair && read.getMappingQuality() >= MIN_MAPPING_QUAL && p.getQual() >= MIN_BASE_QUAL ) {
final byte readBase = p.getBase();
final byte refBase = ref.getBase();
final int cycle = offset;
if ( BaseUtils.isRegularBase(readBase) && BaseUtils.isRegularBase(refBase) ) {
final TableKey key = new TableKey(read.getReadGroup().getReadGroupId(), cycle);
if ( ! table.containsKey(key) ) {
table.set(key, "cycle", cycle);
table.set(key, "readgroup", read.getReadGroup().getReadGroupId());
}
table.increment(key, "counts");
if (readBase != refBase)
table.increment(key, "mismatches");
}
}
}
return null;
}
public Integer reduceInit() { return null; }
public Integer reduce(Integer value, Integer sum) { return null; }
public void onTraversalDone(Integer sum) {
for ( final Object key : table.getPrimaryKeys() ) {
final int mismatches = (Integer)table.get(key, "mismatches");
final int count = (Integer)table.get(key, "counts");
final double errorRate = (mismatches + 1) / (1.0*(count + 1));
final int qual = QualityUtils.probToQual(1-errorRate, 0.0);
table.set(key, "qual", qual);
table.set(key, "errorrate", errorRate);
}
report.print(out);
}
}

View File

@ -94,9 +94,6 @@ import java.util.Map;
*
* @author Mark DePristo
*/
public class ReadGroupProperties extends ReadWalker<Integer, Integer> {
@Output
public PrintStream out;

View File

@ -0,0 +1,41 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.diagnostics;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
public class ErrorRatePerCycleIntegrationTest extends WalkerTest {
@Test
public void basicTest() {
WalkerTestSpec spec = new WalkerTestSpec(
"-T ErrorRatePerCycle -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-10,100,000 -o %s",
1,
Arrays.asList("0cc212ecb6df300e321784039ff29f13"));
executeTest("ErrorRatePerCycle:", spec);
}
}