From 0376d73ece4e337c1b29935f200fe9e478945d2a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 7 Mar 2012 13:08:52 -0500 Subject: [PATCH] Improved, public version of ErrorRateByCycle -- A cleaner table output (molten). For those interested in seeing how this can be done with GATKReports look here for a nice clean example -- Integration tests -- Minor improvements to GATKReportTable with methods to getPrimaryKeys --- .../sting/gatk/report/GATKReportTable.java | 4 + .../diagnostics/ErrorRatePerCycle.java | 162 ++++++++++++++++++ .../diagnostics/ReadGroupProperties.java | 3 - .../ErrorRatePerCycleIntegrationTest.java | 41 +++++ 4 files changed, 207 insertions(+), 3 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index b72b20e0b..b59b550e1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -296,6 +296,10 @@ public class GATKReportTable { return primaryKeyColumn.contains(primaryKey); } + public Collection getPrimaryKeys() { + return Collections.unmodifiableCollection(primaryKeyColumn); + } + /** * Set the value for a given position in the table * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java new file mode 100755 index 000000000..e7a2f74e2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -0,0 +1,162 @@ +package org.broadinstitute.sting.gatk.walkers.diagnostics; + +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; + +/** + * Computes the read error rate per position in read (in the original 5'->3' orientation that the read had coming off the machine) + * + * Emits a GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate for each read + * group in the input BAMs FOR ONLY THE FIRST OF PAIR READS. + * + *

Input

+ *

+ * Any number of BAM files + *

+ * + *

Output

+ *

+ * GATKReport containing readgroup, cycle, mismatches, counts, qual, and error rate. + * + * For example, running this tool on the NA12878 data sets: + * + *

+ *      ##:GATKReport.v0.2 ErrorRatePerCycle : The error rate per sequenced position in the reads
+ *      readgroup  cycle  mismatches  counts  qual  errorrate
+ *      20FUK.1        0          80   23368    25   3.47e-03
+ *      20FUK.1        1          40   23433    28   1.75e-03
+ *      20FUK.1        2          36   23453    28   1.58e-03
+ *      20FUK.1        3          26   23476    29   1.15e-03
+ *      20FUK.1        4          32   23495    29   1.40e-03
+ *      up to 101 cycles
+ *      20FUK.2        0          77   20886    24   3.73e-03
+ *      20FUK.2        1          28   20920    29   1.39e-03
+ *      20FUK.2        2          24   20931    29   1.19e-03
+ *      20FUK.2        3          30   20940    28   1.48e-03
+ *      20FUK.2        4          25   20948    29   1.24e-03
+ *      up to 101 cycles
+ *      20FUK.3        0          78   22038    24   3.58e-03
+ *      20FUK.3        1          40   22091    27   1.86e-03
+ *      20FUK.3        2          23   22108    30   1.09e-03
+ *      20FUK.3        3          36   22126    28   1.67e-03
+ *      
+ *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ErrorRatePerCycle
+ *      -I bundle/current/b37/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam
+ *      -R bundle/current/b37/human_g1k_v37.fasta
+ *      -o example.gatkreport.txt
+ *  
+ * + * @author Kiran Garimella, Mark DePristo + */ +public class ErrorRatePerCycle extends LocusWalker { + @Output PrintStream out; + @Argument(fullName="min_base_quality_score", shortName="mbq", doc="Minimum base quality required to consider a base for calling", required=false) + public Integer MIN_BASE_QUAL = 0; + @Argument(fullName="min_mapping_quality_score", shortName="mmq", doc="Minimum read mapping quality required to consider a read for calling", required=false) + public Integer MIN_MAPPING_QUAL = 20; + + private GATKReport report; + private GATKReportTable table; + private final static String reportName = "ErrorRatePerCycle"; + private final static String reportDescription = "The error rate per sequenced position in the reads"; + + /** + * Allows us to use multiple records for the key (read group x cycle) + */ + private static class TableKey implements Comparable { + final String readGroup; + final int cycle; + + private TableKey(final String readGroup, final int cycle) { + this.readGroup = readGroup; + this.cycle = cycle; + } + + @Override + public int compareTo(final TableKey tableKey) { + final int scmp = readGroup.compareTo(tableKey.readGroup); + if ( scmp == 0 ) + return Integer.valueOf(cycle).compareTo(tableKey.cycle); + else + return scmp; + } + } + + public void initialize() { + report = new GATKReport(); + report.addTable(reportName, reportDescription); + table = report.getTable(reportName); + table.addPrimaryKey("key", false); + table.addColumn("readgroup", 0); + table.addColumn("cycle", 0); + table.addColumn("mismatches", 0); + table.addColumn("counts", 0); + table.addColumn("qual", 0); + table.addColumn("errorrate", 0.0f, "%.2e"); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + for ( final PileupElement p : context.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + final int offset = p.getOffset(); + final boolean firstOfPair = ! read.getReadPairedFlag() || read.getFirstOfPairFlag(); + + if ( firstOfPair && read.getMappingQuality() >= MIN_MAPPING_QUAL && p.getQual() >= MIN_BASE_QUAL ) { + final byte readBase = p.getBase(); + final byte refBase = ref.getBase(); + final int cycle = offset; + + if ( BaseUtils.isRegularBase(readBase) && BaseUtils.isRegularBase(refBase) ) { + final TableKey key = new TableKey(read.getReadGroup().getReadGroupId(), cycle); + + if ( ! table.containsKey(key) ) { + table.set(key, "cycle", cycle); + table.set(key, "readgroup", read.getReadGroup().getReadGroupId()); + } + + table.increment(key, "counts"); + if (readBase != refBase) + table.increment(key, "mismatches"); + } + } + } + + return null; + } + + public Integer reduceInit() { return null; } + + public Integer reduce(Integer value, Integer sum) { return null; } + + public void onTraversalDone(Integer sum) { + for ( final Object key : table.getPrimaryKeys() ) { + final int mismatches = (Integer)table.get(key, "mismatches"); + final int count = (Integer)table.get(key, "counts"); + final double errorRate = (mismatches + 1) / (1.0*(count + 1)); + final int qual = QualityUtils.probToQual(1-errorRate, 0.0); + table.set(key, "qual", qual); + table.set(key, "errorrate", errorRate); + } + + report.print(out); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java index d7a48d321..14985907d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -94,9 +94,6 @@ import java.util.Map; * * @author Mark DePristo */ - - - public class ReadGroupProperties extends ReadWalker { @Output public PrintStream out; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java new file mode 100644 index 000000000..accb9c0cf --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycleIntegrationTest.java @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class ErrorRatePerCycleIntegrationTest extends WalkerTest { + @Test + public void basicTest() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T ErrorRatePerCycle -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-10,100,000 -o %s", + 1, + Arrays.asList("0cc212ecb6df300e321784039ff29f13")); + executeTest("ErrorRatePerCycle:", spec); + } +} \ No newline at end of file