From 538cdf9210dd3e6b80bf7614db9c7fd3f5f0d471 Mon Sep 17 00:00:00 2001 From: Roger Zurawicki Date: Mon, 11 Jun 2012 16:19:57 -0400 Subject: [PATCH 1/3] Created the FindCoveredIntervals Moved some stuff in the DiagnoseTargets walker to the more general ThresHolder class Minor tweaks FindCoveredIntervals supports Gathering FindCoveredIntervals outputs an interval list instead of GATKReport Signed-off-by: Mauricio Carneiro --- .../diagnostics/targets/DiagnoseTargets.java | 33 +------- .../targets/FindCoveredIntervals.java | 84 +++++++++++++++++++ .../diagnostics/targets/SampleStatistics.java | 2 +- .../diagnostics/targets/ThresHolder.java | 47 +++++++++-- 4 files changed, 127 insertions(+), 39 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index b96e6cf1c..62477a87c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -33,7 +33,8 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; @@ -147,7 +148,7 @@ public class DiagnoseTargets extends LocusWalker { intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); // get all of the unique sample names for the VCF Header - vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // initialize the VCF header + vcfWriter.writeHeader(new VCFHeader(ThresHolder.getHeaderInfo(), samples)); // initialize the VCF header } @Override @@ -289,34 +290,6 @@ public class DiagnoseTargets extends LocusWalker { } - /** - * Gets the header lines for the VCF writer - * - * @return A set of VCF header lines - */ - private static Set getHeaderInfo() { - Set headerLines = new HashSet(); - - // INFO fields for overall data - headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); - headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); - - // FORMAT fields for each genotype - // todo -- find the appropriate VCF constants - headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); - headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution.")); - headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution.")); - headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution.")); - - - // FILTER fields - for (CallableStatus stat : CallableStatus.values()) - headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); - - return headerLines; - } - /** * Function that process a set of statuses into strings * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java new file mode 100644 index 000000000..ac60f5f28 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/FindCoveredIntervals.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.PartitionBy; +import org.broadinstitute.sting.gatk.walkers.PartitionType; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; + +import java.io.PrintStream; + +@PartitionBy(PartitionType.CONTIG) +@ActiveRegionExtension(extension = 0, maxRegion = 50000) +public class FindCoveredIntervals extends ActiveRegionWalker { + @Output(required = true) + private PrintStream out; + + @Override + // Look to see if the region has sufficient coverage + public double isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + + int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup()); + + // note the linear probability scale + int coverageThreshold = 20; + return Math.min((double) depth / coverageThreshold, 1); + + } + + @Override + public GenomeLoc map(final ActiveRegion activeRegion, final RefMetaDataTracker tracker) { + if (activeRegion.isActive) + return activeRegion.getLocation(); + else + return null; + } + + @Override + public Long reduceInit() { + return 0L; + } + + @Override + public Long reduce(final GenomeLoc value, Long reduce) { + if (value != null) { + out.println(value.toString()); + return reduce++; + } else + return reduce; + } + + @Override + public void onTraversalDone(Long reduce) { + logger.info(String.format("Found %d intervals", reduce)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java index 958a44d4a..25dda1da5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -146,7 +146,7 @@ class SampleStatistics { int locusIndex = locus.getStart() - interval.getStart(); int rawCoverage = pileup.depthOfCoverage(); - int coverage = pileup.getBaseAndMappingFilteredPileup(thresholds.getMinimumBaseQuality(), thresholds.getMinimumMappingQuality()).depthOfCoverage(); + int coverage = thresholds.getFilteredCoverage(pileup); LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java index 17098567f..20d1dee62 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java @@ -24,6 +24,12 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.HashSet; +import java.util.Set; + class ThresHolder { public static final ThresHolder DEFAULTS = new ThresHolder(20, 20, 5, 700, 20, 50, 0.5, 0.2, 0.5, 0.2, 0.2, 0.5); @@ -69,14 +75,6 @@ class ThresHolder { this.qualityStatusThreshold = qualityStatusThreshold; } - public int getMinimumBaseQuality() { - return minimumBaseQuality; - } - - public int getMinimumMappingQuality() { - return minimumMappingQuality; - } - public int getMinimumCoverage() { return minimumCoverage; } @@ -116,4 +114,37 @@ class ThresHolder { public double getQualityStatusThreshold() { return qualityStatusThreshold; } + + public int getFilteredCoverage(ReadBackedPileup pileup) { + return pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); + } + + /** + * Gets the header lines for the VCF writer + * + * @return A set of VCF header lines + */ + public static Set getHeaderInfo() { + Set headerLines = new HashSet(); + + // INFO fields for overall data + headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); + headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); + + // FORMAT fields for each genotype + // todo -- find the appropriate VCF constants + headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution.")); + headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution.")); + headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution.")); + + + // FILTER fields + for (CallableStatus stat : CallableStatus.values()) + headerLines.add(new VCFFilterHeaderLine(stat.name(), stat.description)); + + return headerLines; + } + } From bdf5945dcca003bcdc4becdd48816ef354b42f97 Mon Sep 17 00:00:00 2001 From: Roger Zurawicki Date: Tue, 12 Jun 2012 16:49:52 -0400 Subject: [PATCH 2/3] Fixed bugs in DiagnoseTargets DT would not report bad mates! that has been fixed Signed-off-by: Mauricio Carneiro --- .../diagnostics/targets/DiagnoseTargets.java | 13 +++++---- .../diagnostics/targets/SampleStatistics.java | 27 ++++++++++++++----- .../diagnostics/targets/ThresHolder.java | 4 +-- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index 62477a87c..d35a72602 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -250,6 +250,7 @@ public class DiagnoseTargets extends LocusWalker { private void outputStatsToVCF(IntervalStatistics stats, Allele refAllele) { GenomeLoc interval = stats.getInterval(); + List alleles = new ArrayList(); Map attributes = new HashMap(); ArrayList genotypes = new ArrayList(); @@ -265,7 +266,9 @@ public class DiagnoseTargets extends LocusWalker { attributes.put(VCFConstants.DEPTH_KEY, stats.averageCoverage()); vcb = vcb.attributes(attributes); - + if (debug) { + System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage()); + } for (String sample : samples) { Map infos = new HashMap(); SampleStatistics sampleStat = stats.getSample(sample); @@ -277,14 +280,14 @@ public class DiagnoseTargets extends LocusWalker { Set filters = new HashSet(); filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses(thresholds))); + if (debug) { + System.out.printf("Found %d bad mates out of %d reads %n", sampleStat.getnBadMates(), sampleStat.getnReads()); + } genotypes.add(new Genotype(sample, null, VariantContext.NO_LOG10_PERROR, filters, infos, false)); } vcb = vcb.genotypes(genotypes); - if (debug) { - System.out.printf("Output -- Interval: %s, Coverage: %.2f%n", stats.getInterval(), stats.averageCoverage()); - } vcfWriter.add(vcb.make()); @@ -306,6 +309,6 @@ public class DiagnoseTargets extends LocusWalker { } private IntervalStatistics createIntervalStatistic(GenomeLoc interval) { - return new IntervalStatistics(samples, interval /*, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality*/); + return new IntervalStatistics(samples, interval); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java index 25dda1da5..ca6070630 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -104,19 +104,19 @@ class SampleStatistics { double intervalSize = interval.size(); - if ((nBadMates / nReads) > thresholds.getBadMateStatusThreshold()) + if (((double) nBadMates / nReads) >= thresholds.getBadMateStatusThreshold()) output.add(CallableStatus.BAD_MATE); - if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) > thresholds.getCoverageStatusThreshold()) + if ((totals.get(CallableStatus.COVERAGE_GAPS) / intervalSize) >= thresholds.getCoverageStatusThreshold()) output.add(CallableStatus.COVERAGE_GAPS); - if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > thresholds.getCoverageStatusThreshold()) + if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) >= thresholds.getCoverageStatusThreshold()) output.add(CallableStatus.LOW_COVERAGE); - if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > thresholds.getExcessiveCoverageThreshold()) + if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) >= thresholds.getExcessiveCoverageThreshold()) output.add(CallableStatus.EXCESSIVE_COVERAGE); - if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > thresholds.getQualityStatusThreshold()) + if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) >= thresholds.getQualityStatusThreshold()) output.add(CallableStatus.POOR_QUALITY); if (totals.get(CallableStatus.REF_N) > 0) @@ -161,7 +161,7 @@ class SampleStatistics { // Was this read already processed? if (read.getTemporaryAttribute("checkedBadMate") == null) { nReads++; - if (hasValidMate(read, thresholds)) + if (!hasValidMate(read, thresholds)) nBadMates++; read.setTemporaryAttribute("checkedBadMate", true); } @@ -254,7 +254,7 @@ class SampleStatistics { * reasonable insert size? * inverted? * same orientation? - * todo - same contig? + * same contig? * is pair mapped? * todo - is forced mate? * @@ -264,6 +264,10 @@ class SampleStatistics { if (!read.getReadPairedFlag()) return false; + // different contigs + if (read.getMateReferenceIndex() != read.getReferenceIndex()) + return false; + // unmapped if (read.getMateUnmappedFlag() || read.getReadUnmappedFlag()) return false; @@ -277,10 +281,19 @@ class SampleStatistics { read.getAlignmentStart() < read.getMateAlignmentStart()) return false; + // TODO note: IGV uses a different alorithm for insert size, there should be a common util class that does this for you // mates are too far apart if (Math.abs(read.getAlignmentStart() - read.getMateAlignmentStart()) > thresholds.getMaximumInsertSize()) return false; return true; } + + public int getnReads() { + return nReads; + } + + public int getnBadMates() { + return nBadMates; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java index 20d1dee62..ae4296449 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/ThresHolder.java @@ -129,12 +129,12 @@ class ThresHolder { // INFO fields for overall data headerLines.add(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "Stop position of the interval")); - headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFInfoHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode")); // FORMAT fields for each genotype // todo -- find the appropriate VCF constants - headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size.")); + headerLines.add(new VCFFormatHeaderLine(VCFConstants.DEPTH_KEY, 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a loci divided by interval size.")); headerLines.add(new VCFFormatHeaderLine("Q1", 1, VCFHeaderLineType.Float, "Lower Quartile of depth distribution.")); headerLines.add(new VCFFormatHeaderLine("MED", 1, VCFHeaderLineType.Float, "Median of depth distribution.")); headerLines.add(new VCFFormatHeaderLine("Q3", 1, VCFHeaderLineType.Float, "Upper Quartile of depth Distribution.")); From 81993b08e205d8d93112a40d0504648e783ee9c0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 13 Jun 2012 11:43:44 -0400 Subject: [PATCH 3/3] Don't put null entries into the key array --- .../sting/gatk/walkers/bqsr/BQSRKeyManager.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java index 9880af405..f0c7c080e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRKeyManager.java @@ -110,7 +110,7 @@ public class BQSRKeyManager { * @return one key in long representation per covariate */ public Long[] longsFromAllKeys(final Long[] allKeys, final EventType eventType) { - final Long[] allFinalKeys = new Long[optionalCovariatesInfo.length > 0 ? optionalCovariatesInfo.length : 1]; // Generate one key per optional covariate + final ArrayList allFinalKeys = new ArrayList(); // Generate one key per optional covariate int covariateIndex = 0; long masterKey = 0L; // This will be a master key holding all the required keys, to replicate later on @@ -128,13 +128,13 @@ public class BQSRKeyManager { long newKey = masterKey | (covariateKey << optionalCovariateOffset); newKey |= (optionalCovariatesInfo[i].covariateID << optionalCovariateIDOffset); - allFinalKeys[i] = newKey; // add this key to the list of keys + allFinalKeys.add(newKey); // add this key to the list of keys } if (optionalCovariatesInfo.length == 0) // special case when we have no optional covariates - allFinalKeys[0] = masterKey; + allFinalKeys.add(masterKey); - return allFinalKeys; + return allFinalKeys.toArray(new Long[allFinalKeys.size()]); } /**