From 6ddf2170b690c6f4c94e48cd5ecb7d1464de7958 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 9 Apr 2012 11:46:16 -0400 Subject: [PATCH 01/12] More efficient implementation of the sum of the allele frequency posteriors matrix using a pre-allocated cache as discussed in group meeting last week. Now, when the cache is filled, we safely collapse down to a single value in real space and put the un-re-centered log10 value back into the front of the cache. Thanks to all for the help and advice. --- .../AlleleFrequencyCalculationResult.java | 36 +++++++++++-------- .../genotyper/UnifiedGenotyperEngine.java | 10 +++--- .../broadinstitute/sting/utils/MathUtils.java | 10 ++++-- .../org/broadinstitute/sting/utils/Utils.java | 26 -------------- 4 files changed, 34 insertions(+), 48 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index 0867d949e..c93e780bf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.MathUtils; -import java.util.ArrayList; import java.util.Arrays; /** @@ -42,12 +41,13 @@ public class AlleleFrequencyCalculationResult { // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles private double log10MLE; private double log10MAP; - final private int[] alleleCountsOfMLE; - final private int[] alleleCountsOfMAP; + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; // The posteriors seen, not including that of AF=0 - // TODO -- better implementation needed here (see below) - private ArrayList log10PosteriorMatrixValues = new ArrayList(100000); + private static final int POSTERIORS_CACHE_SIZE = 5000; + private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE]; + private int currentPosteriorsCacheIndex = 0; private Double log10PosteriorMatrixSum = null; // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) @@ -69,14 +69,9 @@ public class AlleleFrequencyCalculationResult { return log10MAP; } - public double getLog10PosteriorMatrixSum() { + public double getLog10PosteriorsMatrixSumWithoutAFzero() { if ( log10PosteriorMatrixSum == null ) { - // TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory; - // TODO -- will discuss with the team what the best option is - final double[] tmp = new double[log10PosteriorMatrixValues.size()]; - for ( int i = 0; i < tmp.length; i++ ) - tmp[i] = log10PosteriorMatrixValues.get(i); - log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp); + log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); } return log10PosteriorMatrixSum; } @@ -103,7 +98,7 @@ public class AlleleFrequencyCalculationResult { alleleCountsOfMLE[i] = 0; alleleCountsOfMAP[i] = 0; } - log10PosteriorMatrixValues.clear(); + currentPosteriorsCacheIndex = 0; log10PosteriorMatrixSum = null; } @@ -116,7 +111,8 @@ public class AlleleFrequencyCalculationResult { } public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { - log10PosteriorMatrixValues.add(log10LofK); + addToPosteriorsCache(log10LofK); + if ( log10LofK > log10MAP ) { log10MAP = log10LofK; for ( int i = 0; i < alleleCountsForK.length; i++ ) @@ -124,6 +120,18 @@ public class AlleleFrequencyCalculationResult { } } + private void addToPosteriorsCache(final double log10LofK) { + // add to the cache + log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) { + final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); + log10PosteriorMatrixValues[0] = temporarySum; + currentPosteriorsCacheIndex = 1; + } + } + public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; if ( log10LikelihoodOfAFzero > log10MLE ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index d4206e8ef..5d926a865 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -326,7 +326,7 @@ public class UnifiedGenotyperEngine { } else { phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); if ( Double.isInfinite(phredScaledConfidence) ) { - final double sum = AFresult.getLog10PosteriorMatrixSum(); + final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); } } @@ -369,7 +369,7 @@ public class UnifiedGenotyperEngine { // the overall lod //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum(); + double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List alternateAllelesToUse = builder.make().getAlternateAlleles(); @@ -380,7 +380,7 @@ public class UnifiedGenotyperEngine { afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); - double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum(); + double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod @@ -389,7 +389,7 @@ public class UnifiedGenotyperEngine { afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); - double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum(); + double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -424,7 +424,7 @@ public class UnifiedGenotyperEngine { public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); - normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum(); + normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); return MathUtils.normalizeFromLog10(normalizedPosteriors); } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index c4b0165ca..5e3160452 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -237,7 +237,7 @@ public class MathUtils { public static double log10sumLog10(double[] log10p, int start, int finish) { double sum = 0.0; - double maxValue = Utils.findMaxEntry(log10p); + double maxValue = arrayMax(log10p, finish); if(maxValue == Double.NEGATIVE_INFINITY) return sum; @@ -554,7 +554,7 @@ public class MathUtils { // for precision purposes, we need to add (or really subtract, since they're // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = Utils.findMaxEntry(array); + double maxValue = arrayMax(array); // we may decide to just normalize in log space without converting to linear space if (keepInLogSpace) { @@ -627,10 +627,14 @@ public class MathUtils { return maxI; } - public static double arrayMax(double[] array) { + public static double arrayMax(final double[] array) { return array[maxElementIndex(array)]; } + public static double arrayMax(final double[] array, final int endIndex) { + return array[maxElementIndex(array, endIndex)]; + } + public static double arrayMin(double[] array) { return array[minElementIndex(array)]; } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index 4817966fe..c2c608903 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -290,32 +290,6 @@ public class Utils { return m; } - - // returns the maximum value in the array - public static double findMaxEntry(double[] array) { - return findIndexAndMaxEntry(array).first; - } - - // returns the index of the maximum value in the array - public static int findIndexOfMaxEntry(double[] array) { - return findIndexAndMaxEntry(array).second; - } - - // returns the the maximum value and its index in the array - private static Pair findIndexAndMaxEntry(double[] array) { - if ( array.length == 0 ) - return new Pair(0.0, -1); - int index = 0; - double max = array[0]; - for (int i = 1; i < array.length; i++) { - if ( array[i] > max ) { - max = array[i]; - index = i; - } - } - return new Pair(max, index); - } - /** * Splits expressions in command args by spaces and returns the array of expressions. * Expressions may use single or double quotes to group any individual expression, but not both. From ea4300d58373660a10c38750b3f74c2617f17db3 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 9 Apr 2012 13:45:17 -0400 Subject: [PATCH 02/12] Refactoring so that Unified Argument Collection doesn't use deprecated classes. --- ...ploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java | 2 ++ .../gatk/walkers/genotyper/UnifiedArgumentCollection.java | 4 ++-- .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java index 5f374e597..5d6cf9f7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.java @@ -72,6 +72,8 @@ import static java.lang.Math.pow; */ public class DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering implements Cloneable { + public final static double DEFAULT_PCR_ERROR_RATE = 1e-4; + protected final static int FIXED_PLOIDY = 2; protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1; protected final static double ploidyAdjustment = log10(FIXED_PLOIDY); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 58b8af493..9f606cdfb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -45,7 +45,7 @@ public class UnifiedArgumentCollection { * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 */ @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) - public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY; + public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily @@ -53,7 +53,7 @@ public class UnifiedArgumentCollection { * effectively acts as a cap on the base qualities. */ @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) - public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; + public Double PCR_error = DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.DEFAULT_PCR_ERROR_RATE; /** * Specifies how to determine the alternate allele to use for genotyping diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 5d926a865..9241482d4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -53,6 +53,9 @@ public class UnifiedGenotyperEngine { public static final int DEFAULT_PLOIDY = 2; + public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; + public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; + public enum OUTPUT_MODE { /** produces calls only at variant sites */ EMIT_VARIANTS_ONLY, @@ -622,8 +625,6 @@ public class UnifiedGenotyperEngine { } - public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3; - public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4; protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) { if( model.name().contains("SNP") ) return HUMAN_SNP_HETEROZYGOSITY; From f82986ee62cb2f65c31990798c0d4ff0bfa94e8d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 9 Apr 2012 14:28:25 -0400 Subject: [PATCH 04/12] Adding unit tests for the very important log10sumLog10 util method. --- .../sting/utils/MathUtilsUnitTest.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index adc7927a7..5327d4cf2 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -284,6 +284,18 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -2.0}), new double[] {0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211})); } + @Test + public void testLog10sumLog10() { + final double log3 = 0.477121254719662; + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3), 0); + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3), 0); + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3), 0); + + final double log2 = 0.301029995663981; + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2), 0); + Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0), 0); + } + @Test public void testDotProduct() { Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0,-3.0,2.0}, new Double[]{6.0,7.0,8.0}),-35.0,1e-3); From 550179a1f7221fb8d58f3f3f0fc695abac629724 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 9 Apr 2012 14:53:05 -0400 Subject: [PATCH 05/12] Major refactorings/optimizations of pool caller, output still bit-true to older version: a) Move DEFAULT_PLOIDY from UnifiedGenotyperEngine to VariantContextUtils. b) Optimize iteration through all possible allele combinations. c) Don't store log PL's in hashmap from allele conformations to double, it was too slow. Things can still be optimized much more down the line if needed. d) Remove remaining traces of genotype priors. --- .../sting/gatk/walkers/genotyper/UnifiedGenotyper.java | 3 ++- .../sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java | 4 +--- .../sting/utils/variantcontext/GenotypeLikelihoods.java | 4 ++++ .../sting/utils/variantcontext/VariantContextUtils.java | 5 +++-- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index e3d0efaa1..8df501e1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.PrintStream; import java.util.*; @@ -216,7 +217,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP"); annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); - UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, UnifiedGenotyperEngine.DEFAULT_PLOIDY); + UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, VariantContextUtils.DEFAULT_PLOIDY); // initialize the header Set headerInfo = getHeaderInfo(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index d4206e8ef..e561fc511 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -50,8 +50,6 @@ import java.util.*; public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; - - public static final int DEFAULT_PLOIDY = 2; public enum OUTPUT_MODE { /** produces calls only at variant sites */ @@ -111,7 +109,7 @@ public class UnifiedGenotyperEngine { // --------------------------------------------------------------------------------------------------------- @Requires({"toolkit != null", "UAC != null"}) public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) { - this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), DEFAULT_PLOIDY*(SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size())); + this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()), VariantContextUtils.DEFAULT_PLOIDY*(SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()).size())); } @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0","ploidy>0"}) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index 7aa0b2605..a6b2bbb21 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -278,6 +278,10 @@ public class GenotypeLikelihoods { public static int calculateNumLikelihoods(final int numAlleles, final int ploidy) { + // fast, closed form solution for diploid samples (most common use case) + if (ploidy==2) + return numAlleles*(numAlleles+1)/2; + if (numAlleles == 1) return 1; else if (ploidy == 1) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 2a121b6b0..584e76cf9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -30,7 +30,6 @@ import org.apache.commons.jexl2.JexlEngine; import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -48,6 +47,8 @@ public class VariantContextUtils { public final static String MERGE_FILTER_PREFIX = "filterIn"; final public static JexlEngine engine = new JexlEngine(); + public static final int DEFAULT_PLOIDY = 2; + static { engine.setSilent(false); // will throw errors now for selects that don't evaluate properly engine.setLenient(false); @@ -1123,7 +1124,7 @@ public class VariantContextUtils { } // calculateNumLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, UnifiedGenotyperEngine.DEFAULT_PLOIDY); + final int numLikelihoods = GenotypeLikelihoods.calculateNumLikelihoods(1+numOriginalAltAlleles, DEFAULT_PLOIDY); for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); // consider this entry only if both of the alleles are good From 9ece93ae9cedeb4e3d9224fbcf9f449492ca5012 Mon Sep 17 00:00:00 2001 From: Roger Zurawicki Date: Fri, 30 Mar 2012 12:44:04 -0400 Subject: [PATCH 06/12] DiagnoseTargets now outputs a VCF file - refactored the statistics classes - concurrent callable statuses by sample are now available. Signed-off-by: Mauricio Carneiro --- .../walkers/coverage/CallableLociWalker.java | 173 +++++++------ .../diagnostics/targets/CallableStatus.java | 72 +++++- .../diagnostics/targets/DiagnoseTargets.java | 236 ++++++++++++++---- .../targets/IntervalStatisticLocus.java | 34 --- .../targets/IntervalStatistics.java | 150 +++++------ .../diagnostics/targets/LocusStatistics.java | 83 ++++++ .../diagnostics/targets/SampleStatistics.java | 175 +++++++++++++ 7 files changed, 659 insertions(+), 264 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 1dfc6fea0..2a8940de0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -1,23 +1,25 @@ /* - * Copyright (c) 2009 The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * * OTHER DEALINGS IN THE SOFTWARE. + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.walkers.coverage; @@ -42,40 +44,40 @@ import java.io.PrintStream; /** * Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome - * + *

*

* A very common question about a NGS set of reads is what areas of the genome are considered callable. The system * considers the coverage at each locus and emits either a per base state or a summary interval BED file that * partitions the genomic intervals into the following callable states: *

- *
REF_N
- *
the reference base was an N, which is not considered callable the GATK
- *
CALLABLE
- *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
- *
NO_COVERAGE
- *
absolutely no reads were seen at this locus, regardless of the filtering parameters
- *
LOW_COVERAGE
- *
there were less than min. depth bases at the locus, after applying filters
- *
EXCESSIVE_COVERAGE
- *
more than -maxDepth read at the locus, indicating some sort of mapping problem
- *
POOR_MAPPING_QUALITY
- *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *
REF_N
+ *
the reference base was an N, which is not considered callable the GATK
+ *
PASS
+ *
the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ *
NO_COVERAGE
+ *
absolutely no reads were seen at this locus, regardless of the filtering parameters
+ *
LOW_COVERAGE
+ *
there were less than min. depth bases at the locus, after applying filters
+ *
EXCESSIVE_COVERAGE
+ *
more than -maxDepth read at the locus, indicating some sort of mapping problem
+ *
POOR_MAPPING_QUALITY
+ *
more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
*
*

- * + *

*

Input

*

- * A BAM file containing exactly one sample. + * A BAM file containing exactly one sample. *

- * + *

*

Output

*

*

    - *
  • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
  • - *
  • -summary: a table of callable status x count of all examined bases
  • + *
  • -o: a OutputFormatted (recommended BED) file with the callable status covering each base
  • + *
  • -summary: a table of callable status x count of all examined bases
  • *
*

- * + *

*

Examples

*
  *     -T CallableLociWalker \
@@ -83,31 +85,31 @@ import java.io.PrintStream;
  *     -summary my.summary \
  *     -o my.bed
  * 
- * + *

* would produce a BED file (my.bed) that looks like: - * + *

*

- *     20 10000000 10000864 CALLABLE
+ *     20 10000000 10000864 PASS
  *     20 10000865 10000985 POOR_MAPPING_QUALITY
- *     20 10000986 10001138 CALLABLE
+ *     20 10000986 10001138 PASS
  *     20 10001139 10001254 POOR_MAPPING_QUALITY
- *     20 10001255 10012255 CALLABLE
+ *     20 10001255 10012255 PASS
  *     20 10012256 10012259 POOR_MAPPING_QUALITY
- *     20 10012260 10012263 CALLABLE
+ *     20 10012260 10012263 PASS
  *     20 10012264 10012328 POOR_MAPPING_QUALITY
- *     20 10012329 10012550 CALLABLE
+ *     20 10012329 10012550 PASS
  *     20 10012551 10012551 LOW_COVERAGE
- *     20 10012552 10012554 CALLABLE
+ *     20 10012552 10012554 PASS
  *     20 10012555 10012557 LOW_COVERAGE
- *     20 10012558 10012558 CALLABLE
+ *     20 10012558 10012558 PASS
  *     et cetera...
  * 
* as well as a summary table that looks like: - * + *

*

  *                        state nBases
  *                        REF_N 0
- *                     CALLABLE 996046
+ *                     PASS 996046
  *                  NO_COVERAGE 121
  *                 LOW_COVERAGE 928
  *           EXCESSIVE_COVERAGE 0
@@ -139,21 +141,21 @@ public class CallableLociWalker extends LocusWalker minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE
+     * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the PASS
      * state.
      */
     @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
     byte minMappingQuality = 10;
 
     /**
-     * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state
+     * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the PASS state
      */
     @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
     byte minBaseQuality = 20;
 
     /**
      * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
-     * value and is less than maxDepth the site is considered CALLABLE.
+     * value and is less than maxDepth the site is considered PASS.
      */
     @Advanced
     @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
@@ -191,7 +193,7 @@ public class CallableLociWalker extends LocusWalker= minMappingQuality && ( e.getQual() >= minBaseQuality || e.isDeletion() ) ) {
+                if (e.getMappingQual() >= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) {
                     QCDepth++;
                 }
             }
 
             //System.out.printf("%s rawdepth = %d QCDepth = %d lowMAPQ = %d%n", context.getLocation(), rawDepth, QCDepth, lowMAPQDepth);
-            if ( rawDepth == 0 ) {
+            if (rawDepth == 0) {
                 state = CalledState.NO_COVERAGE;
-            } else if ( rawDepth >= minDepthLowMAPQ && MathUtils.ratio( lowMAPQDepth, rawDepth ) >= maxLowMAPQFraction ) {
+            } else if (rawDepth >= minDepthLowMAPQ && MathUtils.ratio(lowMAPQDepth, rawDepth) >= maxLowMAPQFraction) {
                 state = CalledState.POOR_MAPPING_QUALITY;
-            } else if ( QCDepth < minDepth ) {
+            } else if (QCDepth < minDepth) {
                 state = CalledState.LOW_COVERAGE;
-            } else if ( rawDepth >= maxDepth && maxDepth != -1 ) {
+            } else if (rawDepth >= maxDepth && maxDepth != -1) {
                 state = CalledState.EXCESSIVE_COVERAGE;
             } else {
                 state = CalledState.CALLABLE;
             }
         }
 
-        return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state);
+        return new CallableBaseState(getToolkit().getGenomeLocParser(), context.getLocation(), state);
     }
 
     @Override
@@ -328,15 +345,15 @@ public class CallableLociWalker extends LocusWalker
  * 

* [Long description of the walker] *

- * - * + *

+ *

*

Input

*

* [Description of the Input] *

- * + *

*

Output

*

* [Description of the Output] *

- * + *

*

Examples

*
  *    java
@@ -51,12 +73,13 @@ import java.util.TreeSet;
  * @since 2/1/12
  */
 @By(value = DataSource.READS)
-public class DiagnoseTargets extends LocusWalker {
+@PartitionBy(PartitionType.INTERVAL)
+public class DiagnoseTargets extends LocusWalker implements AnnotatorCompatibleWalker {
     @Input(fullName = "interval_track", shortName = "int", doc = "", required = true)
     private IntervalBinding intervalTrack = null;
 
-    @Output
-    private PrintStream out = System.out;
+    @Output(doc = "File to which variants should be written", required = true)
+    protected VCFWriter vcfWriter = null;
 
     @Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false)
     private int expandInterval = 50;
@@ -73,13 +96,13 @@ public class DiagnoseTargets extends LocusWalker {
     @Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false)
     private int maximumCoverage = 700;
 
-    private TreeSet intervalList = null;                     // The list of intervals of interest (plus expanded intervals if user wants them)
-    private HashMap intervalMap = null;  // interval => statistics
-    private Iterator intervalListIterator;                   // An iterator to go over all the intervals provided as we traverse the genome
-    private GenomeLoc currentInterval = null;                           // The "current" interval loaded and being filled with statistics
-    private IntervalStatistics currentIntervalStatistics = null;                 // The "current" interval loaded and being filled with statistics
-
-    private GenomeLocParser parser;                                     // just an object to allow us to create genome locs (for the expanded intervals)
+    private TreeSet intervalList = null;                                                                     // The list of intervals of interest (plus expanded intervals if user wants them)
+    private HashMap intervalMap = null;                                                  // interval => statistics
+    private Iterator intervalListIterator;                                                                   // An iterator to go over all the intervals provided as we traverse the genome
+    private GenomeLoc currentInterval = null;                                                                           // The "current" interval loaded
+    private IntervalStatistics currentIntervalStatistics = null;                                                        // The "current" interval being filled with statistics
+    private Set samples = null;                                                                                 // All the samples being processed
+    private GenomeLocParser parser;                                                                                     // just an object to allow us to create genome locs (for the expanded intervals)
 
     @Override
     public void initialize() {
@@ -88,38 +111,72 @@ public class DiagnoseTargets extends LocusWalker {
         if (intervalTrack == null)
             throw new UserException("This tool currently only works if you provide an interval track");
 
-        parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary());       // Important to initialize the parser before creating the intervals below
+        parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary());                                       // Important to initialize the parser before creating the intervals below
 
-        List originalList = intervalTrack.getIntervals(getToolkit());        // The original list of targets provided by the user that will be expanded or not depending on the options provided
+        List originalList = intervalTrack.getIntervals(getToolkit());                                        // The original list of targets provided by the user that will be expanded or not depending on the options provided
         intervalList = new TreeSet(new GenomeLocComparator());
-        intervalMap = new HashMap(originalList.size() * 2);
+        intervalMap = new HashMap();
         for (GenomeLoc interval : originalList)
-            addAndExpandIntervalToLists(interval);
+            intervalList.add(interval);
+        //addAndExpandIntervalToMap(interval);
 
         intervalListIterator = intervalList.iterator();
+
+        // get all of the unique sample names
+        samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
+
+        // initialize the header
+        Set headerInfo = getHeaderInfo();
+
+        vcfWriter.writeHeader(new VCFHeader(headerInfo, samples));
+    }
+
+    /**
+     * Gets the header lines for the VCF writer
+     *
+     * @return A set of VCF header lines
+     */
+    private Set getHeaderInfo() {
+        Set headerLines = new HashSet();
+
+        // INFO fields for overall data
+        headerLines.add(new VCFInfoHeaderLine("END", 1, VCFHeaderLineType.Integer, "Stop position of the interval"));
+        headerLines.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools"));
+        headerLines.add(new VCFInfoHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
+        headerLines.add(new VCFInfoHeaderLine("Diagnose Targets", 0, VCFHeaderLineType.Flag, "DiagnoseTargets mode"));
+
+        // FORMAT fields for each genotype
+        headerLines.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Total depth in the site. Sum of the depth of all pools"));
+        headerLines.add(new VCFFormatHeaderLine("AD", 1, VCFHeaderLineType.Float, "Average depth across the interval. Sum of the depth in a lci divided by interval size."));
+
+        // FILTER fields
+
+        for (CallableStatus stat : CallableStatus.values()) {
+            headerLines.add(new VCFHeaderLine(stat.name(), stat.description));
+        }
+
+        return headerLines;
     }
 
     @Override
     public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
         GenomeLoc refLocus = ref.getLocus();
-        while (currentInterval == null || currentInterval.isBefore(refLocus)) {
+        while (currentInterval == null || currentInterval.isBefore(refLocus)) {                                         // do this for first time and while currentInterval is behind current locus
             if (!intervalListIterator.hasNext())
                 return 0L;
 
+            if (currentInterval != null)
+                processIntervalStats(currentInterval, Allele.create(ref.getBase(), true));
+
             currentInterval = intervalListIterator.next();
+            addAndExpandIntervalToMap(currentInterval);
             currentIntervalStatistics = intervalMap.get(currentInterval);
         }
 
-        if (currentInterval.isPast(refLocus))
+        if (currentInterval.isPast(refLocus))                                                                           // skip if we are behind the current interval
             return 0L;
 
-        byte[] mappingQualities = context.getBasePileup().getMappingQuals();
-        byte[] baseQualities = context.getBasePileup().getQuals();
-        int coverage = context.getBasePileup().getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
-        int rawCoverage = context.size();
-
-        IntervalStatisticLocus locusData = new IntervalStatisticLocus(mappingQualities, baseQualities, coverage, rawCoverage);
-        currentIntervalStatistics.addLocus(refLocus, locusData);
+        currentIntervalStatistics.addLocus(context);                                                                    // Add current locus to stats
 
         return 1L;
     }
@@ -129,6 +186,13 @@ public class DiagnoseTargets extends LocusWalker {
         return 0L;
     }
 
+    /**
+     * Not sure what we are going to do here
+     *
+     * @param value result of the map.
+     * @param sum   accumulator for the reduce.
+     * @return a long
+     */
     @Override
     public Long reduce(Long value, Long sum) {
         return sum + value;
@@ -136,14 +200,25 @@ public class DiagnoseTargets extends LocusWalker {
 
     @Override
     public void onTraversalDone(Long result) {
-        super.onTraversalDone(result);
-        out.println("Interval\tCallStatus\tCOV\tAVG");
-        for (GenomeLoc interval : intervalList) {
-            IntervalStatistics stats = intervalMap.get(interval);
-            out.println(String.format("%s\t%s\t%d\t%f", interval, stats.callableStatus(), stats.totalCoverage(), stats.averageCoverage()));
-        }
+        for (GenomeLoc interval : intervalMap.keySet()) 
+            processIntervalStats(interval, Allele.create("
", true)); } + @Override + public RodBinding getSnpEffRodBinding() {return null;} + + @Override + public RodBinding getDbsnpRodBinding() {return null;} + + @Override + public List> getCompRodBindings() {return null;} + + @Override + public List> getResourceRodBindings() {return null;} + + @Override + public boolean alwaysAppendDbsnpId() {return false;} + private GenomeLoc createIntervalBefore(GenomeLoc interval) { int start = Math.max(interval.getStart() - expandInterval, 0); int stop = Math.max(interval.getStart() - 1, 0); @@ -157,16 +232,75 @@ public class DiagnoseTargets extends LocusWalker { return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop); } - private void addAndExpandIntervalToLists(GenomeLoc interval) { + /** + * Takes an interval and commits it to memory. + * It will expand it if so told by the -exp command line argument + * + * @param interval The new interval to process + */ + private void addAndExpandIntervalToMap(GenomeLoc interval) { if (expandInterval > 0) { GenomeLoc before = createIntervalBefore(interval); GenomeLoc after = createIntervalAfter(interval); intervalList.add(before); intervalList.add(after); - intervalMap.put(before, new IntervalStatistics(before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); - intervalMap.put(after, new IntervalStatistics(after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + intervalMap.put(before, new IntervalStatistics(samples, before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + intervalMap.put(after, new IntervalStatistics(samples, after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); } - intervalList.add(interval); - intervalMap.put(interval, new IntervalStatistics(interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + if (!intervalList.contains(interval)) + intervalList.add(interval); + intervalMap.put(interval, new IntervalStatistics(samples, interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality)); + } + + /** + * Takes the interval, finds it in the stash, prints it to the VCF, and removes it + * + * @param interval The interval in memory that you want to write out and clear + * @param allele the allele + */ + private void processIntervalStats(GenomeLoc interval, Allele allele) { + IntervalStatistics stats = intervalMap.get(interval); + + List alleles = new ArrayList(); + Map attributes = new HashMap(); + ArrayList genotypes = new ArrayList(); + + alleles.add(allele); + VariantContextBuilder vcb = new VariantContextBuilder("DiagnoseTargets", interval.getContig(), interval.getStart(), interval.getStop(), alleles); + + vcb = vcb.log10PError(VariantContext.NO_LOG10_PERROR); // QUAL field makes no sense in our VCF + vcb.filters(statusesToStrings(stats.callableStatuses())); + + attributes.put(VCFConstants.END_KEY, interval.getStop()); + attributes.put(VCFConstants.DEPTH_KEY, stats.totalCoverage()); + attributes.put("AV", stats.averageCoverage()); + + vcb = vcb.attributes(attributes); + + for (String sample : samples) { + Map infos = new HashMap(); + infos.put("DP", stats.getSample(sample).totalCoverage()); + infos.put("AV", stats.getSample(sample).averageCoverage()); + + Set filters = new HashSet(); + filters.addAll(statusesToStrings(stats.getSample(sample).getCallableStatuses())); + + + genotypes.add(new Genotype(sample, alleles, VariantContext.NO_LOG10_PERROR, filters, infos, false)); + } + vcb = vcb.genotypes(genotypes); + + vcfWriter.add(vcb.make()); + + intervalMap.remove(interval); + } + + private static Set statusesToStrings(Set statuses) { + Set output = new HashSet(statuses.size()); + + for (CallableStatus status : statuses) + output.add(status.name()); + + return output; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java deleted file mode 100644 index 5620c3902..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatisticLocus.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; - -/** - * The definition of a locus for the DiagnoseTargets walker statistics calculation - * - * @author Mauricio Carneiro - * @since 2/3/12 - */ -class IntervalStatisticLocus { - private final byte[] mappingQuality; - private final byte[] baseQuality; - private final int coverage; - private final int rawCoverage; - - public IntervalStatisticLocus(byte[] mappingQuality, byte[] baseQuality, int coverage, int rawCoverage) { - this.mappingQuality = mappingQuality; - this.baseQuality = baseQuality; - this.coverage = coverage; - this.rawCoverage = rawCoverage; - } - - public IntervalStatisticLocus() { - this(new byte[1], new byte[1], 0, 0); - } - - public int getCoverage() { - return coverage; - } - - public int getRawCoverage() { - return rawCoverage; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java index 8ee5f76fb..75f56808f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/IntervalStatistics.java @@ -1,44 +1,62 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; -/** - * Short one line description of the walker. - * - * @author Mauricio Carneiro - * @since 2/1/12 - */ -class IntervalStatistics { +public class IntervalStatistics { + + private final Map samples; private final GenomeLoc interval; - private final ArrayList loci; - private final int minimumCoverageThreshold; - private final int maximumCoverageThreshold; - private final int minimumMappingQuality; - private final int minimumBaseQuality; + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) - private IntervalStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + public IntervalStatistics(Set samples, GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { this.interval = interval; - this.loci = loci; - this.minimumCoverageThreshold = minimumCoverageThreshold; - this.maximumCoverageThreshold = maximumCoverageThreshold; - this.minimumMappingQuality = minimumMappingQuality; - this.minimumBaseQuality = minimumBaseQuality; + this.samples = new HashMap(samples.size()); + for (String sample : samples) + this.samples.put(sample, new SampleStatistics(interval, minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality)); } - public IntervalStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { - this(interval, new ArrayList(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + public SampleStatistics getSample(String sample) { + return samples.get(sample); + } - // Initialize every loci (this way we don't have to worry about non-existent loci in the object - for (int i = 0; i < interval.size(); i++) - this.loci.add(i, new IntervalStatisticLocus()); + public void addLocus(AlignmentContext context) { + ReadBackedPileup pileup = context.getBasePileup(); + for (String sample : samples.keySet()) + getSample(sample).addLocus(context.getLocation(), pileup.getPileupForSample(sample)); } public long totalCoverage() { @@ -50,73 +68,27 @@ class IntervalStatistics { public double averageCoverage() { if (preComputedTotalCoverage < 0) calculateTotalCoverage(); - return (double) preComputedTotalCoverage / loci.size(); - } - - /** - * Calculates the callable status of the entire interval - * - * @return the callable status of the entire interval - */ - public CallableStatus callableStatus() { - long max = -1; - CallableStatus maxCallableStatus = null; - HashMap statusCounts = new HashMap(CallableStatus.values().length); - - // initialize the statusCounts with all callable states - for (CallableStatus key : CallableStatus.values()) - statusCounts.put(key, 0); - - // calculate the callable status for each locus - for (int i = 0; i < loci.size(); i++) { - CallableStatus status = callableStatus(i); - int count = statusCounts.get(status) + 1; - statusCounts.put(status, count); - - if (count > max) { - max = count; - maxCallableStatus = status; - } - } - - return maxCallableStatus; - } - - public void addLocus(GenomeLoc locus, IntervalStatisticLocus locusData) { - if (!interval.containsP(locus)) - throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); - - int locusIndex = locus.getStart() - interval.getStart(); - - loci.add(locusIndex, locusData); - } - - /** - * returns the callable status of this locus without taking the reference base into account. - * - * @param locusIndex location in the genome to inquire (only one locus) - * @return the callable status of a locus - */ - private CallableStatus callableStatus(int locusIndex) { - if (loci.get(locusIndex).getCoverage() > maximumCoverageThreshold) - return CallableStatus.EXCESSIVE_COVERAGE; - - if (loci.get(locusIndex).getCoverage() >= minimumCoverageThreshold) - return CallableStatus.CALLABLE; - - if (loci.get(locusIndex).getRawCoverage() >= minimumCoverageThreshold) - return CallableStatus.POOR_QUALITY; - - if (loci.get(locusIndex).getRawCoverage() > 0) - return CallableStatus.LOW_COVERAGE; - - return CallableStatus.NO_COVERAGE; + return (double) preComputedTotalCoverage / interval.size(); } private void calculateTotalCoverage() { preComputedTotalCoverage = 0; - for (IntervalStatisticLocus locus : loci) - preComputedTotalCoverage += locus.getCoverage(); + for (SampleStatistics sample : samples.values()) + preComputedTotalCoverage += sample.totalCoverage(); } + /** + * Return the Callable statuses for the interval as a whole + * todo -- add a voting system for sample flags and add interval specific statuses + * + * @return the callable status(es) for the whole interval + */ + public Set callableStatuses() { + Set output = new HashSet(); + + for (SampleStatistics sample : samples.values()) + output.addAll(sample.getCallableStatuses()); + + return output; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java new file mode 100644 index 000000000..237ca1b1c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/LocusStatistics.java @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import java.util.HashSet; +import java.util.Set; + +public class LocusStatistics { + final int coverage; + final int rawCoverage; + + public LocusStatistics() { + this.coverage = 0; + this.rawCoverage = 0; + } + + public LocusStatistics(int coverage, int rawCoverage) { + this.coverage = coverage; + this.rawCoverage = rawCoverage; + } + + public int getCoverage() { + return coverage; + } + + public int getRawCoverage() { + return rawCoverage; + } + + /** + * Generates all applicable statuses from the coverages in this locus + * + * @param minimumCoverageThreshold the minimum threshold for determining low coverage/poor quality + * @param maximumCoverageThreshold the maximum threshold for determining excessive coverage + * @return a set of all statuses that apply + */ + public Set callableStatuses(int minimumCoverageThreshold, int maximumCoverageThreshold) { + Set output = new HashSet(); + + // if too much coverage + if (getCoverage() > maximumCoverageThreshold) + output.add(CallableStatus.EXCESSIVE_COVERAGE); + + // if not enough coverage + if (getCoverage() < minimumCoverageThreshold) { + // was there a lot of low Qual coverage? + if (getRawCoverage() >= minimumCoverageThreshold) + output.add(CallableStatus.POOR_QUALITY); + // no? + else { + // is there any coverage? + if (getRawCoverage() > 0) + output.add(CallableStatus.LOW_COVERAGE); + else + output.add(CallableStatus.NO_COVERAGE); + } + } + + return output; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java new file mode 100644 index 000000000..9e4993853 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/SampleStatistics.java @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics.targets; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.util.*; + +/** + * Short one line description of the walker. + * + * @author Mauricio Carneiro + * @since 2/1/12 + */ +class SampleStatistics { + private final GenomeLoc interval; + private final ArrayList loci; + + private final int minimumCoverageThreshold; + private final int maximumCoverageThreshold; + private final int minimumMappingQuality; + private final int minimumBaseQuality; + + private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet) + + private SampleStatistics(GenomeLoc interval, ArrayList loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this.interval = interval; + this.loci = loci; + this.minimumCoverageThreshold = minimumCoverageThreshold; + this.maximumCoverageThreshold = maximumCoverageThreshold; + this.minimumMappingQuality = minimumMappingQuality; + this.minimumBaseQuality = minimumBaseQuality; + } + + public SampleStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) { + this(interval, new ArrayList(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality); + + // Initialize every loci (this way we don't have to worry about non-existent loci in the object + for (int i = 0; i < interval.size(); i++) + this.loci.add(i, new LocusStatistics()); + + } + + public long totalCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return preComputedTotalCoverage; + } + + public double averageCoverage() { + if (preComputedTotalCoverage < 0) + calculateTotalCoverage(); + return (double) preComputedTotalCoverage / loci.size(); + } + + /** + * Calculates the callable statuses of the entire interval + * + * @return the callable statuses of the entire interval + */ + public Set getCallableStatuses() { + + Map totals = new HashMap(CallableStatus.values().length); + + // initialize map + for (CallableStatus status : CallableStatus.values()) + totals.put(status, 0); + + // sum up all the callable statuses for each locus + for (int i = 0; i < interval.size(); i++) { + for (CallableStatus status : callableStatus(i)) { + int count = totals.get(status); + + totals.put(status, count + 1); + } + } + + + Set output = new HashSet(); + + // double to avoid type casting + double intervalSize = interval.size(); + + double coverageStatusThreshold = 0.20; + if ((totals.get(CallableStatus.NO_COVERAGE) / intervalSize) > coverageStatusThreshold) + output.add(CallableStatus.NO_COVERAGE); + + if ((totals.get(CallableStatus.LOW_COVERAGE) / intervalSize) > coverageStatusThreshold) + output.add(CallableStatus.LOW_COVERAGE); + + double excessiveCoverageThreshold = 0.20; + if ((totals.get(CallableStatus.EXCESSIVE_COVERAGE) / intervalSize) > excessiveCoverageThreshold) + output.add(CallableStatus.EXCESSIVE_COVERAGE); + + double qualityStatusThreshold = 0.50; + if ((totals.get(CallableStatus.POOR_QUALITY) / intervalSize) > qualityStatusThreshold) + output.add(CallableStatus.POOR_QUALITY); + + if (totals.get(CallableStatus.REF_N) > 0) + output.add(CallableStatus.REF_N); + + if (output.isEmpty()) { + output.add(CallableStatus.PASS); + } + return output; + } + + /** + * Adds a locus to the interval wide stats + * + * @param locus The locus given as a GenomeLoc + * @param pileup The pileup of that locus + */ + public void addLocus(GenomeLoc locus, ReadBackedPileup pileup) { + if (!interval.containsP(locus)) + throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus)); + + // a null pileup means there nothing ot add + if (pileup != null) { + + int locusIndex = locus.getStart() - interval.getStart(); + + int rawCoverage = pileup.depthOfCoverage(); + int coverage = pileup.getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage(); + + LocusStatistics locusData = new LocusStatistics(coverage, rawCoverage); + + loci.add(locusIndex, locusData); + } + } + + /** + * returns the callable status of this locus without taking the reference base into account. + * + * @param locusIndex location in the genome to inquire (only one locus) + * @return the callable status of a locus + */ + private Set callableStatus(int locusIndex) { + LocusStatistics locus = loci.get(locusIndex); + + return locus.callableStatuses(minimumCoverageThreshold, maximumCoverageThreshold); + } + + + private void calculateTotalCoverage() { + preComputedTotalCoverage = 0; + for (LocusStatistics locus : loci) + preComputedTotalCoverage += locus.getCoverage(); + } + +} From cd9bf1bfc35a9cf18aa4c9b4e521bc09f8c22dd2 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 10 Apr 2012 00:22:40 -0400 Subject: [PATCH 07/12] Changing IndelSummary eval module so that PostCallingQC.scala can run with MIXED-record VCFs. --- .../gatk/walkers/varianteval/evaluators/IndelSummary.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java index 786b7296b..c22f82969 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelSummary.java @@ -207,7 +207,9 @@ public class IndelSummary extends VariantEvaluator implements StandardEval { break; default: - throw new UserException.BadInput("Unexpected variant context type: " + eval); + // TODO - MIXED, SYMBOLIC, and MNP records are skipped over + //throw new UserException.BadInput("Unexpected variant context type: " + eval); + break; } return; From 8507cd7440f3aa58b9f47bbc8a2d434eeb5de9d3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 10 Apr 2012 07:22:43 -0400 Subject: [PATCH 08/12] Throw UserException for bad dict / chain files --- .../sting/gatk/walkers/variantutils/LiftoverVariants.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 50fafa202..43aa273c6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -24,7 +24,6 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; -import net.sf.picard.PicardException; import net.sf.picard.liftover.LiftOver; import net.sf.picard.util.Interval; import net.sf.samtools.SAMFileHeader; @@ -73,7 +72,7 @@ public class LiftoverVariants extends RodWalker { public void initialize() { try { liftOver = new LiftOver(CHAIN); - } catch (PicardException e) { + } catch (RuntimeException e) { throw new UserException.BadInput("there is a problem with the chain file you are using: " + e.getMessage()); } @@ -82,7 +81,7 @@ public class LiftoverVariants extends RodWalker { try { final SAMFileHeader toHeader = new SAMFileReader(NEW_SEQ_DICT).getFileHeader(); liftOver.validateToSequences(toHeader.getSequenceDictionary()); - } catch (PicardException e) { + } catch (RuntimeException e) { throw new UserException.BadInput("the chain file you are using is not compatible with the reference you are trying to lift over to; please use the appropriate chain file for the given reference"); } From 6885e2d065d32dfa1dc5786560b83aeed6982969 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 10 Apr 2012 07:37:42 -0400 Subject: [PATCH 09/12] UserException fixes for GATK_logs recent errors -- SamFileReader.java:525 -- BlockCompressedInputStream:376 These were both instances were we weren't catching and rethrowing picard exceptions as UserExceptions. --- .../gatk/datasources/reads/SAMDataSource.java | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index bf7afe4f0..bbcbe6dbc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -51,11 +51,10 @@ import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import java.io.File; -import java.io.FileNotFoundException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; -import java.util.concurrent.*; +import java.util.concurrent.Callable; /** * User: aaron @@ -474,7 +473,6 @@ public class SAMDataSource { /** * Fill the given buffering shard with reads. * @param shard Shard to fill. - * @return true if at the end of the stream. False otherwise. */ public void fillShard(Shard shard) { if(!shard.buffersReads()) @@ -569,15 +567,19 @@ public class SAMDataSource { if(shard.getFileSpans().get(id) == null) throw new ReviewedStingException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); - if(threadAllocation.getNumIOThreads() > 0) { - BlockInputStream inputStream = readers.getInputStream(id); - inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); - BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); - codec.setInputStream(inputStream); - iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); - } - else { - iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + try { + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream inputStream = readers.getInputStream(id); + inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); + BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); + codec.setInputStream(inputStream); + iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); + } + else { + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + } + } catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes + throw new UserException.MalformedBAM(id.samFile, e.getMessage()); } iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); @@ -932,10 +934,7 @@ public class SAMDataSource { blockInputStream = new BlockInputStream(dispatcher,readerID,false); reader = new SAMFileReader(readerID.samFile,indexFile,false); } catch ( RuntimeIOException e ) { - if ( e.getCause() != null && e.getCause() instanceof FileNotFoundException ) - throw new UserException.CouldNotReadInputFile(readerID.samFile, e); - else - throw e; + throw new UserException.CouldNotReadInputFile(readerID.samFile, e); } catch ( SAMFormatException e ) { throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); } From 10e74a71ebb101cd46827861d403bf68f73266fd Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 10 Apr 2012 12:30:35 -0400 Subject: [PATCH 12/12] We now allow arbitrary annotations other than dbSNP (e.g. HM3) to come out of the Unified Genotyper. This was already set up in the Variant Annotator Engine and was just a matter of hooking UG up to it. Added integration test to ensure correct behavior. --- .../gatk/walkers/genotyper/UnifiedGenotyper.java | 13 ++++++++++++- .../genotyper/UnifiedGenotyperIntegrationTest.java | 8 ++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 8df501e1b..79ec08558 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -127,8 +127,19 @@ public class UnifiedGenotyper extends LocusWalker, Unif @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + /** + * If a call overlaps with a record from the provided comp track, the INFO field will be annotated + * as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field). + * Records that are filtered in the comp track will be ignored. + * Note that 'dbSNP' has been special-cased (see the --dbsnp argument). + */ + @Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false) + public List> comps = Collections.emptyList(); + public List> getCompRodBindings() { return comps; } + + // The following are not used by the Unified Genotyper public RodBinding getSnpEffRodBinding() { return null; } - public List> getCompRodBindings() { return Collections.emptyList(); } public List> getResourceRodBindings() { return Collections.emptyList(); } public boolean alwaysAppendDbsnpId() { return false; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 78167e7e9..e7c10a623 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -142,6 +142,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { executeTest("test SLOD", spec); } + @Test + public void testCompTrack() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("71251d8893649ea9abd5d9aa65739ba1")); + executeTest("test using comp track", spec); + } + @Test public void testOutputParameter() { HashMap e = new HashMap();