From b7b57ef39a313ca2c743a96f92d5951b8e2b390a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 19 Nov 2011 15:57:33 -0500 Subject: [PATCH] Updating MD5 to reflect canonical ordering of calculation -- We should no longer have md5s changing because of hashmaps changing their sort order on us -- Added GenotypeLikelihoodsUnitTests -- Refactored ExactAFCaclculation to put the PL -> QUAL calculation in the GenotypeLikelihoods class to avoid the code copy. --- .../genotyper/ExactAFCalculationModel.java | 31 ++--------- .../variantcontext/GenotypeLikelihoods.java | 30 ++++++----- ...tTest.java => GenotypePriorsUnitTest.java} | 2 +- .../UnifiedGenotyperIntegrationTest.java | 51 ++++++++++--------- .../GenotypeLikelihoodsUnitTest.java | 16 +++++- 5 files changed, 63 insertions(+), 67 deletions(-) rename public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/{GenotypeLikelihoodsUnitTest.java => GenotypePriorsUnitTest.java} (98%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index e071de15b..354702dad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -29,10 +29,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; @@ -354,11 +351,10 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // and will add no-call genotype to GL's in a second pass ArrayList myAlleles = new ArrayList(); - double qual = Double.NEGATIVE_INFINITY; double[] likelihoods = g.getLikelihoods().getAsVector(); if (SIMPLE_GREEDY_GENOTYPER || !vc.isBiallelic()) { - bestGTguess = Utils.findIndexOfMaxEntry(g.getLikelihoods().getAsVector()); + bestGTguess = Utils.findIndexOfMaxEntry(likelihoods); } else { int newIdx = tracebackArray[k][startIdx];; @@ -366,20 +362,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { startIdx = newIdx; } - /* System.out.format("Sample: %s GL:",sample); - for (int i=0; i < likelihoods.length; i++) - System.out.format("%1.4f, ",likelihoods[i]); - */ - - for (int i=0; i < likelihoods.length; i++) { - if (i==bestGTguess) - continue; - if (likelihoods[i] >= qual) - qual = likelihoods[i]; - } - // qual contains now max(likelihoods[k]) for all k != bestGTguess - qual = likelihoods[bestGTguess] - qual; - // likelihoods are stored row-wise in lower triangular matrix. IE // for 2 alleles they have ordering AA,AB,BB // for 3 alleles they are ordered AA,AB,BB,AC,BC,CC @@ -407,16 +389,9 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { break; } - if (qual < 0) { - // QUAL can be negative if the chosen genotype is not the most likely one individually. - // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen on - double[] normalized = MathUtils.normalizeFromLog10(likelihoods); - double chosenGenotype = normalized[bestGTguess]; - qual = -1.0 * Math.log10(1.0 - chosenGenotype); - } + final double qual = GenotypeLikelihoods.getQualFromLikelihoods(bestGTguess, likelihoods); //System.out.println(myAlleles.toString()); calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false)); - } for ( final Genotype genotype : GLs.iterateInSampleNameOrder() ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java index bbe5308a9..a5e4e5774 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoods.java @@ -117,28 +117,34 @@ public class GenotypeLikelihoods { //Return the neg log10 Genotype Quality (GQ) for the given genotype //Returns Double.NEGATIVE_INFINITY in case of missing genotype public double getLog10GQ(Genotype.Type genotype){ - EnumMap likelihoods = getAsMap(false); + return getQualFromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector()); + } + + public static double getQualFromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){ if(likelihoods == null) return Double.NEGATIVE_INFINITY; double qual = Double.NEGATIVE_INFINITY; - for(Map.Entry likelihood : likelihoods.entrySet()){ - if(likelihood.getKey() == genotype) + for (int i=0; i < likelihoods.length; i++) { + if (i==iOfChoosenGenotype) continue; - if(likelihood.getValue() > qual) - qual = likelihood.getValue(); + if (likelihoods[i] >= qual) + qual = likelihoods[i]; } - //Quality of the most likely genotype = likelihood(most likely) - likelihood (2nd best) - qual = likelihoods.get(genotype) - qual; + // qual contains now max(likelihoods[k]) for all k != bestGTguess + qual = likelihoods[iOfChoosenGenotype] - qual; - //Quality of other genotypes 1-P(G) if (qual < 0) { - double[] normalized = MathUtils.normalizeFromLog10(getAsVector()); - double chosenGenotype = normalized[genotype.ordinal()-1]; - qual = Math.log10(1.0 - chosenGenotype); + // QUAL can be negative if the chosen genotype is not the most likely one individually. + // In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one + double[] normalized = MathUtils.normalizeFromLog10(likelihoods); + double chosenGenotype = normalized[iOfChoosenGenotype]; + return Math.log10(1.0 - chosenGenotype); + } else { + // invert the size, as this is the probability of making an error + return -1 * qual; } - return -1 * qual; } private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java similarity index 98% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java index 425b969e2..a87f121f6 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypePriorsUnitTest.java @@ -7,7 +7,7 @@ import org.testng.annotations.Test; import static java.lang.Math.log10; -public class GenotypeLikelihoodsUnitTest extends BaseTest { +public class GenotypePriorsUnitTest extends BaseTest { private final static double DELTA = 1e-8; @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 6d4a971a5..fc234ec24 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -29,7 +29,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("c93def488de12fb3b8199e001b7a24a8")); + Arrays.asList("f5c8bd653aed02059b9f377833eae5bb")); executeTest("test MultiSample Pilot1", spec); } @@ -37,7 +37,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("cef4bd72cbbe72f28e9c72e2818b4708")); + Arrays.asList("ea5b5dcea3a6eef7ec60070b551c994e")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -45,7 +45,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("9834f0cef1cd6ba4943a5aaee1ee8be8")); + Arrays.asList("030ce4feb4bbcf700caba82a45cc45f2")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -53,7 +53,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("6762b72ae60155ad71738d7c76b80e4b")); + Arrays.asList("3ccce5d909f8f128e496f6841836e5f7")); executeTest("test SingleSample Pilot2", spec); } @@ -63,7 +63,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "bc71dba7bbdb23e7d5cc60461fdd897b"; + private final static String COMPRESSED_OUTPUT_MD5 = "890143b366050e78d6c6ba6b2c6b6864"; @Test public void testCompressedOutput() { @@ -84,7 +84,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "b9504e446b9313559c3ed97add7e8dc1"; + String md5 = "95614280c565ad90f8c000376fef822c"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -115,8 +115,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testCallingParameters() { HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "bb3f294eab3e2cf52c70e63b23aac5ee" ); - e.put( "--computeSLOD", "eb34979efaadba1e34bd82bcacf5c722" ); + e.put( "--min_base_quality_score 26", "7acb1a5aee5fdadb0cc0ea07a212efc6" ); + e.put( "--computeSLOD", "e9d23a08472e4e27b4f25e844f5bad57" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -129,9 +129,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameter() { HashMap e = new HashMap(); - e.put( "-sites_only", "d40114aa201aa33ff5f174f15b6b73af" ); - e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "3c681b053fd2280f3c42041d24243752" ); - e.put( "--output_mode EMIT_ALL_SITES", "eafa6d71c5ecd64dfee5d7a3f60e392e" ); + e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" ); + e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "94e53320f14c5ff29d62f68d36b46fcd" ); + e.put( "--output_mode EMIT_ALL_SITES", "73ad1cc41786b12c5f0e6f3e9ec2b728" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -145,12 +145,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("c71ca370947739cb7d87b59452be7a07")); + Arrays.asList("902327e8a45fe585c8dfd1a7c4fcf60f")); executeTest("test confidence 1", spec1); + } + @Test + public void testConfidence2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("1c0a599d475cc7d5e745df6e9b6c0d29")); + Arrays.asList("2343ac8113791f4e79643b333b34afc8")); executeTest("test confidence 2", spec2); } @@ -162,8 +165,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "f84da90c310367bd51f2ab6e346fa3d8" ); - e.put( 1.0 / 1850, "5791e7fef40d4412b6d8f84e0a809c6c" ); + e.put( 0.01, "46243ecc2b9dc716f48ea280c9bb7e72" ); + e.put( 1.0 / 1850, "6b2a59dbc76984db6d4d6d6b5ee5d62c" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -187,7 +190,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("9cc9538ac83770e12bd0830d285bfbd0")); + Arrays.asList("f0fbe472f155baf594b1eeb58166edef")); executeTest(String.format("test multiple technologies"), spec); } @@ -206,7 +209,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("eaf8043edb46dfbe9f97ae03baa797ed")); + Arrays.asList("8c87c749a7bb5a76ed8504d4ec254272")); executeTest(String.format("test calling with BAQ"), spec); } @@ -225,7 +228,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("eeba568272f9b42d5450da75c7cc6d2d")); + Arrays.asList("a64d2e65b5927260e4ce0d948760cc5c")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -240,7 +243,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("5fe98ee853586dc9db58f0bc97daea63")); + Arrays.asList("2ad52c2e75b3ffbfd8f03237c444e8e6")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -253,7 +256,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("19ff9bd3139480bdf79dcbf117cf2b24")); + Arrays.asList("69107157632714150fc068d412e31939")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -263,7 +266,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("81a1035e59cd883e413e62d34265c1a2")); + Arrays.asList("4ffda07590e06d58ed867ae326d74b2d")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -273,7 +276,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("102b7d915f21dff0a9b6ea64c4c7d409")); + Arrays.asList("6e182a58472ea17c8b0eb01f80562fbd")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -283,7 +286,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("5900344f97bbac35d147a0a7c2bf1d0c")); + Arrays.asList("f93f8a35b47bcf96594ada55e2312c73")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -292,7 +295,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("45e7bf21cd6358921626404e7ae76c69")); + Arrays.asList("1e02f57fafaa41db71c531eb25e148e1")); executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java index 70a18856f..a66c78f3c 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -105,8 +105,8 @@ public class GenotypeLikelihoodsUnitTest { double[] test = MathUtils.normalizeFromLog10(gl.getAsVector()); //GQ for the other genotypes - Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF), -1 * Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1])); - Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR), -1 * Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1])); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_REF), Math.log10(1.0 - test[Genotype.Type.HOM_REF.ordinal()-1])); + Assert.assertEquals(gl.getLog10GQ(Genotype.Type.HOM_VAR), Math.log10(1.0 - test[Genotype.Type.HOM_VAR.ordinal()-1])); //Test missing likelihoods gl = new GenotypeLikelihoods("."); @@ -116,6 +116,18 @@ public class GenotypeLikelihoodsUnitTest { } + @Test + public void testgetQualFromLikelihoods(){ + double[] likelihoods = new double[]{-1, 0, -2}; + // qual values we expect for each possible "best" genotype + double[] expectedQuals = new double[]{-0.04100161, -1, -0.003930294}; + + for ( int i = 0; i < likelihoods.length; i++ ) { + Assert.assertEquals(GenotypeLikelihoods.getQualFromLikelihoods(i, likelihoods), expectedQuals[i], 1e-6, + "GQ value for genotype " + i + " was not calculated correctly"); + } + } + private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { Assert.assertEquals(v1.length, v2.length); for ( int i = 0; i < v1.length; i++ ) {