From 11d950abe0e4ebfd342a676fd17e7606327d936e Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 3 Nov 2009 16:18:51 +0000 Subject: [PATCH] No longer allow the lod_threshold argument - use confidence instead. Have UG output qscores in all cases. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1968 348d0f76-0448-11de-a6fe-93d51630548a --- .../genotyper/EMGenotypeCalculationModel.java | 32 +++++++++++++++---- .../genotyper/GenotypeCalculationModel.java | 2 -- ...PointEstimateGenotypeCalculationModel.java | 27 +++++++++++----- .../genotyper/UnifiedArgumentCollection.java | 6 ++-- .../walkers/genotyper/UnifiedGenotyper.java | 8 +++++ .../UnifiedGenotyperIntegrationTest.java | 22 ++++++------- 6 files changed, 66 insertions(+), 31 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index 3086665cb..b02c58945 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -26,28 +26,46 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode // run the EM calculation EMOutput overall = runEM(ref, contexts, priors, StratifiedContext.OVERALL); - double lod = overall.getPofD() - overall.getPofNull(); - logger.debug(String.format("LOD=%f", lod)); - // return a null call if we don't pass the lod cutoff - if ( !ALL_BASE_MODE && lod < LOD_THRESHOLD ) - return new Pair, GenotypeLocusData>(null, null); + double PofD = Math.pow(10, overall.getPofD()); + double PofNull = Math.pow(10, overall.getPofNull()); + double sum = PofD + PofNull; + + // calculate the phred-scaled confidence score + double phredScaledConfidence; + boolean bestIsRef = false; + if ( PofD > PofNull ) { + phredScaledConfidence = -10.0 * Math.log10(PofNull / sum); + } else { + phredScaledConfidence = -10.0 * Math.log10(PofD / sum); + bestIsRef = true; + } + + // are we above the lod threshold for emitting calls (and not in all-bases mode)? + if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) { + return new Pair, GenotypeLocusData>(null, null); + } // generate the calls GenotypeLocusData locusdata = GenotypeWriterFactory.createSupportedGenotypeLocusData(OUTPUT_FORMAT, ref, context.getLocation()); if ( locusdata != null ) { if ( locusdata instanceof ConfidenceBacked ) { - ((ConfidenceBacked)locusdata).setConfidence(lod); + ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence); } if ( locusdata instanceof SLODBacked ) { + // calculate strand score + + double lod = overall.getPofD() - overall.getPofNull(); + logger.debug(String.format("LOD=%f", lod)); + EMOutput forward = runEM(ref, contexts, priors, StratifiedContext.FORWARD); EMOutput reverse = runEM(ref, contexts, priors, StratifiedContext.REVERSE); double forwardLod = (forward.getPofD() + reverse.getPofNull()) - overall.getPofNull(); double reverseLod = (reverse.getPofD() + forward.getPofNull()) - overall.getPofNull(); logger.debug("forward lod=" + forwardLod + ", reverse lod=" + reverseLod); double strandScore = Math.max(forwardLod - lod, reverseLod - lod); - logger.debug(String.format("SLOD=%f", lod, strandScore)); + logger.debug(String.format("SLOD=%f", strandScore)); ((SLODBacked)locusdata).setSLOD(strandScore); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java index ca9452383..e7d5120bb 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModel.java @@ -35,7 +35,6 @@ public abstract class GenotypeCalculationModel implements Cloneable { protected boolean GENOTYPE_MODE; protected boolean POOLED_INPUT; protected int POOL_SIZE; - protected double LOD_THRESHOLD; protected double CONFIDENCE_THRESHOLD; protected double MINIMUM_ALLELE_FREQUENCY; protected int maxDeletionsInPileup; @@ -68,7 +67,6 @@ public abstract class GenotypeCalculationModel implements Cloneable { GENOTYPE_MODE = UAC.GENOTYPE; POOLED_INPUT = UAC.POOLED; POOL_SIZE = UAC.POOLSIZE; - LOD_THRESHOLD = UAC.LOD_THRESHOLD; CONFIDENCE_THRESHOLD = UAC.CONFIDENCE_THRESHOLD; MINIMUM_ALLELE_FREQUENCY = UAC.MINIMUM_ALLELE_FREQUENCY; maxDeletionsInPileup = UAC.MAX_DELETIONS; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index b8f90aa8b..c14615b1e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -45,16 +45,27 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation // get the genotype likelihoods Pair discoveryGL = getSingleSampleLikelihoods(ref, sampleContext, priors, StratifiedContext.OVERALL); - // calculate the lod threshold - double[] posteriors = discoveryGL.second.getPosteriors(); + // find the index of the best genotype + double[] posteriors = discoveryGL.second.getNormalizedPosteriors(); Integer sortedPosteriors[] = Utils.SortPermutation(posteriors); - double bestGenotype = posteriors[sortedPosteriors[sortedPosteriors.length - 1]]; - double nextBestGenotype = posteriors[sortedPosteriors[sortedPosteriors.length - 2]]; - double refGenotype = posteriors[DiploidGenotype.createHomGenotype(ref).ordinal()]; - double lodConfidence = (GENOTYPE_MODE ? (bestGenotype - nextBestGenotype) : (bestGenotype - refGenotype)); + int bestIndex = sortedPosteriors[sortedPosteriors.length - 1]; + + // flag to determine if ref is the best call (not necessary in genotype mode) + boolean bestIsRef = false; + + // calculate the phred-scaled confidence score + double phredScaledConfidence; + if ( GENOTYPE_MODE ) { + phredScaledConfidence = -10.0 * Math.log10(1.0 - posteriors[bestIndex]); + } else { + int refIndex = DiploidGenotype.createHomGenotype(ref).ordinal(); + bestIsRef = (refIndex == bestIndex); + double pError = (bestIsRef ? 1.0 - posteriors[refIndex] : posteriors[refIndex]); + phredScaledConfidence = -10.0 * Math.log10(pError); + } // are we above the lod threshold for emitting calls (and not in all-bases mode)? - if ( !ALL_BASE_MODE && lodConfidence < LOD_THRESHOLD ) { + if ( !ALL_BASE_MODE && (bestIsRef || phredScaledConfidence < CONFIDENCE_THRESHOLD) ) { return new Pair, GenotypeLocusData>(null, null); } @@ -77,7 +88,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation GenotypeLocusData locusdata = GenotypeWriterFactory.createSupportedGenotypeLocusData(OUTPUT_FORMAT, ref, context.getLocation()); if ( locusdata != null ) { if ( locusdata instanceof ConfidenceBacked ) { - ((ConfidenceBacked)locusdata).setConfidence(lodConfidence); + ((ConfidenceBacked)locusdata).setConfidence(phredScaledConfidence); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 1cea4843b..406a2c3ac 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -71,11 +71,11 @@ public class UnifiedArgumentCollection { // control the various parameters to be used - @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold by which variants should be filtered (for EM_POINT_ESTIMATE model)", required = false) + @Argument(fullName = "lod_threshold", shortName = "lod", doc = "[DEPRECATED] The lod threshold by which variants should be filtered", required = false) public double LOD_THRESHOLD = Double.MIN_VALUE; - @Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered (for JOINT_ESTIMATE model)", required = false) - public double CONFIDENCE_THRESHOLD = Double.MIN_VALUE; + @Argument(fullName = "min_confidence_threshold", shortName = "confidence", doc = "The phred-scaled confidence threshold by which variants should be filtered", required = false) + public double CONFIDENCE_THRESHOLD = 0.0; @Argument(fullName = "max_deletions", shortName = "deletions", doc = "Maximum reads with deletions spanning this locus for it to be callable [default:1]", required = false) public Integer MAX_DELETIONS = 1; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 47003fc12..8b7d7ef06 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -100,9 +100,17 @@ public class UnifiedGenotyper extends LocusWalker, GenotypeL **/ public void initialize() { + // deal with input errors if ( UAC.POOLED && UAC.genotypeModel == GenotypeCalculationModel.Model.EM_POINT_ESTIMATE ) { throw new StingException("This was an attempt to use an EM Point Estimate model with pooled genotype calculations. This model does not work with pooled data."); } + if ( UAC.LOD_THRESHOLD > Double.MIN_VALUE ) { + StringBuilder sb = new StringBuilder(); + sb.append("\n***\tThe --lod_threshold argument is no longer supported; instead, please use --min_confidence_threshold."); + sb.append("\n***\tThere is approximately a 10-to-1 mapping from confidence to LOD."); + sb.append("\n***\tUse Q" + (10.0 * UAC.LOD_THRESHOLD) + " as an approximate equivalent to your LOD " + UAC.LOD_THRESHOLD + " cutoff"); + throw new StingException(sb.toString()); + } // get all of the unique sample names samples = new HashSet(); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7e1c44ab0..9ad9adf09 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -13,7 +13,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { } public static String testGeliLod5() { - return baseTestString() + " --variant_output_format GELI -lod 5"; + return baseTestString() + " --variant_output_format GELI -confidence 50"; } private static String OneMb1StateMD5 = "7e3fc1d8427329eb2a3e05a81011749a"; @@ -42,16 +42,16 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1, - Arrays.asList("dba7f51b0ddb1afebcd8f229444f70e5")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1, + Arrays.asList("9985a92060b512b5d27b4074faf8b60b")); executeTest("testMultiSamplePilot1", spec); } @Test public void testMultiSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -lod 5", 1, - Arrays.asList("c2a7dd9bec6819b2d7702564955c993b")); + "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 50", 1, + Arrays.asList("394f009c9bad34eb584fa10d133d79e0")); executeTest("testMultiSamplePilot2", spec); } @@ -108,7 +108,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void genotypeTest() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( testGeliLod5() + " -L 1:10,000,000-10,100,000 -bm empirical --genotype", 1, - Arrays.asList("6a930a8f115495f5319541c21a892d8e")); + Arrays.asList("45da29e3b1306d546a7b80c30c979ad4")); executeTest("genotypeTest", spec); } @@ -167,12 +167,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testLOD() { HashMap e = new HashMap(); - e.put( 10.0, "94c5b48c0c956fcdacbffaa38a80d926" ); - e.put( 3.0, "f1f53b944b821f05d5dd1ad8fda2f57b" ); + e.put( 100.0, "94c5b48c0c956fcdacbffaa38a80d926" ); + e.put( 30.0, "df455abc1b2bc533aa1dc6eb088a835a" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseTestString() + " --variant_output_format GELI -L 1:10,000,000-11,000,000 -bm EMPIRICAL -lod " + entry.getKey(), 1, + baseTestString() + " --variant_output_format GELI -L 1:10,000,000-11,000,000 -bm EMPIRICAL -confidence " + entry.getKey(), 1, Arrays.asList(entry.getValue())); executeTest("testLOD", spec); } @@ -186,7 +186,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "953fe510a81e357d0b56feac2a90095d" ); + e.put( 0.01, "b8837be7e8beb3ab2ed7150cdc022c65" ); e.put( 0.0001, "ef0f2af7d13f166829d86b15fabc2b81" ); e.put( 1.0 / 1850, "a435c8c966c11f4393a25a9d01c4fc3d" ); @@ -204,7 +204,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void empirical1MbTestBinaryGeli() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseTestString() + " -L 1:10,000,000-11,000,000 -bm empirical --variant_output_format GELI_BINARY -lod 5", 1, + baseTestString() + " -L 1:10,000,000-11,000,000 -bm empirical --variant_output_format GELI_BINARY -confidence 50", 1, Arrays.asList("915abf82a04fcd1842f6865501bae67c")); executeTest("empirical1MbTestBinaryGeli", spec); }