From b5e148140b4731df53130ded3232f2502ce956ae Mon Sep 17 00:00:00 2001 From: ebanks Date: Tue, 5 Oct 2010 18:35:36 +0000 Subject: [PATCH] Officially fixed the UG priors; updated the default min MQ/BQs to pipeline values of q20 and min calling threshold to Q50 git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4431 348d0f76-0448-11de-a6fe-93d51630548a --- .../DiploidGenotypeCalculationModel.java | 4 +- .../GenotypeCalculationModelFactory.java | 2 +- ...JointEstimateGenotypeCalculationModel.java | 58 +++++-------------- .../genotyper/UnifiedArgumentCollection.java | 12 ++-- .../genotyper/UnifiedArgumentCollection.java | 12 ++-- .../UnifiedGenotyperIntegrationTest.java | 32 +++++----- 6 files changed, 46 insertions(+), 74 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java index 46a935f78..80a936b71 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java @@ -39,7 +39,9 @@ import java.util.*; public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalculationModel { - protected DiploidGenotypeCalculationModel() {} + protected DiploidGenotypeCalculationModel(int N, double heterozygosity) { + super(N, heterozygosity); + } // the GenotypeLikelihoods map private HashMap GLs = new HashMap(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java index 79fef9aac..cf20c0e9e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeCalculationModelFactory.java @@ -55,7 +55,7 @@ public class GenotypeCalculationModelFactory { GenotypeCalculationModel gcm; switch ( UAC.genotypeModel ) { case JOINT_ESTIMATE: - gcm = new DiploidGenotypeCalculationModel(); + gcm = new DiploidGenotypeCalculationModel(samples.size(), UAC.heterozygosity); break; case DINDEL: throw new UnsupportedOperationException("The Dindel-based genotype likelihoods model is not currently supported"); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java index d52fb2ff5..c6d344c9b 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -19,15 +19,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc protected static final double VALUE_NOT_CALCULATED = -1.0 * Double.MAX_VALUE; private int minAlleleFrequencyToTest; - // because the null allele frequencies are constant for a given N, - // we cache the results to avoid having to recompute everything - private HashMap nullAlleleFrequencyCache = new HashMap(); - - // because the Hardy-Weinberg values for a given frequency are constant, - // we cache the results to avoid having to recompute everything - // private HashMap hardyWeinbergValueCache = new HashMap(); - - // the allele frequency priors + // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything protected double[] log10AlleleFrequencyPriors; // the allele frequency posteriors and P(f>0) for each alternate allele @@ -45,8 +37,9 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc protected static final Set filter = new HashSet(1); - protected JointEstimateGenotypeCalculationModel() { + protected JointEstimateGenotypeCalculationModel(int N, double heterozygosity) { filter.add(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME); + computeAlleleFrequencyPriors(N, heterozygosity); } public VariantCallContext callExtendedLocus(RefMetaDataTracker tracker, byte[] ref, GenomeLoc loc, Map stratifiedContexts) { @@ -79,8 +72,6 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc } // calculate likelihoods if there are non-ref bases - initializeAlleleFrequencies(frequencyEstimationPoints); - initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); calculatePofFs(ref, frequencyEstimationPoints); @@ -142,41 +133,20 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc return; } - protected void initializeAlleleFrequencies(int frequencyEstimationPoints) { - // set up the allele frequency priors - log10AlleleFrequencyPriors = getNullAlleleFrequencyPriors(frequencyEstimationPoints); - } + protected void computeAlleleFrequencyPriors(int N, double heterozygosity) { + int AFs = (2 * N) + 1; + log10AlleleFrequencyPriors = new double[AFs]; - protected double[] getNullAlleleFrequencyPriors(int N) { - double[] AFs = nullAlleleFrequencyCache.get(N); - - // if it hasn't been calculated yet, do so now - if ( AFs == null ) { - - // calculate sum(1/i) - double sigma_1_over_I = 0.0; - for (int i = 1; i < N; i++) - sigma_1_over_I += 1.0 / (double)i; - - // delta = theta / sum(1/i) - double delta = UAC.heterozygosity / sigma_1_over_I; - - // calculate the null allele frequencies for 1-N - AFs = new double[N]; - double sum = 0.0; - for (int i = 1; i < N; i++) { - double value = delta / (double)i; - AFs[i] = Math.log10(value); - sum += value; - } - - // null frequency for AF=0 is (1 - sum(all other frequencies)) - AFs[0] = Math.log10(1.0 - sum); - - nullAlleleFrequencyCache.put(N, AFs); + // calculate the allele frequency priors for 1-N + double sum = 0.0; + for (int i = 1; i < AFs; i++) { + double value = heterozygosity / (double)i; + log10AlleleFrequencyPriors[i] = Math.log10(value); + sum += value; } - return AFs; + // null frequency for AF=0 is (1 - sum(all other frequencies)) + log10AlleleFrequencyPriors[0] = Math.log10(1.0 - sum); } private void estimateReferenceConfidence(VariantCallContext vcc, Map contexts, double theta, boolean ignoreCoveredSamples) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index e82a34caf..ec33827b4 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -49,16 +49,16 @@ public class UnifiedArgumentCollection { public boolean ALL_BASES_MODE = false; @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + public double STANDARD_CONFIDENCE_FOR_CALLING = 50.0; @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) - public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + public double STANDARD_CONFIDENCE_FOR_EMITTING = 50.0; @Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false) - public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0; + public double TRIGGER_CONFIDENCE_FOR_CALLING = 50.0; @Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) - public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0; + public double TRIGGER_CONFIDENCE_FOR_EMITTING = 50.0; @Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false) public boolean NO_SLOD = false; @@ -76,10 +76,10 @@ public class UnifiedArgumentCollection { // control the various parameters to be used @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) - public int MIN_BASE_QUALTY_SCORE = 10; + public int MIN_BASE_QUALTY_SCORE = 20; @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) - public int MIN_MAPPING_QUALTY_SCORE = 10; + public int MIN_MAPPING_QUALTY_SCORE = 20; @Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false) public int MAX_MISMATCHES = 3; diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java index abac31b5e..1438b0a8c 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -52,16 +52,16 @@ public class UnifiedArgumentCollection { public boolean ALL_BASES_MODE = false; @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + public double STANDARD_CONFIDENCE_FOR_CALLING = 50.0; @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) - public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + public double STANDARD_CONFIDENCE_FOR_EMITTING = 50.0; @Argument(fullName = "trigger_min_confidence_threshold_for_calling", shortName = "trig_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be called", required = false) - public double TRIGGER_CONFIDENCE_FOR_CALLING = 30.0; + public double TRIGGER_CONFIDENCE_FOR_CALLING = 50.0; @Argument(fullName = "trigger_min_confidence_threshold_for_emitting", shortName = "trig_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false) - public double TRIGGER_CONFIDENCE_FOR_EMITTING = 30.0; + public double TRIGGER_CONFIDENCE_FOR_EMITTING = 50.0; @Argument(fullName = "noSLOD", shortName = "nsl", doc = "If provided, we will not calculate the SLOD", required = false) public boolean NO_SLOD = false; @@ -79,10 +79,10 @@ public class UnifiedArgumentCollection { // control the various parameters to be used @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) - public int MIN_BASE_QUALTY_SCORE = 10; + public int MIN_BASE_QUALTY_SCORE = 20; @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) - public int MIN_MAPPING_QUALTY_SCORE = 10; + public int MIN_MAPPING_QUALTY_SCORE = 20; @Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false) public int MAX_MISMATCHES = 3; diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7a9fb7112..ea59b9eb8 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -25,7 +25,7 @@ public class public void testMultiSamplePilot1Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("42f589f8743ec16e72a4697c728502ed")); + Arrays.asList("9870dd089d0f76c8dd3e74bc9f2ba0f0")); executeTest("testMultiSamplePilot1 - Joint Estimate", spec); } @@ -33,7 +33,7 @@ public class public void testMultiSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,050,000", 1, - Arrays.asList("32ba9f34185a0aec3107efafe5130556")); + Arrays.asList("eeed5d52f4bdeed2c73ad5d202cbba26")); executeTest("testMultiSamplePilot2 - Joint Estimate", spec); } @@ -41,7 +41,7 @@ public class public void testSingleSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("3d9de73b764a55deac6a956c56c46373")); + Arrays.asList("c34a080e6799ef84f73c6e8d16f7d6ea")); executeTest("testSingleSamplePilot2 - Joint Estimate", spec); } @@ -53,7 +53,7 @@ public class @Test public void testParallelization() { - String md5 = "6fdddf70e8320e04dba50a9ed0f26854"; + String md5 = "ce616a641a2e00d29510e99c6606d151"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -80,11 +80,11 @@ public class @Test public void testParameter() { HashMap e = new HashMap(); - e.put( "-genotype", "8428439ee41ea0e024fcfe1c267b5e2d" ); - e.put( "-all_bases", "b8fd0a213362743b174bd2aeba7d0f8c" ); - e.put( "--min_base_quality_score 26", "be88f3992f7da095e788ff372bf94190" ); - e.put( "--min_mapping_quality_score 26", "266e9f95bd577a3b8ebdb5d66925a74d" ); - e.put( "--max_mismatches_in_40bp_window 5", "61d7de457fd591344a7f8561aeb816a0" ); + e.put( "-genotype", "0bcea27abc956352d18ae2de23dcaa9c" ); + e.put( "-all_bases", "aab0acf3e7f494a57cb2ad673ae8d32e" ); + e.put( "--min_base_quality_score 26", "48837ae22a75002d448806f6b46dd3b3" ); + e.put( "--min_mapping_quality_score 26", "d388e847b655abf56a7fd945bdff5dee" ); + e.put( "--max_mismatches_in_40bp_window 5", "416ba33cf1cfc0b4b4a2ad3256c8b56b" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -98,12 +98,12 @@ public class public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("758555ebda5a145e21edd3cba5193817")); + Arrays.asList("aedfce6b54d3138e0459f62421185760")); executeTest("testConfidence1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("291e2b7d0f8c3b262306c18d61782711")); + Arrays.asList("b442c0fd6e0d3365e8bc2d9149eeb6f5")); executeTest("testConfidence2", spec2); } @@ -115,8 +115,8 @@ public class @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "61a54794bc262479a981b6eb83ce6243" ); - e.put( 1.0 / 1850, "b34d30518143ea407e6570eff101f708" ); + e.put( 0.01, "2588b41cd3debfc2d58fdf6b463bcc07" ); + e.put( 1.0 / 1850, "18fa98796906658402cedd6cc3f28be6" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -135,8 +135,8 @@ public class @Test public void testOtherBaseCallModel() { HashMap e = new HashMap(); - e.put( "one_state", "97d22192536b1efe0163c062ecf64612" ); - e.put( "three_state", "fd48388392b877afcfa28507067fbe63" ); + e.put( "one_state", "62ee552b32d9193b700c1dd8e3eeb4b9" ); + e.put( "three_state", "f2971d0964780b9abfd38e860080c342" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -159,7 +159,7 @@ public class " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("bb112d6f907608a0e8de37cd0c887956")); + Arrays.asList("4469242939216eac5eb6b6b73ae6a509")); executeTest(String.format("testMultiTechnologies"), spec); }