diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index 17461de2f..5e7bf3575 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -44,6 +44,7 @@ import java.util.List; public class GaussianMixtureModel { protected final static Logger logger = Logger.getLogger(GaussianMixtureModel.class); + public final static double MIN_ACCEPTABLE_LOD_SCORE = -20000.0; private final ArrayList gaussians; private final double shrinkage; @@ -207,14 +208,23 @@ public class GaussianMixtureModel { for( final boolean isNull : datum.isNull ) { if( isNull ) { return evaluateDatumMarginalized( datum ); } } + // Fill an array with the log10 probability coming from each Gaussian and then use MathUtils to sum them up correctly final double[] pVarInGaussianLog10 = new double[gaussians.size()]; int gaussianIndex = 0; for( final MultivariateGaussian gaussian : gaussians ) { pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum ); } - return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) + double lod = MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) + + // Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians + // Cap the values at an extremely negative value and spread them out randomly + if( lod < MIN_ACCEPTABLE_LOD_SCORE ) { + lod = MIN_ACCEPTABLE_LOD_SCORE - GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE; + } + return lod; } + // Used only to decide which covariate dimension is most divergent in order to report in the culprit info field annotation public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) { if(datum.isNull[iii]) { return null; } @@ -225,11 +235,18 @@ public class GaussianMixtureModel { normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) ); pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) ); } - return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) + double lod = MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k)) + + // Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians + // Cap the values at an extremely negative value and spread them out randomly + if( lod < MIN_ACCEPTABLE_LOD_SCORE ) { + lod = MIN_ACCEPTABLE_LOD_SCORE - GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE; + } + return lod; } public double evaluateDatumMarginalized( final VariantDatum datum ) { - int numSamples = 0; + int numRandomDraws = 0; double sumPVarInGaussian = 0.0; final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization final double[] pVarInGaussianLog10 = new double[gaussians.size()]; @@ -248,10 +265,17 @@ public class GaussianMixtureModel { // add this sample's probability to the pile in order to take an average in the end sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k)) - numSamples++; + numRandomDraws++; } } } - return Math.log10( sumPVarInGaussian / ((double) numSamples) ); + double lod = Math.log10( sumPVarInGaussian / ((double) numRandomDraws) ); + + // Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians + // Cap the values at an extremely negative value and spread them out randomly + if( lod < MIN_ACCEPTABLE_LOD_SCORE ) { + lod = MIN_ACCEPTABLE_LOD_SCORE - GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE; + } + return lod; } } \ No newline at end of file diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index e4385a282..a28f6f949 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -5,17 +5,6 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.gatk.phonehome.GATKRunReport - - // ToDos: - // reduce the scope of the datasets so the script is more nimble - // create gold standard BAQ'd bam files, no reason to always do it on the fly - - // Analysis to add at the end of the script: - // auto generation of the cluster plots - // spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls - // produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF - - class MethodsDevelopmentCallingPipeline extends QScript { qscript => @@ -28,15 +17,12 @@ class MethodsDevelopmentCallingPipeline extends QScript { @Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false) var datasets: List[String] = Nil - @Argument(shortName="skipGoldStandard", doc="doesn't run the pipeline with the goldstandard VCF files for comparison", required=false) - var skipGoldStandard: Boolean = false + @Argument(shortName="runGoldStandard", doc="run the pipeline with the goldstandard VCF files for comparison", required=false) + var runGoldStandard: Boolean = false @Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false) var noBAQ: Boolean = false - @Argument(shortName="eval", doc="adds the VariantEval walker to the pipeline", required=false) - var eval: Boolean = false - @Argument(shortName="indels", doc="calls indels with the Unified Genotyper", required=false) var callIndels: Boolean = false @@ -52,8 +38,6 @@ class MethodsDevelopmentCallingPipeline extends QScript { @Argument(shortName="sample", doc="Samples to include in Variant Eval", required=false) var samples: List[String] = Nil - - class Target( val baseName: String, val reference: File, @@ -118,15 +102,15 @@ class MethodsDevelopmentCallingPipeline extends QScript { "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.17, 99.0, !lowPass, !exome, 1), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.14, 99.0, !lowPass, !exome, 1), "HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.17, 99.0, !lowPass, !exome, 1), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1), "GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37, new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"), new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.GA2.hg19.filtered.vcf"), - "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.17, 99.0, !lowPass, !exome, 1), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1), "WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18_129, hapmap_hg18, "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask", new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), @@ -177,9 +161,9 @@ class MethodsDevelopmentCallingPipeline extends QScript { add(new snpCall(target)) add(new VQSR(target, !goldStandard)) add(new applyVQSR(target, !goldStandard)) - if (eval) add(new snpEvaluation(target)) + add(new snpEvaluation(target)) } - if ( !skipGoldStandard ) { + if ( runGoldStandard ) { add(new VQSR(target, goldStandard)) add(new applyVQSR(target, goldStandard)) } @@ -200,7 +184,8 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.memoryLimit = 3 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) - this.scatterCount = 120 + this.scatterCount = 140 + this.nt = 2 this.dcov = if ( t.isLowpass ) { 50 } else { 250 } this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 } @@ -259,9 +244,8 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.training :+= new TaggedFile( omni_b37, "prior=12.0") this.truth :+= new TaggedFile( omni_b37, "prior=12.0") this.training :+= new TaggedFile( training_1000G, "prior=10.0" ) - this.badSites :+= new TaggedFile( badSites_1000G, "prior=2.0" ) this.known :+= new TaggedFile( t.dbsnpFile, "prior=2.0" ) - this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=10.0" ) + this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" ) this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS") if(t.nSamples >= 10) { this.use_annotation ++= List("InbreedingCoeff") @@ -275,12 +259,12 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0") this.rscript_file = t.vqsrRscript this.analysisName = t.name + "_VQSR" - this.jobName = queueLogDir + t.name + ".VQSR" + this.jobName = queueLogDir + t.name + ".VQSR" } // 4.) Apply the recalibration table to the appropriate tranches class applyVQSR (t: Target, goldStandard: Boolean) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS { - this.memoryLimit = 4 + this.memoryLimit = 6 this.reference_sequence = t.reference this.intervalsString ++= List(t.intervals) this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } ) @@ -289,7 +273,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.ts_filter_level = t.trancheTarget this.out = t.recalibratedVCF this.analysisName = t.name + "_AVQSR" - this.jobName = queueLogDir + t.name + ".applyVQSR" + this.jobName = queueLogDir + t.name + ".applyVQSR" } // 5.) Variant Evaluation Base(OPTIONAL) @@ -308,7 +292,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.eval :+= t.recalibratedVCF this.out = t.evalFile this.analysisName = t.name + "_VEs" - this.jobName = queueLogDir + t.name + ".snp.eval" + this.jobName = queueLogDir + t.name + ".snp.eval" } // 5b.) Indel Evaluation (OPTIONAL) @@ -317,6 +301,6 @@ class MethodsDevelopmentCallingPipeline extends QScript { this.evalModule :+= "IndelStatistics" this.out = t.evalIndelFile this.analysisName = t.name + "_VEi" - this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval" + this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval" } }