Fixing misc parameters in MDCP. The pipeline now does VariantEval of output by default. Fix for NaN vqslod values in VQSR

This commit is contained in:
Ryan Poplin 2011-08-20 11:28:48 -04:00
parent 824583e007
commit 539e157ecd
2 changed files with 44 additions and 36 deletions

View File

@ -44,6 +44,7 @@ import java.util.List;
public class GaussianMixtureModel {
protected final static Logger logger = Logger.getLogger(GaussianMixtureModel.class);
public final static double MIN_ACCEPTABLE_LOD_SCORE = -20000.0;
private final ArrayList<MultivariateGaussian> gaussians;
private final double shrinkage;
@ -207,14 +208,23 @@ public class GaussianMixtureModel {
for( final boolean isNull : datum.isNull ) {
if( isNull ) { return evaluateDatumMarginalized( datum ); }
}
// Fill an array with the log10 probability coming from each Gaussian and then use MathUtils to sum them up correctly
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
int gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
}
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
double lod = MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
// Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians
// Cap the values at an extremely negative value and spread them out randomly
if( lod < MIN_ACCEPTABLE_LOD_SCORE ) {
lod = MIN_ACCEPTABLE_LOD_SCORE - GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE;
}
return lod;
}
// Used only to decide which covariate dimension is most divergent in order to report in the culprit info field annotation
public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) {
if(datum.isNull[iii]) { return null; }
@ -225,11 +235,18 @@ public class GaussianMixtureModel {
normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) );
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) );
}
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
double lod = MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
// Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians
// Cap the values at an extremely negative value and spread them out randomly
if( lod < MIN_ACCEPTABLE_LOD_SCORE ) {
lod = MIN_ACCEPTABLE_LOD_SCORE - GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE;
}
return lod;
}
public double evaluateDatumMarginalized( final VariantDatum datum ) {
int numSamples = 0;
int numRandomDraws = 0;
double sumPVarInGaussian = 0.0;
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
@ -248,10 +265,17 @@ public class GaussianMixtureModel {
// add this sample's probability to the pile in order to take an average in the end
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k))
numSamples++;
numRandomDraws++;
}
}
}
return Math.log10( sumPVarInGaussian / ((double) numSamples) );
double lod = Math.log10( sumPVarInGaussian / ((double) numRandomDraws) );
// Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians
// Cap the values at an extremely negative value and spread them out randomly
if( lod < MIN_ACCEPTABLE_LOD_SCORE ) {
lod = MIN_ACCEPTABLE_LOD_SCORE - GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE;
}
return lod;
}
}

View File

@ -5,17 +5,6 @@ import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport
// ToDos:
// reduce the scope of the datasets so the script is more nimble
// create gold standard BAQ'd bam files, no reason to always do it on the fly
// Analysis to add at the end of the script:
// auto generation of the cluster plots
// spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls
// produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF
class MethodsDevelopmentCallingPipeline extends QScript {
qscript =>
@ -28,15 +17,12 @@ class MethodsDevelopmentCallingPipeline extends QScript {
@Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
var datasets: List[String] = Nil
@Argument(shortName="skipGoldStandard", doc="doesn't run the pipeline with the goldstandard VCF files for comparison", required=false)
var skipGoldStandard: Boolean = false
@Argument(shortName="runGoldStandard", doc="run the pipeline with the goldstandard VCF files for comparison", required=false)
var runGoldStandard: Boolean = false
@Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
var noBAQ: Boolean = false
@Argument(shortName="eval", doc="adds the VariantEval walker to the pipeline", required=false)
var eval: Boolean = false
@Argument(shortName="indels", doc="calls indels with the Unified Genotyper", required=false)
var callIndels: Boolean = false
@ -52,8 +38,6 @@ class MethodsDevelopmentCallingPipeline extends QScript {
@Argument(shortName="sample", doc="Samples to include in Variant Eval", required=false)
var samples: List[String] = Nil
class Target(
val baseName: String,
val reference: File,
@ -118,15 +102,15 @@ class MethodsDevelopmentCallingPipeline extends QScript {
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"),
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.17, 99.0, !lowPass, !exome, 1),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.14, 99.0, !lowPass, !exome, 1),
"HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.17, 99.0, !lowPass, !exome, 1),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1),
"GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"),
new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.GA2.hg19.filtered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.17, 99.0, !lowPass, !exome, 1),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1),
"WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18_129, hapmap_hg18,
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"),
@ -177,9 +161,9 @@ class MethodsDevelopmentCallingPipeline extends QScript {
add(new snpCall(target))
add(new VQSR(target, !goldStandard))
add(new applyVQSR(target, !goldStandard))
if (eval) add(new snpEvaluation(target))
add(new snpEvaluation(target))
}
if ( !skipGoldStandard ) {
if ( runGoldStandard ) {
add(new VQSR(target, goldStandard))
add(new applyVQSR(target, goldStandard))
}
@ -200,7 +184,8 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.memoryLimit = 3
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.scatterCount = 120
this.scatterCount = 140
this.nt = 2
this.dcov = if ( t.isLowpass ) { 50 } else { 250 }
this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
@ -259,9 +244,8 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.training :+= new TaggedFile( omni_b37, "prior=12.0")
this.truth :+= new TaggedFile( omni_b37, "prior=12.0")
this.training :+= new TaggedFile( training_1000G, "prior=10.0" )
this.badSites :+= new TaggedFile( badSites_1000G, "prior=2.0" )
this.known :+= new TaggedFile( t.dbsnpFile, "prior=2.0" )
this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=10.0" )
this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" )
this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS")
if(t.nSamples >= 10) {
this.use_annotation ++= List("InbreedingCoeff")
@ -275,12 +259,12 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0")
this.rscript_file = t.vqsrRscript
this.analysisName = t.name + "_VQSR"
this.jobName = queueLogDir + t.name + ".VQSR"
this.jobName = queueLogDir + t.name + ".VQSR"
}
// 4.) Apply the recalibration table to the appropriate tranches
class applyVQSR (t: Target, goldStandard: Boolean) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS {
this.memoryLimit = 4
this.memoryLimit = 6
this.reference_sequence = t.reference
this.intervalsString ++= List(t.intervals)
this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } )
@ -289,7 +273,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.ts_filter_level = t.trancheTarget
this.out = t.recalibratedVCF
this.analysisName = t.name + "_AVQSR"
this.jobName = queueLogDir + t.name + ".applyVQSR"
this.jobName = queueLogDir + t.name + ".applyVQSR"
}
// 5.) Variant Evaluation Base(OPTIONAL)
@ -308,7 +292,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.eval :+= t.recalibratedVCF
this.out = t.evalFile
this.analysisName = t.name + "_VEs"
this.jobName = queueLogDir + t.name + ".snp.eval"
this.jobName = queueLogDir + t.name + ".snp.eval"
}
// 5b.) Indel Evaluation (OPTIONAL)
@ -317,6 +301,6 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.evalModule :+= "IndelStatistics"
this.out = t.evalIndelFile
this.analysisName = t.name + "_VEi"
this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval"
this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval"
}
}