Fixing misc parameters in MDCP. The pipeline now does VariantEval of output by default. Fix for NaN vqslod values in VQSR
This commit is contained in:
parent
824583e007
commit
539e157ecd
|
|
@ -44,6 +44,7 @@ import java.util.List;
|
|||
public class GaussianMixtureModel {
|
||||
|
||||
protected final static Logger logger = Logger.getLogger(GaussianMixtureModel.class);
|
||||
public final static double MIN_ACCEPTABLE_LOD_SCORE = -20000.0;
|
||||
|
||||
private final ArrayList<MultivariateGaussian> gaussians;
|
||||
private final double shrinkage;
|
||||
|
|
@ -207,14 +208,23 @@ public class GaussianMixtureModel {
|
|||
for( final boolean isNull : datum.isNull ) {
|
||||
if( isNull ) { return evaluateDatumMarginalized( datum ); }
|
||||
}
|
||||
// Fill an array with the log10 probability coming from each Gaussian and then use MathUtils to sum them up correctly
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
int gaussianIndex = 0;
|
||||
for( final MultivariateGaussian gaussian : gaussians ) {
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + gaussian.evaluateDatumLog10( datum );
|
||||
}
|
||||
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
double lod = MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
|
||||
// Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians
|
||||
// Cap the values at an extremely negative value and spread them out randomly
|
||||
if( lod < MIN_ACCEPTABLE_LOD_SCORE ) {
|
||||
lod = MIN_ACCEPTABLE_LOD_SCORE - GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE;
|
||||
}
|
||||
return lod;
|
||||
}
|
||||
|
||||
// Used only to decide which covariate dimension is most divergent in order to report in the culprit info field annotation
|
||||
public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) {
|
||||
if(datum.isNull[iii]) { return null; }
|
||||
|
||||
|
|
@ -225,11 +235,18 @@ public class GaussianMixtureModel {
|
|||
normal.setState( gaussian.mu[iii], gaussian.sigma.get(iii, iii) );
|
||||
pVarInGaussianLog10[gaussianIndex++] = gaussian.pMixtureLog10 + Math.log10( normal.pdf( datum.annotations[iii] ) );
|
||||
}
|
||||
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
double lod = MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
|
||||
|
||||
// Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians
|
||||
// Cap the values at an extremely negative value and spread them out randomly
|
||||
if( lod < MIN_ACCEPTABLE_LOD_SCORE ) {
|
||||
lod = MIN_ACCEPTABLE_LOD_SCORE - GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE;
|
||||
}
|
||||
return lod;
|
||||
}
|
||||
|
||||
public double evaluateDatumMarginalized( final VariantDatum datum ) {
|
||||
int numSamples = 0;
|
||||
int numRandomDraws = 0;
|
||||
double sumPVarInGaussian = 0.0;
|
||||
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
|
||||
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
|
||||
|
|
@ -248,10 +265,17 @@ public class GaussianMixtureModel {
|
|||
|
||||
// add this sample's probability to the pile in order to take an average in the end
|
||||
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k))
|
||||
numSamples++;
|
||||
numRandomDraws++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Math.log10( sumPVarInGaussian / ((double) numSamples) );
|
||||
double lod = Math.log10( sumPVarInGaussian / ((double) numRandomDraws) );
|
||||
|
||||
// Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians
|
||||
// Cap the values at an extremely negative value and spread them out randomly
|
||||
if( lod < MIN_ACCEPTABLE_LOD_SCORE ) {
|
||||
lod = MIN_ACCEPTABLE_LOD_SCORE - GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE;
|
||||
}
|
||||
return lod;
|
||||
}
|
||||
}
|
||||
|
|
@ -5,17 +5,6 @@ import org.broadinstitute.sting.queue.extensions.gatk._
|
|||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport
|
||||
|
||||
|
||||
// ToDos:
|
||||
// reduce the scope of the datasets so the script is more nimble
|
||||
// create gold standard BAQ'd bam files, no reason to always do it on the fly
|
||||
|
||||
// Analysis to add at the end of the script:
|
||||
// auto generation of the cluster plots
|
||||
// spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls
|
||||
// produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF
|
||||
|
||||
|
||||
class MethodsDevelopmentCallingPipeline extends QScript {
|
||||
qscript =>
|
||||
|
||||
|
|
@ -28,15 +17,12 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
@Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
|
||||
var datasets: List[String] = Nil
|
||||
|
||||
@Argument(shortName="skipGoldStandard", doc="doesn't run the pipeline with the goldstandard VCF files for comparison", required=false)
|
||||
var skipGoldStandard: Boolean = false
|
||||
@Argument(shortName="runGoldStandard", doc="run the pipeline with the goldstandard VCF files for comparison", required=false)
|
||||
var runGoldStandard: Boolean = false
|
||||
|
||||
@Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
|
||||
var noBAQ: Boolean = false
|
||||
|
||||
@Argument(shortName="eval", doc="adds the VariantEval walker to the pipeline", required=false)
|
||||
var eval: Boolean = false
|
||||
|
||||
@Argument(shortName="indels", doc="calls indels with the Unified Genotyper", required=false)
|
||||
var callIndels: Boolean = false
|
||||
|
||||
|
|
@ -52,8 +38,6 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
@Argument(shortName="sample", doc="Samples to include in Variant Eval", required=false)
|
||||
var samples: List[String] = Nil
|
||||
|
||||
|
||||
|
||||
class Target(
|
||||
val baseName: String,
|
||||
val reference: File,
|
||||
|
|
@ -118,15 +102,15 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask",
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"),
|
||||
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.17, 99.0, !lowPass, !exome, 1),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg18.intervals", 2.14, 99.0, !lowPass, !exome, 1),
|
||||
"HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.17, 99.0, !lowPass, !exome, 1),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1),
|
||||
"GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.GA2.hg19.filtered.vcf"),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.17, 99.0, !lowPass, !exome, 1),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1),
|
||||
"WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18_129, hapmap_hg18,
|
||||
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask",
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"),
|
||||
|
|
@ -177,9 +161,9 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
add(new snpCall(target))
|
||||
add(new VQSR(target, !goldStandard))
|
||||
add(new applyVQSR(target, !goldStandard))
|
||||
if (eval) add(new snpEvaluation(target))
|
||||
add(new snpEvaluation(target))
|
||||
}
|
||||
if ( !skipGoldStandard ) {
|
||||
if ( runGoldStandard ) {
|
||||
add(new VQSR(target, goldStandard))
|
||||
add(new applyVQSR(target, goldStandard))
|
||||
}
|
||||
|
|
@ -200,7 +184,8 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
this.memoryLimit = 3
|
||||
this.reference_sequence = t.reference
|
||||
this.intervalsString ++= List(t.intervals)
|
||||
this.scatterCount = 120
|
||||
this.scatterCount = 140
|
||||
this.nt = 2
|
||||
this.dcov = if ( t.isLowpass ) { 50 } else { 250 }
|
||||
this.stand_call_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
|
||||
this.stand_emit_conf = if ( t.isLowpass ) { 4.0 } else { 30.0 }
|
||||
|
|
@ -259,9 +244,8 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
this.training :+= new TaggedFile( omni_b37, "prior=12.0")
|
||||
this.truth :+= new TaggedFile( omni_b37, "prior=12.0")
|
||||
this.training :+= new TaggedFile( training_1000G, "prior=10.0" )
|
||||
this.badSites :+= new TaggedFile( badSites_1000G, "prior=2.0" )
|
||||
this.known :+= new TaggedFile( t.dbsnpFile, "prior=2.0" )
|
||||
this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=10.0" )
|
||||
this.resource :+= new TaggedFile( projectConsensus_1000G, "prior=8.0" )
|
||||
this.use_annotation ++= List("QD", "HaplotypeScore", "MQRankSum", "ReadPosRankSum", "MQ", "FS")
|
||||
if(t.nSamples >= 10) {
|
||||
this.use_annotation ++= List("InbreedingCoeff")
|
||||
|
|
@ -275,12 +259,12 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
this.tranche ++= List("100.0", "99.9", "99.5", "99.3", "99.0", "98.9", "98.8", "98.5", "98.4", "98.3", "98.2", "98.1", "98.0", "97.9", "97.8", "97.5", "97.0", "95.0", "90.0")
|
||||
this.rscript_file = t.vqsrRscript
|
||||
this.analysisName = t.name + "_VQSR"
|
||||
this.jobName = queueLogDir + t.name + ".VQSR"
|
||||
this.jobName = queueLogDir + t.name + ".VQSR"
|
||||
}
|
||||
|
||||
// 4.) Apply the recalibration table to the appropriate tranches
|
||||
class applyVQSR (t: Target, goldStandard: Boolean) extends ApplyRecalibration with UNIVERSAL_GATK_ARGS {
|
||||
this.memoryLimit = 4
|
||||
this.memoryLimit = 6
|
||||
this.reference_sequence = t.reference
|
||||
this.intervalsString ++= List(t.intervals)
|
||||
this.input :+= ( if ( goldStandard ) { t.goldStandard_VCF } else { t.rawVCF } )
|
||||
|
|
@ -289,7 +273,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
this.ts_filter_level = t.trancheTarget
|
||||
this.out = t.recalibratedVCF
|
||||
this.analysisName = t.name + "_AVQSR"
|
||||
this.jobName = queueLogDir + t.name + ".applyVQSR"
|
||||
this.jobName = queueLogDir + t.name + ".applyVQSR"
|
||||
}
|
||||
|
||||
// 5.) Variant Evaluation Base(OPTIONAL)
|
||||
|
|
@ -308,7 +292,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
this.eval :+= t.recalibratedVCF
|
||||
this.out = t.evalFile
|
||||
this.analysisName = t.name + "_VEs"
|
||||
this.jobName = queueLogDir + t.name + ".snp.eval"
|
||||
this.jobName = queueLogDir + t.name + ".snp.eval"
|
||||
}
|
||||
|
||||
// 5b.) Indel Evaluation (OPTIONAL)
|
||||
|
|
@ -317,6 +301,6 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
this.evalModule :+= "IndelStatistics"
|
||||
this.out = t.evalIndelFile
|
||||
this.analysisName = t.name + "_VEi"
|
||||
this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval"
|
||||
this.jobName = queueLogDir + queueLogDir + t.name + ".indel.eval"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue