New hidden option in VQSR to not parse the genotypes of the incoming training data. Updated VQSR training in methods development pipeline to be more in line with best practices.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5340 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e7089f9870
commit
ce34a8a918
|
|
@ -96,6 +96,9 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
@Hidden
|
||||
@Argument(fullName = "NO_HEADER", shortName = "NO_HEADER", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required = false)
|
||||
protected Boolean NO_HEADER_LINE = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
|
||||
protected Boolean TRUST_ALL_POLYMORPHIC = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
|
|
@ -198,9 +201,9 @@ public class GenerateVariantClustersWalker extends RodWalker<ExpandingArrayList<
|
|||
|
||||
variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
|
||||
variantDatum.weight = WEIGHT_NOVELS;
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (!vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
variantDatum.weight = WEIGHT_HAPMAP;
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (!vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
variantDatum.weight = WEIGHT_1KG;
|
||||
} else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
|
||||
variantDatum.weight = WEIGHT_DBSNP;
|
||||
|
|
|
|||
|
|
@ -112,11 +112,12 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
@Hidden
|
||||
@Argument(fullName = "qual", shortName = "qual", doc = "Don't use sites with original quality scores below the qual threshold. FOR DEBUGGING PURPOSES ONLY.", required=false)
|
||||
private double QUAL_THRESHOLD = 0.0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "trustAllPolymorphic", shortName = "allPoly", doc = "Trust that all the input training sets' unfiltered records contain only polymorphic sites to drastically speed up the computation.", required = false)
|
||||
protected Boolean TRUST_ALL_POLYMORPHIC = false;
|
||||
@Hidden
|
||||
@Argument(fullName = "debugFile", shortName = "debugFile", doc = "Print debugging information here", required=false)
|
||||
private File DEBUG_FILE = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "selectionMetric", shortName = "sm", doc = "Selection metric to use", required=false)
|
||||
private SelectionMetricType SELECTION_METRIC_TYPE = SelectionMetricType.NOVEL_TITV;
|
||||
|
|
@ -273,9 +274,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
|
|||
|
||||
variantDatum.isKnown = ( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() );
|
||||
double knownPrior_qScore = PRIOR_NOVELS;
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (!vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
if( vcHapMap != null && vcHapMap.isVariant() && !vcHapMap.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vcHapMap.hasGenotypes() || vcHapMap.isPolymorphic()) ) {
|
||||
knownPrior_qScore = PRIOR_HAPMAP;
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (!vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
} else if( vc1KG != null && vc1KG.isVariant() && !vc1KG.isFiltered() && (TRUST_ALL_POLYMORPHIC || !vc1KG.hasGenotypes() || vc1KG.isPolymorphic()) ) {
|
||||
knownPrior_qScore = PRIOR_1KG;
|
||||
} else if( vcDbsnp != null && vcDbsnp.isVariant() && !vcDbsnp.isFiltered() ) {
|
||||
knownPrior_qScore = PRIOR_DBSNP;
|
||||
|
|
|
|||
|
|
@ -97,6 +97,8 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf"
|
||||
val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf"
|
||||
val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"
|
||||
val training_hapmap_b37 = "/humgen/1kg/processing/pipeline_test_bams/hapmap3.3_training_chr20.vcf"
|
||||
val omni_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b36.vcf"
|
||||
val omni_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/1212samples.b37.vcf"
|
||||
val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed"
|
||||
val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed"
|
||||
|
|
@ -117,7 +119,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
"HiSeq19" -> new Target("NA12878.HiSeq19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 1.0, !lowPass),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 0.5, !lowPass),
|
||||
"GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37_129, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.GA2.hg19.filtered.vcf"),
|
||||
|
|
@ -143,10 +145,6 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from
|
||||
new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, 1.0, lowPass), // chunked interval list to use with Queue's scatter/gather functionality
|
||||
"LowPassAugust" -> new Target("ALL.august.v4", b37, dbSNP_b37, hapmap_b37, indelMask_b37, // BUGBUG: kill this, it is too large
|
||||
new File("/humgen/1kg/processing/allPopulations_chr20_august_release.cleaned.merged.bams/ALL.cleaned.merged.list"),
|
||||
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"),
|
||||
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, 1.0, lowPass),
|
||||
"LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
|
||||
new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"),
|
||||
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
|
||||
|
|
@ -178,7 +176,6 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
}
|
||||
if ( !skipGoldStandard ) {
|
||||
add(new GenerateClusters(target, goldStandard))
|
||||
add(new VariantRecalibratorTiTv(target, goldStandard))
|
||||
add(new VariantRecalibratorNRS(target, goldStandard))
|
||||
}
|
||||
}
|
||||
|
|
@ -260,14 +257,16 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
this.reference_sequence = t.reference
|
||||
this.rodBind :+= RodBind("hapmap", "VCF", t.hapmapFile)
|
||||
if( t.hapmapFile.contains("b37") )
|
||||
this.rodBind :+= RodBind("1kg", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg_pilot1_projectCalls/ALL.low_coverage.2010_07.hg19.vcf")
|
||||
this.rodBind :+= RodBind("1kg", "VCF", omni_b37)
|
||||
else if( t.hapmapFile.contains("b36") )
|
||||
this.rodBind :+= RodBind("1kg", "VCF", omni_b36)
|
||||
this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.filteredVCF } )
|
||||
this.clusterFile = if ( goldStandard ) { t.goldStandardClusterFile } else { t.clusterFile }
|
||||
this.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
|
||||
this.intervalsString ++= List(t.intervals)
|
||||
this.qual = Some(350) // clustering parameters to be updated soon pending new experimentation results
|
||||
this.qual = Some(100) // clustering parameters to be updated soon pending new experimentation results
|
||||
this.std = Some(3.5)
|
||||
this.mG = Some(10)
|
||||
this.mG = Some(8)
|
||||
this.ignoreFilter ++= FiltersToIgnore
|
||||
this.analysisName = name + "_GVC"
|
||||
this.jobName = queueLogDir + t.name + ".cluster"
|
||||
|
|
@ -275,47 +274,38 @@ class MethodsDevelopmentCallingPipeline extends QScript {
|
|||
this.DBSNP = new File(t.dbsnpFile)
|
||||
else if (t.dbsnpFile.endsWith(".vcf"))
|
||||
this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
|
||||
this.trustAllPolymorphic = true
|
||||
}
|
||||
|
||||
// 4.) VQSR part2 Calculate new LOD for all input SNPs by evaluating the Gaussian clusters
|
||||
class VariantRecalibratorBase(t: Target, goldStandard: Boolean) extends VariantRecalibrator with UNIVERSAL_GATK_ARGS {
|
||||
class VariantRecalibratorNRS(t: Target, goldStandard: Boolean) extends VariantRecalibrator with UNIVERSAL_GATK_ARGS {
|
||||
val name: String = if ( goldStandard ) { t.goldStandardName } else { t.name }
|
||||
this.reference_sequence = t.reference
|
||||
if( t.hapmapFile.contains("b37") )
|
||||
this.rodBind :+= RodBind("1kg", "VCF", "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg_pilot1_projectCalls/ALL.low_coverage.2010_07.hg19.vcf")
|
||||
if( t.hapmapFile.contains("b37") ) {
|
||||
this.rodBind :+= RodBind("1kg", "VCF", omni_b37)
|
||||
this.rodBind :+= RodBind("truthOmni", "VCF", omni_b37)
|
||||
} else if( t.hapmapFile.contains("b36") ) {
|
||||
this.rodBind :+= RodBind("1kg", "VCF", omni_b36)
|
||||
this.rodBind :+= RodBind("truthOmni", "VCF", omni_b36)
|
||||
}
|
||||
this.rodBind :+= RodBind("hapmap", "VCF", t.hapmapFile)
|
||||
this.rodBind :+= RodBind("truth", "VCF", t.hapmapFile)
|
||||
this.rodBind :+= RodBind("truthHapMap", "VCF", t.hapmapFile)
|
||||
this.rodBind :+= RodBind("input", "VCF", if ( goldStandard ) { t.goldStandard_VCF } else { t.filteredVCF } )
|
||||
this.clusterFile = if ( goldStandard ) { t.goldStandardClusterFile } else { t.clusterFile }
|
||||
this.analysisName = name + "_VR"
|
||||
this.intervalsString ++= List(t.intervals)
|
||||
this.ignoreFilter ++= FiltersToIgnore
|
||||
this.ignoreFilter ++= List("HARD_TO_VALIDATE")
|
||||
this.target_titv = Some(t.titvTarget)
|
||||
if (t.dbsnpFile.endsWith(".rod"))
|
||||
this.DBSNP = new File(t.dbsnpFile)
|
||||
else if (t.dbsnpFile.endsWith(".vcf"))
|
||||
this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
|
||||
}
|
||||
|
||||
// 4a.) Choose VQSR tranches based on novel ti/tv
|
||||
class VariantRecalibratorTiTv(t: Target, goldStandard: Boolean) extends VariantRecalibratorBase(t, goldStandard) {
|
||||
this.tranche ++= List("0.1", "1.0", "3.0", "5.0", "10.0", "100.0")
|
||||
this.out = t.titvRecalibratedVCF
|
||||
this.tranchesFile = t.titvTranchesFile
|
||||
this.jobName = queueLogDir + t.name + ".titv"
|
||||
}
|
||||
|
||||
// 4b.) Choose VQSR tranches based on sensitivity to truth set
|
||||
class VariantRecalibratorNRS(t: Target, goldStandard: Boolean) extends VariantRecalibratorBase(t, goldStandard) {
|
||||
this.sm = Some(org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibrator.SelectionMetricType.TRUTH_SENSITIVITY)
|
||||
this.tranche ++= List("0.1", "1.0", "3.0", "5.0", "10.0", "100.0")
|
||||
this.tranche ++= List("0.1", "0.5", "0.7", "1.0", "3.0", "5.0", "10.0", "100.0")
|
||||
this.out = t.tsRecalibratedVCF
|
||||
this.priorDBSNP = Some(2.0)
|
||||
this.priorHapMap = Some(2.0)
|
||||
this.prior1KG = Some(2.0)
|
||||
this.tranchesFile = t.tsTranchesFile
|
||||
this.jobName = queueLogDir + t.name + ".nrs"
|
||||
this.trustAllPolymorphic = true
|
||||
}
|
||||
|
||||
// 5.) Variant Cut filter out the variants marked by recalibration to the 99% tranche
|
||||
|
|
|
|||
Loading…
Reference in New Issue