Full calling pipeline now calls indels through the indel genotyper, merges with combine variants, and filters on them. Since new genomic annotator is fast, it is no longer scatter-gathered.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4144 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-08-27 16:28:24 +00:00
parent 78946c4ffd
commit 7908237b90
1 changed files with 30 additions and 12 deletions

View File

@ -45,6 +45,9 @@ class fullCallingPipeline extends QScript {
@Input(doc="SNP cluster filter -- window size",shortName="snpClusterWindow",required=false)
var snpClusterWindow = 7
@Input(doc="dbSNP version",shortName="D")
var dbSNP: File = _
trait CommandLineGATKArgs extends CommandLineGATK {
@ -110,7 +113,6 @@ class fullCallingPipeline extends QScript {
snps.standard_min_confidence_threshold_for_emitting = Some(10)
snps.min_mapping_quality_score = Some(20)
snps.min_base_quality_score = Some(20)
snps.downsampling_type = Some(DownsampleType.EXPERIMENTAL_BY_SAMPLE)
snps.downsample_to_coverage = Some(200)
snps.annotation :+= "QualByDepthV2"
@ -129,16 +131,32 @@ class fullCallingPipeline extends QScript {
snps.scatterCount = 50
val indels = new UnifiedGenotyper with CommandLineGATKArgs
indels.input_file = bamFiles
indels.downsampling_type = Some(DownsampleType.EXPERIMENTAL_BY_SAMPLE)
indels.downsample_to_coverage = Some(200)
indels.variants_out = base+".indels.vcf"
indels.genotype_model = Some(Model.INDELS)
indels.scatterCount = 50
// indel genotyper does one sample at a time
val indelCallFiles = List.empty[RodBind]
val loopNo = 0
val priority = ""
for ( bam <- bamFiles ) {
val indel = new IndelGenotyperV2 with CommandLineGATKArgs
indel.input_file :+= bam
indel.out = swapExt(bam,".bam",".indels.vcf")
indel.downsample_to_coverage = Some(500)
indelCallFiles :+= new RodBind("v"+loopNo.toString, "VCF", indel.out)
if ( loopNo == 0 ) {
priority = "v0"
} else {
priority += ",v"+loopNo.toString
}
loopNo += 1
}
val mergeIndels = new CombineVariants with CommandLineGATKArgs
mergeIndels.out = qscript.project+".indels.vcf"
mergeIndels.genotypemergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE
mergeIndels.priority = priority
mergeIndels.variantmergeoption = org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION
mergeIndels.rodBind = indelCallFiles
// 1b. genomically annotate SNPs -- slow, but scatter it
// 1b. genomically annotate SNPs -- no longer slow
val annotated = new GenomicAnnotator with CommandLineGATKArgs
annotated.rodBind :+= RodBind("variant", "VCF", new File(snps.variants_out))
annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.refseqTable)
@ -146,13 +164,12 @@ class fullCallingPipeline extends QScript {
annotated.vcfOutput = swapExt(new File(snps.variants_out),".vcf",".annotated.vcf").getAbsolutePath
annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet"
annotated.rodToIntervalTrackName = "variant"
annotated.scatterCount = 100
// 2.a filter on cluster and near indels
val masker = new VariantFiltration with CommandLineGATKArgs
masker.rodBind :+= RodBind("variant", "VCF", new File(annotated.vcfOutput))
masker.rodBind :+= RodBind("mask", "VCF", new File(indels.variants_out))
masker.rodBind :+= RodBind("mask", "VCF", new File(mergeIndels.out))
masker.maskName = "NearIndel"
masker.clusterWindowSize = Some(qscript.snpClusterWindow)
masker.clusterSize = Some(qscript.snpsInCluster)
@ -173,7 +190,8 @@ class fullCallingPipeline extends QScript {
// todo -- args for resources (properties file)
val clusters = new GenerateVariantClusters with CommandLineGATKArgs
clusters.rodBind :+= RodBind("input", "VCF", masker.out)
val clusters_clusterFile = swapExt(new File(snps.variants_out),".vcf",".cluster").getAbsolutePath
val clusters_clusterFile = swapExt(new File(snps.variants_out),".vcf",".cluster")
clusters.clusterFile = clusters_clusterFile
clusters.memoryLimit = Some(8)
clusters.jobQueue = "hugemem"