Removed unnecessary dbSNP big-table dependency. Ti/Tv is now required. Consistent downsampling level for all programs. Spelling corrections. VariantEval now generates R-style output.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4355 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2010-09-26 16:55:58 +00:00
parent 145fb0df8b
commit 9820a12fa5
1 changed files with 44 additions and 53 deletions

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.queue.extensions.samtools._
import org.broadinstitute.sting.queue.{QException, QScript} import org.broadinstitute.sting.queue.{QException, QScript}
import collection.JavaConversions._ import collection.JavaConversions._
import org.broadinstitute.sting.utils.yaml.YamlUtils import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
class fullCallingPipeline extends QScript { class fullCallingPipeline extends QScript {
qscript => qscript =>
@ -29,33 +30,33 @@ class fullCallingPipeline extends QScript {
@Input(doc="refseqTable", shortName="refseqTable") @Input(doc="refseqTable", shortName="refseqTable")
var refseqTable: File = _ var refseqTable: File = _
@Input(doc="dbsnpTable", shortName="dbsnpTable") @Input(doc="Picard FixMateInformation.jar. At the Broad this can be found at /seq/software/picard/current/bin/FixMateInformation.jar. Outside the Broad see http://picard.sourceforge.net/")
var dbsnpTable: File = _
@Input(doc="Picard FixMateInformation.jar. At the Broad this can be found at /seq/software/picard/current/bin/FixMateInformation.jar. Outside the broad see http://picard.sourceforge.net/")
var picardFixMatesJar: File = _ var picardFixMatesJar: File = _
@Input(doc="gatk jar") @Input(doc="gatk jar")
var gatkJar: File = _ var gatkJar: File = _
@Input(doc="SNP cluster filter -- number SNPs",shortName="snpsInCluster",required=false) @Input(doc="SNP cluster filter -- number SNPs",shortName="snpsInCluster",required=false)
var snpsInCluster = 4 var snpsInCluster = 3
@Input(doc="SNP cluster filter -- window size",shortName="snpClusterWindow",required=false) @Input(doc="SNP cluster filter -- window size",shortName="snpClusterWindow",required=false)
var snpClusterWindow = 7 var snpClusterWindow = 10
@Input(doc="target titv for recalibration",shortName="titv",required=false) @Input(doc="target titv for recalibration",shortName="titv",required=true)
var target_titv = 2.1 var target_titv: Float = _
@Input(doc="downsampling coverage",shortName="dcov",required=false) @Input(doc="downsampling coverage",shortName="dcov",required=false)
var downsampling_coverage = 200 var downsampling_coverage = 300
@Input(doc="Number of jobs to scatter unifeid genotyper",shortName="snpScatter",required=false) @Input(doc="Number of jobs to scatter UnifiedGenotyper",shortName="snpScatter",required=false)
var num_snp_scatter_jobs = 50 var num_snp_scatter_jobs = 50
@Input(doc="Number of jobs to scatter indel genotyper",shortName="indelScatter",required=false) @Input(doc="Number of jobs to scatter IndelGenotyperV2",shortName="indelScatter",required=false)
var num_indel_scatter_jobs = 5 var num_indel_scatter_jobs = 5
@Input(doc="Indel-clean BAM files",shortName="clean",required=false)
var clean_bam_files = 1
@Input(doc="ADPR script") @Input(doc="ADPR script")
var adprScript: File = _ var adprScript: File = _
@ -71,11 +72,10 @@ class fullCallingPipeline extends QScript {
this.intervals = qscript.pipeline.getProject.getIntervalList this.intervals = qscript.pipeline.getProject.getIntervalList
this.jarFile = qscript.gatkJar this.jarFile = qscript.gatkJar
this.reference_sequence = qscript.pipeline.getProject.getReferenceFile this.reference_sequence = qscript.pipeline.getProject.getReferenceFile
this.memoryLimit = Some(2)
} }
// ------------ SETUP THE PIPELINE ----------- // // ------------ SETUP THE PIPELINE ----------- //
@ -84,27 +84,24 @@ class fullCallingPipeline extends QScript {
val projectBase: String = qscript.pipeline.getProject.getName val projectBase: String = qscript.pipeline.getProject.getName
val cleanedBase: String = projectBase + ".cleaned" val cleanedBase: String = projectBase + ".cleaned"
val uncleanedBase: String = projectBase + ".uncleaned" val uncleanedBase: String = projectBase + ".uncleaned"
// there are commands that use all the bam files // there are commands that use all the bam files
val recalibratedSamples = qscript.pipeline.getSamples val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated"))
.filter(_.getBamFiles.contains("recalibrated"))
val adprRScript = qscript.adprScript val adprRScript = qscript.adprScript
val seq = qscript.machine val seq = qscript.machine
val expKind = qscript.protocol val expKind = qscript.protocol
for ( sample <- recalibratedSamples ) { for ( sample <- recalibratedSamples ) {
// put unclean bams in unclean genotypers in advance, create the extension files
// put unclean bams in unclean genotypers
// in advance, create the extension files
val bam = sample.getBamFiles.get("recalibrated") val bam = sample.getBamFiles.get("recalibrated")
if (!sample.getBamFiles.contains("cleaned")) if (!sample.getBamFiles.contains("cleaned")) {
sample.getBamFiles.put("cleaned", swapExt(bam,"bam","cleaned.bam")) sample.getBamFiles.put("cleaned", swapExt(bam,"bam","cleaned.bam"))
val cleaned_bam = sample.getBamFiles.get("cleaned") }
val cleaned_bam = sample.getBamFiles.get("cleaned")
val indel_targets = swapExt(bam,"bam","realigner_targets.interval_list") val indel_targets = swapExt(bam,"bam","realigner_targets.interval_list")
// create the cleaning commands // create the cleaning commands
val targetCreator = new RealignerTargetCreator with CommandLineGATKArgs val targetCreator = new RealignerTargetCreator with CommandLineGATKArgs
targetCreator.analysisName = "CreateTargets_"+bam.getName targetCreator.analysisName = "CreateTargets_"+bam.getName
targetCreator.input_file :+= bam targetCreator.input_file :+= bam
@ -126,8 +123,6 @@ class fullCallingPipeline extends QScript {
def outputBam = fixed def outputBam = fixed
} }
// realigner.out = cleaned_bam // realigner.out = cleaned_bam
// realigner.scatterClass = classOf[ContigScatterFunction] // realigner.scatterClass = classOf[ContigScatterFunction]
// realigner.setupGatherFunction = { case (f: BamGatherFunction, _) => f.jarFile = qscript.picardFixMatesJar } // realigner.setupGatherFunction = { case (f: BamGatherFunction, _) => f.jarFile = qscript.picardFixMatesJar }
@ -163,27 +158,33 @@ class fullCallingPipeline extends QScript {
samtoolsindex.analysisName = "index_"+cleaned_bam.getName samtoolsindex.analysisName = "index_"+cleaned_bam.getName
// COMMENT THIS NEXT BLOCK TO SKIP CLEANING // COMMENT THIS NEXT BLOCK TO SKIP CLEANING
if ( realigner.scatterCount > 1 ) if (qscript.clean_bam_files == 1) {
add(targetCreator,realigner,samtoolsindex) if ( realigner.scatterCount > 1 ) {
else add(targetCreator,realigner,samtoolsindex)
add(targetCreator,realigner,fixMates,samtoolsindex) } else {
add(targetCreator,realigner,fixMates,samtoolsindex)
}
}
} }
val recalibratedBamFiles = recalibratedSamples val recalibratedBamFiles = recalibratedSamples
.map(_.getBamFiles.get("recalibrated")) .map(_.getBamFiles.get("recalibrated"))
.toList .toList
val cleanBamFiles = qscript.pipeline.getSamples val cleanBamFiles = qscript.pipeline.getSamples
.filter(_.getBamFiles.contains("cleaned")) .filter(_.getBamFiles.contains("cleaned"))
.map(_.getBamFiles.get("cleaned")) .map(_.getBamFiles.get("cleaned"))
.toList .toList
// actually make calls // actually make calls
//endToEnd(uncleanedBase,recalibratedBamFiles, adprRscript, seq, expKind) if (qscript.clean_bam_files == 1) {
endToEnd(uncleanedBase,recalibratedBamFiles, seq, expKind) // COMMENT THIS NEXT LINE TO AVOID CALLING ON CLEANED FILES
// COMMENT THIS NEXT LINE TO AVOID CALLING ON CLEANED FILES //endToEnd(cleanedBase, cleanBamFiles, adprRscript, seq, expKind)
//endToEnd(cleanedBase,cleanBamFiles, adprRscript, seq, expKind) endToEnd(cleanedBase, cleanBamFiles, seq, expKind)
endToEnd(cleanedBase,cleanBamFiles, seq, expKind) } else {
//endToEnd(uncleanedBase, recalibratedBamFiles, adprRscript, seq, expKind)
endToEnd(uncleanedBase, recalibratedBamFiles, seq, expKind)
}
} }
//def endToEnd(base: String, bamFiles: List[File], adprthing: File, seqinfo: String, exptype: String) = { //def endToEnd(base: String, bamFiles: List[File], adprthing: File, seqinfo: String, exptype: String) = {
@ -199,7 +200,7 @@ class fullCallingPipeline extends QScript {
snps.standard_min_confidence_threshold_for_emitting = Some(10) snps.standard_min_confidence_threshold_for_emitting = Some(10)
snps.min_mapping_quality_score = Some(20) snps.min_mapping_quality_score = Some(20)
snps.min_base_quality_score = Some(20) snps.min_base_quality_score = Some(20)
snps.downsample_to_coverage = Some(200) snps.downsample_to_coverage = Some(qscript.downsampling_coverage)
//snps.annotation :+= "QualByDepthV2" //snps.annotation :+= "QualByDepthV2"
if (qscript.trigger != null) { if (qscript.trigger != null) {
@ -216,7 +217,6 @@ class fullCallingPipeline extends QScript {
snps.scatterCount = qscript.num_snp_scatter_jobs snps.scatterCount = qscript.num_snp_scatter_jobs
// indel genotyper does one sample at a time // indel genotyper does one sample at a time
var indelCallFiles = List.empty[RodBind] var indelCallFiles = List.empty[RodBind]
var indelGenotypers = List.empty[IndelGenotyperV2 with CommandLineGATKArgs] var indelGenotypers = List.empty[IndelGenotyperV2 with CommandLineGATKArgs]
@ -227,9 +227,9 @@ class fullCallingPipeline extends QScript {
indel.analysisName = "IndelGenotyper_"+bam.getName indel.analysisName = "IndelGenotyper_"+bam.getName
indel.input_file :+= bam indel.input_file :+= bam
indel.out = swapExt(bam,".bam",".indels.vcf") indel.out = swapExt(bam,".bam",".indels.vcf")
indel.downsample_to_coverage = Some(500) indel.downsample_to_coverage = Some(qscript.downsampling_coverage)
indelCallFiles :+= RodBind("v"+loopNo.toString, "VCF", indel.out) indelCallFiles :+= RodBind("v"+loopNo.toString, "VCF", indel.out)
indel.scatterCount = qscript.num_indel_scatter_jobs //indel.scatterCount = qscript.num_indel_scatter_jobs
indelGenotypers :+= indel indelGenotypers :+= indel
@ -248,21 +248,19 @@ class fullCallingPipeline extends QScript {
mergeIndels.rodBind = indelCallFiles mergeIndels.rodBind = indelCallFiles
mergeIndels.analysisName = base+"_MergeIndels" mergeIndels.analysisName = base+"_MergeIndels"
// 1b. genomically annotate SNPs -- no longer slow // 1b. genomically annotate SNPs -- no longer slow
val annotated = new GenomicAnnotator with CommandLineGATKArgs val annotated = new GenomicAnnotator with CommandLineGATKArgs
annotated.rodBind :+= RodBind("variant", "VCF", snps.out) annotated.rodBind :+= RodBind("variant", "VCF", snps.out)
annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.refseqTable) annotated.rodBind :+= RodBind("refseq", "AnnotatorInputTable", qscript.refseqTable)
annotated.rodBind :+= RodBind("dbsnp", "AnnotatorInputTable", qscript.dbsnpTable) //annotated.rodBind :+= RodBind("dbsnp", "AnnotatorInputTable", qscript.dbsnpTable)
annotated.out = swapExt(snps.out,".vcf",".annotated.vcf") annotated.out = swapExt(snps.out,".vcf",".annotated.vcf")
annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet" //annotated.select :+= "dbsnp.name,dbsnp.refUCSC,dbsnp.strand,dbsnp.observed,dbsnp.avHet"
annotated.rodToIntervalTrackName = "variant" annotated.rodToIntervalTrackName = "variant"
annotated.analysisName = base+"_GenomicAnnotator" annotated.analysisName = base+"_GenomicAnnotator"
// 2.a filter on cluster and near indels // 2.a filter on cluster and near indels
val masker = new VariantFiltration with CommandLineGATKArgs val masker = new VariantFiltration with CommandLineGATKArgs
masker.variantVCF = annotated.out masker.variantVCF = annotated.out
masker.rodBind :+= RodBind("mask", "VCF", mergeIndels.out) masker.rodBind :+= RodBind("mask", "VCF", mergeIndels.out)
masker.maskName = "NearIndel" masker.maskName = "NearIndel"
masker.clusterWindowSize = Some(qscript.snpClusterWindow) masker.clusterWindowSize = Some(qscript.snpClusterWindow)
@ -270,7 +268,6 @@ class fullCallingPipeline extends QScript {
masker.out = swapExt(annotated.out,".vcf",".indel.masked.vcf") masker.out = swapExt(annotated.out,".vcf",".indel.masked.vcf")
masker.analysisName = base+"_Cluster_and_Indel_filter" masker.analysisName = base+"_Cluster_and_Indel_filter"
// 2.b hand filter with standard filter // 2.b hand filter with standard filter
val handFilter = new VariantFiltration with CommandLineGATKArgs val handFilter = new VariantFiltration with CommandLineGATKArgs
handFilter.variantVCF = masker.out handFilter.variantVCF = masker.out
@ -280,7 +277,6 @@ class fullCallingPipeline extends QScript {
handFilter.out = swapExt(annotated.out,".vcf",".handfiltered.vcf") handFilter.out = swapExt(annotated.out,".vcf",".handfiltered.vcf")
handFilter.analysisName = base+"_HandFilter" handFilter.analysisName = base+"_HandFilter"
// 3.i generate gaussian clusters on the masked vcf // 3.i generate gaussian clusters on the masked vcf
// todo -- args for annotations? // todo -- args for annotations?
// todo -- args for resources (properties file) // todo -- args for resources (properties file)
@ -295,7 +291,6 @@ class fullCallingPipeline extends QScript {
clusters.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun") clusters.use_annotation ++= List("QD", "SB", "HaplotypeScore", "HRun")
clusters.analysisName = base+"_Cluster" clusters.analysisName = base+"_Cluster"
// 3.ii apply gaussian clusters to the masked vcf // 3.ii apply gaussian clusters to the masked vcf
val recalibrate = new VariantRecalibrator with CommandLineGATKArgs val recalibrate = new VariantRecalibrator with CommandLineGATKArgs
recalibrate.clusterFile = clusters.clusterFile recalibrate.clusterFile = clusters.clusterFile
@ -316,14 +311,13 @@ class fullCallingPipeline extends QScript {
cut.fdr_filter_level = Some(1) cut.fdr_filter_level = Some(1)
cut.analysisName = base+"_ApplyVariantCuts" cut.analysisName = base+"_ApplyVariantCuts"
// 4. Variant eval the cut and the hand-filtered vcf files // 4. Variant eval the cut and the hand-filtered vcf files
val eval = new VariantEval with CommandLineGATKArgs val eval = new VariantEval with CommandLineGATKArgs
eval.rodBind :+= RodBind("evalOptimized", "VCF", cut.out) eval.rodBind :+= RodBind("evalOptimized", "VCF", cut.out)
eval.rodBind :+= RodBind("evalHandFiltered", "VCF", handFilter.out) eval.rodBind :+= RodBind("evalHandFiltered", "VCF", handFilter.out)
eval.evalModule ++= List("CountFunctionalClasses", "CompOverlap", "CountVariants", "TiTvVariantEvaluator") eval.evalModule ++= List("CountFunctionalClasses", "CompOverlap", "CountVariants", "TiTvVariantEvaluator")
//eval.reportLocation = new File(base+".eval") eval.reportLocation = new File(base+".eval")
//eval.reportType = "R" eval.reportType = Option(org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType.R)
eval.analysisName = base+"_VariantEval" eval.analysisName = base+"_VariantEval"
add(snps) add(snps)
@ -353,14 +347,11 @@ class fullCallingPipeline extends QScript {
// to be in the working directory. When database access is ready, this and the protocol and sequencer parameters of // to be in the working directory. When database access is ready, this and the protocol and sequencer parameters of
//the r script will go away. //the r script will go away.
for ( igv2 <- indelGenotypers ) { for ( igv2 <- indelGenotypers ) {
add(igv2) add(igv2)
} }
// add(mergeIndels,annotated,masker,handFilter,clusters,recalibrate,cut,eval,adpr) // add(mergeIndels,annotated,masker,handFilter,clusters,recalibrate,cut,eval,adpr)
add(mergeIndels,annotated,masker,handFilter,clusters,recalibrate,cut,eval) add(mergeIndels,annotated,masker,handFilter,clusters,recalibrate,cut,eval)
} }
} }