data processing pipeline now has on the fly bam indexing (powered by Matt) some new parameters, Indel Cleaning with constrain movement and fixMates is gone.

setting up methods development pipeline for some cosmetic changes.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5277 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-02-18 23:13:54 +00:00
parent 290afae047
commit c61dd2f09f
2 changed files with 67 additions and 59 deletions

View File

@ -1,7 +1,10 @@
import org.broadinstitute.sting.commandline.ArgumentSource
import org.broadinstitute.sting.gatk.CommandLineGATK
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport
import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction}
class MethodsDevelopmentCallingPipeline extends QScript {
qscript =>
@ -12,13 +15,13 @@ class MethodsDevelopmentCallingPipeline extends QScript {
@Argument(shortName="outputDir", doc="output directory", required=true)
var outputDir: String = "./"
@Argument(shortName="skipCalling", doc="If true, skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false)
@Argument(shortName="skipCalling", doc="skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false)
var skipCalling: Boolean = false
@Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
var datasets: List[String] = Nil
@Argument(shortName="skipGoldStandard", doc="runs the pipeline with the goldstandard VCF files for comparison", required=false)
@Argument(shortName="skipGoldStandard", doc="doesn't run the pipeline with the goldstandard VCF files for comparison", required=false)
var skipGoldStandard: Boolean = false
@Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
@ -30,7 +33,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
@Argument(shortName="eval", doc="adds the VariantEval walker to the pipeline", required=false)
var eval: Boolean = false
@Argument(shortName="noCut", doc="adds the ApplyVariantCut walker to the pipeline", required=false)
@Argument(shortName="noCut", doc="removes the ApplyVariantCut walker from the pipeline", required=false)
var noCut: Boolean = false
@Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false)
@ -183,10 +186,25 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.out = t.rawVCF
this.baq = Some( if (noBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE})
this.analysisName = t.name + "_UG"
if (t.dbsnpFile.endsWith(".rod"))
this.DBSNP = new File(t.dbsnpFile)
else if (t.dbsnpFile.endsWith(".vcf"))
this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
if (t.dbsnpFile.endsWith(".rod")) this.DBSNP = new File(t.dbsnpFile)
else if (t.dbsnpFile.endsWith(".vcf")) this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
/*
this.setupScatterFunction = {
case scatter: ScatterFunction =>
scatter.commandDirectory = new File("UG/ScatterGather")
scatter.jobOutputFile = new File(".queue/UG/ScatterGather/Scatter.out")
}
this.setupCloneFunction = {
case (clone: CloneFunction, index: Int) =>
clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
}
this.setupGatherFunction = {
case (gather: GatherFunction, source: ArgumentSource) =>
gather.commandDirectory = new File("UG/ScatterGather/Gather_%s".format(source.field.getName))
gather.jobOutputFile = new File(".queue/UG/ScatterGather/Gather_%s.out".format(source.field.getName))
}
*/
}
// 2.) Filter SNPs

View File

@ -9,27 +9,31 @@ import scala.io.Source
class dataProcessing extends QScript {
qscript =>
@Input(doc="path to GATK jar", shortName="gatk", required=true)
@Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
var GATKjar: File = _
@Input(doc="path to AnalyzeCovariates.jar", shortName="ac", required=true)
var ACJar: File = _
// todo -- we should support the standard GATK arguments -R, -D [dbsnp],
// todo -- and indel files. Those should be defaulted to hg19 but providable on command line
@Input(doc="path to R resources folder inside Sting", shortName="r", required=true)
@Input(doc="path to Picard's MarkDuplicates.jar", shortName="dedup", required=true)
var dedupJar: File = _
@Input(doc="path to R resources folder inside the Sting repository", shortName="r", required=true)
var R: String = _
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", shortName="fixmates", required=false)
var fixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
@Input(doc="path to MarkDuplicates jar", shortName="dedup", required=false)
var dedupJar: File = new java.io.File("/seq/software/picard/current/bin/MarkDuplicates.jar")
@Input(doc="input BAM file", shortName="i", required=true)
@Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
var input: String = _
@Input(doc="final output BAM file base name", shortName="p", required=false)
@Input(doc="Reference fasta file", shortName="R", required=false)
var reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
@Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) // todo -- accept any format. Not only VCF.
val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
@Input(doc="extra VCF files to use as reference indels for Indel Realignment", shortName="indels", required=false) //todo -- once vcfs are merged, this will become the only indel vcf to be used and the merged file will be the default.
val indels: File = _
@Input(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", shortName="p", required=false)
var projectName: String = "combined"
@Input(doc="output path", shortName="outputDir", required=false)
@ -43,11 +47,7 @@ class dataProcessing extends QScript {
var intervals: File = new File("/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals")
// Reference sequence, dbsnps and RODs used by the pipeline
val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
// TODO -- let's create a pre-merged single VCF and put it into /humgen/gsa-hpprojects/GATK/data please
// todo -- let's create a pre-merged single VCF and put it into /humgen/gsa-hpprojects/GATK/data please
val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
@ -60,7 +60,7 @@ class dataProcessing extends QScript {
// General arguments to all programs
trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = qscript.GATKjar
this.reference_sequence = reference
this.reference_sequence = qscript.reference
this.memoryLimit = Some(4)
this.isIntermediate = true
}
@ -70,16 +70,14 @@ class dataProcessing extends QScript {
var perLaneBamList: List[String] = Nil
var recalibratedBamList: List[File] = Nil
var recalibratedBamIndexList: List[File] = Nil
// Populates the list of per lane bam files to process (single bam or list of bams).
if (input.endsWith("bam")) {
if (input.endsWith("bam"))
perLaneBamList :+= input
}
else {
else
for (bam <- Source.fromFile(input).getLines())
perLaneBamList :+= bam
}
@ -90,10 +88,9 @@ class dataProcessing extends QScript {
val baseDir: String = perLaneBam.substring(0, perLaneBam.lastIndexOf("/")+1)
// BAM files generated by the pipeline
val cleanedBam: String = baseName + ".cleaned.bam"
val fixedBam: String = baseName + ".cleaned.fixed.bam"
val dedupedBam: String = baseName + ".cleaned.fixed.dedup.bam"
val recalBam: String = baseName + ".cleaned.fixed.dedup.recal.bam"
val cleanedBam: String = baseName + ".clean.bam"
val dedupedBam: String = baseName + ".clean.dedup.bam"
val recalBam: String = baseName + ".clean.dedup.recal.bam"
// Accessory files
val targetIntervals: String = baseName + ".indel.intervals"
@ -104,19 +101,15 @@ class dataProcessing extends QScript {
val postOutPath: String = baseName + ".post"
add(new target(perLaneBam, targetIntervals),
new clean(perLaneBam, targetIntervals, cleanedBam, knownsOnly), // todo -- use constrained movement mode to skip this
new fixMates(cleanedBam, fixedBam, intermediate), // todo -- use constrained movement mode to skip this
new dedup(fixedBam, dedupedBam, metricsFile), // todo -- generate index on fly here
new index(dedupedBam), // todo -- remove for on the fly index
new clean(perLaneBam, targetIntervals, cleanedBam, knownsOnly, intermediate),
new dedup(cleanedBam, dedupedBam, metricsFile),
new cov(dedupedBam, preRecalFile),
new recal(dedupedBam, preRecalFile, recalBam), // todo -- use GATK on the fly indexing?
new index(recalBam), // todo remove for on the fly indexing
new recal(dedupedBam, preRecalFile, recalBam),
new cov(recalBam, postRecalFile),
new analyzeCovariates(preRecalFile, preOutPath),
new analyzeCovariates(postRecalFile, postOutPath))
recalibratedBamList :+= new File(recalBam)
recalibratedBamIndexList :+= new File(recalBam + ".bai") // to hold next process by this dependency
}
// Helpful variables
@ -125,16 +118,15 @@ class dataProcessing extends QScript {
// BAM files generated by the pipeline
val bamList: String = outDir + outName + ".list"
val cleanedBam: String = outDir + outName + ".cleaned.bam"
val fixedBam: String = outDir + outName + ".final.bam"
val cleanedBam: String = outDir + outName + ".clean.bam"
val fixedBam: String = outDir + outName + ".processed.bam"
// Accessory files
val targetIntervals: String = outDir + outName + ".indel.intervals"
add(new writeList(recalibratedBamList, bamList, recalibratedBamIndexList),
new target(bamList, targetIntervals),
new clean(bamList, targetIntervals, cleanedBam, !knownsOnly), // todo -- use constrained movement mode to skip fix mates
new fixMates(cleanedBam, fixedBam, !intermediate)) // todo -- use constrained movement mode to skip this
add(new writeList(recalibratedBamList, bamList),
new target(bamList, targetIntervals), // todo -- reuse previously generated intervals (see how to do that)
new clean(bamList, targetIntervals, cleanedBam, !knownsOnly, !intermediate))
}
class target (inBams: String, outIntervals: String) extends RealignerTargetCreator with CommandLineGATKArgs {
@ -146,12 +138,13 @@ class dataProcessing extends QScript {
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
if (qscript.indels != null) this.rodBind :+= RodBind("indels5", "VCF", qscript.indels)
this.jobName = inBams + ".tgt"
if (!qscript.intervalString.isEmpty()) this.intervalsString :+= qscript.intervalString
else this.intervals :+= qscript.intervals
}
class clean (inBams: String, tIntervals: String, outBam: String, knownsOnly: Boolean) extends IndelRealigner with CommandLineGATKArgs {
class clean (inBams: String, tIntervals: String, outBam: String, knownsOnly: Boolean, intermediate: Boolean) extends IndelRealigner with CommandLineGATKArgs {
this.input_file :+= new File(inBams)
this.targetIntervals = new File(tIntervals)
this.out = new File(outBam)
@ -162,8 +155,11 @@ class dataProcessing extends QScript {
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
if (qscript.indels != null) this.rodBind :+= RodBind("indels5", "VCF", qscript.indels)
this.useOnlyKnownIndels = knownsOnly
this.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
this.constrainMovement = true
this.isIntermediate = intermediate
this.jobName = inBams + ".clean"
if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString)
else this.intervals :+= qscript.intervals
@ -174,6 +170,7 @@ class dataProcessing extends QScript {
@Output(doc="fixed bam") var fixed: File = new File(outBam)
override def inputBams = List(cleaned)
override def outputBam = fixed
override def commandLine = super.commandLine + " CREATE_INDEX=true"
this.jarFile = qscript.fixMatesJar
this.isIntermediate = intermediate
this.memoryLimit = Some(6)
@ -185,21 +182,14 @@ class dataProcessing extends QScript {
@Output(doc="deduped bam") var deduped: File = new File(outBam)
override def inputBams = List(clean)
override def outputBam = deduped
override def commandLine = super.commandLine + " M=" + metricsFile
override def commandLine = super.commandLine + " M=" + metricsFile + " CREATE_INDEX=true"
sortOrder = null
this.memoryLimit = Some(6)
this.jarFile = qscript.dedupJar
this.isIntermediate = true
this.jobName = inBam + ".dedup"
}
// todo -- may we should use the picard version instead? What about telling all of the picard tools to
// todo -- generate BAM indices on the fly? That would be even better
class index (inBam: String) extends SamtoolsIndexFunction {
@Output(doc="bam index file") var outIndex: File = new File(inBam + ".bai")
this.bamFile = new File(inBam)
this.analysisName = inBam + ".index"
}
class cov (inBam: String, outRecalFile: String) extends CountCovariates with CommandLineGATKArgs {
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
@ -211,6 +201,7 @@ class dataProcessing extends QScript {
this.input_file :+= new File (inBam)
this.recal_file = new File(inRecalFile)
this.out = new File(outBam)
this.index_output_bam_on_the_fly = Some(true)
}
class analyzeCovariates (inRecalFile: String, outPath: String) extends AnalyzeCovariates {
@ -220,8 +211,7 @@ class dataProcessing extends QScript {
this.output_dir = outPath
}
class writeList(inBams: List[File], outBamList: String, depIndices: List[File]) extends ListWriterFunction {
@Input(doc="bam indexes") var indexes: List[File] = depIndices // I need this dependency to hold creation of the list until all indices are done
class writeList(inBams: List[File], outBamList: String) extends ListWriterFunction {
this.inputFiles = inBams
this.listFile = new File(outBamList)
this.jobName = "bamList"