data processing pipeline now has on the fly bam indexing (powered by Matt) some new parameters, Indel Cleaning with constrain movement and fixMates is gone.

setting up methods development pipeline for some cosmetic changes.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5277 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-02-18 23:13:54 +00:00
parent 290afae047
commit c61dd2f09f
2 changed files with 67 additions and 59 deletions

View File

@ -1,7 +1,10 @@
import org.broadinstitute.sting.commandline.ArgumentSource
import org.broadinstitute.sting.gatk.CommandLineGATK import org.broadinstitute.sting.gatk.CommandLineGATK
import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport import org.broadinstitute.sting.gatk.phonehome.GATKRunReport
import org.broadinstitute.sting.queue.function.scattergather.{GatherFunction, CloneFunction, ScatterFunction}
class MethodsDevelopmentCallingPipeline extends QScript { class MethodsDevelopmentCallingPipeline extends QScript {
qscript => qscript =>
@ -12,13 +15,13 @@ class MethodsDevelopmentCallingPipeline extends QScript {
@Argument(shortName="outputDir", doc="output directory", required=true) @Argument(shortName="outputDir", doc="output directory", required=true)
var outputDir: String = "./" var outputDir: String = "./"
@Argument(shortName="skipCalling", doc="If true, skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false) @Argument(shortName="skipCalling", doc="skip the calling part of the pipeline and only run VQSR on preset, gold standard VCF files", required=false)
var skipCalling: Boolean = false var skipCalling: Boolean = false
@Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false) @Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
var datasets: List[String] = Nil var datasets: List[String] = Nil
@Argument(shortName="skipGoldStandard", doc="runs the pipeline with the goldstandard VCF files for comparison", required=false) @Argument(shortName="skipGoldStandard", doc="doesn't run the pipeline with the goldstandard VCF files for comparison", required=false)
var skipGoldStandard: Boolean = false var skipGoldStandard: Boolean = false
@Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false) @Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
@ -30,7 +33,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
@Argument(shortName="eval", doc="adds the VariantEval walker to the pipeline", required=false) @Argument(shortName="eval", doc="adds the VariantEval walker to the pipeline", required=false)
var eval: Boolean = false var eval: Boolean = false
@Argument(shortName="noCut", doc="adds the ApplyVariantCut walker to the pipeline", required=false) @Argument(shortName="noCut", doc="removes the ApplyVariantCut walker from the pipeline", required=false)
var noCut: Boolean = false var noCut: Boolean = false
@Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false) @Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false)
@ -183,10 +186,25 @@ class MethodsDevelopmentCallingPipeline extends QScript {
this.out = t.rawVCF this.out = t.rawVCF
this.baq = Some( if (noBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE}) this.baq = Some( if (noBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE})
this.analysisName = t.name + "_UG" this.analysisName = t.name + "_UG"
if (t.dbsnpFile.endsWith(".rod")) if (t.dbsnpFile.endsWith(".rod")) this.DBSNP = new File(t.dbsnpFile)
this.DBSNP = new File(t.dbsnpFile) else if (t.dbsnpFile.endsWith(".vcf")) this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
else if (t.dbsnpFile.endsWith(".vcf")) /*
this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) this.setupScatterFunction = {
case scatter: ScatterFunction =>
scatter.commandDirectory = new File("UG/ScatterGather")
scatter.jobOutputFile = new File(".queue/UG/ScatterGather/Scatter.out")
}
this.setupCloneFunction = {
case (clone: CloneFunction, index: Int) =>
clone.commandDirectory = new File("SnpCalls/ScatterGather/Scatter_%s".format(index))
clone.jobOutputFile = new File(".queue/logs/SNPCalling/ScatterGather/Scatter_%s.out".format(index))
}
this.setupGatherFunction = {
case (gather: GatherFunction, source: ArgumentSource) =>
gather.commandDirectory = new File("UG/ScatterGather/Gather_%s".format(source.field.getName))
gather.jobOutputFile = new File(".queue/UG/ScatterGather/Gather_%s.out".format(source.field.getName))
}
*/
} }
// 2.) Filter SNPs // 2.) Filter SNPs

View File

@ -9,27 +9,31 @@ import scala.io.Source
class dataProcessing extends QScript { class dataProcessing extends QScript {
qscript => qscript =>
@Input(doc="path to GATK jar", shortName="gatk", required=true) @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true)
var GATKjar: File = _ var GATKjar: File = _
@Input(doc="path to AnalyzeCovariates.jar", shortName="ac", required=true) @Input(doc="path to AnalyzeCovariates.jar", shortName="ac", required=true)
var ACJar: File = _ var ACJar: File = _
// todo -- we should support the standard GATK arguments -R, -D [dbsnp], @Input(doc="path to Picard's MarkDuplicates.jar", shortName="dedup", required=true)
// todo -- and indel files. Those should be defaulted to hg19 but providable on command line var dedupJar: File = _
@Input(doc="path to R resources folder inside Sting", shortName="r", required=true)
@Input(doc="path to R resources folder inside the Sting repository", shortName="r", required=true)
var R: String = _ var R: String = _
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", shortName="fixmates", required=false) @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true)
var fixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
@Input(doc="path to MarkDuplicates jar", shortName="dedup", required=false)
var dedupJar: File = new java.io.File("/seq/software/picard/current/bin/MarkDuplicates.jar")
@Input(doc="input BAM file", shortName="i", required=true)
var input: String = _ var input: String = _
@Input(doc="final output BAM file base name", shortName="p", required=false) @Input(doc="Reference fasta file", shortName="R", required=false)
var reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
@Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) // todo -- accept any format. Not only VCF.
val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
@Input(doc="extra VCF files to use as reference indels for Indel Realignment", shortName="indels", required=false) //todo -- once vcfs are merged, this will become the only indel vcf to be used and the merged file will be the default.
val indels: File = _
@Input(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", shortName="p", required=false)
var projectName: String = "combined" var projectName: String = "combined"
@Input(doc="output path", shortName="outputDir", required=false) @Input(doc="output path", shortName="outputDir", required=false)
@ -43,11 +47,7 @@ class dataProcessing extends QScript {
var intervals: File = new File("/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals") var intervals: File = new File("/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals")
// Reference sequence, dbsnps and RODs used by the pipeline // todo -- let's create a pre-merged single VCF and put it into /humgen/gsa-hpprojects/GATK/data please
val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
// TODO -- let's create a pre-merged single VCF and put it into /humgen/gsa-hpprojects/GATK/data please
val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf" val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
@ -60,7 +60,7 @@ class dataProcessing extends QScript {
// General arguments to all programs // General arguments to all programs
trait CommandLineGATKArgs extends CommandLineGATK { trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = qscript.GATKjar this.jarFile = qscript.GATKjar
this.reference_sequence = reference this.reference_sequence = qscript.reference
this.memoryLimit = Some(4) this.memoryLimit = Some(4)
this.isIntermediate = true this.isIntermediate = true
} }
@ -70,16 +70,14 @@ class dataProcessing extends QScript {
var perLaneBamList: List[String] = Nil var perLaneBamList: List[String] = Nil
var recalibratedBamList: List[File] = Nil var recalibratedBamList: List[File] = Nil
var recalibratedBamIndexList: List[File] = Nil
// Populates the list of per lane bam files to process (single bam or list of bams). // Populates the list of per lane bam files to process (single bam or list of bams).
if (input.endsWith("bam")) { if (input.endsWith("bam"))
perLaneBamList :+= input perLaneBamList :+= input
} else
else {
for (bam <- Source.fromFile(input).getLines()) for (bam <- Source.fromFile(input).getLines())
perLaneBamList :+= bam perLaneBamList :+= bam
}
@ -90,10 +88,9 @@ class dataProcessing extends QScript {
val baseDir: String = perLaneBam.substring(0, perLaneBam.lastIndexOf("/")+1) val baseDir: String = perLaneBam.substring(0, perLaneBam.lastIndexOf("/")+1)
// BAM files generated by the pipeline // BAM files generated by the pipeline
val cleanedBam: String = baseName + ".cleaned.bam" val cleanedBam: String = baseName + ".clean.bam"
val fixedBam: String = baseName + ".cleaned.fixed.bam" val dedupedBam: String = baseName + ".clean.dedup.bam"
val dedupedBam: String = baseName + ".cleaned.fixed.dedup.bam" val recalBam: String = baseName + ".clean.dedup.recal.bam"
val recalBam: String = baseName + ".cleaned.fixed.dedup.recal.bam"
// Accessory files // Accessory files
val targetIntervals: String = baseName + ".indel.intervals" val targetIntervals: String = baseName + ".indel.intervals"
@ -104,19 +101,15 @@ class dataProcessing extends QScript {
val postOutPath: String = baseName + ".post" val postOutPath: String = baseName + ".post"
add(new target(perLaneBam, targetIntervals), add(new target(perLaneBam, targetIntervals),
new clean(perLaneBam, targetIntervals, cleanedBam, knownsOnly), // todo -- use constrained movement mode to skip this new clean(perLaneBam, targetIntervals, cleanedBam, knownsOnly, intermediate),
new fixMates(cleanedBam, fixedBam, intermediate), // todo -- use constrained movement mode to skip this new dedup(cleanedBam, dedupedBam, metricsFile),
new dedup(fixedBam, dedupedBam, metricsFile), // todo -- generate index on fly here
new index(dedupedBam), // todo -- remove for on the fly index
new cov(dedupedBam, preRecalFile), new cov(dedupedBam, preRecalFile),
new recal(dedupedBam, preRecalFile, recalBam), // todo -- use GATK on the fly indexing? new recal(dedupedBam, preRecalFile, recalBam),
new index(recalBam), // todo remove for on the fly indexing
new cov(recalBam, postRecalFile), new cov(recalBam, postRecalFile),
new analyzeCovariates(preRecalFile, preOutPath), new analyzeCovariates(preRecalFile, preOutPath),
new analyzeCovariates(postRecalFile, postOutPath)) new analyzeCovariates(postRecalFile, postOutPath))
recalibratedBamList :+= new File(recalBam) recalibratedBamList :+= new File(recalBam)
recalibratedBamIndexList :+= new File(recalBam + ".bai") // to hold next process by this dependency
} }
// Helpful variables // Helpful variables
@ -125,16 +118,15 @@ class dataProcessing extends QScript {
// BAM files generated by the pipeline // BAM files generated by the pipeline
val bamList: String = outDir + outName + ".list" val bamList: String = outDir + outName + ".list"
val cleanedBam: String = outDir + outName + ".cleaned.bam" val cleanedBam: String = outDir + outName + ".clean.bam"
val fixedBam: String = outDir + outName + ".final.bam" val fixedBam: String = outDir + outName + ".processed.bam"
// Accessory files // Accessory files
val targetIntervals: String = outDir + outName + ".indel.intervals" val targetIntervals: String = outDir + outName + ".indel.intervals"
add(new writeList(recalibratedBamList, bamList, recalibratedBamIndexList), add(new writeList(recalibratedBamList, bamList),
new target(bamList, targetIntervals), new target(bamList, targetIntervals), // todo -- reuse previously generated intervals (see how to do that)
new clean(bamList, targetIntervals, cleanedBam, !knownsOnly), // todo -- use constrained movement mode to skip fix mates new clean(bamList, targetIntervals, cleanedBam, !knownsOnly, !intermediate))
new fixMates(cleanedBam, fixedBam, !intermediate)) // todo -- use constrained movement mode to skip this
} }
class target (inBams: String, outIntervals: String) extends RealignerTargetCreator with CommandLineGATKArgs { class target (inBams: String, outIntervals: String) extends RealignerTargetCreator with CommandLineGATKArgs {
@ -146,12 +138,13 @@ class dataProcessing extends QScript {
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls) this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls) this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls) this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
if (qscript.indels != null) this.rodBind :+= RodBind("indels5", "VCF", qscript.indels)
this.jobName = inBams + ".tgt" this.jobName = inBams + ".tgt"
if (!qscript.intervalString.isEmpty()) this.intervalsString :+= qscript.intervalString if (!qscript.intervalString.isEmpty()) this.intervalsString :+= qscript.intervalString
else this.intervals :+= qscript.intervals else this.intervals :+= qscript.intervals
} }
class clean (inBams: String, tIntervals: String, outBam: String, knownsOnly: Boolean) extends IndelRealigner with CommandLineGATKArgs { class clean (inBams: String, tIntervals: String, outBam: String, knownsOnly: Boolean, intermediate: Boolean) extends IndelRealigner with CommandLineGATKArgs {
this.input_file :+= new File(inBams) this.input_file :+= new File(inBams)
this.targetIntervals = new File(tIntervals) this.targetIntervals = new File(tIntervals)
this.out = new File(outBam) this.out = new File(outBam)
@ -162,8 +155,11 @@ class dataProcessing extends QScript {
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls) this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls) this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls) this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
if (qscript.indels != null) this.rodBind :+= RodBind("indels5", "VCF", qscript.indels)
this.useOnlyKnownIndels = knownsOnly this.useOnlyKnownIndels = knownsOnly
this.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true this.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
this.constrainMovement = true
this.isIntermediate = intermediate
this.jobName = inBams + ".clean" this.jobName = inBams + ".clean"
if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString) if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString)
else this.intervals :+= qscript.intervals else this.intervals :+= qscript.intervals
@ -174,6 +170,7 @@ class dataProcessing extends QScript {
@Output(doc="fixed bam") var fixed: File = new File(outBam) @Output(doc="fixed bam") var fixed: File = new File(outBam)
override def inputBams = List(cleaned) override def inputBams = List(cleaned)
override def outputBam = fixed override def outputBam = fixed
override def commandLine = super.commandLine + " CREATE_INDEX=true"
this.jarFile = qscript.fixMatesJar this.jarFile = qscript.fixMatesJar
this.isIntermediate = intermediate this.isIntermediate = intermediate
this.memoryLimit = Some(6) this.memoryLimit = Some(6)
@ -185,21 +182,14 @@ class dataProcessing extends QScript {
@Output(doc="deduped bam") var deduped: File = new File(outBam) @Output(doc="deduped bam") var deduped: File = new File(outBam)
override def inputBams = List(clean) override def inputBams = List(clean)
override def outputBam = deduped override def outputBam = deduped
override def commandLine = super.commandLine + " M=" + metricsFile override def commandLine = super.commandLine + " M=" + metricsFile + " CREATE_INDEX=true"
sortOrder = null sortOrder = null
this.memoryLimit = Some(6) this.memoryLimit = Some(6)
this.jarFile = qscript.dedupJar this.jarFile = qscript.dedupJar
this.isIntermediate = true
this.jobName = inBam + ".dedup" this.jobName = inBam + ".dedup"
} }
// todo -- may we should use the picard version instead? What about telling all of the picard tools to
// todo -- generate BAM indices on the fly? That would be even better
class index (inBam: String) extends SamtoolsIndexFunction {
@Output(doc="bam index file") var outIndex: File = new File(inBam + ".bai")
this.bamFile = new File(inBam)
this.analysisName = inBam + ".index"
}
class cov (inBam: String, outRecalFile: String) extends CountCovariates with CommandLineGATKArgs { class cov (inBam: String, outRecalFile: String) extends CountCovariates with CommandLineGATKArgs {
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
@ -211,6 +201,7 @@ class dataProcessing extends QScript {
this.input_file :+= new File (inBam) this.input_file :+= new File (inBam)
this.recal_file = new File(inRecalFile) this.recal_file = new File(inRecalFile)
this.out = new File(outBam) this.out = new File(outBam)
this.index_output_bam_on_the_fly = Some(true)
} }
class analyzeCovariates (inRecalFile: String, outPath: String) extends AnalyzeCovariates { class analyzeCovariates (inRecalFile: String, outPath: String) extends AnalyzeCovariates {
@ -220,8 +211,7 @@ class dataProcessing extends QScript {
this.output_dir = outPath this.output_dir = outPath
} }
class writeList(inBams: List[File], outBamList: String, depIndices: List[File]) extends ListWriterFunction { class writeList(inBams: List[File], outBamList: String) extends ListWriterFunction {
@Input(doc="bam indexes") var indexes: List[File] = depIndices // I need this dependency to hold creation of the list until all indices are done
this.inputFiles = inBams this.inputFiles = inBams
this.listFile = new File(outBamList) this.listFile = new File(outBamList)
this.jobName = "bamList" this.jobName = "bamList"