full data processing pipeline, now deleting intermediate files and performing both phases (per lane and combined) of the processing.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5269 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-02-17 23:34:00 +00:00
parent 4f83151c4e
commit 7f9ca6b28a
1 changed files with 187 additions and 99 deletions

View File

@ -2,6 +2,7 @@ import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
import org.broadinstitute.sting.queue.function.ListWriterFunction
import scala.io.Source import scala.io.Source
@ -9,50 +10,36 @@ class dataProcessing extends QScript {
qscript => qscript =>
@Input(doc="path to GATK jar", shortName="gatk", required=true) @Input(doc="path to GATK jar", shortName="gatk", required=true)
var gatkJar: File = _ var GATKjar: File = _
@Input(doc="path to MarkDuplicates jar", shortName="dedup", required=true) @Input(doc="path to AnalyzeCovariates.jar", shortName="ac", required=true)
var dedupJar: File = _ var ACJar: File = _
@Input(doc="input BAM file", shortName="input", required=true) @Input(doc="path to R resources folder inside Sting", shortName="r", required=true)
var inputBam: String = _ var R: String = _
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", shortName="fixmates", required=false)
var fixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
@Input(doc="path to MarkDuplicates jar", shortName="dedup", required=false)
var dedupJar: File = new java.io.File("/seq/software/picard/current/bin/MarkDuplicates.jar")
@Input(doc="input BAM file", shortName="i", required=true)
var input: String = _
@Input(doc="final output BAM file base name", shortName="p", required=false)
var projectName: String = "combined"
@Input(doc="output path", shortName="outputDir", required=false) @Input(doc="output path", shortName="outputDir", required=false)
var outputDir: String = "" var outputDir: String = ""
@Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false)
var outputTmpDir: File = _
@Input(doc="the -L interval string to be used by GATK", shortName="L", required=false) @Input(doc="the -L interval string to be used by GATK", shortName="L", required=false)
var intervalString: String = "" var intervalString: String = ""
@Input(doc="provide a .intervals file with the list of target intervals", shortName="intervals", required=false) @Input(doc="provide a .intervals file with the list of target intervals", shortName="intervals", required=false)
var intervals: File = _ var intervals: File = new File("/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals")
def script = {
// Populates the list of files to process either from a single bam or from a list of bams.
var bamList: List[String] = Nil
if (inputBam.endsWith("bam")) {
bamList :+= inputBam
}
else { // todo: maybe check if the filename ends with .list?
for (bam <- Source.fromFile(inputBam).getLines())
bamList :+= bam
}
bamList.foreach { bam =>
// Files generated by the pipeline
val baseName: String = swapExt(new File(bam.substring(bam.lastIndexOf("/")+1)), ".bam", "").toString()
val cleanedBam: String = baseName + ".cleaned.bam"
val dedupedBam: String = baseName + ".cleaned.dedup.bam"
val metricsFile: String = baseName + ".metrics"
val targetIntervals: String = baseName + ".indel.intervals"
val recalFile: String = baseName + ".recal.csv"
val recalBam: String = baseName + ".cleaned.dedup.recal.bam"
// Reference sequence, dbsnps and RODs used by the pipeline // Reference sequence, dbsnps and RODs used by the pipeline
val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta") val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
@ -61,75 +48,176 @@ class dataProcessing extends QScript {
val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/EUR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz" val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/EUR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
// Simple boolean definitions for code clarity
val knownsOnly: Boolean = true
val intermediate: Boolean = true
// General arguments to all programs // General arguments to all programs
trait CommandLineGATKArgs extends CommandLineGATK { trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = qscript.gatkJar this.jarFile = qscript.GATKjar
this.reference_sequence = reference this.reference_sequence = reference
this.memoryLimit = Some(4) this.memoryLimit = Some(4)
this.jobTempDir = qscript.outputTmpDir this.isIntermediate = true
} }
val target = new RealignerTargetCreator with CommandLineGATKArgs def script = {
target.input_file :+= new File(bam)
target.out = new File(targetIntervals)
target.mismatchFraction = Some(0.0)
target.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
target.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls)
target.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
target.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
target.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
target.jobName = baseName + ".target"
if (!qscript.intervalString.isEmpty()) target.intervalsString ++= List(qscript.intervalString)
if (qscript.intervals != Nil) target.intervals ++= List(qscript.intervals)
// 2.) Clean without SW var perLaneBamList: List[String] = Nil
var recalibratedBamList: List[File] = Nil
var recalibratedBamIndexList: List[File] = Nil
val clean = new IndelRealigner with CommandLineGATKArgs // Populates the list of per lane bam files to process (single bam or list of bams).
clean.input_file :+= new File(bam) if (input.endsWith("bam")) {
clean.targetIntervals = new File(targetIntervals) perLaneBamList :+= input
clean.out = new File(cleanedBam) }
clean.doNotUseSW = true else {
clean.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE) for (bam <- Source.fromFile(input).getLines())
clean.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) perLaneBamList :+= bam
clean.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls) }
clean.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
clean.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
clean.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
clean.jobName = baseName + ".clean"
if (!qscript.intervalString.isEmpty()) clean.intervalsString ++= List(qscript.intervalString)
if (qscript.intervals != Nil) clean.intervals ++= List(qscript.intervals)
// 3.) Mark Duplicates
val dedup = new PicardBamJarFunction{
@Input(doc="cleaned bam") var clean: File = new File(cleanedBam) perLaneBamList.foreach { perLaneBam =>
@Output(doc="deduped bam") var deduped: File = new File(dedupedBam)
// Helpful variables
val baseName: String = swapExt(new File(perLaneBam.substring(perLaneBam.lastIndexOf("/")+1)), ".bam", "").toString()
val baseDir: String = perLaneBam.substring(0, perLaneBam.lastIndexOf("/")+1)
// BAM files generated by the pipeline
val cleanedBam: String = baseName + ".cleaned.bam"
val fixedBam: String = baseName + ".cleaned.fixed.bam"
val dedupedBam: String = baseName + ".cleaned.fixed.dedup.bam"
val recalBam: String = baseName + ".cleaned.fixed.dedup.recal.bam"
// Accessory files
val targetIntervals: String = baseName + ".indel.intervals"
val metricsFile: String = baseName + ".metrics"
val preRecalFile: String = baseName + ".pre_recal.csv"
val postRecalFile: String = baseName + ".post_recal.csv"
val preOutPath: String = baseName + ".pre"
val postOutPath: String = baseName + ".post"
add(new target(perLaneBam, targetIntervals),
new clean(perLaneBam, targetIntervals, cleanedBam, knownsOnly),
new fixMates(cleanedBam, fixedBam, intermediate),
new dedup(fixedBam, dedupedBam, metricsFile),
new index(dedupedBam),
new cov(dedupedBam, preRecalFile),
new recal(dedupedBam, preRecalFile, recalBam),
new index(recalBam),
new cov(recalBam, postRecalFile),
new analyzeCovariates(preRecalFile, preOutPath),
new analyzeCovariates(postRecalFile, postOutPath))
recalibratedBamList :+= new File(recalBam)
recalibratedBamIndexList :+= new File(recalBam + ".bai") // to hold next process by this dependency
}
// Helpful variables
val outName: String = qscript.projectName
val outDir: String = qscript.outputDir
// BAM files generated by the pipeline
val bamList: String = outDir + outName + ".list"
val cleanedBam: String = outDir + outName + ".cleaned.bam"
val fixedBam: String = outDir + outName + ".final.bam"
// Accessory files
val targetIntervals: String = outDir + outName + ".indel.intervals"
add(new writeList(recalibratedBamList, bamList, recalibratedBamIndexList),
new target(bamList, targetIntervals),
new clean(bamList, targetIntervals, cleanedBam, !knownsOnly),
new fixMates(cleanedBam, fixedBam, !intermediate))
}
class target (inBams: String, outIntervals: String) extends RealignerTargetCreator with CommandLineGATKArgs {
this.input_file :+= new File(inBams)
this.out = new File(outIntervals)
this.mismatchFraction = Some(0.0)
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls)
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
this.jobName = inBams + ".tgt"
if (!qscript.intervalString.isEmpty()) this.intervalsString :+= qscript.intervalString
else this.intervals :+= qscript.intervals
}
class clean (inBams: String, tIntervals: String, outBam: String, knownsOnly: Boolean) extends IndelRealigner with CommandLineGATKArgs {
this.input_file :+= new File(inBams)
this.targetIntervals = new File(tIntervals)
this.out = new File(outBam)
this.doNotUseSW = true
this.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE)
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls)
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
this.useOnlyKnownIndels = knownsOnly
this.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
this.jobName = inBams + ".clean"
if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString)
else this.intervals :+= qscript.intervals
}
class fixMates (inBam: String, outBam: String, intermediate: Boolean) extends PicardBamJarFunction {
@Input(doc="cleaned bam") var cleaned: File = new File(inBam)
@Output(doc="fixed bam") var fixed: File = new File(outBam)
override def inputBams = List(cleaned)
override def outputBam = fixed
this.jarFile = qscript.fixMatesJar
this.isIntermediate = intermediate
this.memoryLimit = Some(6)
this.jobName = inBam + ".fix"
}
class dedup (inBam: String, outBam: String, metricsFile: String) extends PicardBamJarFunction {
@Input(doc="fixed bam") var clean: File = new File(inBam)
@Output(doc="deduped bam") var deduped: File = new File(outBam)
override def inputBams = List(clean) override def inputBams = List(clean)
override def outputBam = deduped override def outputBam = deduped
override def commandLine = super.commandLine + " M=" + metricsFile override def commandLine = super.commandLine + " M=" + metricsFile
sortOrder = null sortOrder = null
this.memoryLimit = Some(6)
this.jarFile = qscript.dedupJar
this.jobName = inBam + ".dedup"
} }
dedup.memoryLimit = Some(8)
dedup.jarFile = qscript.dedupJar
dedup.jobName = baseName + ".dedup"
val index = new SamtoolsIndexFunction class index (inBam: String) extends SamtoolsIndexFunction {
index.bamFile = new File(dedupedBam) @Output(doc="bam index file") var outIndex: File = new File(inBam + ".bai")
index.analysisName = baseName + ".index" this.bamFile = new File(inBam)
this.analysisName = inBam + ".index"
val cov = new CountCovariates with CommandLineGATKArgs
cov.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
cov.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
cov.input_file :+= new File(dedupedBam)
cov.recal_file = new File(recalFile)
val recal = new TableRecalibration with CommandLineGATKArgs
recal.input_file :+= new File (dedupedBam)
recal.recal_file = new File(recalFile)
recal.out = new File(recalBam)
add(target, clean, dedup, index, cov, recal)
} }
class cov (inBam: String, outRecalFile: String) extends CountCovariates with CommandLineGATKArgs {
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
this.input_file :+= new File(inBam)
this.recal_file = new File(outRecalFile)
}
class recal (inBam: String, inRecalFile: String, outBam: String) extends TableRecalibration with CommandLineGATKArgs {
this.input_file :+= new File (inBam)
this.recal_file = new File(inRecalFile)
this.out = new File(outBam)
}
class analyzeCovariates (inRecalFile: String, outPath: String) extends AnalyzeCovariates {
this.jarFile = qscript.ACJar
this.resources = qscript.R
this.recal_file = new File(inRecalFile)
this.output_dir = outPath
}
class writeList(inBams: List[File], outBamList: String, depIndices: List[File]) extends ListWriterFunction {
@Input(doc="bam indexes") var indexes: List[File] = depIndices // I need this dependency to hold creation of the list until all indices are done
this.inputFiles = inBams
this.listFile = new File(outBamList)
this.jobName = "bamList"
} }
} }