full data processing pipeline, now deleting intermediate files and performing both phases (per lane and combined) of the processing.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5269 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
carneiro 2011-02-17 23:34:00 +00:00
parent 4f83151c4e
commit 7f9ca6b28a
1 changed files with 187 additions and 99 deletions

View File

@ -2,6 +2,7 @@ import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
import org.broadinstitute.sting.queue.function.ListWriterFunction
import scala.io.Source
@ -9,127 +10,214 @@ class dataProcessing extends QScript {
qscript =>
@Input(doc="path to GATK jar", shortName="gatk", required=true)
var gatkJar: File = _
var GATKjar: File = _
@Input(doc="path to MarkDuplicates jar", shortName="dedup", required=true)
var dedupJar: File = _
@Input(doc="path to AnalyzeCovariates.jar", shortName="ac", required=true)
var ACJar: File = _
@Input(doc="input BAM file", shortName="input", required=true)
var inputBam: String = _
@Input(doc="path to R resources folder inside Sting", shortName="r", required=true)
var R: String = _
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", shortName="fixmates", required=false)
var fixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
@Input(doc="path to MarkDuplicates jar", shortName="dedup", required=false)
var dedupJar: File = new java.io.File("/seq/software/picard/current/bin/MarkDuplicates.jar")
@Input(doc="input BAM file", shortName="i", required=true)
var input: String = _
@Input(doc="final output BAM file base name", shortName="p", required=false)
var projectName: String = "combined"
@Input(doc="output path", shortName="outputDir", required=false)
var outputDir: String = ""
@Input(doc="path to tmp space for storing intermediate bam files", shortName="outputTmpDir", required=false)
var outputTmpDir: File = _
@Input(doc="the -L interval string to be used by GATK", shortName="L", required=false)
var intervalString: String = ""
@Input(doc="provide a .intervals file with the list of target intervals", shortName="intervals", required=false)
var intervals: File = _
var intervals: File = new File("/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals")
// Reference sequence, dbsnps and RODs used by the pipeline
val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/EUR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
// Simple boolean definitions for code clarity
val knownsOnly: Boolean = true
val intermediate: Boolean = true
// General arguments to all programs
trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = qscript.GATKjar
this.reference_sequence = reference
this.memoryLimit = Some(4)
this.isIntermediate = true
}
def script = {
// Populates the list of files to process either from a single bam or from a list of bams.
var bamList: List[String] = Nil
if (inputBam.endsWith("bam")) {
bamList :+= inputBam
var perLaneBamList: List[String] = Nil
var recalibratedBamList: List[File] = Nil
var recalibratedBamIndexList: List[File] = Nil
// Populates the list of per lane bam files to process (single bam or list of bams).
if (input.endsWith("bam")) {
perLaneBamList :+= input
}
else { // todo: maybe check if the filename ends with .list?
for (bam <- Source.fromFile(inputBam).getLines())
bamList :+= bam
else {
for (bam <- Source.fromFile(input).getLines())
perLaneBamList :+= bam
}
bamList.foreach { bam =>
// Files generated by the pipeline
val baseName: String = swapExt(new File(bam.substring(bam.lastIndexOf("/")+1)), ".bam", "").toString()
perLaneBamList.foreach { perLaneBam =>
// Helpful variables
val baseName: String = swapExt(new File(perLaneBam.substring(perLaneBam.lastIndexOf("/")+1)), ".bam", "").toString()
val baseDir: String = perLaneBam.substring(0, perLaneBam.lastIndexOf("/")+1)
// BAM files generated by the pipeline
val cleanedBam: String = baseName + ".cleaned.bam"
val dedupedBam: String = baseName + ".cleaned.dedup.bam"
val metricsFile: String = baseName + ".metrics"
val fixedBam: String = baseName + ".cleaned.fixed.bam"
val dedupedBam: String = baseName + ".cleaned.fixed.dedup.bam"
val recalBam: String = baseName + ".cleaned.fixed.dedup.recal.bam"
// Accessory files
val targetIntervals: String = baseName + ".indel.intervals"
val recalFile: String = baseName + ".recal.csv"
val recalBam: String = baseName + ".cleaned.dedup.recal.bam"
val metricsFile: String = baseName + ".metrics"
val preRecalFile: String = baseName + ".pre_recal.csv"
val postRecalFile: String = baseName + ".post_recal.csv"
val preOutPath: String = baseName + ".pre"
val postOutPath: String = baseName + ".post"
// Reference sequence, dbsnps and RODs used by the pipeline
val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/EUR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
add(new target(perLaneBam, targetIntervals),
new clean(perLaneBam, targetIntervals, cleanedBam, knownsOnly),
new fixMates(cleanedBam, fixedBam, intermediate),
new dedup(fixedBam, dedupedBam, metricsFile),
new index(dedupedBam),
new cov(dedupedBam, preRecalFile),
new recal(dedupedBam, preRecalFile, recalBam),
new index(recalBam),
new cov(recalBam, postRecalFile),
new analyzeCovariates(preRecalFile, preOutPath),
new analyzeCovariates(postRecalFile, postOutPath))
// General arguments to all programs
trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = qscript.gatkJar
this.reference_sequence = reference
this.memoryLimit = Some(4)
this.jobTempDir = qscript.outputTmpDir
}
val target = new RealignerTargetCreator with CommandLineGATKArgs
target.input_file :+= new File(bam)
target.out = new File(targetIntervals)
target.mismatchFraction = Some(0.0)
target.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
target.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls)
target.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
target.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
target.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
target.jobName = baseName + ".target"
if (!qscript.intervalString.isEmpty()) target.intervalsString ++= List(qscript.intervalString)
if (qscript.intervals != Nil) target.intervals ++= List(qscript.intervals)
// 2.) Clean without SW
val clean = new IndelRealigner with CommandLineGATKArgs
clean.input_file :+= new File(bam)
clean.targetIntervals = new File(targetIntervals)
clean.out = new File(cleanedBam)
clean.doNotUseSW = true
clean.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE)
clean.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
clean.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls)
clean.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
clean.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
clean.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
clean.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
clean.jobName = baseName + ".clean"
if (!qscript.intervalString.isEmpty()) clean.intervalsString ++= List(qscript.intervalString)
if (qscript.intervals != Nil) clean.intervals ++= List(qscript.intervals)
// 3.) Mark Duplicates
val dedup = new PicardBamJarFunction{
@Input(doc="cleaned bam") var clean: File = new File(cleanedBam)
@Output(doc="deduped bam") var deduped: File = new File(dedupedBam)
override def inputBams = List(clean)
override def outputBam = deduped
override def commandLine = super.commandLine + " M=" + metricsFile
sortOrder = null
}
dedup.memoryLimit = Some(8)
dedup.jarFile = qscript.dedupJar
dedup.jobName = baseName + ".dedup"
val index = new SamtoolsIndexFunction
index.bamFile = new File(dedupedBam)
index.analysisName = baseName + ".index"
val cov = new CountCovariates with CommandLineGATKArgs
cov.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
cov.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
cov.input_file :+= new File(dedupedBam)
cov.recal_file = new File(recalFile)
val recal = new TableRecalibration with CommandLineGATKArgs
recal.input_file :+= new File (dedupedBam)
recal.recal_file = new File(recalFile)
recal.out = new File(recalBam)
add(target, clean, dedup, index, cov, recal)
recalibratedBamList :+= new File(recalBam)
recalibratedBamIndexList :+= new File(recalBam + ".bai") // to hold next process by this dependency
}
// Helpful variables
val outName: String = qscript.projectName
val outDir: String = qscript.outputDir
// BAM files generated by the pipeline
val bamList: String = outDir + outName + ".list"
val cleanedBam: String = outDir + outName + ".cleaned.bam"
val fixedBam: String = outDir + outName + ".final.bam"
// Accessory files
val targetIntervals: String = outDir + outName + ".indel.intervals"
add(new writeList(recalibratedBamList, bamList, recalibratedBamIndexList),
new target(bamList, targetIntervals),
new clean(bamList, targetIntervals, cleanedBam, !knownsOnly),
new fixMates(cleanedBam, fixedBam, !intermediate))
}
class target (inBams: String, outIntervals: String) extends RealignerTargetCreator with CommandLineGATKArgs {
this.input_file :+= new File(inBams)
this.out = new File(outIntervals)
this.mismatchFraction = Some(0.0)
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls)
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
this.jobName = inBams + ".tgt"
if (!qscript.intervalString.isEmpty()) this.intervalsString :+= qscript.intervalString
else this.intervals :+= qscript.intervals
}
class clean (inBams: String, tIntervals: String, outBam: String, knownsOnly: Boolean) extends IndelRealigner with CommandLineGATKArgs {
this.input_file :+= new File(inBams)
this.targetIntervals = new File(tIntervals)
this.out = new File(outBam)
this.doNotUseSW = true
this.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE)
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls)
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
this.useOnlyKnownIndels = knownsOnly
this.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
this.jobName = inBams + ".clean"
if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString)
else this.intervals :+= qscript.intervals
}
class fixMates (inBam: String, outBam: String, intermediate: Boolean) extends PicardBamJarFunction {
@Input(doc="cleaned bam") var cleaned: File = new File(inBam)
@Output(doc="fixed bam") var fixed: File = new File(outBam)
override def inputBams = List(cleaned)
override def outputBam = fixed
this.jarFile = qscript.fixMatesJar
this.isIntermediate = intermediate
this.memoryLimit = Some(6)
this.jobName = inBam + ".fix"
}
class dedup (inBam: String, outBam: String, metricsFile: String) extends PicardBamJarFunction {
@Input(doc="fixed bam") var clean: File = new File(inBam)
@Output(doc="deduped bam") var deduped: File = new File(outBam)
override def inputBams = List(clean)
override def outputBam = deduped
override def commandLine = super.commandLine + " M=" + metricsFile
sortOrder = null
this.memoryLimit = Some(6)
this.jarFile = qscript.dedupJar
this.jobName = inBam + ".dedup"
}
class index (inBam: String) extends SamtoolsIndexFunction {
@Output(doc="bam index file") var outIndex: File = new File(inBam + ".bai")
this.bamFile = new File(inBam)
this.analysisName = inBam + ".index"
}
class cov (inBam: String, outRecalFile: String) extends CountCovariates with CommandLineGATKArgs {
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
this.input_file :+= new File(inBam)
this.recal_file = new File(outRecalFile)
}
class recal (inBam: String, inRecalFile: String, outBam: String) extends TableRecalibration with CommandLineGATKArgs {
this.input_file :+= new File (inBam)
this.recal_file = new File(inRecalFile)
this.out = new File(outBam)
}
class analyzeCovariates (inRecalFile: String, outPath: String) extends AnalyzeCovariates {
this.jarFile = qscript.ACJar
this.resources = qscript.R
this.recal_file = new File(inRecalFile)
this.output_dir = outPath
}
class writeList(inBams: List[File], outBamList: String, depIndices: List[File]) extends ListWriterFunction {
@Input(doc="bam indexes") var indexes: List[File] = depIndices // I need this dependency to hold creation of the list until all indices are done
this.inputFiles = inBams
this.listFile = new File(outBamList)
this.jobName = "bamList"
}
}