gatk-3.8/scala/qscript/oneoffs/carneiro/dataProcessing.scala

223 lines
9.8 KiB
Scala
Executable File

import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
import org.broadinstitute.sting.queue.function.ListWriterFunction
import scala.io.Source
class dataProcessing extends QScript {
qscript =>
@Input(doc="path to GATK jar", shortName="gatk", required=true)
var GATKjar: File = _
@Input(doc="path to AnalyzeCovariates.jar", shortName="ac", required=true)
var ACJar: File = _
@Input(doc="path to R resources folder inside Sting", shortName="r", required=true)
var R: String = _
@Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", shortName="fixmates", required=false)
var fixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
@Input(doc="path to MarkDuplicates jar", shortName="dedup", required=false)
var dedupJar: File = new java.io.File("/seq/software/picard/current/bin/MarkDuplicates.jar")
@Input(doc="input BAM file", shortName="i", required=true)
var input: String = _
@Input(doc="final output BAM file base name", shortName="p", required=false)
var projectName: String = "combined"
@Input(doc="output path", shortName="outputDir", required=false)
var outputDir: String = ""
@Input(doc="the -L interval string to be used by GATK", shortName="L", required=false)
var intervalString: String = ""
@Input(doc="provide a .intervals file with the list of target intervals", shortName="intervals", required=false)
var intervals: File = new File("/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals")
// Reference sequence, dbsnps and RODs used by the pipeline
val reference: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf")
val dindelPilotCalls: String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
val dindelAFRCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelASNCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelEURCalls: String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/EUR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
// Simple boolean definitions for code clarity
val knownsOnly: Boolean = true
val intermediate: Boolean = true
// General arguments to all programs
trait CommandLineGATKArgs extends CommandLineGATK {
this.jarFile = qscript.GATKjar
this.reference_sequence = reference
this.memoryLimit = Some(4)
this.isIntermediate = true
}
def script = {
var perLaneBamList: List[String] = Nil
var recalibratedBamList: List[File] = Nil
var recalibratedBamIndexList: List[File] = Nil
// Populates the list of per lane bam files to process (single bam or list of bams).
if (input.endsWith("bam")) {
perLaneBamList :+= input
}
else {
for (bam <- Source.fromFile(input).getLines())
perLaneBamList :+= bam
}
perLaneBamList.foreach { perLaneBam =>
// Helpful variables
val baseName: String = swapExt(new File(perLaneBam.substring(perLaneBam.lastIndexOf("/")+1)), ".bam", "").toString()
val baseDir: String = perLaneBam.substring(0, perLaneBam.lastIndexOf("/")+1)
// BAM files generated by the pipeline
val cleanedBam: String = baseName + ".cleaned.bam"
val fixedBam: String = baseName + ".cleaned.fixed.bam"
val dedupedBam: String = baseName + ".cleaned.fixed.dedup.bam"
val recalBam: String = baseName + ".cleaned.fixed.dedup.recal.bam"
// Accessory files
val targetIntervals: String = baseName + ".indel.intervals"
val metricsFile: String = baseName + ".metrics"
val preRecalFile: String = baseName + ".pre_recal.csv"
val postRecalFile: String = baseName + ".post_recal.csv"
val preOutPath: String = baseName + ".pre"
val postOutPath: String = baseName + ".post"
add(new target(perLaneBam, targetIntervals),
new clean(perLaneBam, targetIntervals, cleanedBam, knownsOnly),
new fixMates(cleanedBam, fixedBam, intermediate),
new dedup(fixedBam, dedupedBam, metricsFile),
new index(dedupedBam),
new cov(dedupedBam, preRecalFile),
new recal(dedupedBam, preRecalFile, recalBam),
new index(recalBam),
new cov(recalBam, postRecalFile),
new analyzeCovariates(preRecalFile, preOutPath),
new analyzeCovariates(postRecalFile, postOutPath))
recalibratedBamList :+= new File(recalBam)
recalibratedBamIndexList :+= new File(recalBam + ".bai") // to hold next process by this dependency
}
// Helpful variables
val outName: String = qscript.projectName
val outDir: String = qscript.outputDir
// BAM files generated by the pipeline
val bamList: String = outDir + outName + ".list"
val cleanedBam: String = outDir + outName + ".cleaned.bam"
val fixedBam: String = outDir + outName + ".final.bam"
// Accessory files
val targetIntervals: String = outDir + outName + ".indel.intervals"
add(new writeList(recalibratedBamList, bamList, recalibratedBamIndexList),
new target(bamList, targetIntervals),
new clean(bamList, targetIntervals, cleanedBam, !knownsOnly),
new fixMates(cleanedBam, fixedBam, !intermediate))
}
class target (inBams: String, outIntervals: String) extends RealignerTargetCreator with CommandLineGATKArgs {
this.input_file :+= new File(inBams)
this.out = new File(outIntervals)
this.mismatchFraction = Some(0.0)
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls)
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
this.jobName = inBams + ".tgt"
if (!qscript.intervalString.isEmpty()) this.intervalsString :+= qscript.intervalString
else this.intervals :+= qscript.intervals
}
class clean (inBams: String, tIntervals: String, outBam: String, knownsOnly: Boolean) extends IndelRealigner with CommandLineGATKArgs {
this.input_file :+= new File(inBams)
this.targetIntervals = new File(tIntervals)
this.out = new File(outBam)
this.doNotUseSW = true
this.baq = Some(org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.CALCULATE_AS_NECESSARY)
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.rodBind :+= RodBind("indels1", "VCF", dindelPilotCalls)
this.rodBind :+= RodBind("indels2", "VCF", dindelAFRCalls)
this.rodBind :+= RodBind("indels3", "VCF", dindelEURCalls)
this.rodBind :+= RodBind("indels4", "VCF", dindelASNCalls)
this.useOnlyKnownIndels = knownsOnly
this.sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
this.jobName = inBams + ".clean"
if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString)
else this.intervals :+= qscript.intervals
}
class fixMates (inBam: String, outBam: String, intermediate: Boolean) extends PicardBamJarFunction {
@Input(doc="cleaned bam") var cleaned: File = new File(inBam)
@Output(doc="fixed bam") var fixed: File = new File(outBam)
override def inputBams = List(cleaned)
override def outputBam = fixed
this.jarFile = qscript.fixMatesJar
this.isIntermediate = intermediate
this.memoryLimit = Some(6)
this.jobName = inBam + ".fix"
}
class dedup (inBam: String, outBam: String, metricsFile: String) extends PicardBamJarFunction {
@Input(doc="fixed bam") var clean: File = new File(inBam)
@Output(doc="deduped bam") var deduped: File = new File(outBam)
override def inputBams = List(clean)
override def outputBam = deduped
override def commandLine = super.commandLine + " M=" + metricsFile
sortOrder = null
this.memoryLimit = Some(6)
this.jarFile = qscript.dedupJar
this.jobName = inBam + ".dedup"
}
class index (inBam: String) extends SamtoolsIndexFunction {
@Output(doc="bam index file") var outIndex: File = new File(inBam + ".bai")
this.bamFile = new File(inBam)
this.analysisName = inBam + ".index"
}
class cov (inBam: String, outRecalFile: String) extends CountCovariates with CommandLineGATKArgs {
this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
this.input_file :+= new File(inBam)
this.recal_file = new File(outRecalFile)
}
class recal (inBam: String, inRecalFile: String, outBam: String) extends TableRecalibration with CommandLineGATKArgs {
this.input_file :+= new File (inBam)
this.recal_file = new File(inRecalFile)
this.out = new File(outBam)
}
class analyzeCovariates (inRecalFile: String, outPath: String) extends AnalyzeCovariates {
this.jarFile = qscript.ACJar
this.resources = qscript.R
this.recal_file = new File(inRecalFile)
this.output_dir = outPath
}
class writeList(inBams: List[File], outBamList: String, depIndices: List[File]) extends ListWriterFunction {
@Input(doc="bam indexes") var indexes: List[File] = depIndices // I need this dependency to hold creation of the list until all indices are done
this.inputFiles = inBams
this.listFile = new File(outBamList)
this.jobName = "bamList"
}
}