2011-02-16 05:49:05 +08:00
import org.broadinstitute.sting.queue.extensions.gatk._
2011-03-24 22:03:51 +08:00
import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction
2011-02-16 05:49:05 +08:00
import org.broadinstitute.sting.queue.QScript
2011-02-18 07:34:00 +08:00
import org.broadinstitute.sting.queue.function.ListWriterFunction
2011-02-17 10:07:22 +08:00
import scala.io.Source
2011-02-16 05:49:05 +08:00
class dataProcessing extends QScript {
qscript =>
2011-02-19 07:13:54 +08:00
@Input ( doc = "path to GenomeAnalysisTK.jar" , shortName = "gatk" , required = true )
2011-02-18 07:34:00 +08:00
var GATKjar : File = _
2011-02-16 05:49:05 +08:00
2011-02-18 07:34:00 +08:00
@Input ( doc = "path to AnalyzeCovariates.jar" , shortName = "ac" , required = true )
var ACJar : File = _
2011-02-16 05:49:05 +08:00
2011-02-19 07:13:54 +08:00
@Input ( doc = "path to Picard's MarkDuplicates.jar" , shortName = "dedup" , required = true )
var dedupJar : File = _
@Input ( doc = "path to R resources folder inside the Sting repository" , shortName = "r" , required = true )
2011-02-18 07:34:00 +08:00
var R : String = _
2011-02-19 07:13:54 +08:00
@Input ( doc = "input BAM file - or list of BAM files" , shortName = "i" , required = true )
var input : String = _
2011-02-18 07:34:00 +08:00
2011-02-19 07:13:54 +08:00
@Input ( doc = "Reference fasta file" , shortName = "R" , required = false )
2011-02-25 01:12:55 +08:00
var reference : File = new File ( "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta" )
2011-02-18 07:34:00 +08:00
2011-02-26 03:21:02 +08:00
@Input ( doc = "dbsnp ROD to use (VCF)" , shortName = "D" , required = false )
2011-03-04 05:42:22 +08:00
var dbSNP : File = new File ( "/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf" )
2011-02-18 07:34:00 +08:00
2011-03-01 03:04:58 +08:00
@Input ( doc = "extra VCF files to use as reference indels for Indel Realignment" , shortName = "indels" , required = false )
2011-03-04 05:42:22 +08:00
var indels : File = new File ( "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf" )
2011-02-19 07:13:54 +08:00
@Input ( doc = "the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam" , shortName = "p" , required = false )
2011-02-18 07:34:00 +08:00
var projectName : String = "combined"
2011-02-16 05:49:05 +08:00
@Input ( doc = "output path" , shortName = "outputDir" , required = false )
var outputDir : String = ""
2011-02-24 00:49:58 +08:00
@Input ( doc = "the -L interval string to be used by GATK - output bams at interval only" , shortName = "L" , required = false )
2011-02-16 05:49:05 +08:00
var intervalString : String = ""
2011-02-24 00:49:58 +08:00
@Input ( doc = "output bams at intervals only" , shortName = "intervals" , required = false )
var intervals : File = _
2011-02-18 07:34:00 +08:00
2011-02-26 03:21:02 +08:00
val queueLogDir : String = ".qlog/"
2011-02-18 07:34:00 +08:00
// Simple boolean definitions for code clarity
val knownsOnly : Boolean = true
val intermediate : Boolean = true
// General arguments to all programs
trait CommandLineGATKArgs extends CommandLineGATK {
this . jarFile = qscript . GATKjar
2011-02-19 07:13:54 +08:00
this . reference_sequence = qscript . reference
2011-03-24 22:03:51 +08:00
this . memoryLimit = 4
2011-02-18 07:34:00 +08:00
this . isIntermediate = true
}
2011-02-16 05:49:05 +08:00
def script = {
2011-02-18 07:34:00 +08:00
var perLaneBamList : List [ String ] = Nil
var recalibratedBamList : List [ File ] = Nil
2011-02-24 00:49:58 +08:00
// Helpful variables
val outName : String = qscript . projectName
val outDir : String = qscript . outputDir
// BAM files generated by the pipeline
val bamList : String = outDir + outName + ".list"
2011-03-15 00:11:03 +08:00
val joinedBams : String = outDir + outName + ".join.bam"
2011-02-24 00:49:58 +08:00
val cleanedBam : String = outDir + outName + ".clean.bam"
val fixedBam : String = outDir + outName + ".processed.bam"
// Accessory files
val knownTargetIntervals : String = outDir + outName + ".known_indels.intervals"
val allTargetIntervals : String = outDir + outName + ".all_indels.intervals"
add ( new knownTargets ( knownTargetIntervals ) )
2011-02-18 07:34:00 +08:00
// Populates the list of per lane bam files to process (single bam or list of bams).
2011-02-19 07:13:54 +08:00
if ( input . endsWith ( "bam" ) )
2011-02-18 07:34:00 +08:00
perLaneBamList : += input
2011-02-19 07:13:54 +08:00
else
2011-02-18 07:34:00 +08:00
for ( bam <- Source . fromFile ( input ) . getLines ( ) )
perLaneBamList : += bam
2011-02-19 07:13:54 +08:00
2011-02-18 07:34:00 +08:00
perLaneBamList . foreach { perLaneBam =>
// Helpful variables
val baseName : String = swapExt ( new File ( perLaneBam . substring ( perLaneBam . lastIndexOf ( "/" ) + 1 ) ) , ".bam" , "" ) . toString ( )
val baseDir : String = perLaneBam . substring ( 0 , perLaneBam . lastIndexOf ( "/" ) + 1 )
// BAM files generated by the pipeline
2011-02-19 07:13:54 +08:00
val cleanedBam : String = baseName + ".clean.bam"
val dedupedBam : String = baseName + ".clean.dedup.bam"
val recalBam : String = baseName + ".clean.dedup.recal.bam"
2011-02-18 07:34:00 +08:00
// Accessory files
val metricsFile : String = baseName + ".metrics"
val preRecalFile : String = baseName + ".pre_recal.csv"
val postRecalFile : String = baseName + ".post_recal.csv"
val preOutPath : String = baseName + ".pre"
val postOutPath : String = baseName + ".post"
2011-02-24 00:49:58 +08:00
add ( new clean ( perLaneBam , knownTargetIntervals , cleanedBam , knownsOnly , intermediate ) ,
2011-02-19 07:13:54 +08:00
new dedup ( cleanedBam , dedupedBam , metricsFile ) ,
2011-02-18 07:34:00 +08:00
new cov ( dedupedBam , preRecalFile ) ,
2011-02-19 07:13:54 +08:00
new recal ( dedupedBam , preRecalFile , recalBam ) ,
2011-02-18 07:34:00 +08:00
new cov ( recalBam , postRecalFile ) ,
new analyzeCovariates ( preRecalFile , preOutPath ) ,
new analyzeCovariates ( postRecalFile , postOutPath ) )
recalibratedBamList : += new File ( recalBam )
2011-02-16 05:49:05 +08:00
}
2011-02-18 07:34:00 +08:00
2011-02-19 07:13:54 +08:00
add ( new writeList ( recalibratedBamList , bamList ) ,
2011-03-15 00:11:03 +08:00
new joinBams ( bamList , joinedBams ) ,
new allTargets ( joinedBams , allTargetIntervals ) ,
new clean ( joinedBams , allTargetIntervals , cleanedBam , ! knownsOnly , ! intermediate ) )
2011-02-18 07:34:00 +08:00
}
2011-02-26 05:56:35 +08:00
class TargetBase ( outIntervals : String ) extends RealignerTargetCreator with CommandLineGATKArgs {
2011-02-24 00:49:58 +08:00
this . out = new File ( outIntervals )
2011-03-24 22:03:51 +08:00
this . mismatchFraction = 0.0
2011-02-24 00:49:58 +08:00
this . rodBind : += RodBind ( " dbsnp " , " VCF " , dbSNP )
2011-03-01 03:04:58 +08:00
this . rodBind : += RodBind ( " indels " , " VCF " , indels )
2011-02-26 05:56:35 +08:00
}
2011-03-01 03:04:58 +08:00
class knownTargets ( outIntervals : String ) extends TargetBase ( outIntervals ) {
2011-02-26 03:21:02 +08:00
this . jobName = queueLogDir + outIntervals + ".ktarget"
2011-02-24 00:49:58 +08:00
}
2011-02-26 05:56:35 +08:00
class allTargets ( inBams : String , outIntervals : String ) extends TargetBase ( outIntervals ) {
2011-02-18 07:34:00 +08:00
this . input_file : += new File ( inBams )
2011-03-24 22:03:51 +08:00
this . memoryLimit = 6
2011-02-26 03:21:02 +08:00
this . jobName = queueLogDir + outIntervals + ".atarget"
2011-02-18 07:34:00 +08:00
}
2011-02-19 07:13:54 +08:00
class clean ( inBams : String , tIntervals : String , outBam : String , knownsOnly : Boolean , intermediate : Boolean ) extends IndelRealigner with CommandLineGATKArgs {
2011-02-18 07:34:00 +08:00
this . input_file : += new File ( inBams )
this . targetIntervals = new File ( tIntervals )
this . out = new File ( outBam )
this . rodBind : += RodBind ( " dbsnp " , " VCF " , dbSNP )
2011-03-04 05:42:22 +08:00
this . rodBind : += RodBind ( " indels " , " VCF " , qscript . indels )
2011-02-18 07:34:00 +08:00
this . useOnlyKnownIndels = knownsOnly
2011-02-24 00:49:58 +08:00
this . doNotUseSW = true
2011-03-24 22:03:51 +08:00
this . baq = org . broadinstitute . sting . utils . baq . BAQ . CalculationMode . CALCULATE_AS_NECESSARY
this . compress = 0
this . U = org . broadinstitute . sting . gatk . arguments . ValidationExclusion . TYPE . NO_READ_ORDER_VERIFICATION // todo -- update this with the last consensus between Tim, Matt and Eric. This is ugly!
2011-02-19 07:13:54 +08:00
this . isIntermediate = intermediate
2011-02-26 03:21:02 +08:00
this . jobName = queueLogDir + outBam + ".clean"
2011-02-24 00:49:58 +08:00
if ( ! intermediate && ! qscript . intervalString . isEmpty ( ) ) this . intervalsString ++= List ( qscript . intervalString )
if ( ! intermediate && qscript . intervals != null ) this . intervals : += qscript.intervals
2011-02-18 07:34:00 +08:00
}
2011-03-24 22:03:51 +08:00
class dedup ( inBam : String , outBam : String , metricsFile : String ) extends PicardBamFunction {
2011-02-18 07:34:00 +08:00
@Input ( doc = "fixed bam" ) var clean : File = new File ( inBam )
@Output ( doc = "deduped bam" ) var deduped : File = new File ( outBam )
2011-02-24 00:49:58 +08:00
@Output ( doc = "deduped bam index" ) var dedupedIndex : File = new File ( outBam + ".bai" )
@Output ( doc = "metrics file" ) var metrics : File = new File ( metricsFile )
2011-02-18 07:34:00 +08:00
override def inputBams = List ( clean )
override def outputBam = deduped
2011-02-19 07:13:54 +08:00
override def commandLine = super . commandLine + " M=" + metricsFile + " CREATE_INDEX=true"
2011-02-18 07:34:00 +08:00
sortOrder = null
2011-03-24 22:03:51 +08:00
this . memoryLimit = 6
2011-02-18 07:34:00 +08:00
this . jarFile = qscript . dedupJar
2011-02-19 07:13:54 +08:00
this . isIntermediate = true
2011-02-26 03:21:02 +08:00
this . jobName = queueLogDir + outBam + ".dedup"
2011-02-18 07:34:00 +08:00
}
class cov ( inBam : String , outRecalFile : String ) extends CountCovariates with CommandLineGATKArgs {
this . rodBind : += RodBind ( " dbsnp " , " VCF " , dbSNP )
this . covariate ++= List ( "ReadGroupCovariate" , "QualityScoreCovariate" , "CycleCovariate" , "DinucCovariate" )
this . input_file : += new File ( inBam )
this . recal_file = new File ( outRecalFile )
2011-02-26 03:21:02 +08:00
this . jobName = queueLogDir + outRecalFile + ".covariates"
2011-02-18 07:34:00 +08:00
}
class recal ( inBam : String , inRecalFile : String , outBam : String ) extends TableRecalibration with CommandLineGATKArgs {
2011-02-24 00:49:58 +08:00
@Output ( doc = "recalibrated bam index" ) var recalIndex : File = new File ( outBam + ".bai" )
2011-02-18 07:34:00 +08:00
this . input_file : += new File ( inBam )
this . recal_file = new File ( inRecalFile )
this . out = new File ( outBam )
2011-03-24 22:03:51 +08:00
this . U = org . broadinstitute . sting . gatk . arguments . ValidationExclusion . TYPE . NO_READ_ORDER_VERIFICATION // todo -- update this with the last consensus between Tim, Matt and Eric. This is ugly!
this . index_output_bam_on_the_fly = true
2011-02-26 03:21:02 +08:00
this . jobName = queueLogDir + outBam + ".recalibration"
2011-02-18 07:34:00 +08:00
}
class analyzeCovariates ( inRecalFile : String , outPath : String ) extends AnalyzeCovariates {
this . jarFile = qscript . ACJar
this . resources = qscript . R
this . recal_file = new File ( inRecalFile )
this . output_dir = outPath
2011-02-26 03:21:02 +08:00
this . jobName = queueLogDir + inRecalFile + ".analyze_covariates"
2011-02-18 07:34:00 +08:00
}
2011-02-19 07:13:54 +08:00
class writeList ( inBams : List [ File ] , outBamList : String ) extends ListWriterFunction {
2011-03-15 00:11:03 +08:00
this . inputFiles = inBams
this . listFile = new File ( outBamList )
this . jobName = queueLogDir + outBamList + ".bamList"
}
2011-03-19 06:06:52 +08:00
class joinBams ( inBams : String , outBam : String ) extends PrintReads {
this . input_file : += new File ( inBams )
2011-03-15 00:11:03 +08:00
this . out = new File ( outBam )
this . jobName = queueLogDir + inBams + ".joinBams"
2011-02-16 05:49:05 +08:00
}
2011-02-24 00:49:58 +08:00
}