2011-02-16 05:49:05 +08:00
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.QScript
2011-02-17 01:22:30 +08:00
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
2011-02-18 07:34:00 +08:00
import org.broadinstitute.sting.queue.function.ListWriterFunction
2011-02-17 10:07:22 +08:00
import scala.io.Source
2011-02-16 05:49:05 +08:00
class dataProcessing extends QScript {
qscript =>
2011-02-19 07:13:54 +08:00
@Input ( doc = "path to GenomeAnalysisTK.jar" , shortName = "gatk" , required = true )
2011-02-18 07:34:00 +08:00
var GATKjar : File = _
2011-02-16 05:49:05 +08:00
2011-02-18 07:34:00 +08:00
@Input ( doc = "path to AnalyzeCovariates.jar" , shortName = "ac" , required = true )
var ACJar : File = _
2011-02-16 05:49:05 +08:00
2011-02-19 07:13:54 +08:00
@Input ( doc = "path to Picard's MarkDuplicates.jar" , shortName = "dedup" , required = true )
var dedupJar : File = _
@Input ( doc = "path to R resources folder inside the Sting repository" , shortName = "r" , required = true )
2011-02-18 07:34:00 +08:00
var R : String = _
2011-02-19 07:13:54 +08:00
@Input ( doc = "input BAM file - or list of BAM files" , shortName = "i" , required = true )
var input : String = _
2011-02-18 07:34:00 +08:00
2011-02-19 07:13:54 +08:00
@Input ( doc = "Reference fasta file" , shortName = "R" , required = false )
var reference : File = new File ( "/humgen/1kg/reference/human_g1k_v37.fasta" )
2011-02-18 07:34:00 +08:00
2011-02-19 07:13:54 +08:00
@Input ( doc = "dbsnp ROD to use (VCF)" , shortName = "D" , required = false ) // todo -- accept any format. Not only VCF.
val dbSNP : File = new File ( "/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf" )
2011-02-18 07:34:00 +08:00
2011-02-19 07:13:54 +08:00
@Input ( doc = "extra VCF files to use as reference indels for Indel Realignment" , shortName = "indels" , required = false ) //todo -- once vcfs are merged, this will become the only indel vcf to be used and the merged file will be the default.
val indels : File = _
@Input ( doc = "the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam" , shortName = "p" , required = false )
2011-02-18 07:34:00 +08:00
var projectName : String = "combined"
2011-02-16 05:49:05 +08:00
@Input ( doc = "output path" , shortName = "outputDir" , required = false )
var outputDir : String = ""
@Input ( doc = "the -L interval string to be used by GATK" , shortName = "L" , required = false )
var intervalString : String = ""
2011-02-19 00:58:34 +08:00
// todo -- this shouldn't be allowed. We want a flag that says "output bams at intervals only" or not
2011-02-16 05:49:05 +08:00
@Input ( doc = "provide a .intervals file with the list of target intervals" , shortName = "intervals" , required = false )
2011-02-18 07:34:00 +08:00
var intervals : File = new File ( "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals" )
2011-02-19 07:13:54 +08:00
// todo -- let's create a pre-merged single VCF and put it into /humgen/gsa-hpprojects/GATK/data please
2011-02-18 07:34:00 +08:00
val dindelPilotCalls : String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
val dindelAFRCalls : String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/AFR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelASNCalls : String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/ASN.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
val dindelEURCalls : String = "/humgen/1kg/DCC/ftp/technical/working/20110126_dindel_august/EUR.dindel_august_release_merged_pilot1.20110126.sites.vcf.gz"
// Simple boolean definitions for code clarity
val knownsOnly : Boolean = true
val intermediate : Boolean = true
// General arguments to all programs
trait CommandLineGATKArgs extends CommandLineGATK {
this . jarFile = qscript . GATKjar
2011-02-19 07:13:54 +08:00
this . reference_sequence = qscript . reference
2011-02-18 07:34:00 +08:00
this . memoryLimit = Some ( 4 )
this . isIntermediate = true
}
2011-02-16 05:49:05 +08:00
def script = {
2011-02-18 07:34:00 +08:00
var perLaneBamList : List [ String ] = Nil
var recalibratedBamList : List [ File ] = Nil
// Populates the list of per lane bam files to process (single bam or list of bams).
2011-02-19 07:13:54 +08:00
if ( input . endsWith ( "bam" ) )
2011-02-18 07:34:00 +08:00
perLaneBamList : += input
2011-02-19 07:13:54 +08:00
else
2011-02-18 07:34:00 +08:00
for ( bam <- Source . fromFile ( input ) . getLines ( ) )
perLaneBamList : += bam
2011-02-19 07:13:54 +08:00
2011-02-16 05:49:05 +08:00
2011-02-17 10:07:22 +08:00
2011-02-18 07:34:00 +08:00
perLaneBamList . foreach { perLaneBam =>
// Helpful variables
val baseName : String = swapExt ( new File ( perLaneBam . substring ( perLaneBam . lastIndexOf ( "/" ) + 1 ) ) , ".bam" , "" ) . toString ( )
val baseDir : String = perLaneBam . substring ( 0 , perLaneBam . lastIndexOf ( "/" ) + 1 )
// BAM files generated by the pipeline
2011-02-19 07:13:54 +08:00
val cleanedBam : String = baseName + ".clean.bam"
val dedupedBam : String = baseName + ".clean.dedup.bam"
val recalBam : String = baseName + ".clean.dedup.recal.bam"
2011-02-18 07:34:00 +08:00
// Accessory files
2011-02-17 10:07:22 +08:00
val targetIntervals : String = baseName + ".indel.intervals"
2011-02-18 07:34:00 +08:00
val metricsFile : String = baseName + ".metrics"
val preRecalFile : String = baseName + ".pre_recal.csv"
val postRecalFile : String = baseName + ".post_recal.csv"
val preOutPath : String = baseName + ".pre"
val postOutPath : String = baseName + ".post"
add ( new target ( perLaneBam , targetIntervals ) ,
2011-02-19 07:13:54 +08:00
new clean ( perLaneBam , targetIntervals , cleanedBam , knownsOnly , intermediate ) ,
new dedup ( cleanedBam , dedupedBam , metricsFile ) ,
2011-02-18 07:34:00 +08:00
new cov ( dedupedBam , preRecalFile ) ,
2011-02-19 07:13:54 +08:00
new recal ( dedupedBam , preRecalFile , recalBam ) ,
2011-02-18 07:34:00 +08:00
new cov ( recalBam , postRecalFile ) ,
new analyzeCovariates ( preRecalFile , preOutPath ) ,
new analyzeCovariates ( postRecalFile , postOutPath ) )
recalibratedBamList : += new File ( recalBam )
2011-02-16 05:49:05 +08:00
}
2011-02-18 07:34:00 +08:00
// Helpful variables
val outName : String = qscript . projectName
val outDir : String = qscript . outputDir
// BAM files generated by the pipeline
val bamList : String = outDir + outName + ".list"
2011-02-19 07:13:54 +08:00
val cleanedBam : String = outDir + outName + ".clean.bam"
val fixedBam : String = outDir + outName + ".processed.bam"
2011-02-18 07:34:00 +08:00
// Accessory files
val targetIntervals : String = outDir + outName + ".indel.intervals"
2011-02-19 07:13:54 +08:00
add ( new writeList ( recalibratedBamList , bamList ) ,
new target ( bamList , targetIntervals ) , // todo -- reuse previously generated intervals (see how to do that)
new clean ( bamList , targetIntervals , cleanedBam , ! knownsOnly , ! intermediate ) )
2011-02-18 07:34:00 +08:00
}
class target ( inBams : String , outIntervals : String ) extends RealignerTargetCreator with CommandLineGATKArgs {
this . input_file : += new File ( inBams )
this . out = new File ( outIntervals )
this . mismatchFraction = Some ( 0.0 )
this . rodBind : += RodBind ( " dbsnp " , " VCF " , dbSNP )
this . rodBind : += RodBind ( " indels1 " , " VCF " , dindelPilotCalls )
this . rodBind : += RodBind ( " indels2 " , " VCF " , dindelAFRCalls )
this . rodBind : += RodBind ( " indels3 " , " VCF " , dindelEURCalls )
this . rodBind : += RodBind ( " indels4 " , " VCF " , dindelASNCalls )
2011-02-19 07:13:54 +08:00
if ( qscript . indels != null ) this . rodBind : += RodBind ( " indels5 " , " VCF " , qscript . indels )
2011-02-18 07:34:00 +08:00
this . jobName = inBams + ".tgt"
if ( ! qscript . intervalString . isEmpty ( ) ) this . intervalsString : += qscript.intervalString
else this . intervals : += qscript.intervals
}
2011-02-19 07:13:54 +08:00
class clean ( inBams : String , tIntervals : String , outBam : String , knownsOnly : Boolean , intermediate : Boolean ) extends IndelRealigner with CommandLineGATKArgs {
2011-02-18 07:34:00 +08:00
this . input_file : += new File ( inBams )
this . targetIntervals = new File ( tIntervals )
this . out = new File ( outBam )
this . doNotUseSW = true
2011-02-18 07:50:46 +08:00
this . baq = Some ( org . broadinstitute . sting . utils . baq . BAQ . CalculationMode . CALCULATE_AS_NECESSARY )
2011-02-18 07:34:00 +08:00
this . rodBind : += RodBind ( " dbsnp " , " VCF " , dbSNP )
this . rodBind : += RodBind ( " indels1 " , " VCF " , dindelPilotCalls )
this . rodBind : += RodBind ( " indels2 " , " VCF " , dindelAFRCalls )
this . rodBind : += RodBind ( " indels3 " , " VCF " , dindelEURCalls )
this . rodBind : += RodBind ( " indels4 " , " VCF " , dindelASNCalls )
2011-02-19 07:13:54 +08:00
if ( qscript . indels != null ) this . rodBind : += RodBind ( " indels5 " , " VCF " , qscript . indels )
2011-02-18 07:34:00 +08:00
this . useOnlyKnownIndels = knownsOnly
this . sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
2011-02-19 07:13:54 +08:00
this . constrainMovement = true
this . isIntermediate = intermediate
2011-02-18 07:34:00 +08:00
this . jobName = inBams + ".clean"
if ( ! qscript . intervalString . isEmpty ( ) ) this . intervalsString ++= List ( qscript . intervalString )
else this . intervals : += qscript.intervals
}
class fixMates ( inBam : String , outBam : String , intermediate : Boolean ) extends PicardBamJarFunction {
@Input ( doc = "cleaned bam" ) var cleaned : File = new File ( inBam )
@Output ( doc = "fixed bam" ) var fixed : File = new File ( outBam )
override def inputBams = List ( cleaned )
override def outputBam = fixed
2011-02-19 07:13:54 +08:00
override def commandLine = super . commandLine + " CREATE_INDEX=true"
2011-02-18 07:34:00 +08:00
this . jarFile = qscript . fixMatesJar
this . isIntermediate = intermediate
this . memoryLimit = Some ( 6 )
this . jobName = inBam + ".fix"
}
class dedup ( inBam : String , outBam : String , metricsFile : String ) extends PicardBamJarFunction {
@Input ( doc = "fixed bam" ) var clean : File = new File ( inBam )
@Output ( doc = "deduped bam" ) var deduped : File = new File ( outBam )
override def inputBams = List ( clean )
override def outputBam = deduped
2011-02-19 07:13:54 +08:00
override def commandLine = super . commandLine + " M=" + metricsFile + " CREATE_INDEX=true"
2011-02-18 07:34:00 +08:00
sortOrder = null
this . memoryLimit = Some ( 6 )
this . jarFile = qscript . dedupJar
2011-02-19 07:13:54 +08:00
this . isIntermediate = true
2011-02-18 07:34:00 +08:00
this . jobName = inBam + ".dedup"
}
class cov ( inBam : String , outRecalFile : String ) extends CountCovariates with CommandLineGATKArgs {
this . rodBind : += RodBind ( " dbsnp " , " VCF " , dbSNP )
this . covariate ++= List ( "ReadGroupCovariate" , "QualityScoreCovariate" , "CycleCovariate" , "DinucCovariate" )
this . input_file : += new File ( inBam )
this . recal_file = new File ( outRecalFile )
}
class recal ( inBam : String , inRecalFile : String , outBam : String ) extends TableRecalibration with CommandLineGATKArgs {
this . input_file : += new File ( inBam )
this . recal_file = new File ( inRecalFile )
this . out = new File ( outBam )
2011-02-19 07:13:54 +08:00
this . index_output_bam_on_the_fly = Some ( true )
2011-02-18 07:34:00 +08:00
}
class analyzeCovariates ( inRecalFile : String , outPath : String ) extends AnalyzeCovariates {
this . jarFile = qscript . ACJar
this . resources = qscript . R
this . recal_file = new File ( inRecalFile )
this . output_dir = outPath
}
2011-02-19 07:13:54 +08:00
class writeList ( inBams : List [ File ] , outBamList : String ) extends ListWriterFunction {
2011-02-18 07:34:00 +08:00
this . inputFiles = inBams
this . listFile = new File ( outBamList )
this . jobName = "bamList"
2011-02-16 05:49:05 +08:00
}
}