2010-12-18 02:49:03 +08:00
import net.sf.picard.reference.FastaSequenceFile
import org.broadinstitute.sting.datasources.pipeline.Pipeline
import org.broadinstitute.sting.gatk.DownsampleType
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.samtools._
import org.broadinstitute.sting.queue. { QException , QScript }
import collection.JavaConversions._
import org.broadinstitute.sting.utils.yaml.YamlUtils
import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
class ASHGcalling extends QScript {
qscript =>
@Input ( doc = "path to GATK jar" , shortName = "gatk" , required = true )
var gatkJar : File = _
@Input ( doc = "the chromosome to process" , shortName = "chr" , required = true )
var chr : Int = _
@Input ( doc = "output path" , shortName = "outputDir" , required = false )
var outputDir : String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/calls/"
@Input ( doc = "base output filename" , shortName = "baseName" , required = false )
var baseName : String = "ALL.august"
@Input ( doc = "path to tmp space for storing intermediate bam files" , shortName = "outputTmpDir" , required = false )
var outputTmpDir : String = "/humgen/gsa-hpprojects/august_cleaned_bams"
private val tmpDir : File = new File ( "/broad/shptmp/rpoplin/" )
private val reference : File = new File ( "/humgen/1kg/reference/human_g1k_v37.fasta" )
private val dbSNP : File = new File ( "/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod" )
private val targetIntervals : File = new File ( "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/knownIndels.intervals" )
private val dindelCalls : String = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/1kg.pilot_release.merged.indels.sites.hg19.vcf"
private val dindelMask : String = "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/pilot1.dindel.mask.bed"
val chromosomeLength = List ( 249250621 , 243199373 , 198022430 , 191154276 , 180915260 , 171115067 , 159138663 , 146364022 , 141213431 , 135534747 , 135006516 , 133851895 , 115169878 , 107349540 , 102531392 , 90354753 , 81195210 , 78077248 , 59128983 , 63025520 , 48129895 , 51304566 )
val populations = List ( "YRI" , "LWK" , "ASW" , "PUR" , "CEU" , "TSI" , "GBR" , "FIN" , "MXL" , "CHB" , "CHS" , "JPT" )
private var pipeline : Pipeline = _
trait CommandLineGATKArgs extends CommandLineGATK {
this . jarFile = qscript . gatkJar
this . reference_sequence = qscript . reference
2011-03-24 22:03:51 +08:00
this . memoryLimit = 2
2010-12-18 02:49:03 +08:00
this . DBSNP = qscript . dbSNP
this . jobTempDir = qscript . tmpDir
}
class SamtoolsBaqFunction extends CommandLineFunction {
@Input ( doc = "The input BAM file" ) var in_bam : File = _
@Output ( doc = "The output BAM file" ) var out_bam : File = _
def commandLine = "/humgen/gsa-scr1/rpoplin/samtools/samtools calmd -br %s %s > %s" . format ( in_bam . getAbsolutePath , qscript . reference , out_bam . getAbsolutePath )
}
class DeleteMeFunction extends CommandLineFunction {
@Input ( doc = "The file to be deleted" ) var me : File = _
@Input ( doc = "The file which must exist before we are allowed to delete" ) var trigger : File = _
def commandLine = "rm -f %s" . format ( me . getAbsolutePath )
}
class DeleteMeAllFunction extends CommandLineFunction {
@Input ( doc = "The file to be deleted" ) var me : File = _
@Input ( doc = "The file which must exist before we are allowed to delete" ) var trigger : File = _
def commandLine = "rm -f %s*" . format ( me . getAbsolutePath )
}
def script = {
val basesPerJob : Int = 3000000
val lastBase : Int = qscript . chromosomeLength ( qscript . chr - 1 )
var start : Int = 1
var stop : Int = start - 1 + basesPerJob
if ( stop > lastBase ) { stop = lastBase }
var jobNumber : Int = 1
while ( jobNumber < ( lastBase . toFloat / basesPerJob . toFloat ) + 1.0 ) {
callThisChunk ( "%d:%d-%d" . format ( qscript . chr , start , stop ) , jobNumber )
start += basesPerJob
stop += basesPerJob
if ( stop > lastBase ) { stop = lastBase }
jobNumber += 1
}
/* CombineVariants parses the 800+ genotypes per record and is way too slow. Combine the vcf files together using grep, cat, and sortByRef.pl outside of Queue
combineVariants = new CombineVariants with CommandLineGATKArgs
combineVariants . rodBind = vcfChunks
combineVariants . out = new TaggedFile ( qscript . baseName + ".chr" + qscript . chr . toString + ".filtered.vcf" , "vcf" )
2011-03-24 22:03:51 +08:00
combineVariants . variantmergeoption = org . broadinstitute . sting . gatk . contexts . variantcontext . VariantContextUtils . VariantMergeType . UNION
combineVariants . genotypemergeoption = org . broadinstitute . sting . gatk . contexts . variantcontext . VariantContextUtils . GenotypeMergeType . UNSORTED
2010-12-18 02:49:03 +08:00
combineVariants . setKey = "null"
add ( combineVariants )
*/
}
def callThisChunk ( interval : String , jobNumber : Int ) = {
val baseName : String = qscript . outputDir + "/chr" + qscript . chr . toString + "/" + qscript . baseName + ".chr" + qscript . chr . toString + "." + jobNumber . toString + "."
var call = new UnifiedGenotyperV2 with CommandLineGATKArgs
val rawCalls = new File ( baseName + "raw.vcf" )
for ( population <- qscript . populations ) {
val baseTmpName : String = qscript . outputTmpDir + "/chr" + qscript . chr . toString + "/" + population + ".august.chr" + qscript . chr . toString + "." + jobNumber . toString + "."
val bamList : File = new File ( "/humgen/1kg/processing/allPopulations_wholeGenome_august_release/bamLists/%s.chr%d.bam.list" . format ( population , qscript . chr ) )
// 1.) Clean at known indels
var clean = new IndelRealigner with CommandLineGATKArgs
val cleanedBam = new File ( baseTmpName + "cleaned.bam" )
2011-03-24 22:03:51 +08:00
clean . memoryLimit = 4
2010-12-18 02:49:03 +08:00
clean . input_file : += bamList
clean . intervalsString : += interval
clean . targetIntervals = qscript . targetIntervals
clean . out = cleanedBam
clean . rodBind : += RodBind ( " indels " , " VCF " , qscript . dindelCalls )
clean . knownsOnly = true
2011-03-24 22:03:51 +08:00
clean . LOD = 1.0
2010-12-18 02:49:03 +08:00
clean . sortInCoordinateOrderEvenThoughItIsHighlyUnsafe = true
2011-03-24 22:03:51 +08:00
clean . compress = 2
2010-12-18 02:49:03 +08:00
clean . jobName = baseName + population + ".clean"
//clean.stripBam = true
//clean.fileSystemUsage = "indium"
// 2.) Apply BAQ calculation
var baq = new SamtoolsBaqFunction
val baqedBam = new File ( baseTmpName + "cleaned.baq.bam" )
2011-03-24 22:03:51 +08:00
baq . memoryLimit = 4
2010-12-18 02:49:03 +08:00
baq . in_bam = cleanedBam
baq . out_bam = baqedBam
baq . jobName = baseName + population + ".baq"
//baq.fileSystemUsage = "iodine"
// 3a.) Delete cleaned bam
var deleteClean = new DeleteMeFunction
deleteClean . me = cleanedBam
deleteClean . trigger = baqedBam
deleteClean . jobName = baseName + population + ".deleteClean"
//deleteClean.fileSystemUsage = "iodine"
// 3b.) Index BAQ'ed bam
var index = new SamtoolsIndexFunction
index . bamFile = baqedBam
index . jobName = baseName + population + ".index"
//index.fileSystemUsage = "iodine"
// 5a.) Delete BAQ'ed bam and index
//var deleteBaq = new DeleteMeAllFunction
//deleteBaq.me = baqedBam
//deleteBaq.trigger = rawCalls
//deleteBaq.jobName = baseName + population + ".deleteBaq"
//deleteBaq.fileSystemUsage = "iodine"
call . input_file : += baqedBam
//add(clean, baq, deleteClean, index, deleteBaq)
add ( clean , baq , deleteClean , index )
}
// 4.) Call with UGv2
2011-03-24 22:03:51 +08:00
call . memoryLimit = 4
2010-12-18 02:49:03 +08:00
call . intervalsString : += interval
call . out = rawCalls
2011-03-24 22:03:51 +08:00
call . dcov = 50
call . standard_min_confidence_threshold_for_calling = 50
call . standard_min_confidence_threshold_for_emitting = 30
call . min_mapping_quality_score = 20
call . min_base_quality_score = 20
call . pnrm = org . broadinstitute . sting . playground . gatk . walkers . genotyper . AlleleFrequencyCalculationModel . Model . GRID_SEARCH
2010-12-18 02:49:03 +08:00
call . jobName = baseName + "call"
//call.fileSystemUsage = "iodine"
// 5b.) Filter near indels and HARD_TO_VALIDATE
var filter = new VariantFiltration with CommandLineGATKArgs
val filteredCalls = new File ( baseName + "filtered.vcf" )
2011-03-24 22:03:51 +08:00
filter . memoryLimit = 1
2010-12-18 02:49:03 +08:00
filter . out = filteredCalls
filter . intervalsString : += interval
filter . variantVCF = rawCalls
filter . rodBind : += RodBind ( " mask " , " Bed " , qscript . dindelMask )
filter . maskName = "InDel"
filter . filterName ++= List ( "HARD_TO_VALIDATE" )
filter . filterExpression ++= List ( "\"MQ0 >= 4 && (MQ0 / (1.0 * DP)) > 0.1\"" )
filter . jobName = baseName + "filter"
//filter.fileSystemUsage = "indium"
// 6.) Delete raw calls and index
var deleteRawCalls = new DeleteMeAllFunction
deleteRawCalls . me = rawCalls
deleteRawCalls . trigger = filteredCalls
deleteRawCalls . jobName = baseName + "deleteRawCalls"
//deleteRawCalls.fileSystemUsage = "indium"
add ( call , filter , deleteRawCalls )
}
}