2010-11-11 09:43:21 +08:00
from farm_commands2 import *
import os . path
import sys
from optparse import OptionParser
from datetime import date
import glob
import operator
import faiReader
import math
import shutil
import string
from madPipelineUtils import *
EXCLUDE_CHRS = [ ' chrM ' , ' chrY ' ]
2010-11-23 05:57:12 +08:00
EXTRA_GATK_ARGS = ' -XL chrM -XL chrY ' # -XL chrX -XL chrY '
2010-11-11 09:43:21 +08:00
#EXTRA_GATK_ARGS = ' -XL chrM -XL chrY ' # -XL chrX -XL chrY '
VALIDATION_DIR = ' /humgen/gsa-hpprojects/GATK/data/Comparisons '
BAM_ROOT = ' /humgen/1kg/analysis/bamsForDataProcessingPapers/ '
WE_LIST = ' /seq/references/HybSelOligos/whole_exome_agilent_designed_120/whole_exome_agilent_designed_120.targets.interval_list '
WGS_FILTER = [ [ ' ABFilter ' , ' AB > 0.75 && DP > 40 ' ] , [ ' DPFilter ' , ' DP > 120 || SB > -0.10 ' ] ] # , ['QDFilter', 'QD < 5.0 && DP > 40']]
WE_FILTER = [ [ ' ESPStandard ' , ' AB > 0.75 || QD < 5.0 || HRun > 3 || SB > -0.10 ' ] ]
2010-11-23 05:57:12 +08:00
UG_ARGS = " -mbq 20 -mmq 20 -stand_call_conf 50 -stand_emit_conf 10 -hets 0.78e-3 -dcov 10000 -pnrm GRID_SEARCH "
2010-11-11 09:43:21 +08:00
class CallTarget :
2010-11-23 05:57:12 +08:00
def __init__ ( self , name , bam , interval = ' ' , callArgs = " " , b36 = False , optimize = True , filters = [ ] , targetTiTv = 2.07 , maxClusters = 16 , minQual = 300 , tranchToTake = 1 ) :
2010-11-11 09:43:21 +08:00
self . name = name
self . bam = bam
self . interval = interval
self . callArgs = callArgs
self . vcfs = [ ] # list of processed vcf
self . b36 = b36
self . filters = filters
self . optimize = optimize
self . targetTiTv = targetTiTv
self . maxClusters = maxClusters
self . tranchToTake = tranchToTake
self . minQual = minQual
def getCallArgs ( self ) :
return self . callArgs # + self.getIntervalArg()
def getIntervalArg ( self ) :
if self . hasInterval ( ) :
return ' -L ' + self . interval
else :
return ' '
def hasInterval ( self ) :
return self . interval != ' ' and self . interval != None
def getVcf ( self ) :
return os . path . join ( OPTIONS . dir , self . name + " .vcf " )
def getVcfs ( self ) :
return self . vcfs
def addVcf ( self , vcf ) :
self . vcfs . append ( vcf )
def getBam ( self ) :
return self . bam
KG_PATH = ' /humgen/gsa-hpprojects/1kg/1kg_pilot2/currentBestProjectCalls '
TECH_COMP = ' /humgen/gsa-hphome1/kiran/one_off_projects/multiTechComparisons/results/v7/NA12878 '
#WGS_INTERVAL = 'chr1'
#WGS_INTERVAL = '-L chr1:1-50,000,000'
def weTarget ( name , bam , ignore = ' ' , args = ' ' , filters = None ) :
2010-11-23 05:57:12 +08:00
# 3.0 was old target, new is 2.8
return CallTarget ( name , bam , interval = WE_LIST , callArgs = args , filters = WE_FILTER , targetTiTv = 2.8 , maxClusters = 12 , minQual = 2800 , tranchToTake = 10 )
2010-11-11 09:43:21 +08:00
#TARGETS_BY_STRATEGY = [['', ''], ['.OQ', '-OQ'], ['.OQ.noCM', '-OQ -bm THREE_STATE'], ['.noCM', '-bm THREE_STATE']]
TARGETS_BY_STRATEGY = [ [ ' ' , ' ' ] , [ ' .OQ ' , ' -OQ ' ] ]
#TARGETS_BY_STRATEGY = [['', '']]
def targetsByStrategy ( func , rootName , bam , interval = ' ' , args = ' ' , filters = [ ] ) :
def makeTarget ( ext , moreArgs ) :
name = rootName + ext
return func ( name , bam , interval , args + ' ' + moreArgs , filters = filters )
if " cleaned " in rootName or " CG " in rootName :
strats = [ TARGETS_BY_STRATEGY [ 0 ] ]
else :
strats = [ TARGETS_BY_STRATEGY [ 1 ] ]
return map ( lambda x : makeTarget ( * x ) , strats )
targets = [ ]
def findTargets ( names ) :
def find1 ( name ) :
for target in targets :
if target . name == name :
return target
return None
return map ( find1 , names )
def matches ( string , pattern ) :
return string . find ( pattern ) != - 1
def main ( ) :
global OPTIONS , targets
usage = " usage: % prog stage [options] "
parser = OptionParser ( usage = usage )
parser . add_option ( " " , " --dry " , dest = " dry " ,
action = ' store_true ' , default = False ,
help = " If provided, nothing actually gets run, just a dry run " )
parser . add_option ( " -v " , " --verbose " , dest = " verbose " ,
action = ' store_true ' , default = False ,
help = " If provided, print out a lot of information " )
parser . add_option ( " " , " --byQEval " , dest = " byQEval " ,
action = ' store_true ' , default = False ,
help = " If provided, variant eval will be run by Q threshold " )
parser . add_option ( " -s " , " --splitByChr " , dest = " splitByChr " ,
action = ' store_true ' , default = False ,
help = " If provided, we ' ll parallelize by chromosome over the farm " )
parser . add_option ( " " , " --dev " , dest = " dev " ,
action = ' store_true ' , default = False ,
help = " If provided, we ' ll use the GATK dev build " )
parser . add_option ( " -d " , " --dir " , dest = " dir " ,
type = ' string ' , default = " " ,
help = " If provided, this is the root where files are read and written " )
parser . add_option ( " -L " , " " , dest = " WGSIntervals " ,
type = ' string ' , default = None ,
help = " If provided, these are the interval files we will process for WGS " )
parser . add_option ( " -q " , " --farm " , dest = " farmQueue " ,
type = " string " , default = None ,
help = " Farm queue to send processing jobs to " )
parser . add_option ( " -p " , " --parallel " , dest = " parallel " ,
type = " int " , default = None ,
help = " Number of parallel shared memory threads " )
parser . add_option ( " -t " , " --target " , dest = " target " ,
type = " string " , default = None ,
help = " Only run jobs with names containing this string " )
( OPTIONS , args ) = parser . parse_args ( )
if len ( args ) != 1 :
parser . error ( " incorrect number of arguments " )
# set up targets
# WGS
WGS_INTERVAL = OPTIONS . WGSIntervals
#targets += targetsByStrategy(CallTarget, 'GA2.WGS.cleaned', BAM_ROOT + '/NA12878.GA2.WGS.bwa.cleaned.bam', WGS_INTERVAL, filters = WGS_FILTER)
#targets.append(CallTarget('GA2.WGS.raw', '/seq/dirseq/pem/seq/picard_aggregation/G2946gaII/NA12878/v1/NA12878.bam', WGS_INTERVAL, filters = WGS_FILTER))
# HiSeq
targets + = targetsByStrategy ( CallTarget , ' HiSeq.WGS.raw ' , ' /seq/dirseq/pem/seq/picard_aggregation/G2946/NA12878/v1/NA12878.bam ' , WGS_INTERVAL , filters = WGS_FILTER )
2010-11-23 05:57:12 +08:00
#targets += targetsByStrategy(CallTarget, 'HiSeq.WGS.cleaned', '/humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/tmp.list', WGS_INTERVAL, filters = WGS_FILTER)
targets + = targetsByStrategy ( CallTarget , ' HiSeq.WGS.cleaned ' , BAM_ROOT + ' /NA12878.HiSeq.WGS.bwa.cleaned.recal.bam ' , WGS_INTERVAL , filters = WGS_FILTER )
2010-11-11 09:43:21 +08:00
# WE
targets + = targetsByStrategy ( weTarget , ' GA2.WEx.cleaned ' , BAM_ROOT + ' /NA12878.WEx.cleaned.recal.bam ' )
targets + = targetsByStrategy ( weTarget , ' GA2.WEx.raw ' , ' /seq/picard_aggregation/C308/NA12878/v3/NA12878.bam ' )
#targets.append(weTarget('GA2.WEx.raw', '/seq/picard_aggregation/C308/NA12878/v3/NA12878.bam'))
2010-11-23 05:57:12 +08:00
#targets += targetsByStrategy(CallTarget, 'CG.WGS.raw', '/seq/complete_genomics/GS00106-DNA_E01-180_NA12878/SAM0/merge/NA12878.bam', WGS_INTERVAL, filters = WGS_FILTER)
2010-11-11 09:43:21 +08:00
# CG
# todo -- fixme -- needs genome-wide bams on hg18
#targets.append(CallTarget('CG.chr1.raw', '/humgen/gsa-hphome1/kiran/one_off_projects/multiTechComparisons/results/v5/NA12878/CG.full/sample.chr1.primaryAlignmentsMarked.dupesRemoved.bam', WGS_INTERVAL, filters = WGS_FILTER, callArgs = "-bm THREE_STATE", b36 = True))
# low-pass
# '/humgen/gsa-hpprojects/1kg/1kg_pilot1/freeze5_merged/low_coverage_CEU.1.bam
# targets.append(CallTarget('CEU.lowpass.cleaned', 'CEU.bam.list', WGS_INTERVAL, b36 = True))
# 1KG SLX
# targets.append(CallTarget('1KG.NA12878', '/humgen/gsa-hpprojects/1kg/1kg_pilot2/useTheseBamsForAnalyses/NA12878.SLX.bam', WGS_INTERVAL, b36 = True))
# MCDK1 special case
#MCKD1_INTERVAL = "chr1:152448527-154998173"
#targets.append(CallTarget('MCKD1.raw', '/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/MCKD1_WGS/MCKD1.bam.list', MCKD1_INTERVAL, filters = WGS_FILTER, callArgs = '-bm THREE_STATE'))
#targets.append(CallTarget('MCKD1.cleaned', '/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/MCKD1_WGS/bams_linkage/MCKD1.bam.cleaned.bam.chr1:152448527-154998173.bam', MCKD1_INTERVAL, filters = WGS_FILTER, callArgs = '-bm THREE_STATE'))
stages = map ( string . lower , args [ 0 ] . split ( " , " ) )
STAGES = [ ' callsnps ' , ' callindels ' , ' indelmask ' , ' snpfilter ' , ' indelfilter ' , ' to_hg18 ' , ' optimize ' , ' eval ' , ' confusion_matrix ' ]
for stage in stages :
if stage not in STAGES :
sys . exit ( ' unknown stage ' + stage )
if OPTIONS . dir != " " and not os . path . exists ( OPTIONS . dir ) :
os . makedirs ( OPTIONS . dir )
allJobs = [ ]
def includeStage ( name ) :
return name in stages
for callTarget in targets :
# setup pipeline args
GATK_JAR = GATK_STABLE_JAR
if ( OPTIONS . dev ) : GATK_JAR = GATK_DEV_JAR
myPipelineArgs = PipelineArgs ( GATK_JAR = GATK_JAR , name = callTarget . name , excludeChrs = EXCLUDE_CHRS )
myPipelineArgs . addGATKArg ( EXTRA_GATK_ARGS )
myPipelineArgs . addGATKArg ( callTarget . getIntervalArg ( ) )
if ( OPTIONS . parallel != None ) : myPipelineArgs . addGATKArg ( ' -nt ' + OPTIONS . parallel )
if ( callTarget . b36 ) : myPipelineArgs . ref = ' b36 '
print callTarget . name , callTarget . b36 , myPipelineArgs . ref
lastJobs = None
if OPTIONS . target != None and not matches ( callTarget . name , OPTIONS . target ) :
print ' Skipping target ' , callTarget
continue
def updateNewJobs ( newjobs , lastJobs ) :
if OPTIONS . verbose :
print ' New jobs ' , newjobs
#for job in newjobs:
# print ' job ', job
allJobs . append ( newjobs )
if newjobs != [ ] :
lastJobs = newjobs
return [ ] , lastJobs
newJobs = [ ]
def execStage ( name , func , vcf = None , lastJobs = [ ] ) :
if OPTIONS . verbose : print ' Name is ' , name
newJobs , newVcf = func ( myPipelineArgs , callTarget , vcf , lastJobs )
if newVcf != None : vcf = newVcf
if OPTIONS . verbose : print ' VCF is ' , vcf
callTarget . addVcf ( vcf )
if includeStage ( name ) : newJobs , lastJobs = updateNewJobs ( newJobs , lastJobs )
if OPTIONS . verbose : print ' execStage: ' , newJobs , lastJobs , vcf
return newJobs , lastJobs , vcf
newJobs , callSNPJobs , vcf = execStage ( ' callsnps ' , callSNPs )
newJobs , lastJobs , vcf = execStage ( ' to_hg18 ' , convertToHg18 , vcf , callSNPJobs )
newJobs , filterSNPJobs , vcf = execStage ( ' snpfilter ' , filterSNPs , vcf , callSNPJobs )
# indel jobs
newJobs , callIndelJobs , vcf = execStage ( ' callindels ' , callIndels , vcf )
newJobs , indelMaskJobs , vcf = execStage ( ' indelmask ' , createIndelMask , vcf , callIndelJobs )
newJobs , filterIndelJobs , vcf = execStage ( ' indelfilter ' , filterIndels , vcf , indelMaskJobs + filterSNPJobs )
# optimization
newJobs , optimizeJobs , vcf = execStage ( ' optimize ' , VariantOptimizer , vcf , filterIndelJobs )
# eval
newJobs , evalJobs , vcf = execStage ( ' eval ' , evalSNPs , vcf , optimizeJobs )
# newJobs, lastJobs, ignore = execStage('confusion_matrix', computeConfusionMatrix, vcf)
print ' EXECUTING JOBS '
executeJobs ( allJobs , farm_queue = OPTIONS . farmQueue , just_print_commands = OPTIONS . dry )
#
# Actual commands
#
def convertToHg18 ( myPipelineArgs , callTarget , vcf , lastJobs ) :
if callTarget . b36 :
outputVCF = vcf . replace ( " .b36 " , " " )
cmd = ' python /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/python/vcf_b36_to_hg18.py ' + vcf + ' ' + outputVCF
jobs = [ FarmJob ( cmd , jobName = callTarget . name + ' . ' + ' b36ToHg18 ' , dependencies = lastJobs ) ]
return jobs , outputVCF
else :
return [ ] , vcf
def callSNPs ( myPipelineArgs , callTarget , ignore , lastJobs ) :
outputVCF = appendExtension ( callTarget . getVcf ( ) , " ug " )
if callTarget . b36 :
outputVCF = appendExtension ( outputVCF , " b36 " )
print ' outputVCF ' , outputVCF
2010-11-23 05:57:12 +08:00
ugArgs = ' -T UnifiedGenotyperV2 -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -I %s %s -o %s %s ' % ( callTarget . getBam ( ) , UG_ARGS , outputVCF , callTarget . getCallArgs ( ) )
farmCmds = simpleGATKCommand ( myPipelineArgs , ' call.UG ' , ugArgs , lastJobs )
2010-11-11 09:43:21 +08:00
if OPTIONS . splitByChr and not callTarget . hasInterval ( ) :
farmCmds = splitGATKCommandByChr ( myPipelineArgs , farmCmds [ 0 ] , [ outputVCF ] , [ mergeVCFs ] )
return farmCmds , outputVCF
INDEL_MASK_SIZE = 10
def getIndelCallFiles ( callTarget ) :
outputBed = appendExtension ( callTarget . getVcf ( ) , " indels.bed " , False )
outputVerbose = appendExtension ( callTarget . getVcf ( ) , " indels.verbose.txt " , False )
2010-11-23 05:57:12 +08:00
outputVCF = appendExtension ( callTarget . getVcf ( ) , " indels.vcf " , False )
2010-11-11 09:43:21 +08:00
outputMask = appendExtension ( callTarget . getVcf ( ) , " indels. %d .mask " % INDEL_MASK_SIZE , False )
2010-11-23 05:57:12 +08:00
return outputBed , outputVerbose , outputVCF , outputMask
2010-11-11 09:43:21 +08:00
def callIndels ( myPipelineArgs , callTarget , ignore , lastJobs ) :
2010-11-23 05:57:12 +08:00
outputBed , outputVerbose , indelsVCF , outputMask = getIndelCallFiles ( callTarget )
IGV2_ARGS = ' -T IndelGenotyperV2 -ws 500 -I %s -bed %s -verbose %s -o %s -rf Platform454 ' % ( callTarget . getBam ( ) , outputBed , outputVerbose , indelsVCF )
farmCmds = simpleGATKCommand ( myPipelineArgs , ' call.CallIndels ' , IGV2_ARGS , lastJobs )
2010-11-11 09:43:21 +08:00
if OPTIONS . splitByChr and not callTarget . hasInterval ( ) :
farmCmds = splitGATKCommandByChr ( myPipelineArgs , farmCmds [ 0 ] , [ outputBed , outputVerbose ] , [ mergeByCat , mergeByCat ] )
return farmCmds , None
def createIndelMask ( myPipelineArgs , callTarget , vcf , lastJobs ) :
2010-11-23 05:57:12 +08:00
outputBed , outputVerbose , outputVCF , outputMask = getIndelCallFiles ( callTarget )
2010-11-11 09:43:21 +08:00
cmd = ' python /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/python/makeIndelMask.py %s %d %s ' % ( outputBed , INDEL_MASK_SIZE , outputMask )
jobs = [ FarmJob ( cmd , jobName = callTarget . name + ' . ' + ' makeIndelMask ' , dependencies = lastJobs ) ]
return jobs , None
def filterSNPs ( myPipelineArgs , callTarget , vcf , lastJobs ) :
out = appendExtension ( vcf , ' snpfiltered ' )
filterString = ' ' . join ( map ( lambda x : ' --filterName %s --filterExpression " %s " ' % ( x [ 0 ] , x [ 1 ] ) , callTarget . filters ) )
2010-11-23 05:57:12 +08:00
return simpleGATKCommand ( myPipelineArgs , ' call.filterSNPs ' , ' -T VariantFiltration -B:variant,VCF %s -o %s --filterName LowQual --filterExpression " QUAL < 50.0 " --clusterWindowSize 10 --filterName HARD_TO_VALIDATE --filterExpression " MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1) " %s ' % ( vcf , out , filterString ) , lastJobs ) , out
2010-11-11 09:43:21 +08:00
def filterIndels ( myPipelineArgs , callTarget , vcf , lastJobs ) :
out = appendExtension ( vcf , ' indelfiltered ' )
2010-11-23 05:57:12 +08:00
outputBed , outputVerbose , outputVCF , outputMask = getIndelCallFiles ( callTarget )
return simpleGATKCommand ( myPipelineArgs , ' call.filterIndels ' , ' -T VariantFiltration -B:variant,VCF %s -o %s --maskName Indel -B:mask,Bed %s ' % ( vcf , out , outputMask ) , lastJobs ) , out
2010-11-11 09:43:21 +08:00
def VariantOptimizer ( myPipelineArgs , callTarget , vcf , lastJobs ) :
if callTarget . optimize :
2010-11-23 05:57:12 +08:00
clusterFile = appendExtension ( vcf , ' optimized.clusters ' , False )
tranchesFile = appendExtension ( vcf , ' optimized.tranches ' , False )
2010-11-11 09:43:21 +08:00
optOutVCF = appendExtension ( vcf , ' optimized ' )
out = appendExtension ( vcf , ' optimized.cut ' )
table = appendExtension ( vcf , ' optimized.table ' , False )
#NCLUSTERS = 4
#ITERATIONS = 3
#ITERATION_TO_TAKE = ITERATIONS
DBSNP_PRIOR = 2.0 # dbSNP seems dodger and dodger
IGNORE_FILTERS_CLUSTERING = " -ignoreFilter DPFilter -ignoreFilter ABFilter -ignoreFilter ESPStandard "
#IGNORE_FILTERS_CLUSTERING = "-ignoreFilter DPFilter -ignoreFilter ABFilter -ignoreFilter LowQual -ignoreFilter ESPStandard"
IGNORE_FILTERS_SCORING = IGNORE_FILTERS_CLUSTERING + " -ignoreFilter HARD_TO_VALIDATE "
annotationsToOptimize = [ ' SB ' , ' HaplotypeScore ' , " QD " , ' HRun ' ]
annotationsToOptimizeArg = ' ' . join ( map ( lambda x : ' -an ' + x , annotationsToOptimize ) ) # '' ['DP', 'SB', 'HaplotypeScore', 'MQ', "QD", 'HRun']
#tranches = ' '.join(map( lambda x: '-tranche ' + str(x), [1, 5, 10]))
2010-11-23 05:57:12 +08:00
tranches = ' ' . join ( map ( lambda x : ' -tranche ' + str ( x ) , [ 0.1 , 1 , 10 ] ) )
2010-11-11 09:43:21 +08:00
maxVariantsToShow = 2500
#singletonFPRate = 0.2
2010-11-23 05:57:12 +08:00
#hapmapVCF = '/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/genotypes_r27_nr.hg18_fwd.vcf'
hapmapVCF = ' hapmap_analysis/sitesr27_nr.hg18_fwd.vcf '
2010-11-11 09:43:21 +08:00
REGENERATE_VARIANT_CLUSTERS = True
if ( REGENERATE_VARIANT_CLUSTERS ) :
2010-11-23 05:57:12 +08:00
jobs1 = simpleGATKCommand ( myPipelineArgs , ' call.GenerateVariantClusters ' , ' -T GenerateVariantClusters -qual %d -std 3.5 -mG %d -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:hapmap,vcf %s -B:input,VCF %s -clusterFile %s %s %s --NoByHapMapValidationStatus ' % ( callTarget . minQual , callTarget . maxClusters , hapmapVCF , vcf , clusterFile , annotationsToOptimizeArg , IGNORE_FILTERS_CLUSTERING ) , lastJobs )
#jobs1 = simpleGATKCommand( myPipelineArgs, 'call.GenerateVariantClusters', '-T GenerateVariantClusters -qual %d -std 3.5 -mG %d -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:input,VCF %s -clusterFile %s %s %s' % ( callTarget.minQual, callTarget.maxClusters, vcf, clusterFile, annotationsToOptimizeArg, IGNORE_FILTERS_CLUSTERING ), lastJobs )
2010-11-11 09:43:21 +08:00
else :
jobs1 = lastJobs
2010-11-23 05:57:12 +08:00
jobs2 = simpleGATKCommand ( myPipelineArgs , ' call.VariantRecalibrator ' , ' -T VariantRecalibrator -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:input,VCF %s -clusterFile %s -o %s --target_titv %f %s -resources ~/dev/GenomeAnalysisTK/trunk/R/ %s -priorDBSNP %.2f -tranchesFile %s ' % ( vcf , clusterFile , optOutVCF , callTarget . targetTiTv , IGNORE_FILTERS_SCORING , tranches , DBSNP_PRIOR , tranchesFile ) , jobs1 )
2010-11-11 09:43:21 +08:00
cmd21 = ' python /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/python/vcf2table.py -f CHROM,POS,ID,AC,AF,AN,DB, ' + ' , ' . join ( annotationsToOptimize ) + ' ' + vcf + ' -o ' + table
2010-11-23 05:57:12 +08:00
jobs21 = [ FarmJob ( cmd21 , jobName = callTarget . name + ' .call. ' + ' VariantRecalibrationReport.vcf2table ' , dependencies = jobs2 ) ]
2010-11-11 09:43:21 +08:00
cmd22 = ' Rscript /humgen/gsa-scr1/depristo/dev/GenomeAnalysisTK/trunk/R/VariantRecalibratorReport/VariantRecalibratorReport.R %s %s %s NA %d ' % ( clusterFile , clusterFile , table , maxVariantsToShow )
2010-11-23 05:57:12 +08:00
jobs22 = [ FarmJob ( cmd22 , jobName = callTarget . name + ' .call. ' + ' VariantRecalibrationReport.RScript ' , dependencies = jobs21 ) ]
2010-11-11 09:43:21 +08:00
2010-11-23 05:57:12 +08:00
jobs3 = simpleGATKCommand ( myPipelineArgs , ' call.ApplyVariantCuts ' , ' -T ApplyVariantCuts -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:input,VCF %s -o %s -tranchesFile %s --fdr_filter_level %f ' % ( optOutVCF , out , tranchesFile , callTarget . tranchToTake ) , jobs2 )
2010-11-11 09:43:21 +08:00
return jobs1 + jobs2 + jobs21 + jobs22 + jobs3 , out
else :
return [ ] , vcf
def computeConfusionMatrix ( myPipelineArgs , callTarget , vcf , lastJobs ) :
out = appendExtension ( vcf , ' confusionmatrix ' , addExtension = False )
CM_ARGS = ' -T ComputeConfusionMatrix -I %s -o %s ' % ( callTarget . getBam ( ) , out )
farmCmds = simpleGATKCommand ( myPipelineArgs , ' ConfusionMatrix ' , CM_ARGS , lastJobs )
return farmCmds , None
def evalSNPs ( myPipelineArgs , callTarget , vcf , lastJobs ) :
evalRoot = OPTIONS . dir
oldMemory = myPipelineArgs . memory
2010-11-23 05:57:12 +08:00
myPipelineArgs . memory = ' 3g '
2010-11-11 09:43:21 +08:00
def eval1 ( vcf , namePostfix = " " , args = " " ) :
out = os . path . join ( OPTIONS . dir , os . path . basename ( vcf ) + namePostfix + " .ve2 " )
maybeHiSeqBindings = " "
hiSeqComp = os . path . join ( OPTIONS . dir , " HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.optimized.cut.vcf " )
2010-11-23 05:57:12 +08:00
omni = " -B:compOmni,VCF Omni.NA12878.hg18.vcf "
2010-11-11 09:43:21 +08:00
if os . path . exists ( hiSeqComp ) :
2010-11-23 05:57:12 +08:00
maybeHiSeqBindings = " -B:comp_HiSeq,VCF " + hiSeqComp + " "
validation_bindings = maybeHiSeqBindings + " -B:comp_p2_val,VCF 1kg_pilot2_snps.hg18.vcf -B:comp_CG,VCF /humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/NA12878/CG.hg18.vcf -B:compTrio,VCF CEU.trio.2010_03.genotypes.vcf -B:compTrioNovel,VCF CEU.trio.novels.2010_03.genotypes.vcf -B:comp_hm3,VCF /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.2/by_population/genotypes_CEU_phase3.2_consensus.hg18_fwd.vcf " + omni # not in hg18 space :-(
2010-11-11 09:43:21 +08:00
tranches = " "
if vcf . find ( " optimized " ) != - 1 :
2010-11-23 05:57:12 +08:00
args + = " -tf " + appendExtension ( vcf . replace ( " .cut " , " " ) , ' tranches ' , False )
2010-11-11 09:43:21 +08:00
vcf = appendExtension ( vcf . replace ( " .cut " , " " ) , ' vcf ' , False )
2010-11-23 05:57:12 +08:00
gatk_args = ( " -T VariantEval -reportType Grep -D /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod -B:eval,VCF %s " + validation_bindings + " -sample NA12878 -o %s -E CompOverlap -E GenotypeConcordance -E TiTvVariantEvaluator -E CountVariants %s " ) % ( vcf , out , args )
2010-11-11 09:43:21 +08:00
name = " EVAL_ %s _ %s " % ( callTarget . name , namePostfix )
return simpleGATKCommand ( myPipelineArgs , name , gatk_args , lastJobs ) [ 0 ]
jobs = [ ]
for vcf in callTarget . getVcfs ( ) :
jobs . append ( eval1 ( vcf ) )
if OPTIONS . byQEval and vcf . find ( " optimized " ) != - 1 :
for Q in [ 0.01 , 0.02 , 0.03 , 0.03 ] :
jobs . append ( eval1 ( vcf , ' .Q ' + str ( Q ) , ' -Q ' + str ( Q ) ) )
myPipelineArgs . memory = oldMemory
return jobs , None
if __name__ == " __main__ " :
main ( )