2011-03-08 02:03:29 +08:00
package core
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk.RodBind
import org.broadinstitute.sting.queue.extensions.gatk._
class StandardVariantEvaluation extends QScript {
// todo -- update to released version when things stabilize
@Argument ( doc = "gatkJarFile" , required = false )
var gatkJarFile : File = new File ( "/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar" )
@Argument ( shortName = "R" , doc = "B37 reference sequence: defaults to broad standard location" , required = false )
var referenceFile : File = new File ( "/humgen/1kg/reference/human_g1k_v37.fasta" )
@Argument ( shortName = "intervals" , doc = "intervals to evaluate. Only supports evaluation on chromosome 20 now, as most evaluation data is there" , required = false )
val TARGET_INTERVAL : String = "20"
@Argument ( shortName = "includeUnion" , doc = "If provided, we'll create a union of the evaluation data sets for evaluation" , required = false )
val CREATE_UNION : Boolean = false
@Argument ( shortName = "dataDir" , doc = "Path to the standard evaluation data files" , required = false )
val DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Comparisons/StandardForEvaluation/b37/"
val COMPS_DIR = DATA_DIR + "/comps/"
val EVALS_DIR = DATA_DIR + "/evals/"
@Argument ( shortName = "moreSNPsToEval" , doc = "Path to additional SNP call sets for evaluation" , required = false )
val moreSNPsToEval : List [ File ] = Nil
@Argument ( shortName = "moreIndelsToEval" , doc = "Path to additional Indel call sets for evaluation" , required = false )
val moreIndelsToEval : List [ File ] = Nil
val VARIANT_TYPES : List [ String ] = List ( "indels" , "snps" )
val VARIANT_TYPE_VT : Map [ String , List [ org . broad . tribble . util . variantcontext . VariantContext . Type ] ] = Map (
"indels" -> List ( org . broad . tribble . util . variantcontext . VariantContext . Type . INDEL , org . broad . tribble . util . variantcontext . VariantContext . Type . MIXED , org . broad . tribble . util . variantcontext . VariantContext . Type . NO_VARIATION ) ,
"snps" -> List ( org . broad . tribble . util . variantcontext . VariantContext . Type . SNP , org . broad . tribble . util . variantcontext . VariantContext . Type . NO_VARIATION )
)
val SITES_DIR : String = "sitesFiles"
// path to b37 DBSNP
@Argument ( shortName = "dbsnp" , doc = "Path to DBSNP **VCF** for evaluation" , required = false )
val MY_DBSNP : File = new File ( "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf" )
//val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf");
class Comp ( val name : String , val evalType : String , val filename : String , val MakeHomVar : Boolean = false ) {
val originalFile = new File ( COMPS_DIR + filename )
val file : File = if ( MakeHomVar ) swapExt ( originalFile , ".vcf" , ".homvar.vcf" ) else originalFile
val sitesFile = new File ( SITES_DIR + "/" + swapExt ( file , ".vcf" , ".sites.vcf" ) . getName )
}
class Eval ( val name : String , val evalType : String , val filename : String , val overrideFile : File = null ) {
val file : File = if ( overrideFile != null ) overrideFile else new File ( EVALS_DIR + "/" + filename )
}
var COMPS : List [ Comp ] = Nil
def addComp ( comp : Comp ) { COMPS = comp : : COMPS }
var EVALS : List [ Eval ] = Nil
def addEval ( eval : Eval ) { EVALS = eval : : EVALS }
def addEvalFromCMD ( file : File , t : String ) { addEval ( new Eval ( file . getName , t , null , file ) ) }
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
this . logging_level = "INFO" ;
this . jarFile = gatkJarFile ;
this . intervalsString = List ( TARGET_INTERVAL ) ;
this . reference_sequence = referenceFile ;
2011-03-24 22:03:51 +08:00
this . memoryLimit = 2
2011-03-08 02:03:29 +08:00
}
def initializeStandardDataFiles ( ) = {
//
// Standard evaluation files for indels
//
addComp ( new Comp ( "NA12878.homvar.GATK" , "indels" , "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf" , true ) )
addComp ( new Comp ( "CG.38samples" , "indels" , "CG.Indels.leftAligned.b37.vcf" ) )
addComp ( new Comp ( "NA12878.homvar.CG" , "indels" , "NA12878.CG.b37.indels.vcf" , true ) )
addComp ( new Comp ( "g1k.pilot1.validation" , "indels" , "pilot1_indel_validation_2009.b37.vcf" ) )
addComp ( new Comp ( "NA12878.hand_curated" , "indels" , "NA12878.validated.curated.polymorphic.indels.vcf" ) )
//
// INDEL call sets
//
addEval ( new Eval ( "dindel" , "indels" , "20110208.chr20.dindel2.EUR.sites.vcf" ) )
addEval ( new Eval ( "si" , "indels" , "20101123.chr20.si.v2.EUR.sites.vcf" ) )
addEval ( new Eval ( "gatk" , "indels" , "EUR.phase1.chr20.broad.filtered.indels.sites.vcf" ) )
//
// Standard evaluation files for SNPs
//
addComp ( new Comp ( "NA12878.homvar.GATK" , "snps" , "NA12878.HiSeq19.cut.vcf" , true ) )
addComp ( new Comp ( "CG.38samples" , "snps" , "CG.38samples.b37.vcf" ) )
addComp ( new Comp ( "NA12878.homvar.CG" , "snps" , "NA12878.CG.b37.snps.vcf" , true ) )
addComp ( new Comp ( "HapMap3.3" , "snps" , "hapmap3.3.sites_r27_nr.b37_fwd.vcf" ) )
addComp ( new Comp ( "OMNI.2.5M" , "snps" , "omni2.5.1212samples.b37.sites.chr20.monoAreAC0.vcf" ) )
addComp ( new Comp ( "g1k.pilot1.validation" , "snps" , "1000G.snp.validation.b37.vcf" ) )
//
// SNP call sets
//
2011-03-12 10:06:28 +08:00
addEval ( new Eval ( "1000G.gatk.eurPlus.phase1" , "snps" , "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf" ) )
addEval ( new Eval ( "1000G.high_specificity.phase1" , "snps" , "ALL.phase1.chr20.projectConsensus.highSpecificity.snps.genotypes.sites.vcf" ) )
2011-03-08 02:03:29 +08:00
// todo -- are there other good call sets for evaluation?
// todo -- add hg19 na12878 64x
}
def script = {
val sitesDir = new File ( SITES_DIR )
if ( ! sitesDir . exists ) sitesDir . mkdirs ( )
initializeStandardDataFiles ( ) ;
// add additional files for evaluation, if necessary
moreSNPsToEval . foreach ( addEvalFromCMD ( _ , "snps" ) )
moreIndelsToEval . foreach ( addEvalFromCMD ( _ , "indels" ) )
//
// create hom-var versions of key files
//
for ( comp <- COMPS )
if ( comp . MakeHomVar )
add ( new SelectHomVars ( comp . originalFile , comp . file ) )
for ( comp <- COMPS )
add ( new JustSites ( comp . file , comp . sitesFile ) )
//
// Loop over evaluation types
//
for ( evalType <- VARIANT_TYPES ) {
var evalsOfType = EVALS . filter ( _ . evalType == evalType )
val compsOfType = COMPS . filter ( _ . evalType == evalType )
// if desired and possible, create a union.X.vcf file
if ( CREATE_UNION && evalsOfType . size > 1 ) {
val union : File = new File ( "union.%s.vcf" . format ( evalType ) )
add ( new MyCombine ( evalsOfType . map ( _ . file ) , union ) ) ;
evalsOfType = new Eval ( "union" , evalType , null , union ) : : evalsOfType
}
// our root VE
val VE = new MyEval ( )
VE . VT = VARIANT_TYPE_VT ( evalType )
VE . o = new File ( evalType + ".eval" )
// add evals
for ( calls <- evalsOfType )
VE . rodBind : += RodBind ( " eval_ " + calls . name , " VCF " , calls . file )
// add comps
//VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP)
for ( comp <- compsOfType )
VE . rodBind : += RodBind ( " comp_ " + comp . name , " VCF " , comp . sitesFile )
add ( VE )
}
}
/* *
* Select homozygous non - reference sites from a single deep data set
*/
class SelectHomVars ( @Input ( doc = "foo" ) vcf : File , @Output ( doc = "foo" ) out : File ) extends SelectVariants with UNIVERSAL_GATK_ARGS {
this . rodBind : += RodBind ( " variant " , " VCF " , vcf )
this . o = out
this . select ++= List ( "\"AC == 2\"" )
}
/* *
* A simple union
*/
class MyCombine ( @Input ( doc = "foo" ) vcfs : List [ File ] , @Output ( doc = "foo" ) out : File ) extends CombineVariants with UNIVERSAL_GATK_ARGS {
for ( vcf <- vcfs )
this . rodBind : += RodBind ( vcf . getName , " VCF " , vcf )
this . o = out
}
/* *
* A command line ( cut ) that removes all genotyping information from a file
*/
class JustSites ( @Input ( doc = "foo" ) in : File , @Output ( doc = "foo" ) out : File ) extends CommandLineFunction {
def commandLine = "cut -f 1-8 %s > %s" . format ( in , out )
}
/* *
* Base class for VariantEval used here
*/
class MyEval ( ) extends VariantEval with UNIVERSAL_GATK_ARGS {
this . noST = true
this . evalModule : += " ValidationReport "
}
}