gatk-3.8/scala/qscript/core/StandardVariantEvaluation.s...

192 lines
7.8 KiB
Scala
Executable File

package core
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk.RodBind
import org.broadinstitute.sting.queue.extensions.gatk._
class StandardVariantEvaluation extends QScript {
// todo -- update to released version when things stabilize
@Argument(doc="gatkJarFile", required=false)
var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar")
@Argument(shortName = "R", doc="B37 reference sequence: defaults to broad standard location", required=false)
var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
@Argument(shortName = "intervals", doc="intervals to evaluate. Only supports evaluation on chromosome 20 now, as most evaluation data is there", required=false)
val TARGET_INTERVAL: String = "20"
@Argument(shortName = "includeUnion", doc="If provided, we'll create a union of the evaluation data sets for evaluation", required=false)
val CREATE_UNION: Boolean = false
@Argument(shortName = "dataDir", doc="Path to the standard evaluation data files", required=false)
val DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Comparisons/StandardForEvaluation/b37/"
val COMPS_DIR = DATA_DIR + "/comps/"
val EVALS_DIR = DATA_DIR + "/evals/"
@Argument(shortName = "moreSNPsToEval", doc="Path to additional SNP call sets for evaluation", required=false)
val moreSNPsToEval: List[File] = Nil
@Argument(shortName = "moreIndelsToEval", doc="Path to additional Indel call sets for evaluation", required=false)
val moreIndelsToEval: List[File] = Nil
val VARIANT_TYPES: List[String] = List("indels", "snps")
val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map(
"indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION),
"snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION)
)
val SITES_DIR: String = "sitesFiles"
// path to b37 DBSNP
@Argument(shortName = "dbsnp", doc="Path to DBSNP **VCF** for evaluation", required=false)
val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf")
//val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf");
class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) {
val originalFile = new File(COMPS_DIR + filename)
val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile
val sitesFile = new File(SITES_DIR + "/" + swapExt(file, ".vcf", ".sites.vcf").getName)
}
class Eval(val name: String, val evalType: String, val filename: String, val overrideFile: File = null ) {
val file: File = if ( overrideFile != null ) overrideFile else new File(EVALS_DIR + "/" + filename)
}
var COMPS: List[Comp] = Nil
def addComp(comp: Comp) { COMPS = comp :: COMPS }
var EVALS: List[Eval] = Nil
def addEval(eval: Eval) { EVALS = eval :: EVALS }
def addEvalFromCMD(file: File, t: String) { addEval(new Eval(file.getName, t, null, file)) }
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
this.logging_level = "INFO";
this.jarFile = gatkJarFile;
this.intervalsString = List(TARGET_INTERVAL);
this.reference_sequence = referenceFile;
this.memoryLimit = Some(2)
}
def initializeStandardDataFiles() = {
//
// Standard evaluation files for indels
//
addComp(new Comp("NA12878.homvar.GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true))
addComp(new Comp("CG.38samples", "indels", "CG.Indels.leftAligned.b37.vcf"))
addComp(new Comp("NA12878.homvar.CG", "indels", "NA12878.CG.b37.indels.vcf", true))
addComp(new Comp("g1k.pilot1.validation", "indels", "pilot1_indel_validation_2009.b37.vcf"))
addComp(new Comp("NA12878.hand_curated", "indels", "NA12878.validated.curated.polymorphic.indels.vcf"))
//
// INDEL call sets
//
addEval(new Eval("dindel", "indels", "20110208.chr20.dindel2.EUR.sites.vcf"))
addEval(new Eval("si", "indels", "20101123.chr20.si.v2.EUR.sites.vcf"))
addEval(new Eval("gatk", "indels", "EUR.phase1.chr20.broad.filtered.indels.sites.vcf"))
//
// Standard evaluation files for SNPs
//
addComp(new Comp("NA12878.homvar.GATK", "snps", "NA12878.HiSeq19.cut.vcf", true))
addComp(new Comp("CG.38samples", "snps", "CG.38samples.b37.vcf"))
addComp(new Comp("NA12878.homvar.CG", "snps", "NA12878.CG.b37.snps.vcf", true))
addComp(new Comp("HapMap3.3", "snps", "hapmap3.3.sites_r27_nr.b37_fwd.vcf"))
addComp(new Comp("OMNI.2.5M", "snps", "omni2.5.1212samples.b37.sites.chr20.monoAreAC0.vcf"))
addComp(new Comp("g1k.pilot1.validation", "snps", "1000G.snp.validation.b37.vcf"))
//
// SNP call sets
//
addEval(new Eval("gatk", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))
// todo -- are there other good call sets for evaluation?
// todo -- add hg19 na12878 64x
}
def script = {
val sitesDir = new File(SITES_DIR)
if ( ! sitesDir.exists ) sitesDir.mkdirs()
initializeStandardDataFiles();
// add additional files for evaluation, if necessary
moreSNPsToEval.foreach(addEvalFromCMD(_, "snps"))
moreIndelsToEval.foreach(addEvalFromCMD(_, "indels"))
//
// create hom-var versions of key files
//
for ( comp <- COMPS )
if ( comp.MakeHomVar )
add(new SelectHomVars(comp.originalFile, comp.file))
for ( comp <- COMPS )
add(new JustSites(comp.file, comp.sitesFile))
//
// Loop over evaluation types
//
for ( evalType <- VARIANT_TYPES ) {
var evalsOfType = EVALS.filter(_.evalType == evalType)
val compsOfType = COMPS.filter(_.evalType == evalType)
// if desired and possible, create a union.X.vcf file
if ( CREATE_UNION && evalsOfType.size > 1 ) {
val union: File = new File("union.%s.vcf".format(evalType))
add(new MyCombine(evalsOfType.map(_.file), union));
evalsOfType = new Eval("union", evalType, null, union) :: evalsOfType
}
// our root VE
val VE = new MyEval()
VE.VT = VARIANT_TYPE_VT(evalType)
VE.o = new File(evalType + ".eval")
// add evals
for ( calls <- evalsOfType )
VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file)
// add comps
//VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP)
for ( comp <- compsOfType )
VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile)
add(VE)
}
}
/**
* Select homozygous non-reference sites from a single deep data set
*/
class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS {
this.rodBind :+= RodBind("variant", "VCF", vcf)
this.o = out
this.select ++= List("\"AC == 2\"")
}
/**
* A simple union
*/
class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS {
for ( vcf <- vcfs )
this.rodBind :+= RodBind(vcf.getName, "VCF", vcf)
this.o = out
}
/**
* A command line (cut) that removes all genotyping information from a file
*/
class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction {
def commandLine = "cut -f 1-8 %s > %s".format(in, out)
}
/**
* Base class for VariantEval used here
*/
class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS {
this.noST = true
this.evalModule :+= "ValidationReport"
}
}