203 lines
8.3 KiB
Scala
Executable File
203 lines
8.3 KiB
Scala
Executable File
package core
|
|
|
|
import org.broadinstitute.sting.queue.QScript
|
|
import org.broadinstitute.sting.queue.extensions.gatk.RodBind
|
|
import org.broadinstitute.sting.queue.extensions.gatk._
|
|
|
|
class StandardVariantEvaluation extends QScript {
|
|
// todo -- update to released version when things stabilize
|
|
@Argument(doc="gatkJarFile", required=false)
|
|
var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar")
|
|
|
|
@Argument(shortName = "R", doc="B37 reference sequence: defaults to broad standard location", required=false)
|
|
var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
|
|
|
@Argument(shortName = "intervals", doc="intervals to evaluate. Only supports evaluation on chromosome 20 now, as most evaluation data is there", required=false)
|
|
val TARGET_INTERVAL: String = "20"
|
|
|
|
@Argument(shortName = "includeUnion", doc="If provided, we'll create a union of the evaluation data sets for evaluation", required=false)
|
|
val CREATE_UNION: Boolean = false
|
|
|
|
@Argument(shortName = "dataDir", doc="Path to the standard evaluation data files", required=false)
|
|
val DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Comparisons/StandardForEvaluation/b37/"
|
|
|
|
@Argument(shortName = "evalStandard1000GCalls", doc="If provided, we'll include some standard 1000G data for evaluation", required=false)
|
|
val EVAL_STANDARD_1000G_CALLS: Boolean = false
|
|
|
|
val COMPS_DIR = DATA_DIR + "/comps/"
|
|
val EVALS_DIR = DATA_DIR + "/evals/"
|
|
|
|
@Argument(shortName = "moreSNPsToEval", doc="Path to additional SNP call sets for evaluation", required=false)
|
|
val moreSNPsToEval: List[File] = Nil
|
|
|
|
@Argument(shortName = "moreIndelsToEval", doc="Path to additional Indel call sets for evaluation", required=false)
|
|
val moreIndelsToEval: List[File] = Nil
|
|
|
|
val VARIANT_TYPES: List[String] = List("indels", "snps")
|
|
val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map(
|
|
"indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION),
|
|
"snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION)
|
|
)
|
|
|
|
val SITES_DIR: String = "sitesFiles"
|
|
|
|
// path to b37 DBSNP
|
|
@Argument(shortName = "dbsnp", doc="Path to DBSNP **VCF** for evaluation", required=false)
|
|
val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf")
|
|
//val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf");
|
|
|
|
class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) {
|
|
val originalFile = new File(COMPS_DIR + filename)
|
|
val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile
|
|
val sitesFile = new File(SITES_DIR + "/" + swapExt(file, ".vcf", ".sites.vcf").getName)
|
|
}
|
|
|
|
class Eval(val name: String, val evalType: String, val filename: String, val overrideFile: File = null ) {
|
|
val file: File = if ( overrideFile != null ) overrideFile else new File(EVALS_DIR + "/" + filename)
|
|
}
|
|
|
|
var COMPS: List[Comp] = Nil
|
|
def addComp(comp: Comp) { COMPS = comp :: COMPS }
|
|
|
|
var EVALS: List[Eval] = Nil
|
|
def addEval(eval: Eval) { EVALS = eval :: EVALS }
|
|
def addEvalFromCMD(file: File, t: String) { addEval(new Eval(file.getName, t, null, file)) }
|
|
|
|
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
|
|
this.logging_level = "INFO";
|
|
this.jarFile = gatkJarFile;
|
|
this.intervalsString = List(TARGET_INTERVAL);
|
|
this.reference_sequence = referenceFile;
|
|
this.memoryLimit = 2
|
|
}
|
|
|
|
def initializeStandardDataFiles() = {
|
|
//
|
|
// Standard evaluation files for indels
|
|
//
|
|
addComp(new Comp("NA12878.homvar.GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true))
|
|
addComp(new Comp("CG.38samples", "indels", "CG.Indels.leftAligned.b37.vcf"))
|
|
addComp(new Comp("NA12878.homvar.CG", "indels", "NA12878.CG.b37.indels.vcf", true))
|
|
addComp(new Comp("g1k.pilot1.validation", "indels", "pilot1_indel_validation_2009.b37.vcf"))
|
|
addComp(new Comp("NA12878.hand_curated", "indels", "NA12878.validated.curated.polymorphic.indels.vcf"))
|
|
addComp(new Comp("NA12878.Mullikin", "indels", "NA12878.DIPline.NQScm.expanded.chr20.b37.minReads_2_or_gt2bp.vcf"))
|
|
|
|
|
|
//
|
|
// INDEL call sets
|
|
//
|
|
if ( EVAL_STANDARD_1000G_CALLS ) {
|
|
addEval(new Eval("dindel", "indels", "20110208.chr20.dindel2.EUR.sites.vcf"))
|
|
addEval(new Eval("si", "indels", "20101123.chr20.si.v2.EUR.sites.vcf"))
|
|
addEval(new Eval("gatk", "indels", "EUR.phase1.chr20.broad.filtered.indels.sites.vcf"))
|
|
}
|
|
|
|
//
|
|
// Standard evaluation files for SNPs
|
|
//
|
|
addComp(new Comp("NA12878.homvar.GATK", "snps", "NA12878.HiSeq19.cut.vcf", true))
|
|
addComp(new Comp("CG.38samples", "snps", "CG.38samples.b37.vcf"))
|
|
addComp(new Comp("NA12878.homvar.CG", "snps", "NA12878.CG.b37.snps.vcf", true))
|
|
addComp(new Comp("HapMap3.3", "snps", "hapmap3.3.sites_r27_nr.b37_fwd.vcf"))
|
|
addComp(new Comp("OMNI.2.5M", "snps", "omni2.5.1212samples.b37.sites.chr20.monoAreAC0.vcf"))
|
|
addComp(new Comp("g1k.pilot1.validation", "snps", "1000G.snp.validation.b37.vcf"))
|
|
|
|
//
|
|
// SNP call sets
|
|
//
|
|
if ( EVAL_STANDARD_1000G_CALLS ) {
|
|
addEval(new Eval("1000G.gatk.eurPlus.phase1", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))
|
|
addEval(new Eval("1000G.high_specificity.phase1", "snps", "ALL.phase1.chr20.projectConsensus.highSpecificity.snps.genotypes.sites.vcf"))
|
|
}
|
|
}
|
|
|
|
def script = {
|
|
val sitesDir = new File(SITES_DIR)
|
|
if ( ! sitesDir.exists ) sitesDir.mkdirs()
|
|
|
|
initializeStandardDataFiles();
|
|
|
|
// add additional files for evaluation, if necessary
|
|
moreSNPsToEval.foreach(addEvalFromCMD(_, "snps"))
|
|
moreIndelsToEval.foreach(addEvalFromCMD(_, "indels"))
|
|
|
|
//
|
|
// create hom-var versions of key files
|
|
//
|
|
for ( comp <- COMPS )
|
|
if ( comp.MakeHomVar )
|
|
add(new SelectHomVars(comp.originalFile, comp.file))
|
|
|
|
for ( comp <- COMPS )
|
|
add(new JustSites(comp.file, comp.sitesFile))
|
|
|
|
//
|
|
// Loop over evaluation types
|
|
//
|
|
for ( evalType <- VARIANT_TYPES ) {
|
|
var evalsOfType = EVALS.filter(_.evalType == evalType)
|
|
val compsOfType = COMPS.filter(_.evalType == evalType)
|
|
|
|
if ( evalsOfType.size > 0 ) {
|
|
|
|
// if desired and possible, create a union.X.vcf file
|
|
if ( CREATE_UNION && evalsOfType.size > 1 ) {
|
|
val union: File = new File("union.%s.vcf".format(evalType))
|
|
add(new MyCombine(evalsOfType.map(_.file), union));
|
|
evalsOfType = new Eval("union", evalType, null, union) :: evalsOfType
|
|
}
|
|
|
|
// our root VE
|
|
val VE = new MyEval()
|
|
VE.VT = VARIANT_TYPE_VT(evalType)
|
|
VE.o = new File(evalType + ".eval")
|
|
|
|
// add evals
|
|
for ( calls <- evalsOfType )
|
|
VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file)
|
|
|
|
// add comps
|
|
//VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP)
|
|
for ( comp <- compsOfType )
|
|
VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile)
|
|
|
|
add(VE)
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Select homozygous non-reference sites from a single deep data set
|
|
*/
|
|
class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS {
|
|
this.rodBind :+= RodBind("variant", "VCF", vcf)
|
|
this.o = out
|
|
this.select ++= List("\"AC == 2\"")
|
|
}
|
|
|
|
/**
|
|
* A simple union
|
|
*/
|
|
class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS {
|
|
for ( vcf <- vcfs )
|
|
this.rodBind :+= RodBind(vcf.getName, "VCF", vcf)
|
|
this.o = out
|
|
}
|
|
|
|
/**
|
|
* A command line (cut) that removes all genotyping information from a file
|
|
*/
|
|
class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction {
|
|
def commandLine = "cut -f 1-8 %s > %s".format(in, out)
|
|
}
|
|
|
|
/**
|
|
* Base class for VariantEval used here
|
|
*/
|
|
class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS {
|
|
this.noST = true
|
|
this.evalModule :+= "ValidationReport"
|
|
}
|
|
}
|
|
|