Generic, easy-to-use variant evaluation Queue script that tests indel and SNP call sets against standard evaluation data sets for sensitivity and specificity

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5391 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-07 18:03:29 +00:00
parent dc33fbed7c
commit bf2e02f472
3 changed files with 224 additions and 158 deletions

View File

@ -0,0 +1,191 @@
package core
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.gatk.RodBind
import org.broadinstitute.sting.queue.extensions.gatk._
class StandardVariantEvaluation extends QScript {
// todo -- update to released version when things stabilize
@Argument(doc="gatkJarFile", required=false)
var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar")
@Argument(shortName = "R", doc="B37 reference sequence: defaults to broad standard location", required=false)
var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
@Argument(shortName = "intervals", doc="intervals to evaluate. Only supports evaluation on chromosome 20 now, as most evaluation data is there", required=false)
val TARGET_INTERVAL: String = "20"
@Argument(shortName = "includeUnion", doc="If provided, we'll create a union of the evaluation data sets for evaluation", required=false)
val CREATE_UNION: Boolean = false
@Argument(shortName = "dataDir", doc="Path to the standard evaluation data files", required=false)
val DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Comparisons/StandardForEvaluation/b37/"
val COMPS_DIR = DATA_DIR + "/comps/"
val EVALS_DIR = DATA_DIR + "/evals/"
@Argument(shortName = "moreSNPsToEval", doc="Path to additional SNP call sets for evaluation", required=false)
val moreSNPsToEval: List[File] = Nil
@Argument(shortName = "moreIndelsToEval", doc="Path to additional Indel call sets for evaluation", required=false)
val moreIndelsToEval: List[File] = Nil
val VARIANT_TYPES: List[String] = List("indels", "snps")
val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map(
"indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION),
"snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION)
)
val SITES_DIR: String = "sitesFiles"
// path to b37 DBSNP
@Argument(shortName = "dbsnp", doc="Path to DBSNP **VCF** for evaluation", required=false)
val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_129_b37.leftAligned.vcf")
//val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf");
class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) {
val originalFile = new File(COMPS_DIR + filename)
val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile
val sitesFile = new File(SITES_DIR + "/" + swapExt(file, ".vcf", ".sites.vcf").getName)
}
class Eval(val name: String, val evalType: String, val filename: String, val overrideFile: File = null ) {
val file: File = if ( overrideFile != null ) overrideFile else new File(EVALS_DIR + "/" + filename)
}
var COMPS: List[Comp] = Nil
def addComp(comp: Comp) { COMPS = comp :: COMPS }
var EVALS: List[Eval] = Nil
def addEval(eval: Eval) { EVALS = eval :: EVALS }
def addEvalFromCMD(file: File, t: String) { addEval(new Eval(file.getName, t, null, file)) }
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
this.logging_level = "INFO";
this.jarFile = gatkJarFile;
this.intervalsString = List(TARGET_INTERVAL);
this.reference_sequence = referenceFile;
this.memoryLimit = Some(2)
}
def initializeStandardDataFiles() = {
//
// Standard evaluation files for indels
//
addComp(new Comp("NA12878.homvar.GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true))
addComp(new Comp("CG.38samples", "indels", "CG.Indels.leftAligned.b37.vcf"))
addComp(new Comp("NA12878.homvar.CG", "indels", "NA12878.CG.b37.indels.vcf", true))
addComp(new Comp("g1k.pilot1.validation", "indels", "pilot1_indel_validation_2009.b37.vcf"))
addComp(new Comp("NA12878.hand_curated", "indels", "NA12878.validated.curated.polymorphic.indels.vcf"))
//
// INDEL call sets
//
addEval(new Eval("dindel", "indels", "20110208.chr20.dindel2.EUR.sites.vcf"))
addEval(new Eval("si", "indels", "20101123.chr20.si.v2.EUR.sites.vcf"))
addEval(new Eval("gatk", "indels", "EUR.phase1.chr20.broad.filtered.indels.sites.vcf"))
//
// Standard evaluation files for SNPs
//
addComp(new Comp("NA12878.homvar.GATK", "snps", "NA12878.HiSeq19.cut.vcf", true))
addComp(new Comp("CG.38samples", "snps", "CG.38samples.b37.vcf"))
addComp(new Comp("NA12878.homvar.CG", "snps", "NA12878.CG.b37.snps.vcf", true))
addComp(new Comp("HapMap3.3", "snps", "hapmap3.3.sites_r27_nr.b37_fwd.vcf"))
addComp(new Comp("OMNI.2.5M", "snps", "omni2.5.1212samples.b37.sites.chr20.monoAreAC0.vcf"))
addComp(new Comp("g1k.pilot1.validation", "snps", "1000G.snp.validation.b37.vcf"))
//
// SNP call sets
//
addEval(new Eval("gatk", "snps", "EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf"))
// todo -- are there other good call sets for evaluation?
// todo -- add hg19 na12878 64x
}
def script = {
val sitesDir = new File(SITES_DIR)
if ( ! sitesDir.exists ) sitesDir.mkdirs()
initializeStandardDataFiles();
// add additional files for evaluation, if necessary
moreSNPsToEval.foreach(addEvalFromCMD(_, "snps"))
moreIndelsToEval.foreach(addEvalFromCMD(_, "indels"))
//
// create hom-var versions of key files
//
for ( comp <- COMPS )
if ( comp.MakeHomVar )
add(new SelectHomVars(comp.originalFile, comp.file))
for ( comp <- COMPS )
add(new JustSites(comp.file, comp.sitesFile))
//
// Loop over evaluation types
//
for ( evalType <- VARIANT_TYPES ) {
var evalsOfType = EVALS.filter(_.evalType == evalType)
val compsOfType = COMPS.filter(_.evalType == evalType)
// if desired and possible, create a union.X.vcf file
if ( CREATE_UNION && evalsOfType.size > 1 ) {
val union: File = new File("union.%s.vcf".format(evalType))
add(new MyCombine(evalsOfType.map(_.file), union));
evalsOfType = new Eval("union", evalType, null, union) :: evalsOfType
}
// our root VE
val VE = new MyEval()
VE.VT = VARIANT_TYPE_VT(evalType)
VE.o = new File(evalType + ".eval")
// add evals
for ( calls <- evalsOfType )
VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file)
// add comps
//VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP)
for ( comp <- compsOfType )
VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile)
add(VE)
}
}
/**
* Select homozygous non-reference sites from a single deep data set
*/
class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS {
this.rodBind :+= RodBind("variant", "VCF", vcf)
this.o = out
this.select ++= List("\"AC == 2\"")
}
/**
* A simple union
*/
class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS {
for ( vcf <- vcfs )
this.rodBind :+= RodBind(vcf.getName, "VCF", vcf)
this.o = out
}
/**
* A command line (cut) that removes all genotyping information from a file
*/
class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction {
def commandLine = "cut -f 1-8 %s > %s".format(in, out)
}
/**
* Base class for VariantEval used here
*/
class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS {
this.noST = true
this.evalModule :+= "ValidationReport"
}
}

View File

@ -1,147 +0,0 @@
package oneoffs.depristo
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
import org.broadinstitute.sting.queue.extensions.gatk.RodBind
import org.broadinstitute.sting.queue.extensions.gatk._
class StandardVariantEvalation extends QScript {
@Argument(doc="gatkJarFile", required=false)
var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar")
@Argument(shortName = "R", doc="ref", required=false)
var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
@Argument(shortName = "intervals", doc="intervals", required=false)
val TARGET_INTERVAL: String = "20";
val DATA_DIR = "data/" // todo -- make into an argument
val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf");
val JOB_QUEUE = "hour"
// todo -- add arguments to include other files for SNPs and indels
val VARIANT_TYPES: List[String] = List("indels", "snps")
val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map(
"indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION),
"snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION)
)
class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) {
val originalFile = new File(DATA_DIR + filename)
val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile
val sitesFile = swapExt(file, ".vcf", ".sites.vcf")
}
class Eval(val name: String, val evalType: String, val file: File) {}
var COMPS: List[Comp] = Nil
def addComp(comp: Comp) { COMPS = comp :: COMPS }
var EVALS: List[Eval] = Nil
def addEval(eval: Eval) { EVALS = eval :: EVALS }
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
this.logging_level = "INFO";
this.jarFile = gatkJarFile;
this.intervalsString = List(TARGET_INTERVAL);
this.reference_sequence = referenceFile;
this.jobQueue = JOB_QUEUE;
this.memoryLimit = Some(2)
}
def script = {
//
// Standard evaluation files for indels
//
addComp(new Comp("homVarNA12878GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true))
addComp(new Comp("CG38", "indels", "CG.Indels.leftAligned.b37.vcf"))
addComp(new Comp("homVarNA12878CG", "indels", "NA12878.CG.b37.indels.vcf", true))
addComp(new Comp("Pilot1Validation", "indels", "pilot1_indel_validation_2009.b37.vcf"))
addComp(new Comp("Pilot1Validation", "indels", "NA12878.validated.curated.polymorphic.indels.vcf"))
//
// INDEL call sets
//
addEval(new Eval("dindel", "indels", new File("20110208.chr20.dindel2.EUR.sites.vcf")))
addEval(new Eval("si", "indels", new File("20101123.chr20.si.v2.EUR.sites.vcf")))
addEval(new Eval("gatk", "indels", new File("EUR.phase1.chr20.broad.filtered.indels.sites.vcf")))
//
// Standard evaluation files for SNPs
//
addComp(new Comp("homVarNA12878GATK", "snps", "NA12878.HiSeq19.cut.vcf", true))
addComp(new Comp("CG38", "snps", "CG.38samples.b37.vcf"))
addComp(new Comp("homVarNA12878CG", "snps", "NA12878.CG.b37.snps.vcf", true))
//
// SNP call sets
//
addEval(new Eval("gatk", "snps", new File("EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf")))
//
// create hom-var versions of key files
//
for ( comp <- COMPS )
if ( comp.MakeHomVar )
add(new SelectHomVars(comp.originalFile, comp.file))
for ( comp <- COMPS )
add(new JustSites(comp.file, comp.sitesFile))
//
// Loop over evaluation types
//
for ( evalType <- VARIANT_TYPES ) {
var evalsOfType = EVALS.filter(_.evalType == evalType)
val compsOfType = COMPS.filter(_.evalType == evalType)
if ( evalsOfType.size > 1 ) {
val union: File = new File("union.%s.vcf".format(evalType))
add(new MyCombine(evalsOfType.map(_.file), union));
evalsOfType = new Eval("union", evalType, union) :: evalsOfType
}
val VE = new MyEval()
VE.VT = VARIANT_TYPE_VT(evalType)
VE.o = new File(evalType + ".eval")
// add evals
for ( calls <- evalsOfType )
VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file)
// add comps
//VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP)
for ( comp <- compsOfType )
VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile)
add(VE)
}
}
class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS {
this.rodBind :+= RodBind("variant", "VCF", vcf)
this.o = out
this.select ++= List("\"AC == 2\"")
}
class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS {
for ( vcf <- vcfs )
this.rodBind :+= RodBind(vcf.getName, "VCF", vcf)
this.o = out
}
class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction {
def commandLine = "cut -f 1-8 %s > %s".format(in, out)
this.jobQueue = JOB_QUEUE;
}
class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS {
this.noST = true
this.evalModule :+= "ValidationReport"
}
}

View File

@ -9,8 +9,18 @@ class ManySampleUGPerformanceTesting extends QScript {
@Argument(shortName = "R", doc="ref", required=false)
var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val TARGET_INTERVAL = "my.intervals"
val FULL_BAM_LIST = new File("allPopulations_phase1_release.no_solid.list")
@Argument(shortName = "bams", doc="BAMs", required=true)
val FULL_BAM_LIST: File = null;
@Argument(shortName = "intervals", doc="intervals", required=true)
val TARGET_INTERVAL: String = null;
@Argument(shortName = "preMerge", doc="preMerge", required=false)
val PRE_MERGE: Boolean = false;
@Argument(shortName = "dcov", doc="dcov", required=false)
val DCOV: Int = 50;
val MERGED_DIR = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/manySampleUGPerformance/")
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
@ -24,26 +34,37 @@ class ManySampleUGPerformanceTesting extends QScript {
}
def script = {
for (nSamples <- List(1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900)) {
for (nSamples <- List(1, 2, 4, 8, 16, 32, 60)) {
// for (nSamples <- List(1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900)) {
// for (nSamples <- List(10)) {
val sublist = new SliceList(nSamples)
val mergeSublist = new MergeBAMs(sublist.list)
val name: String = if ( PRE_MERGE ) "pre_merge" else "dynamic_merge"
val bams: File = if ( PRE_MERGE ) mergeSublist.o else sublist.list
add(sublist)
add(mergeSublist)
add(new Index(mergeSublist.o) )
if ( PRE_MERGE ) {
add(mergeSublist)
add(new Index(mergeSublist.o) )
}
// SNP calling
//add(new Call(sublist.list, nSamples, "dynamic_merge"))
add(new Call(mergeSublist.o, nSamples, "pre_merge"))
val gt = new Call(bams, nSamples, name);
gt.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.N2_GOLD_STANDARD)
add(gt)
val gtLinear = new Call(bams, nSamples, name + "_linear");
gtLinear.exactCalculation = Some(org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL)
add(gtLinear)
// SNP calling -- no annotations
//add(new Call(sublist.list, nSamples, "dynamic_merge_no_annotations") { this.G :+= "None"; })
add(new Call(mergeSublist.o, nSamples, "pre_merge_no_annotations") { this.G :+= "none"; })
//add(new Call(bams.list, nSamples, "dynamic_merge_no_annotations") { this.G :+= "None"; })
// CountLoci
//add(new MyCountLoci(sublist.list, nSamples, "dynamic_merge"))
add(new MyCountLoci(mergeSublist.o, nSamples, "pre_merge"))
add(new MyCountLoci(bams, nSamples, name))
}
}
@ -63,20 +84,21 @@ class ManySampleUGPerformanceTesting extends QScript {
@Output(doc="foo") var outVCF: File = new File("%s.%d.%s.vcf".format(bamList.getName, n, name))
this.input_file :+= bamList
this.stand_call_conf = Option(10.0)
this.dcov = Option(50);
this.dcov = Option(DCOV);
this.o = outVCF
}
class MyCountLoci(@Input(doc="foo") bamList: File, n: Int, name: String) extends CountLoci with UNIVERSAL_GATK_ARGS {
@Output(doc="foo") var outFile: File = new File("%s.%d.%s.txt".format(bamList.getName, n, name))
this.input_file :+= bamList
this.dcov = Option(50);
this.dcov = Option(DCOV);
this.o = outFile
}
class SliceList(n: Int) extends CommandLineFunction {
@Output(doc="foo") var list: File = new File("bams.%d.list".format(n))
def commandLine = "head -n %d %s > %s".format(n, FULL_BAM_LIST, list)
this.jobQueue = "gsa";
}
}