Due to a problem in the way that dynamic type selection works, I've added an explicit (temporary) ability to restrict VE to specific variant types (SNPs, INDELs, etc), so that calculations will work when a site has a SNP in dbSNP but is called as an indel, causing the SNP site to mysteriously disappear from the comp track, a huge problem for validation report. VEU updated to allow both dynamic type (old) and just returning everything in the track.

Also, created a standard Queue script that calculates a suite of standard indel and SNP assessment results.  Will be the basis for a general evaluation Queue script with standardized data files for SNPs and Indels.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5385 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-06 19:31:12 +00:00
parent 2f1e249aed
commit 5c979633f0
3 changed files with 188 additions and 25 deletions

View File

@ -68,6 +68,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)")
protected Boolean NO_STANDARD_STRATIFICATIONS = false;
@Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false)
protected Set<VariantContext.Type> typesToUse = null;
// Evaluator arguments
@Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
protected String[] MODULES_TO_USE = {};
@ -201,7 +204,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
if (tracker != null) {
// track sample vc
HashMap<String, HashMap<String, VariantContext>> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames);
HashMap<String, HashMap<String, VariantContext>> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames, typesToUse != null);
for ( String compName : compNames ) {
VariantContext comp = vcs.containsKey(compName) && vcs.get(compName) != null && vcs.get(compName).containsKey(ALL_SAMPLE_NAME) ? vcs.get(compName).get(ALL_SAMPLE_NAME) : null;
@ -210,6 +213,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
for ( String sampleName : sampleNamesForStratification ) {
VariantContext eval = vcs.containsKey(evalName) && vcs.get(evalName) != null ? vcs.get(evalName).get(sampleName) : null;
if ( typesToUse != null ) {
if ( eval != null && ! typesToUse.contains(eval.getType()) ) eval = null;
if ( comp != null && ! typesToUse.contains(comp.getType()) ) comp = null;
// if ( eval != null ) logger.info("Keeping " + eval);
}
HashMap<VariantStratifier, ArrayList<String>> stateMap = new HashMap<VariantStratifier, ArrayList<String>>();
for ( VariantStratifier vs : stratificationObjects ) {
ArrayList<String> states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName);

View File

@ -267,27 +267,35 @@ public class VariantEvalUtils {
* @param evalNames the evaluation track names
* @return the set of allowable variation types
*/
public EnumSet<VariantContext.Type> getAllowableVariationTypes(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames) {
EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION);
public EnumSet<VariantContext.Type> getAllowableVariationTypes(RefMetaDataTracker tracker,
ReferenceContext ref,
Set<String> compNames,
Set<String> evalNames,
boolean dynamicSelectTypes ) {
if ( dynamicSelectTypes ) { // todo -- this code is really conceptually broken
EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION);
if (tracker != null) {
Collection<VariantContext> evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false);
if (tracker != null) {
Collection<VariantContext> evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false);
for (VariantContext vc : evalvcs) {
allowableTypes.add(vc.getType());
}
if (allowableTypes.size() == 1) {
// We didn't find any variation in the eval track, so now let's look at the comp track for allowable types
Collection<VariantContext> compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false);
for (VariantContext vc : compvcs) {
for (VariantContext vc : evalvcs) {
allowableTypes.add(vc.getType());
}
}
}
return allowableTypes;
if (allowableTypes.size() == 1) {
// We didn't find any variation in the eval track, so now let's look at the comp track for allowable types
Collection<VariantContext> compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false);
for (VariantContext vc : compvcs) {
allowableTypes.add(vc.getType());
}
}
}
return allowableTypes;
} else {
return EnumSet.allOf(VariantContext.Type.class);
}
}
/**
@ -345,10 +353,9 @@ public class VariantEvalUtils {
* @param subsetBySample if false, do not separate the track into per-sample VCs
* @param trackPerSample if false, don't stratify per sample (and don't cut up the VariantContext like we would need
* to do this)
* @param allowNoCalls if false, don't accept no-call loci from a variant track
* @return a mapping of track names to a list of VariantContext objects
*/
public HashMap<String, HashMap<String, VariantContext>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> trackNames, EnumSet<VariantContext.Type> allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample, boolean allowNoCalls) {
public HashMap<String, HashMap<String, VariantContext>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> trackNames, EnumSet<VariantContext.Type> allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample) {
HashMap<String, HashMap<String, VariantContext>> bindings = new HashMap<String, HashMap<String, VariantContext>>();
for (String trackName : trackNames) {
@ -365,7 +372,7 @@ public class VariantEvalUtils {
vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
}
if ((byFilter || !vcsub.isFiltered()) && (allowNoCalls || vcsub.getType() != VariantContext.Type.NO_VARIATION)) {
if ((byFilter || !vcsub.isFiltered())) {
vcs.put(VariantEvalWalker.getAllSampleName(), vcsub);
}
@ -374,7 +381,7 @@ public class VariantEvalUtils {
for (String sampleName : variantEvalWalker.getSampleNamesForEvaluation()) {
VariantContext samplevc = getSubsetOfVariantContext(vc, sampleName);
if ((byFilter || !samplevc.isFiltered()) && (allowNoCalls || samplevc.getType() != VariantContext.Type.NO_VARIATION)) {
if ((byFilter || !samplevc.isFiltered())) {
vcs.put(sampleName, samplevc);
}
}
@ -397,10 +404,10 @@ public class VariantEvalUtils {
* @param evalNames the list of eval names to process
* @return a mapping of track names to a list of VariantContext objects
*/
public HashMap<String, HashMap<String, VariantContext>> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames) {
public HashMap<String, HashMap<String, VariantContext>> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames, boolean dynamicSelectTypes) {
HashMap<String, HashMap<String, VariantContext>> vcs = new HashMap<String, HashMap<String, VariantContext>>();
EnumSet<VariantContext.Type> allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames);
EnumSet<VariantContext.Type> allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames, dynamicSelectTypes);
boolean byFilter = false;
boolean perSampleIsEnabled = false;
@ -412,8 +419,8 @@ public class VariantEvalUtils {
}
}
HashMap<String, HashMap<String, VariantContext>> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled, true);
HashMap<String, HashMap<String, VariantContext>> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false, false);
HashMap<String, HashMap<String, VariantContext>> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled);
HashMap<String, HashMap<String, VariantContext>> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false);
vcs.putAll(compBindings);
vcs.putAll(evalBindings);

View File

@ -0,0 +1,147 @@
package oneoffs.depristo
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
import org.broadinstitute.sting.queue.extensions.gatk.RodBind
import org.broadinstitute.sting.queue.extensions.gatk._
class StandardVariantEvalation extends QScript {
@Argument(doc="gatkJarFile", required=false)
var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar")
@Argument(shortName = "R", doc="ref", required=false)
var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
@Argument(shortName = "intervals", doc="intervals", required=false)
val TARGET_INTERVAL: String = "20";
val DATA_DIR = "data/" // todo -- make into an argument
val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf");
val JOB_QUEUE = "hour"
// todo -- add arguments to include other files for SNPs and indels
val VARIANT_TYPES: List[String] = List("indels", "snps")
val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map(
"indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION),
"snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION)
)
class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) {
val originalFile = new File(DATA_DIR + filename)
val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile
val sitesFile = swapExt(file, ".vcf", ".sites.vcf")
}
class Eval(val name: String, val evalType: String, val file: File) {}
var COMPS: List[Comp] = Nil
def addComp(comp: Comp) { COMPS = comp :: COMPS }
var EVALS: List[Eval] = Nil
def addEval(eval: Eval) { EVALS = eval :: EVALS }
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
this.logging_level = "INFO";
this.jarFile = gatkJarFile;
this.intervalsString = List(TARGET_INTERVAL);
this.reference_sequence = referenceFile;
this.jobQueue = JOB_QUEUE;
this.memoryLimit = Some(2)
}
def script = {
//
// Standard evaluation files for indels
//
addComp(new Comp("homVarNA12878GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true))
addComp(new Comp("CG38", "indels", "CG.Indels.leftAligned.b37.vcf"))
addComp(new Comp("homVarNA12878CG", "indels", "NA12878.CG.b37.indels.vcf", true))
addComp(new Comp("Pilot1Validation", "indels", "pilot1_indel_validation_2009.b37.vcf"))
addComp(new Comp("Pilot1Validation", "indels", "NA12878.validated.curated.polymorphic.indels.vcf"))
//
// INDEL call sets
//
addEval(new Eval("dindel", "indels", new File("20110208.chr20.dindel2.EUR.sites.vcf")))
addEval(new Eval("si", "indels", new File("20101123.chr20.si.v2.EUR.sites.vcf")))
addEval(new Eval("gatk", "indels", new File("EUR.phase1.chr20.broad.filtered.indels.sites.vcf")))
//
// Standard evaluation files for SNPs
//
addComp(new Comp("homVarNA12878GATK", "snps", "NA12878.HiSeq19.cut.vcf", true))
addComp(new Comp("CG38", "snps", "CG.38samples.b37.vcf"))
addComp(new Comp("homVarNA12878CG", "snps", "NA12878.CG.b37.snps.vcf", true))
//
// SNP call sets
//
addEval(new Eval("gatk", "snps", new File("EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf")))
//
// create hom-var versions of key files
//
for ( comp <- COMPS )
if ( comp.MakeHomVar )
add(new SelectHomVars(comp.originalFile, comp.file))
for ( comp <- COMPS )
add(new JustSites(comp.file, comp.sitesFile))
//
// Loop over evaluation types
//
for ( evalType <- VARIANT_TYPES ) {
var evalsOfType = EVALS.filter(_.evalType == evalType)
val compsOfType = COMPS.filter(_.evalType == evalType)
if ( evalsOfType.size > 1 ) {
val union: File = new File("union.%s.vcf".format(evalType))
add(new MyCombine(evalsOfType.map(_.file), union));
evalsOfType = new Eval("union", evalType, union) :: evalsOfType
}
val VE = new MyEval()
VE.VT = VARIANT_TYPE_VT(evalType)
VE.o = new File(evalType + ".eval")
// add evals
for ( calls <- evalsOfType )
VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file)
// add comps
//VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP)
for ( comp <- compsOfType )
VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile)
add(VE)
}
}
class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS {
this.rodBind :+= RodBind("variant", "VCF", vcf)
this.o = out
this.select ++= List("\"AC == 2\"")
}
class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS {
for ( vcf <- vcfs )
this.rodBind :+= RodBind(vcf.getName, "VCF", vcf)
this.o = out
}
class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction {
def commandLine = "cut -f 1-8 %s > %s".format(in, out)
this.jobQueue = JOB_QUEUE;
}
class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS {
this.noST = true
this.evalModule :+= "ValidationReport"
}
}