Due to a problem in the way that dynamic type selection works, I've added an explicit (temporary) ability to restrict VE to specific variant types (SNPs, INDELs, etc), so that calculations will work when a site has a SNP in dbSNP but is called as an indel, causing the SNP site to mysteriously disappear from the comp track, a huge problem for validation report. VEU updated to allow both dynamic type (old) and just returning everything in the track.
Also, created a standard Queue script that calculates a suite of standard indel and SNP assessment results. Will be the basis for a general evaluation Queue script with standardized data files for SNPs and Indels. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5385 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2f1e249aed
commit
5c979633f0
|
|
@ -68,6 +68,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
@Argument(fullName="doNotUseAllStandardStratifications", shortName="noST", doc="Do not use the standard stratification modules by default (instead, only those that are specified with the -S option)")
|
||||
protected Boolean NO_STANDARD_STRATIFICATIONS = false;
|
||||
|
||||
@Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false)
|
||||
protected Set<VariantContext.Type> typesToUse = null;
|
||||
|
||||
// Evaluator arguments
|
||||
@Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
|
||||
protected String[] MODULES_TO_USE = {};
|
||||
|
|
@ -201,7 +204,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
|
||||
if (tracker != null) {
|
||||
// track sample vc
|
||||
HashMap<String, HashMap<String, VariantContext>> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames);
|
||||
HashMap<String, HashMap<String, VariantContext>> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames, typesToUse != null);
|
||||
|
||||
for ( String compName : compNames ) {
|
||||
VariantContext comp = vcs.containsKey(compName) && vcs.get(compName) != null && vcs.get(compName).containsKey(ALL_SAMPLE_NAME) ? vcs.get(compName).get(ALL_SAMPLE_NAME) : null;
|
||||
|
|
@ -210,6 +213,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
|
|||
for ( String sampleName : sampleNamesForStratification ) {
|
||||
VariantContext eval = vcs.containsKey(evalName) && vcs.get(evalName) != null ? vcs.get(evalName).get(sampleName) : null;
|
||||
|
||||
if ( typesToUse != null ) {
|
||||
if ( eval != null && ! typesToUse.contains(eval.getType()) ) eval = null;
|
||||
if ( comp != null && ! typesToUse.contains(comp.getType()) ) comp = null;
|
||||
// if ( eval != null ) logger.info("Keeping " + eval);
|
||||
}
|
||||
|
||||
HashMap<VariantStratifier, ArrayList<String>> stateMap = new HashMap<VariantStratifier, ArrayList<String>>();
|
||||
for ( VariantStratifier vs : stratificationObjects ) {
|
||||
ArrayList<String> states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName);
|
||||
|
|
|
|||
|
|
@ -267,27 +267,35 @@ public class VariantEvalUtils {
|
|||
* @param evalNames the evaluation track names
|
||||
* @return the set of allowable variation types
|
||||
*/
|
||||
public EnumSet<VariantContext.Type> getAllowableVariationTypes(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames) {
|
||||
EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION);
|
||||
public EnumSet<VariantContext.Type> getAllowableVariationTypes(RefMetaDataTracker tracker,
|
||||
ReferenceContext ref,
|
||||
Set<String> compNames,
|
||||
Set<String> evalNames,
|
||||
boolean dynamicSelectTypes ) {
|
||||
if ( dynamicSelectTypes ) { // todo -- this code is really conceptually broken
|
||||
EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.NO_VARIATION);
|
||||
|
||||
if (tracker != null) {
|
||||
Collection<VariantContext> evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false);
|
||||
if (tracker != null) {
|
||||
Collection<VariantContext> evalvcs = tracker.getVariantContexts(ref, evalNames, null, ref.getLocus(), true, false);
|
||||
|
||||
for (VariantContext vc : evalvcs) {
|
||||
allowableTypes.add(vc.getType());
|
||||
}
|
||||
|
||||
if (allowableTypes.size() == 1) {
|
||||
// We didn't find any variation in the eval track, so now let's look at the comp track for allowable types
|
||||
Collection<VariantContext> compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false);
|
||||
|
||||
for (VariantContext vc : compvcs) {
|
||||
for (VariantContext vc : evalvcs) {
|
||||
allowableTypes.add(vc.getType());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return allowableTypes;
|
||||
if (allowableTypes.size() == 1) {
|
||||
// We didn't find any variation in the eval track, so now let's look at the comp track for allowable types
|
||||
Collection<VariantContext> compvcs = tracker.getVariantContexts(ref, compNames, null, ref.getLocus(), true, false);
|
||||
|
||||
for (VariantContext vc : compvcs) {
|
||||
allowableTypes.add(vc.getType());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return allowableTypes;
|
||||
} else {
|
||||
return EnumSet.allOf(VariantContext.Type.class);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -345,10 +353,9 @@ public class VariantEvalUtils {
|
|||
* @param subsetBySample if false, do not separate the track into per-sample VCs
|
||||
* @param trackPerSample if false, don't stratify per sample (and don't cut up the VariantContext like we would need
|
||||
* to do this)
|
||||
* @param allowNoCalls if false, don't accept no-call loci from a variant track
|
||||
* @return a mapping of track names to a list of VariantContext objects
|
||||
*/
|
||||
public HashMap<String, HashMap<String, VariantContext>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> trackNames, EnumSet<VariantContext.Type> allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample, boolean allowNoCalls) {
|
||||
public HashMap<String, HashMap<String, VariantContext>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> trackNames, EnumSet<VariantContext.Type> allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample) {
|
||||
HashMap<String, HashMap<String, VariantContext>> bindings = new HashMap<String, HashMap<String, VariantContext>>();
|
||||
|
||||
for (String trackName : trackNames) {
|
||||
|
|
@ -365,7 +372,7 @@ public class VariantEvalUtils {
|
|||
vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
|
||||
}
|
||||
|
||||
if ((byFilter || !vcsub.isFiltered()) && (allowNoCalls || vcsub.getType() != VariantContext.Type.NO_VARIATION)) {
|
||||
if ((byFilter || !vcsub.isFiltered())) {
|
||||
vcs.put(VariantEvalWalker.getAllSampleName(), vcsub);
|
||||
}
|
||||
|
||||
|
|
@ -374,7 +381,7 @@ public class VariantEvalUtils {
|
|||
for (String sampleName : variantEvalWalker.getSampleNamesForEvaluation()) {
|
||||
VariantContext samplevc = getSubsetOfVariantContext(vc, sampleName);
|
||||
|
||||
if ((byFilter || !samplevc.isFiltered()) && (allowNoCalls || samplevc.getType() != VariantContext.Type.NO_VARIATION)) {
|
||||
if ((byFilter || !samplevc.isFiltered())) {
|
||||
vcs.put(sampleName, samplevc);
|
||||
}
|
||||
}
|
||||
|
|
@ -397,10 +404,10 @@ public class VariantEvalUtils {
|
|||
* @param evalNames the list of eval names to process
|
||||
* @return a mapping of track names to a list of VariantContext objects
|
||||
*/
|
||||
public HashMap<String, HashMap<String, VariantContext>> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames) {
|
||||
public HashMap<String, HashMap<String, VariantContext>> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames, boolean dynamicSelectTypes) {
|
||||
HashMap<String, HashMap<String, VariantContext>> vcs = new HashMap<String, HashMap<String, VariantContext>>();
|
||||
|
||||
EnumSet<VariantContext.Type> allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames);
|
||||
EnumSet<VariantContext.Type> allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames, dynamicSelectTypes);
|
||||
|
||||
boolean byFilter = false;
|
||||
boolean perSampleIsEnabled = false;
|
||||
|
|
@ -412,8 +419,8 @@ public class VariantEvalUtils {
|
|||
}
|
||||
}
|
||||
|
||||
HashMap<String, HashMap<String, VariantContext>> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled, true);
|
||||
HashMap<String, HashMap<String, VariantContext>> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false, false);
|
||||
HashMap<String, HashMap<String, VariantContext>> evalBindings = bindVariantContexts(tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled);
|
||||
HashMap<String, HashMap<String, VariantContext>> compBindings = bindVariantContexts(tracker, ref, compNames, allowableTypes, byFilter, false, false);
|
||||
|
||||
vcs.putAll(compBindings);
|
||||
vcs.putAll(evalBindings);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,147 @@
|
|||
package oneoffs.depristo
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.RodBind
|
||||
import org.broadinstitute.sting.queue.extensions.gatk._
|
||||
|
||||
class StandardVariantEvalation extends QScript {
|
||||
@Argument(doc="gatkJarFile", required=false)
|
||||
var gatkJarFile: File = new File("/home/radon01/depristo/dev/GenomeAnalysisTKFromLaptop/trunk/dist/GenomeAnalysisTK.jar")
|
||||
|
||||
@Argument(shortName = "R", doc="ref", required=false)
|
||||
var referenceFile: File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||
|
||||
@Argument(shortName = "intervals", doc="intervals", required=false)
|
||||
val TARGET_INTERVAL: String = "20";
|
||||
|
||||
val DATA_DIR = "data/" // todo -- make into an argument
|
||||
|
||||
val MY_DBSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf");
|
||||
|
||||
val JOB_QUEUE = "hour"
|
||||
|
||||
// todo -- add arguments to include other files for SNPs and indels
|
||||
|
||||
val VARIANT_TYPES: List[String] = List("indels", "snps")
|
||||
val VARIANT_TYPE_VT: Map[String, List[org.broad.tribble.util.variantcontext.VariantContext.Type]] = Map(
|
||||
"indels" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.INDEL, org.broad.tribble.util.variantcontext.VariantContext.Type.MIXED, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION),
|
||||
"snps" -> List(org.broad.tribble.util.variantcontext.VariantContext.Type.SNP, org.broad.tribble.util.variantcontext.VariantContext.Type.NO_VARIATION)
|
||||
)
|
||||
|
||||
class Comp(val name: String, val evalType: String, val filename: String, val MakeHomVar: Boolean = false) {
|
||||
val originalFile = new File(DATA_DIR + filename)
|
||||
val file: File = if ( MakeHomVar ) swapExt(originalFile, ".vcf",".homvar.vcf") else originalFile
|
||||
val sitesFile = swapExt(file, ".vcf", ".sites.vcf")
|
||||
}
|
||||
|
||||
class Eval(val name: String, val evalType: String, val file: File) {}
|
||||
|
||||
var COMPS: List[Comp] = Nil
|
||||
def addComp(comp: Comp) { COMPS = comp :: COMPS }
|
||||
|
||||
var EVALS: List[Eval] = Nil
|
||||
def addEval(eval: Eval) { EVALS = eval :: EVALS }
|
||||
|
||||
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK {
|
||||
this.logging_level = "INFO";
|
||||
this.jarFile = gatkJarFile;
|
||||
this.intervalsString = List(TARGET_INTERVAL);
|
||||
this.reference_sequence = referenceFile;
|
||||
this.jobQueue = JOB_QUEUE;
|
||||
this.memoryLimit = Some(2)
|
||||
}
|
||||
|
||||
def script = {
|
||||
//
|
||||
// Standard evaluation files for indels
|
||||
//
|
||||
addComp(new Comp("homVarNA12878GATK", "indels", "Indels.NA12878_WGS.filtered_Q50.0_QD5.0_SB-1.0_HR18.vcf", true))
|
||||
addComp(new Comp("CG38", "indels", "CG.Indels.leftAligned.b37.vcf"))
|
||||
addComp(new Comp("homVarNA12878CG", "indels", "NA12878.CG.b37.indels.vcf", true))
|
||||
addComp(new Comp("Pilot1Validation", "indels", "pilot1_indel_validation_2009.b37.vcf"))
|
||||
addComp(new Comp("Pilot1Validation", "indels", "NA12878.validated.curated.polymorphic.indels.vcf"))
|
||||
|
||||
//
|
||||
// INDEL call sets
|
||||
//
|
||||
addEval(new Eval("dindel", "indels", new File("20110208.chr20.dindel2.EUR.sites.vcf")))
|
||||
addEval(new Eval("si", "indels", new File("20101123.chr20.si.v2.EUR.sites.vcf")))
|
||||
addEval(new Eval("gatk", "indels", new File("EUR.phase1.chr20.broad.filtered.indels.sites.vcf")))
|
||||
|
||||
//
|
||||
// Standard evaluation files for SNPs
|
||||
//
|
||||
addComp(new Comp("homVarNA12878GATK", "snps", "NA12878.HiSeq19.cut.vcf", true))
|
||||
addComp(new Comp("CG38", "snps", "CG.38samples.b37.vcf"))
|
||||
addComp(new Comp("homVarNA12878CG", "snps", "NA12878.CG.b37.snps.vcf", true))
|
||||
|
||||
//
|
||||
// SNP call sets
|
||||
//
|
||||
addEval(new Eval("gatk", "snps", new File("EUR+.phase1.chr20.broad.recal.vrcut1p0.sites.vcf")))
|
||||
|
||||
//
|
||||
// create hom-var versions of key files
|
||||
//
|
||||
for ( comp <- COMPS )
|
||||
if ( comp.MakeHomVar )
|
||||
add(new SelectHomVars(comp.originalFile, comp.file))
|
||||
|
||||
for ( comp <- COMPS )
|
||||
add(new JustSites(comp.file, comp.sitesFile))
|
||||
|
||||
//
|
||||
// Loop over evaluation types
|
||||
//
|
||||
for ( evalType <- VARIANT_TYPES ) {
|
||||
var evalsOfType = EVALS.filter(_.evalType == evalType)
|
||||
val compsOfType = COMPS.filter(_.evalType == evalType)
|
||||
|
||||
if ( evalsOfType.size > 1 ) {
|
||||
val union: File = new File("union.%s.vcf".format(evalType))
|
||||
add(new MyCombine(evalsOfType.map(_.file), union));
|
||||
evalsOfType = new Eval("union", evalType, union) :: evalsOfType
|
||||
}
|
||||
|
||||
val VE = new MyEval()
|
||||
VE.VT = VARIANT_TYPE_VT(evalType)
|
||||
VE.o = new File(evalType + ".eval")
|
||||
|
||||
// add evals
|
||||
for ( calls <- evalsOfType )
|
||||
VE.rodBind :+= RodBind("eval_" + calls.name, "VCF", calls.file)
|
||||
|
||||
// add comps
|
||||
//VE.rodBind :+= RodBind("dbsnp", "VCF", MY_DBSNP)
|
||||
for ( comp <- compsOfType )
|
||||
VE.rodBind :+= RodBind("comp_" + comp.name, "VCF", comp.sitesFile)
|
||||
|
||||
add(VE)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class SelectHomVars(@Input(doc="foo") vcf: File, @Output(doc="foo") out: File) extends SelectVariants with UNIVERSAL_GATK_ARGS {
|
||||
this.rodBind :+= RodBind("variant", "VCF", vcf)
|
||||
this.o = out
|
||||
this.select ++= List("\"AC == 2\"")
|
||||
}
|
||||
|
||||
class MyCombine(@Input(doc="foo") vcfs: List[File], @Output(doc="foo") out: File) extends CombineVariants with UNIVERSAL_GATK_ARGS {
|
||||
for ( vcf <- vcfs )
|
||||
this.rodBind :+= RodBind(vcf.getName, "VCF", vcf)
|
||||
this.o = out
|
||||
}
|
||||
|
||||
class JustSites(@Input(doc="foo") in: File, @Output(doc="foo") out: File) extends CommandLineFunction {
|
||||
def commandLine = "cut -f 1-8 %s > %s".format(in, out)
|
||||
this.jobQueue = JOB_QUEUE;
|
||||
}
|
||||
|
||||
class MyEval() extends VariantEval with UNIVERSAL_GATK_ARGS {
|
||||
this.noST = true
|
||||
this.evalModule :+= "ValidationReport"
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue