Fixing VariantEval logic and having it use the new rod system.

This commit is contained in:
Eric Banks 2011-08-11 13:39:34 -04:00
parent f1b09db39e
commit 265c3d744b
3 changed files with 148 additions and 127 deletions

View File

@ -60,14 +60,16 @@ import java.util.*;
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGenotyper.UGStatistics> implements TreeReducible<UnifiedGenotyper.UGStatistics>, AnnotatorCompatibleWalker {
@ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
@ArgumentCollection
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
/**
* A dbSNP VCF file from which to annotate.
*
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
*/
@ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }

View File

@ -3,8 +3,8 @@ package org.broadinstitute.sting.gatk.walkers.varianteval;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
@ -27,7 +27,6 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@ -46,6 +45,15 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Output
protected PrintStream out;
@Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true)
public List<RodBinding<VariantContext>> evals;
@Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false)
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
// Help arguments
@Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit")
protected Boolean LIST = false;
@ -61,7 +69,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
protected Set<String> SAMPLE_EXPRESSIONS;
@Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
protected String[] KNOWN_NAMES = {DbSNPHelper.STANDARD_DBSNP_TRACK_NAME};
protected String[] KNOWN_NAMES = {dbsnp.dbsnp.getName()};
// Stratification arguments
@Argument(fullName="stratificationModule", shortName="ST", doc="One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required=false)
@ -115,6 +123,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// The set of all possible evaluation contexts
private HashMap<StateKey, NewEvaluationContext> evaluationContexts = null;
// important stratifications
private boolean byFilterIsEnabled = false;
private boolean perSampleIsEnabled = false;
// Output report
private GATKReport report = null;
@ -134,27 +146,21 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// Just list the modules, and exit quickly.
if (LIST) { variantEvalUtils.listModulesAndExit(); }
// Categorize each rod as an eval or a comp rod.
for ( ReferenceOrderedDataSource d : this.getToolkit().getRodDataSources() ) {
if ( d.getName().startsWith("eval") ) {
evalNames.add(d.getName());
} else if ( d.getName().startsWith("comp") || d.getName().startsWith(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
compNames.add(d.getName());
} else {
logger.info(String.format("Not evaluating ROD binding '%s' because the name did not start with %s, comp, or eval", d.getName(), Utils.join(", ", KNOWN_NAMES)));
}
}
// Barf if we don't have any eval tracks.
if (evalNames.size() == 0) {
throw new UserException("No evaluation tracks were specified. Please bind one or more callsets to evaluate using the -B argument with a trackname that starts with the word 'eval'.");
}
// Add a dummy comp track if none exists
if (compNames.size() == 0) {
compNames.add("none");
if ( comps.size() == 0 ) {
comps.add(new RodBinding(VariantContext.class, "none", "UNBOUND", "", new Tags()));
}
// Cache the rod names
for ( RodBinding<VariantContext> compRod : comps )
compNames.add(compRod.getName());
for ( RodBinding<VariantContext> evalRod : evals )
evalNames.add(evalRod.getName());
if ( dbsnp.dbsnp.isBound() )
compNames.add(dbsnp.dbsnp.getName());
// Set up set of known names
knownNames.addAll(Arrays.asList(KNOWN_NAMES));
@ -190,6 +196,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// Initialize the set of stratifications and evaluations to use
stratificationObjects = variantEvalUtils.initializeStratificationObjects(this, NO_STANDARD_STRATIFICATIONS, STRATIFICATIONS_TO_USE);
Set<Class<? extends VariantEvaluator>> evaluationObjects = variantEvalUtils.initializeEvaluationObjects(NO_STANDARD_MODULES, MODULES_TO_USE);
for ( VariantStratifier vs : getStratificationObjects() ) {
if ( vs.getClass().getSimpleName().equals("Filter") )
byFilterIsEnabled = true;
else if ( vs.getClass().getSimpleName().equals("Sample") )
perSampleIsEnabled = true;
}
// Initialize the evaluation contexts
evaluationContexts = variantEvalUtils.initializeEvaluationContexts(stratificationObjects, evaluationObjects, null, null);
@ -221,15 +233,68 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
if (tracker != null) {
String aastr = (ancestralAlignments == null) ? null : new String(ancestralAlignments.getSubsequenceAt(ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop()).getBases());
// track sample vc
HashMap<String, HashMap<String, VariantContext>> vcs = variantEvalUtils.getVariantContexts(tracker, ref, compNames, evalNames, typesToUse != null);
// --------- track --------- sample - VariantContexts -
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> evalVCs = variantEvalUtils.bindVariantContexts(tracker, ref, evals, byFilterIsEnabled, true, perSampleIsEnabled);
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> compVCs = variantEvalUtils.bindVariantContexts(tracker, ref, comps, byFilterIsEnabled, false, false);
for ( String compName : compNames ) {
VariantContext comp = vcs.containsKey(compName) && vcs.get(compName) != null && vcs.get(compName).containsKey(ALL_SAMPLE_NAME) ? vcs.get(compName).get(ALL_SAMPLE_NAME) : null;
// for each eval track
for ( final RodBinding<VariantContext> evalRod : evals ) {
final HashMap<String, Set<VariantContext>> evalSet = evalVCs.containsKey(evalRod) ? evalVCs.get(evalRod) : new HashMap<String, Set<VariantContext>>(0);
for ( String evalName : evalNames ) {
for ( String sampleName : sampleNamesForStratification ) {
VariantContext eval = vcs.containsKey(evalName) && vcs.get(evalName) != null ? vcs.get(evalName).get(sampleName) : null;
// for each sample stratifier
for ( final String sampleName : sampleNamesForStratification ) {
Set<VariantContext> evalSetBySample = evalSet.get(sampleName);
if ( evalSetBySample == null ) {
evalSetBySample = new HashSet<VariantContext>(1);
evalSetBySample.add(null);
}
// for each eval in the track
for ( VariantContext eval : evalSetBySample ) {
// deal with ancestral alleles if requested
if ( eval != null && aastr != null ) {
HashMap<String, Object> newAts = new HashMap<String, Object>(eval.getAttributes());
newAts.put("ANCESTRALALLELE", aastr);
eval = VariantContext.modifyAttributes(eval, newAts);
}
// for each comp track
for ( final RodBinding<VariantContext> compRod : comps ) {
// no sample stratification for comps
final Set<VariantContext> compSet = compVCs.get(compRod) == null ? new HashSet<VariantContext>(0) : compVCs.get(compRod).values().iterator().next();
// find the comp
final VariantContext comp = findMatchingComp(eval, compSet);
HashMap<VariantStratifier, ArrayList<String>> stateMap = new HashMap<VariantStratifier, ArrayList<String>>();
for ( VariantStratifier vs : stratificationObjects ) {
ArrayList<String> states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
stateMap.put(vs, states);
}
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);
for ( StateKey stateKey : stateKeysHash ) {
NewEvaluationContext nec = evaluationContexts.get(stateKey);
// eval against the comp
synchronized (nec) {
nec.apply(tracker, ref, context, comp, eval);
}
// eval=null against all comps of different type
for ( VariantContext otherComp : compSet ) {
if ( otherComp != comp ) {
synchronized (nec) {
nec.apply(tracker, ref, context, otherComp, null);
}
}
}
}
}
// todo: Eric, this is really the problem. We select single eval and comp VCs independently
// todo: discarding multiple eval tracks at the sites and not providing matched comps
@ -247,37 +312,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// todo: like subset to sample, etc. So you probably will want a master map that maps
// todo: from special eval bindings to the digested VC for efficiency.
if ( typesToUse != null ) {
if ( eval != null && ! typesToUse.contains(eval.getType()) ) eval = null;
if ( comp != null && ! typesToUse.contains(comp.getType()) ) comp = null;
// if ( eval != null ) logger.info("Keeping " + eval);
}
if (eval != null && aastr != null) {
HashMap<String, Object> newAts = new HashMap<String, Object>(eval.getAttributes());
newAts.put("ANCESTRALALLELE", aastr);
eval = VariantContext.modifyAttributes(eval, newAts);
}
HashMap<VariantStratifier, ArrayList<String>> stateMap = new HashMap<VariantStratifier, ArrayList<String>>();
for ( VariantStratifier vs : stratificationObjects ) {
ArrayList<String> states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName);
stateMap.put(vs, states);
}
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);
for ( StateKey stateKey : stateKeysHash ) {
NewEvaluationContext nec = evaluationContexts.get(stateKey);
synchronized (nec) {
nec.apply(tracker, ref, context, comp, eval);
}
}
}
}
}
@ -286,6 +320,25 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
return null;
}
private VariantContext findMatchingComp(final VariantContext eval, final Set<VariantContext> comps) {
// if no comps, return null
if ( comps == null || comps.isEmpty() )
return null;
// if no eval, return any comp
if ( eval == null )
return comps.iterator().next();
// find a matching comp
for ( VariantContext comp : comps ) {
if ( comp.getType() == eval.getType() )
return comp;
}
// if no matching comp, return null
return null;
}
public Integer treeReduce(Integer lhs, Integer rhs) { return null; }
@Override

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.util;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.report.GATKReport;
@ -56,8 +57,9 @@ public class VariantEvalUtils {
/**
* Initialize required, standard and user-specified stratification objects
*
* @param noStandardStrats don't use the standard stratifications
* @param modulesToUse the list of stratification modules to use
* @param variantEvalWalker the parent walker
* @param noStandardStrats don't use the standard stratifications
* @param modulesToUse the list of stratification modules to use
* @return set of stratifications to use
*/
public TreeSet<VariantStratifier> initializeStratificationObjects(VariantEvalWalker variantEvalWalker, boolean noStandardStrats, String[] modulesToUse) {
@ -256,23 +258,6 @@ public class VariantEvalUtils {
return report;
}
/**
* Figure out what the allowable variation types are based on the eval context
*
* @param tracker the reference metadata tracker
* @param ref the reference context
* @param compNames the comp track names
* @param evalNames the evaluation track names
* @return the set of allowable variation types
*/
public EnumSet<VariantContext.Type> getAllowableVariationTypes(RefMetaDataTracker tracker,
ReferenceContext ref,
Set<String> compNames,
Set<String> evalNames,
boolean dynamicSelectTypes ) {
return EnumSet.allOf(VariantContext.Type.class);
}
/**
* Subset a VariantContext to a single sample
*
@ -321,78 +306,59 @@ public class VariantEvalUtils {
*
* @param tracker the metadata tracker
* @param ref the reference context
* @param trackNames the list of track names to process
* @param allowableTypes a set of allowable variation types
* @param tracks the list of tracks to process
* @param byFilter if false, only accept PASSing VariantContexts. Otherwise, accept both PASSing and filtered
* sites
* @param subsetBySample if false, do not separate the track into per-sample VCs
* @param trackPerSample if false, don't stratify per sample (and don't cut up the VariantContext like we would need
* to do this)
* @return a mapping of track names to a list of VariantContext objects
*
* @return the mapping of track to VC list that should be populated
*/
protected void bindVariantContexts(HashMap<String, HashMap<String, VariantContext>> bindings, RefMetaDataTracker tracker, ReferenceContext ref, Set<String> trackNames, EnumSet<VariantContext.Type> allowableTypes, boolean byFilter, boolean subsetBySample, boolean trackPerSample) {
for (String trackName : trackNames) {
HashMap<String, VariantContext> vcs = new HashMap<String, VariantContext>();
public HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> bindVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, List<RodBinding<VariantContext>> tracks, boolean byFilter, boolean subsetBySample, boolean trackPerSample) {
if ( tracker == null )
return null;
VariantContext vc = tracker == null ? null : tracker.getFirstValue(VariantContext.class, trackName, ref.getLocus());
HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>> bindings = new HashMap<RodBinding<VariantContext>, HashMap<String, Set<VariantContext>>>();
// First, filter the VariantContext to represent only the samples for evaluation
if (vc != null) {
for ( RodBinding<VariantContext> track : tracks ) {
HashMap<String, Set<VariantContext>> mapping = new HashMap<String, Set<VariantContext>>();
for ( VariantContext vc : tracker.getValues(track, ref.getLocus()) ) {
// First, filter the VariantContext to represent only the samples for evaluation
VariantContext vcsub = vc;
if (subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation())) {
if ( subsetBySample && vc.hasGenotypes() && vc.hasGenotypes(variantEvalWalker.getSampleNamesForEvaluation()) ) {
vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation());
}
if ((byFilter || !vcsub.isFiltered())) {
vcs.put(VariantEvalWalker.getAllSampleName(), vcsub);
if ( (byFilter || !vcsub.isFiltered()) ) {
addMapping(mapping, VariantEvalWalker.getAllSampleName(), vcsub);
}
// Now, if stratifying, split the subsetted vc per sample and add each as a new context
if (vc.hasGenotypes() && trackPerSample) {
for (String sampleName : variantEvalWalker.getSampleNamesForEvaluation()) {
if ( vc.hasGenotypes() && trackPerSample ) {
for ( String sampleName : variantEvalWalker.getSampleNamesForEvaluation() ) {
VariantContext samplevc = getSubsetOfVariantContext(vc, sampleName);
if ((byFilter || !samplevc.isFiltered())) {
vcs.put(sampleName, samplevc);
if ( byFilter || !samplevc.isFiltered() ) {
addMapping(mapping, sampleName, samplevc);
}
}
}
bindings.put(trackName, vcs);
bindings.put(track, mapping);
}
}
return bindings;
}
/**
* Maps track names to sample name to VariantContext objects. For eval tracks, VariantContexts per specified sample
* are also included.
*
* @param tracker the metadata tracker
* @param ref the reference context
* @param compNames the list of comp names to process
* @param evalNames the list of eval names to process
* @return a mapping of track names to a list of VariantContext objects
*/
public HashMap<String, HashMap<String, VariantContext>> getVariantContexts(RefMetaDataTracker tracker, ReferenceContext ref, Set<String> compNames, Set<String> evalNames, boolean dynamicSelectTypes) {
HashMap<String, HashMap<String, VariantContext>> vcs = new HashMap<String, HashMap<String, VariantContext>>();
EnumSet<VariantContext.Type> allowableTypes = getAllowableVariationTypes(tracker, ref, compNames, evalNames, dynamicSelectTypes);
boolean byFilter = false;
boolean perSampleIsEnabled = false;
for (VariantStratifier vs : variantEvalWalker.getStratificationObjects()) {
if (vs.getClass().getSimpleName().equals("Filter")) {
byFilter = true;
} else if (vs.getClass().getSimpleName().equals("Sample")) {
perSampleIsEnabled = true;
}
}
bindVariantContexts(vcs, tracker, ref, evalNames, allowableTypes, byFilter, true, perSampleIsEnabled);
bindVariantContexts(vcs, tracker, ref, compNames, allowableTypes, byFilter, false, false);
return vcs;
private void addMapping(HashMap<String, Set<VariantContext>> mappings, String sample, VariantContext vc) {
if ( !mappings.containsKey(sample) )
mappings.put(sample, new HashSet<VariantContext>());
mappings.get(sample).add(vc);
}
/**