Significant fixes for the Genomic Annotator.

1. Rip out all of Ben's code intended to circumvent the stable VCF Writer output system in multi-threaded mode (I threw up a little when 
I saw this code).  This will improve memory consumption when running with -nt.
2. Don't annotate indels or > bi-allelic sites.
3. Fix bug where not all records were making it into the output VCF.
4. General code clean up.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4118 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-08-25 20:16:50 +00:00
parent 41e53d37e1
commit 4678613893
2 changed files with 28 additions and 102 deletions

View File

@ -27,17 +27,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.annotator; package org.broadinstitute.sting.playground.gatk.walkers.annotator;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.Map.Entry; import java.util.Map.Entry;
import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.util.variantcontext.VariantContext;
@ -51,14 +41,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec;
import org.broadinstitute.sting.gatk.walkers.By; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
@ -69,11 +54,9 @@ import org.broadinstitute.sting.utils.vcf.VCFUtils;
* *
* For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator * For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator
*/ */
//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariantContext.class)) @Requires(value={DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariantContext.class))
//@Allows(value={DataSource.READS, DataSource.REFERENCE})
//@Reference(window=@Window(start=-50,stop=50))
@By(DataSource.REFERENCE) @By(DataSource.REFERENCE)
public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, LinkedList<VariantContext>> implements TreeReducible<LinkedList<VariantContext>> { public class GenomicAnnotator extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
@Output(doc="File to which variants should be written",required=true) @Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null; protected VCFWriter vcfWriter = null;
@ -98,15 +81,11 @@ public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, Link
private boolean strict = true; private boolean strict = true;
private boolean multiThreadedMode = false; //whether map will be called by more than one thread.
/** /**
* Prepare the output file and the list of available features. * Prepare the output file and the list of available features.
*/ */
public void initialize() { public void initialize() {
multiThreadedMode = getToolkit().getArguments().numberOfThreads > 1;
// get the list of all sample names from the various VCF input rods // get the list of all sample names from the various VCF input rods
TreeSet<String> samples = new TreeSet<String>(); TreeSet<String> samples = new TreeSet<String>();
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>()); SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
@ -251,8 +230,7 @@ public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, Link
* *
* @return 0 * @return 0
*/ */
public LinkedList<VariantContext> reduceInit() { return new LinkedList<VariantContext>(); } public Integer reduceInit() { return 0; }
/** /**
* We want reads that span deletions * We want reads that span deletions
@ -269,93 +247,43 @@ public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, Link
* @param context the context for the given locus * @param context the context for the given locus
* @return 1 if the locus was successfully processed, 0 if otherwise * @return 1 if the locus was successfully processed, 0 if otherwise
*/ */
public LinkedList<VariantContext> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
LinkedList<VariantContext> result = new LinkedList<VariantContext>();
if ( tracker == null ) if ( tracker == null )
return result; return 0;
List<Object> rods = tracker.getReferenceMetaData("variant"); Set<VariantContext> results = new LinkedHashSet<VariantContext>();
// ignore places where we don't have a variant for (VariantContext vc : tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false)) {
if ( rods.size() == 0 ) if ( vc.isFiltered() ||
return result; (vc.isVariant() && (!vc.isSNP() || !vc.isBiallelic())) ) {
results.add(vc);
Object variant = rods.get(0);
if( BaseUtils.isNBase(ref.getBase()) ) {
return result; //TODO Currently, VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()) fails when base is 'N'. is this right?
}
VariantContext vc = VariantContextAdaptors.toVariantContext("variant", variant, ref);
if ( vc == null )
return result;
// if the reference base is not ambiguous, we can annotate
Collection<VariantContext> annotatedVCs = Arrays.asList(vc);
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
if ( stratifiedContexts != null ) {
annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
}
}
if(multiThreadedMode) {
//keep results in memory, only writing them in onTraversalDone(..) after they have been merged via treeReduce(..)
for(VariantContext annotatedVC : annotatedVCs ) {
result.add(annotatedVC);
}
} else { } else {
//write results to disk immediately Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
for(VariantContext annotatedVC : annotatedVCs ) { if ( stratifiedContexts != null )
vcfWriter.add(annotatedVC,ref.getBase()); results.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
else
results.add(vc);
} }
} }
for ( VariantContext vc : results )
vcfWriter.add(vc ,ref.getBase());
return result; return 1;
} }
public Integer reduce(Integer value, Integer sum) {
/** return sum + value;
* Merge lists.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of loci processed.
*/
public LinkedList<VariantContext> reduce(LinkedList<VariantContext> value, LinkedList<VariantContext> sum) {
sum.addAll(value);
return sum;
} }
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
/**
* Merge lists.
*/
public LinkedList<VariantContext> treeReduce(LinkedList<VariantContext> lhs, LinkedList<VariantContext> rhs) {
lhs.addAll(rhs);
return lhs;
} }
public void onTraversalDone(Integer sum) {
/**
* Tell the user the number of loci processed and close out the new variants file.
*
* @param totalOutputRecords all VCs seen.
*/
public void onTraversalDone(LinkedList<VariantContext> totalOutputRecords) {
if(multiThreadedMode) {
//finally write results to disk
for(VariantContext vc : totalOutputRecords ) {
vcfWriter.add(vc, vc.getReference().getBases()[0]);
}
}
//out.printf("Generated %d annotated VCF records.\n", totalOutputVCFRecords); //out.printf("Generated %d annotated VCF records.\n", totalOutputVCFRecords);
Map<String, Integer> inputTableHitCounter = engine.getInputTableHitCounter(); Map<String, Integer> inputTableHitCounter = engine.getInputTableHitCounter();
for(Entry<String, Integer> e : inputTableHitCounter.entrySet()) { for ( Entry<String, Integer> e : inputTableHitCounter.entrySet() ) {
final String bindingName = e.getKey(); final String bindingName = e.getKey();
final int counter = e.getValue(); final int counter = e.getValue();
//final float percent = 100 * counter /(float) totalOutputVCFRecords; //final float percent = 100 * counter /(float) totalOutputVCFRecords;
@ -363,7 +291,5 @@ public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, Link
System.out.printf(" %d annotated with %s.\n", counter, bindingName ); System.out.printf(" %d annotated with %s.\n", counter, bindingName );
} }
} }
} }

View File

@ -26,7 +26,7 @@ public class GenomicAnnotatorIntegrationTest extends WalkerTest {
*/ */
String[] md5WithDashSArg = {"94edacdaee0dd58508d35d4d6040e31b"}; String[] md5WithDashSArg = {"5942c1bdc736f016af248994e036777a"};
WalkerTestSpec specWithSArg = new WalkerTestSpec( WalkerTestSpec specWithSArg = new WalkerTestSpec(
"-T GenomicAnnotator -R " + b36KGReference + "-T GenomicAnnotator -R " + b36KGReference +
" -B:variant,vcf /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" + " -B:variant,vcf /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" +