Significant fixes for the Genomic Annotator.

1. Rip out all of Ben's code intended to circumvent the stable VCF Writer output system in multi-threaded mode (I threw up a little when 
I saw this code).  This will improve memory consumption when running with -nt.
2. Don't annotate indels or > bi-allelic sites.
3. Fix bug where not all records were making it into the output VCF.
4. General code clean up.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4118 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-08-25 20:16:50 +00:00
parent 41e53d37e1
commit 4678613893
2 changed files with 28 additions and 102 deletions

View File

@ -27,17 +27,7 @@
package org.broadinstitute.sting.playground.gatk.walkers.annotator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.*;
import java.util.Map.Entry;
import org.broad.tribble.util.variantcontext.VariantContext;
@ -51,14 +41,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec;
import org.broadinstitute.sting.gatk.walkers.By;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.collections.Pair;
@ -69,11 +54,9 @@ import org.broadinstitute.sting.utils.vcf.VCFUtils;
*
* For details, see: http://www.broadinstitute.org/gsa/wiki/index.php/GenomicAnnotator
*/
//@Requires(value={DataSource.READS, DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariantContext.class))
//@Allows(value={DataSource.READS, DataSource.REFERENCE})
//@Reference(window=@Window(start=-50,stop=50))
@Requires(value={DataSource.REFERENCE},referenceMetaData=@RMD(name="variant",type=VariantContext.class))
@By(DataSource.REFERENCE)
public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, LinkedList<VariantContext>> implements TreeReducible<LinkedList<VariantContext>> {
public class GenomicAnnotator extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@ -98,15 +81,11 @@ public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, Link
private boolean strict = true;
private boolean multiThreadedMode = false; //whether map will be called by more than one thread.
/**
* Prepare the output file and the list of available features.
*/
public void initialize() {
multiThreadedMode = getToolkit().getArguments().numberOfThreads > 1;
// get the list of all sample names from the various VCF input rods
TreeSet<String> samples = new TreeSet<String>();
SampleUtils.getUniquifiedSamplesFromRods(getToolkit(), samples, new HashMap<Pair<String, String>, String>());
@ -251,8 +230,7 @@ public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, Link
*
* @return 0
*/
public LinkedList<VariantContext> reduceInit() { return new LinkedList<VariantContext>(); }
public Integer reduceInit() { return 0; }
/**
* We want reads that span deletions
@ -269,93 +247,43 @@ public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, Link
* @param context the context for the given locus
* @return 1 if the locus was successfully processed, 0 if otherwise
*/
public LinkedList<VariantContext> map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
LinkedList<VariantContext> result = new LinkedList<VariantContext>();
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return result;
return 0;
List<Object> rods = tracker.getReferenceMetaData("variant");
// ignore places where we don't have a variant
if ( rods.size() == 0 )
return result;
Object variant = rods.get(0);
if( BaseUtils.isNBase(ref.getBase()) ) {
return result; //TODO Currently, VariantContextAdaptors.toVCF(annotatedVC, ref.getBase()) fails when base is 'N'. is this right?
}
VariantContext vc = VariantContextAdaptors.toVariantContext("variant", variant, ref);
if ( vc == null )
return result;
// if the reference base is not ambiguous, we can annotate
Collection<VariantContext> annotatedVCs = Arrays.asList(vc);
if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
if ( stratifiedContexts != null ) {
annotatedVCs = engine.annotateContext(tracker, ref, stratifiedContexts, vc);
Set<VariantContext> results = new LinkedHashSet<VariantContext>();
for (VariantContext vc : tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false)) {
if ( vc.isFiltered() ||
(vc.isVariant() && (!vc.isSNP() || !vc.isBiallelic())) ) {
results.add(vc);
} else {
Map<String, StratifiedAlignmentContext> stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getBasePileup());
if ( stratifiedContexts != null )
results.addAll(engine.annotateContext(tracker, ref, stratifiedContexts, vc));
else
results.add(vc);
}
}
if(multiThreadedMode) {
//keep results in memory, only writing them in onTraversalDone(..) after they have been merged via treeReduce(..)
for(VariantContext annotatedVC : annotatedVCs ) {
result.add(annotatedVC);
}
} else {
//write results to disk immediately
for(VariantContext annotatedVC : annotatedVCs ) {
vcfWriter.add(annotatedVC,ref.getBase());
}
}
for ( VariantContext vc : results )
vcfWriter.add(vc ,ref.getBase());
return result;
return 1;
}
/**
* Merge lists.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of loci processed.
*/
public LinkedList<VariantContext> reduce(LinkedList<VariantContext> value, LinkedList<VariantContext> sum) {
sum.addAll(value);
return sum;
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
/**
* Merge lists.
*/
public LinkedList<VariantContext> treeReduce(LinkedList<VariantContext> lhs, LinkedList<VariantContext> rhs) {
lhs.addAll(rhs);
return lhs;
public Integer treeReduce(Integer lhs, Integer rhs) {
return lhs + rhs;
}
/**
* Tell the user the number of loci processed and close out the new variants file.
*
* @param totalOutputRecords all VCs seen.
*/
public void onTraversalDone(LinkedList<VariantContext> totalOutputRecords) {
if(multiThreadedMode) {
//finally write results to disk
for(VariantContext vc : totalOutputRecords ) {
vcfWriter.add(vc, vc.getReference().getBases()[0]);
}
}
public void onTraversalDone(Integer sum) {
//out.printf("Generated %d annotated VCF records.\n", totalOutputVCFRecords);
Map<String, Integer> inputTableHitCounter = engine.getInputTableHitCounter();
for(Entry<String, Integer> e : inputTableHitCounter.entrySet()) {
for ( Entry<String, Integer> e : inputTableHitCounter.entrySet() ) {
final String bindingName = e.getKey();
final int counter = e.getValue();
//final float percent = 100 * counter /(float) totalOutputVCFRecords;
@ -363,7 +291,5 @@ public class GenomicAnnotator extends RodWalker<LinkedList<VariantContext>, Link
System.out.printf(" %d annotated with %s.\n", counter, bindingName );
}
}
}

View File

@ -26,7 +26,7 @@ public class GenomicAnnotatorIntegrationTest extends WalkerTest {
*/
String[] md5WithDashSArg = {"94edacdaee0dd58508d35d4d6040e31b"};
String[] md5WithDashSArg = {"5942c1bdc736f016af248994e036777a"};
WalkerTestSpec specWithSArg = new WalkerTestSpec(
"-T GenomicAnnotator -R " + b36KGReference +
" -B:variant,vcf /humgen/gsa-hpprojects/GATK/data/Annotations/examples/CEU_hapmap_nogt_23_subset.vcf" +