Users can now use VariantAnnotator to add annotations from one VCF to another. For example, if you want to annotate your target VCF with the AC field value from the rod bound to CEU1kg, you can specify -E CEU1kg.AC and records will be annotated with CEU1kg.AC=N when a record exists in that rod at the given position.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4598 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-10-29 16:38:31 +00:00
parent 1b3fc8ddd2
commit 1c056ea791
3 changed files with 119 additions and 57 deletions

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
@ -40,7 +41,6 @@ import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.classloader.PackageUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.CommandLineUtils;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
@ -68,15 +68,20 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected List<String> annotationGroupsToUse = new ArrayList<String>();
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false)
protected List<String> expressionsToUse = new ArrayList<String>();
@Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations (not for the faint of heart)", required=false)
protected Boolean USE_ALL_ANNOTATIONS = false;
@Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
protected Boolean LIST = false;
@Hidden
@Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
protected String ASSUME_SINGLE_SAMPLE = null;
@Hidden
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;
@ -132,7 +137,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
if ( USE_ALL_ANNOTATIONS )
engine = new VariantAnnotatorEngine(getToolkit());
else
engine = new VariantAnnotatorEngine(getToolkit(), annotationGroupsToUse, annotationsToUse);
engine = new VariantAnnotatorEngine(getToolkit(), annotationGroupsToUse, annotationsToUse, expressionsToUse);
// setup the header fields
// note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones

View File

@ -60,6 +60,7 @@ public class VariantAnnotatorEngine {
private List<InfoFieldAnnotation> requestedInfoAnnotations;
private List<GenotypeAnnotation> requestedGenotypeAnnotations;
private List<VAExpression> requestedExpressions = new ArrayList<VAExpression>();
private HashMap<String, String> dbAnnotations = new HashMap<String, String>();
@ -77,17 +78,44 @@ public class VariantAnnotatorEngine {
// printing out stats at the end.
private Map<String, Integer> inputTableHitCounter = new HashMap<String, Integer>();
private static class VAExpression {
public String fullName, bindingName, fieldName;
public VAExpression(String fullEpression) {
int indexOfDot = fullEpression.lastIndexOf(".");
if ( indexOfDot == -1 )
throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format");
fullName = fullEpression;
bindingName = fullEpression.substring(0, indexOfDot);
fieldName = fullEpression.substring(indexOfDot+1);
}
}
// use this constructor if you want all possible annotations
public VariantAnnotatorEngine(GenomeAnalysisEngine engine) {
requestedInfoAnnotations = PackageUtils.getInstancesOfClassesImplementingInterface(InfoFieldAnnotation.class);
requestedGenotypeAnnotations = PackageUtils.getInstancesOfClassesImplementingInterface(GenotypeAnnotation.class);
initialize(engine);
initializeDBs(engine);
}
// use this constructor if you want to select specific annotations (and/or interfaces)
// use this constructor if you want to select specific annotations (and/or interfaces), but no expressions
public VariantAnnotatorEngine(GenomeAnalysisEngine engine, List<String> annotationGroupsToUse, List<String> annotationsToUse) {
initializeAnnotations(annotationGroupsToUse, annotationsToUse);
initializeDBs(engine);
}
// use this constructor if you want to select specific annotations (and/or interfaces) plus expressions
public VariantAnnotatorEngine(GenomeAnalysisEngine engine, List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> expressionsToUse) {
initializeAnnotations(annotationGroupsToUse, annotationsToUse);
initializeDBs(engine);
// set up the expressions
for ( String expression : expressionsToUse )
requestedExpressions.add(new VAExpression(expression));
}
private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse) {
// create a map for all annotation classes which implement our top-level interfaces
HashMap<String, Class> classMap = new HashMap<String, Class>();
for ( Class c : PackageUtils.getClassesImplementingInterface(InfoFieldAnnotation.class) )
@ -130,11 +158,9 @@ public class VariantAnnotatorEngine {
if ( GenotypeAnnotation.class.isAssignableFrom(c) )
requestedGenotypeAnnotations.add((GenotypeAnnotation)PackageUtils.getSimpleInstance(c));
}
initialize(engine);
}
private void initialize(GenomeAnalysisEngine engine) {
private void initializeDBs(GenomeAnalysisEngine engine) {
// check to see whether comp rods were included
List<ReferenceOrderedDataSource> dataSources = engine.getRodDataSources();
@ -180,6 +206,50 @@ public class VariantAnnotatorEngine {
Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
// annotate db occurrences
annotateDBs(tracker, ref, vc, infoAnnotations);
// annotate expressions where available
annotateExpressions(tracker, ref, infoAnnotations);
// process the info field
List<Map<String, Object>> infoAnnotationOutputsList = new LinkedList<Map<String, Object>>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file
infoAnnotationOutputsList.add(new LinkedHashMap<String, Object>(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line.
infoAnnotationOutputsList.get(0).putAll(infoAnnotations); // put the DB membership info in
// go through all the requested info annotationTypes
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations )
{
Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc);
if ( annotationsFromCurrentType == null ) {
continue;
}
if(annotationType instanceof GenomicAnnotation)
{
infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType );
}
else
{
// add the annotations to each output line.
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
infoAnnotationOutput.putAll(annotationsFromCurrentType);
}
}
}
// annotate genotypes
Map<String, Genotype> genotypes = annotateGenotypes(tracker, ref, stratifiedContexts, vc);
// create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList
Collection<VariantContext> returnValue = new LinkedList<VariantContext>();
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) );
}
return returnValue;
}
private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
for ( Map.Entry<String, String> dbSet : dbAnnotations.entrySet() ) {
if ( dbSet.getKey().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
@ -198,64 +268,43 @@ public class VariantAnnotatorEngine {
infoAnnotations.put(dbSet.getValue(), overlapsComp ? true : false);
}
}
}
//Process the info field
List<Map<String, Object>> infoAnnotationOutputsList = new LinkedList<Map<String, Object>>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file
infoAnnotationOutputsList.add(new LinkedHashMap<String, Object>(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line.
infoAnnotationOutputsList.get(0).putAll(infoAnnotations); // put the DB membership info in
private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, Object> infoAnnotations) {
for ( VAExpression expression : requestedExpressions ) {
Collection<VariantContext> VCs = tracker.getVariantContexts(ref, expression.bindingName, null, ref.getLocus(), false, true);
if ( VCs.size() == 0 )
continue;
//go through all the requested info annotationTypes
for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations )
{
Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc);
if ( annotationsFromCurrentType == null ) {
VariantContext vc = VCs.iterator().next();
if ( vc.hasAttribute(expression.fieldName) )
infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
}
}
private Map<String, Genotype> annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
if ( requestedGenotypeAnnotations.size() == 0 )
return vc.getGenotypes();
Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc.getNSamples());
for ( Map.Entry<String, Genotype> g : vc.getGenotypes().entrySet() ) {
Genotype genotype = g.getValue();
StratifiedAlignmentContext context = stratifiedContexts.get(g.getKey());
if ( context == null ) {
genotypes.put(g.getKey(), genotype);
continue;
}
if(annotationType instanceof GenomicAnnotation)
{
infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType );
}
else
{
//add the annotations to each output line.
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
infoAnnotationOutput.putAll(annotationsFromCurrentType);
}
Map<String, Object> genotypeAnnotations = new HashMap<String, Object>(genotype.getAttributes());
for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
Map<String, Object> result = annotation.annotate(tracker, ref, context, vc, genotype);
if ( result != null )
genotypeAnnotations.putAll(result);
}
genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.genotypesArePhased()));
}
//Process genotypes
Map<String, Genotype> genotypes;
if ( requestedGenotypeAnnotations.size() == 0 ) {
genotypes = vc.getGenotypes();
} else {
genotypes = new HashMap<String, Genotype>(vc.getNSamples());
for ( Map.Entry<String, Genotype> g : vc.getGenotypes().entrySet() ) {
Genotype genotype = g.getValue();
StratifiedAlignmentContext context = stratifiedContexts.get(g.getKey());
if ( context == null ) {
genotypes.put(g.getKey(), genotype);
continue;
}
Map<String, Object> genotypeAnnotations = new HashMap<String, Object>(genotype.getAttributes());
for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
Map<String, Object> result = annotation.annotate(tracker, ref, context, vc, genotype);
if ( result != null )
genotypeAnnotations.putAll(result);
}
genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.genotypesArePhased()));
}
}
//Create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList
Collection<VariantContext> returnValue = new LinkedList<VariantContext>();
for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) );
}
return returnValue;
return genotypes;
}
// Finish processing data from GenomicAnnotation.

View File

@ -107,6 +107,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
executeTest("getting DB tag with HM3", spec);
}
@Test
public void testUsingExpression() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString() + " -B:foo,VCF " + validationDataLocation + "targetAnnotations.vcf -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -BTI variant", 1,
Arrays.asList("9362ae272d5e2ff2cfe7db307a324b6d"));
executeTest("using expression", spec);
}
@Test
public void testTabixAnnotations() {
final String MD5 = "6c7a6a1c0027bf82656542a9b2671a35";