From 1c056ea791e94c4f8095604ef7558fa635c76dc2 Mon Sep 17 00:00:00 2001
From: ebanks <ebanks@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 29 Oct 2010 16:38:31 +0000
Subject: [PATCH] Users can now use VariantAnnotator to add annotations from
 one VCF to another.  For example, if you want to annotate your target VCF
 with the AC field value from the rod bound to CEU1kg, you can specify -E
 CEU1kg.AC and records will be annotated with CEU1kg.AC=N when a record exists
 in that rod at the given position.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4598 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../walkers/annotator/VariantAnnotator.java   |   9 +-
 .../annotator/VariantAnnotatorEngine.java     | 159 ++++++++++++------
 .../VariantAnnotatorIntegrationTest.java      |   8 +
 3 files changed, 119 insertions(+), 57 deletions(-)
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index b70b00f94..d2941c637 100755
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator;
 
 import org.broad.tribble.util.variantcontext.VariantContext;
 import org.broad.tribble.vcf.*;
+import org.broadinstitute.sting.commandline.Hidden;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext;
@@ -40,7 +41,6 @@ import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.classloader.PackageUtils;
 import org.broadinstitute.sting.utils.SampleUtils;
 import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.CommandLineUtils;
 import org.broadinstitute.sting.commandline.Output;
 import org.broadinstitute.sting.utils.vcf.VCFUtils;
 
@@ -68,15 +68,20 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
     @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
     protected List<String> annotationGroupsToUse = new ArrayList<String>();
 
+    @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false)
+    protected List<String> expressionsToUse = new ArrayList<String>();
+
     @Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations (not for the faint of heart)", required=false)
     protected Boolean USE_ALL_ANNOTATIONS = false;
 
     @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
     protected Boolean LIST = false;
 
+    @Hidden
     @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
     protected String ASSUME_SINGLE_SAMPLE = null;
 
+    @Hidden
     @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
     protected boolean indelsOnly = false;
 
@@ -132,7 +137,7 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> {
         if ( USE_ALL_ANNOTATIONS )
             engine = new VariantAnnotatorEngine(getToolkit());
         else
-            engine = new VariantAnnotatorEngine(getToolkit(), annotationGroupsToUse, annotationsToUse);
+            engine = new VariantAnnotatorEngine(getToolkit(), annotationGroupsToUse, annotationsToUse, expressionsToUse);
 
         // setup the header fields
         // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones
diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index 3cded6029..df892075e 100755
--- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -60,6 +60,7 @@ public class VariantAnnotatorEngine {
 
     private List<InfoFieldAnnotation> requestedInfoAnnotations;
     private List<GenotypeAnnotation> requestedGenotypeAnnotations;
+    private List<VAExpression> requestedExpressions = new ArrayList<VAExpression>();
 
     private HashMap<String, String> dbAnnotations = new HashMap<String, String>();
 
@@ -77,17 +78,44 @@ public class VariantAnnotatorEngine {
     // printing out stats at the end.
     private Map<String, Integer> inputTableHitCounter = new HashMap<String, Integer>();
 
+    private static class VAExpression {
+        public String fullName, bindingName, fieldName;
+
+        public VAExpression(String fullEpression) {
+            int indexOfDot = fullEpression.lastIndexOf(".");
+            if ( indexOfDot == -1 )
+                throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format");
+
+            fullName = fullEpression;
+            bindingName = fullEpression.substring(0, indexOfDot);
+            fieldName = fullEpression.substring(indexOfDot+1);
+        }
+    }
 
     // use this constructor if you want all possible annotations
     public VariantAnnotatorEngine(GenomeAnalysisEngine engine) {
         requestedInfoAnnotations = PackageUtils.getInstancesOfClassesImplementingInterface(InfoFieldAnnotation.class);
         requestedGenotypeAnnotations = PackageUtils.getInstancesOfClassesImplementingInterface(GenotypeAnnotation.class);
-        initialize(engine);
+        initializeDBs(engine);
     }
 
-    // use this constructor if you want to select specific annotations (and/or interfaces)
+    // use this constructor if you want to select specific annotations (and/or interfaces), but no expressions
     public VariantAnnotatorEngine(GenomeAnalysisEngine engine, List<String> annotationGroupsToUse, List<String> annotationsToUse) {
+        initializeAnnotations(annotationGroupsToUse, annotationsToUse);
+        initializeDBs(engine);
+    }
 
+    // use this constructor if you want to select specific annotations (and/or interfaces) plus expressions
+    public VariantAnnotatorEngine(GenomeAnalysisEngine engine, List<String> annotationGroupsToUse, List<String> annotationsToUse, List<String> expressionsToUse) {
+        initializeAnnotations(annotationGroupsToUse, annotationsToUse);
+        initializeDBs(engine);
+
+        // set up the expressions
+        for ( String expression : expressionsToUse )
+            requestedExpressions.add(new VAExpression(expression));
+    }
+
+    private void initializeAnnotations(List<String> annotationGroupsToUse, List<String> annotationsToUse) {
         // create a map for all annotation classes which implement our top-level interfaces
         HashMap<String, Class> classMap = new HashMap<String, Class>();
         for ( Class c : PackageUtils.getClassesImplementingInterface(InfoFieldAnnotation.class) )
@@ -130,11 +158,9 @@ public class VariantAnnotatorEngine {
             if ( GenotypeAnnotation.class.isAssignableFrom(c) )
                 requestedGenotypeAnnotations.add((GenotypeAnnotation)PackageUtils.getSimpleInstance(c));
         }
-
-        initialize(engine);
     }
 
-    private void initialize(GenomeAnalysisEngine engine) {
+    private void initializeDBs(GenomeAnalysisEngine engine) {
 
         // check to see whether comp rods were included
         List<ReferenceOrderedDataSource> dataSources = engine.getRodDataSources();
@@ -180,6 +206,50 @@ public class VariantAnnotatorEngine {
         Map<String, Object> infoAnnotations = new LinkedHashMap<String, Object>(vc.getAttributes());
 
         // annotate db occurrences
+        annotateDBs(tracker, ref, vc, infoAnnotations);
+
+        // annotate expressions where available
+        annotateExpressions(tracker, ref, infoAnnotations);
+
+        // process the info field
+        List<Map<String, Object>> infoAnnotationOutputsList = new LinkedList<Map<String, Object>>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file
+        infoAnnotationOutputsList.add(new LinkedHashMap<String, Object>(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line.
+        infoAnnotationOutputsList.get(0).putAll(infoAnnotations);  // put the DB membership info in
+
+        // go through all the requested info annotationTypes
+        for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations )
+        {
+            Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc);
+            if ( annotationsFromCurrentType == null ) {
+                continue;
+            }
+
+            if(annotationType instanceof GenomicAnnotation)
+            {
+                infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType );
+            }
+            else
+            {
+                // add the annotations to each output line.
+                for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
+                    infoAnnotationOutput.putAll(annotationsFromCurrentType);
+                }
+            }
+        }
+
+        // annotate genotypes
+        Map<String, Genotype> genotypes = annotateGenotypes(tracker, ref, stratifiedContexts, vc);
+
+        // create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList
+        Collection<VariantContext> returnValue = new LinkedList<VariantContext>();
+        for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
+            returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) );
+        }
+
+        return returnValue;
+    }
+
+    private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
         for ( Map.Entry<String, String> dbSet : dbAnnotations.entrySet() ) {
             if ( dbSet.getKey().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
                 DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME));
@@ -198,64 +268,43 @@ public class VariantAnnotatorEngine {
                 infoAnnotations.put(dbSet.getValue(), overlapsComp ? true : false);
             }
         }
+    }
 
-        //Process the info field
-        List<Map<String, Object>> infoAnnotationOutputsList = new LinkedList<Map<String, Object>>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file
-        infoAnnotationOutputsList.add(new LinkedHashMap<String, Object>(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line.
-        infoAnnotationOutputsList.get(0).putAll(infoAnnotations);  // put the DB membership info in
+    private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, Object> infoAnnotations) {
+        for ( VAExpression expression : requestedExpressions ) {
+            Collection<VariantContext> VCs = tracker.getVariantContexts(ref, expression.bindingName, null, ref.getLocus(), false, true);
+            if ( VCs.size() == 0 )
+                continue;
 
-        //go through all the requested info annotationTypes
-        for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations )
-        {
-            Map<String, Object> annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc);
-            if ( annotationsFromCurrentType == null ) {
+            VariantContext vc = VCs.iterator().next();
+            if ( vc.hasAttribute(expression.fieldName) )
+                infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName));
+        }
+    }
+
+    private Map<String, Genotype> annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map<String, StratifiedAlignmentContext> stratifiedContexts, VariantContext vc) {
+        if ( requestedGenotypeAnnotations.size() == 0 )
+            return vc.getGenotypes();
+
+        Map<String, Genotype> genotypes = new HashMap<String, Genotype>(vc.getNSamples());
+        for ( Map.Entry<String, Genotype> g : vc.getGenotypes().entrySet() ) {
+            Genotype genotype = g.getValue();
+            StratifiedAlignmentContext context = stratifiedContexts.get(g.getKey());
+            if ( context == null ) {
+                genotypes.put(g.getKey(), genotype);
                 continue;
             }
 
-            if(annotationType instanceof GenomicAnnotation)
-            {
-                infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType );
-            }
-            else
-            {
-                //add the annotations to each output line.
-                for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
-                    infoAnnotationOutput.putAll(annotationsFromCurrentType);
-                }
+            Map<String, Object> genotypeAnnotations = new HashMap<String, Object>(genotype.getAttributes());
+            for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
+                Map<String, Object> result = annotation.annotate(tracker, ref, context, vc, genotype);
+                if ( result != null )
+                    genotypeAnnotations.putAll(result);
             }
+            genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.genotypesArePhased()));
         }
 
-        //Process genotypes
-        Map<String, Genotype> genotypes;
-        if ( requestedGenotypeAnnotations.size() == 0 ) {
-            genotypes = vc.getGenotypes();
-        } else {
-            genotypes = new HashMap<String, Genotype>(vc.getNSamples());
-            for ( Map.Entry<String, Genotype> g : vc.getGenotypes().entrySet() ) {
-                Genotype genotype = g.getValue();
-                StratifiedAlignmentContext context = stratifiedContexts.get(g.getKey());
-                if ( context == null ) {
-                    genotypes.put(g.getKey(), genotype);
-                    continue;
-                }
-
-                Map<String, Object> genotypeAnnotations = new HashMap<String, Object>(genotype.getAttributes());
-                for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) {
-                    Map<String, Object> result = annotation.annotate(tracker, ref, context, vc, genotype);
-                    if ( result != null )
-                        genotypeAnnotations.putAll(result);
-                }
-                genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.genotypesArePhased()));
-            }
-        }
-
-        //Create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList
-        Collection<VariantContext> returnValue = new LinkedList<VariantContext>();
-        for(Map<String, Object> infoAnnotationOutput : infoAnnotationOutputsList) {
-            returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) );
-        }
-
-        return returnValue;
+        return genotypes;
     }
 
     // Finish processing data from GenomicAnnotation.
diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
index a46ed91c3..a797287d2 100755
--- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@@ -107,6 +107,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
         executeTest("getting DB tag with HM3", spec);
     }
 
+    @Test
+    public void testUsingExpression() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                baseTestString() + " -B:foo,VCF " + validationDataLocation + "targetAnnotations.vcf -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -BTI variant", 1,
+                Arrays.asList("9362ae272d5e2ff2cfe7db307a324b6d"));
+        executeTest("using expression", spec);
+    }
+
     @Test
     public void testTabixAnnotations() {
         final String MD5 = "6c7a6a1c0027bf82656542a9b2671a35";