From 1c056ea791e94c4f8095604ef7558fa635c76dc2 Mon Sep 17 00:00:00 2001 From: ebanks Date: Fri, 29 Oct 2010 16:38:31 +0000 Subject: [PATCH] Users can now use VariantAnnotator to add annotations from one VCF to another. For example, if you want to annotate your target VCF with the AC field value from the rod bound to CEU1kg, you can specify -E CEU1kg.AC and records will be annotated with CEU1kg.AC=N when a record exists in that rod at the given position. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4598 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/annotator/VariantAnnotator.java | 9 +- .../annotator/VariantAnnotatorEngine.java | 159 ++++++++++++------ .../VariantAnnotatorIntegrationTest.java | 8 + 3 files changed, 119 insertions(+), 57 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index b70b00f94..d2941c637 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.*; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; @@ -40,7 +41,6 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.classloader.PackageUtils; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.CommandLineUtils; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.utils.vcf.VCFUtils; @@ -68,15 +68,20 @@ public class VariantAnnotator extends RodWalker { @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) protected List annotationGroupsToUse = new ArrayList(); + @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false) + protected List expressionsToUse = new ArrayList(); + @Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations (not for the faint of heart)", required=false) protected Boolean USE_ALL_ANNOTATIONS = false; @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") protected Boolean LIST = false; + @Hidden @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false) protected String ASSUME_SINGLE_SAMPLE = null; + @Hidden @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) protected boolean indelsOnly = false; @@ -132,7 +137,7 @@ public class VariantAnnotator extends RodWalker { if ( USE_ALL_ANNOTATIONS ) engine = new VariantAnnotatorEngine(getToolkit()); else - engine = new VariantAnnotatorEngine(getToolkit(), annotationGroupsToUse, annotationsToUse); + engine = new VariantAnnotatorEngine(getToolkit(), annotationGroupsToUse, annotationsToUse, expressionsToUse); // setup the header fields // note that if any of the definitions conflict with our new ones, then we want to overwrite the old ones diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 3cded6029..df892075e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -60,6 +60,7 @@ public class VariantAnnotatorEngine { private List requestedInfoAnnotations; private List requestedGenotypeAnnotations; + private List requestedExpressions = new ArrayList(); private HashMap dbAnnotations = new HashMap(); @@ -77,17 +78,44 @@ public class VariantAnnotatorEngine { // printing out stats at the end. private Map inputTableHitCounter = new HashMap(); + private static class VAExpression { + public String fullName, bindingName, fieldName; + + public VAExpression(String fullEpression) { + int indexOfDot = fullEpression.lastIndexOf("."); + if ( indexOfDot == -1 ) + throw new UserException.BadArgumentValue(fullEpression, "it should be in rodname.value format"); + + fullName = fullEpression; + bindingName = fullEpression.substring(0, indexOfDot); + fieldName = fullEpression.substring(indexOfDot+1); + } + } // use this constructor if you want all possible annotations public VariantAnnotatorEngine(GenomeAnalysisEngine engine) { requestedInfoAnnotations = PackageUtils.getInstancesOfClassesImplementingInterface(InfoFieldAnnotation.class); requestedGenotypeAnnotations = PackageUtils.getInstancesOfClassesImplementingInterface(GenotypeAnnotation.class); - initialize(engine); + initializeDBs(engine); } - // use this constructor if you want to select specific annotations (and/or interfaces) + // use this constructor if you want to select specific annotations (and/or interfaces), but no expressions public VariantAnnotatorEngine(GenomeAnalysisEngine engine, List annotationGroupsToUse, List annotationsToUse) { + initializeAnnotations(annotationGroupsToUse, annotationsToUse); + initializeDBs(engine); + } + // use this constructor if you want to select specific annotations (and/or interfaces) plus expressions + public VariantAnnotatorEngine(GenomeAnalysisEngine engine, List annotationGroupsToUse, List annotationsToUse, List expressionsToUse) { + initializeAnnotations(annotationGroupsToUse, annotationsToUse); + initializeDBs(engine); + + // set up the expressions + for ( String expression : expressionsToUse ) + requestedExpressions.add(new VAExpression(expression)); + } + + private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse) { // create a map for all annotation classes which implement our top-level interfaces HashMap classMap = new HashMap(); for ( Class c : PackageUtils.getClassesImplementingInterface(InfoFieldAnnotation.class) ) @@ -130,11 +158,9 @@ public class VariantAnnotatorEngine { if ( GenotypeAnnotation.class.isAssignableFrom(c) ) requestedGenotypeAnnotations.add((GenotypeAnnotation)PackageUtils.getSimpleInstance(c)); } - - initialize(engine); } - private void initialize(GenomeAnalysisEngine engine) { + private void initializeDBs(GenomeAnalysisEngine engine) { // check to see whether comp rods were included List dataSources = engine.getRodDataSources(); @@ -180,6 +206,50 @@ public class VariantAnnotatorEngine { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences + annotateDBs(tracker, ref, vc, infoAnnotations); + + // annotate expressions where available + annotateExpressions(tracker, ref, infoAnnotations); + + // process the info field + List> infoAnnotationOutputsList = new LinkedList>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file + infoAnnotationOutputsList.add(new LinkedHashMap(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line. + infoAnnotationOutputsList.get(0).putAll(infoAnnotations); // put the DB membership info in + + // go through all the requested info annotationTypes + for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) + { + Map annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc); + if ( annotationsFromCurrentType == null ) { + continue; + } + + if(annotationType instanceof GenomicAnnotation) + { + infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType ); + } + else + { + // add the annotations to each output line. + for(Map infoAnnotationOutput : infoAnnotationOutputsList) { + infoAnnotationOutput.putAll(annotationsFromCurrentType); + } + } + } + + // annotate genotypes + Map genotypes = annotateGenotypes(tracker, ref, stratifiedContexts, vc); + + // create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList + Collection returnValue = new LinkedList(); + for(Map infoAnnotationOutput : infoAnnotationOutputsList) { + returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) ); + } + + return returnValue; + } + + private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { for ( Map.Entry dbSet : dbAnnotations.entrySet() ) { if ( dbSet.getKey().equals(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) { DbSNPFeature dbsnp = DbSNPHelper.getFirstRealSNP(tracker.getReferenceMetaData(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME)); @@ -198,64 +268,43 @@ public class VariantAnnotatorEngine { infoAnnotations.put(dbSet.getValue(), overlapsComp ? true : false); } } + } - //Process the info field - List> infoAnnotationOutputsList = new LinkedList>(); //each element in infoAnnotationOutputs corresponds to a single line in the output VCF file - infoAnnotationOutputsList.add(new LinkedHashMap(vc.getAttributes())); //keep the existing info-field annotations. After this infoAnnotationOutputsList.size() == 1, which means the output VCF file has 1 additional line. - infoAnnotationOutputsList.get(0).putAll(infoAnnotations); // put the DB membership info in + private void annotateExpressions(RefMetaDataTracker tracker, ReferenceContext ref, Map infoAnnotations) { + for ( VAExpression expression : requestedExpressions ) { + Collection VCs = tracker.getVariantContexts(ref, expression.bindingName, null, ref.getLocus(), false, true); + if ( VCs.size() == 0 ) + continue; - //go through all the requested info annotationTypes - for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) - { - Map annotationsFromCurrentType = annotationType.annotate(tracker, ref, stratifiedContexts, vc); - if ( annotationsFromCurrentType == null ) { + VariantContext vc = VCs.iterator().next(); + if ( vc.hasAttribute(expression.fieldName) ) + infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + } + } + + private Map annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + if ( requestedGenotypeAnnotations.size() == 0 ) + return vc.getGenotypes(); + + Map genotypes = new HashMap(vc.getNSamples()); + for ( Map.Entry g : vc.getGenotypes().entrySet() ) { + Genotype genotype = g.getValue(); + StratifiedAlignmentContext context = stratifiedContexts.get(g.getKey()); + if ( context == null ) { + genotypes.put(g.getKey(), genotype); continue; } - if(annotationType instanceof GenomicAnnotation) - { - infoAnnotationOutputsList = processGenomicAnnotation( infoAnnotationOutputsList, annotationsFromCurrentType ); - } - else - { - //add the annotations to each output line. - for(Map infoAnnotationOutput : infoAnnotationOutputsList) { - infoAnnotationOutput.putAll(annotationsFromCurrentType); - } + Map genotypeAnnotations = new HashMap(genotype.getAttributes()); + for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + Map result = annotation.annotate(tracker, ref, context, vc, genotype); + if ( result != null ) + genotypeAnnotations.putAll(result); } + genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.genotypesArePhased())); } - //Process genotypes - Map genotypes; - if ( requestedGenotypeAnnotations.size() == 0 ) { - genotypes = vc.getGenotypes(); - } else { - genotypes = new HashMap(vc.getNSamples()); - for ( Map.Entry g : vc.getGenotypes().entrySet() ) { - Genotype genotype = g.getValue(); - StratifiedAlignmentContext context = stratifiedContexts.get(g.getKey()); - if ( context == null ) { - genotypes.put(g.getKey(), genotype); - continue; - } - - Map genotypeAnnotations = new HashMap(genotype.getAttributes()); - for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { - Map result = annotation.annotate(tracker, ref, context, vc, genotype); - if ( result != null ) - genotypeAnnotations.putAll(result); - } - genotypes.put(g.getKey(), new Genotype(g.getKey(), genotype.getAlleles(), genotype.getNegLog10PError(), genotype.getFilters(), genotypeAnnotations, genotype.genotypesArePhased())); - } - } - - //Create a separate VariantContext (aka. output line) for each element in infoAnnotationOutputsList - Collection returnValue = new LinkedList(); - for(Map infoAnnotationOutput : infoAnnotationOutputsList) { - returnValue.add( new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, infoAnnotationOutput) ); - } - - return returnValue; + return genotypes; } // Finish processing data from GenomicAnnotation. diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index a46ed91c3..a797287d2 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -107,6 +107,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { executeTest("getting DB tag with HM3", spec); } + @Test + public void testUsingExpression() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -B:foo,VCF " + validationDataLocation + "targetAnnotations.vcf -G \"Standard\" -B:variant,VCF " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -BTI variant", 1, + Arrays.asList("9362ae272d5e2ff2cfe7db307a324b6d")); + executeTest("using expression", spec); + } + @Test public void testTabixAnnotations() { final String MD5 = "6c7a6a1c0027bf82656542a9b2671a35";